Esempio n. 1
0
import codecs
import sys
import os
from wav_transcribe import wav_transcribe

sys.path.append("C:/Users/Daneel/GitHub/for-fun/top_words")
from top_words import top_words

WAV_PATH = "C:/Users/Daneel/Google Drive/Data Science/7D minuto decisivo"
candidatos = ["sanchez", "iglesias", "rivera", "saenz"]
num_partes = {"sanchez": 7, "saenz": 8, "rivera": 7, "iglesias": 9}
text_speech = {"sanchez": "", "saenz": "", "rivera": "", "iglesias": ""}

for candidato in candidatos:
    print "--- Discurso de " + candidato + ":"

    outfile = "minuto_decisivo_" + candidato + ".txt"
    if os.path.isfile(outfile):
        print "Cargando transcripcion de", outfile
        text_speech[candidato] = codecs.open(outfile, "r", "utf8").read()
    else:
        for num in range(1, num_partes[candidato] + 1):
            WAV_FILE = WAV_PATH + "/" + "minuto_decisivo_" + candidato + "_" + str(num) + ".wav"
            text_speech[candidato] += " " + wav_transcribe(WAV_FILE, "es-ES")

        with codecs.open(outfile, "w", "utf8") as out:
            out.write("%s" % text_speech[candidato])

    print "Palabras más usadas:".decode("utf8")
    top_words(text_speech[candidato], lang="spanish")
Esempio n. 2
0
    for art in listing:
        if art.startswith('article_') and art.endswith('.html'):
            article_list.append("")
            for line in codecs.open(art, 'r', 'utf8').readlines():
                if "Las revistas " in line or "Para poder comentar" in line:
                    isCuerpo = False
                if isCuerpo and line == "</div>" and prevline == "</div>":
                    isCuerpo = False
                    break
                if isCuerpo and line.startswith('<p>'):
                    article_list[-1] += " " + line.split('>')[1].split('<')[0]
                if 'cuerpo_' in line:
                    isCuerpo = True
                prevline = line

    return article_list


# download_articles()
artlist = extract_articles()

for ii, art in enumerate(artlist):
    print "----------------- ARTICLE ", ii + 1, " ------------------"
    print art
    print "Most common words on article", ii + 1
    top_words(art, "spanish")

print ""
print "Most common words on all the articles"
top_words(" ".join(artlist), "spanish")
Esempio n. 3
0
from top_words import top_words

WAV_PATH = "C:/Users/Daneel/Google Drive/Data Science/7D minuto decisivo"
candidatos = [ "sanchez", "iglesias", "rivera", "saenz" ]
num_partes = { "sanchez" : 7 ,
               "saenz" : 8 ,
               "rivera" : 7 ,
               "iglesias" : 9}
text_speech = { "sanchez" : "" ,
                "saenz" : "" ,
                "rivera" : "" ,
                "iglesias" : ""}

for candidato in candidatos:
    print "--- Discurso de " + candidato + ":"

    outfile = "minuto_decisivo_" + candidato + ".txt"
    if os.path.isfile(outfile):
        print "Cargando transcripcion de",outfile
        text_speech[candidato] = codecs.open(outfile, 'r', 'utf8').read()
    else:
        for num in range(1,num_partes[candidato]+1):
            WAV_FILE = WAV_PATH + "/" + "minuto_decisivo_" + candidato + "_" + str(num) + ".wav"
            text_speech[candidato] += " " + wav_transcribe(WAV_FILE,"es-ES")

        with codecs.open(outfile, 'w', 'utf8') as out:
            out.write("%s" % text_speech[candidato])

    print "Palabras más usadas:".decode('utf8')
    top_words(text_speech[candidato],lang="spanish")
Esempio n. 4
0
    prevline = ""
    for art in listing:
        if art.startswith('article_') and art.endswith('.html'):
            article_list.append("")
            for line in codecs.open(art,'r','utf8').readlines():
                if "Las revistas " in line or "Para poder comentar" in line:
                    isCuerpo = False
                if isCuerpo and line == "</div>" and prevline == "</div>":
                    isCuerpo = False
                    break
                if isCuerpo and line.startswith('<p>'):
                    article_list[-1] += " " + line.split('>')[1].split('<')[0]
                if 'cuerpo_' in line:
                    isCuerpo = True
                prevline = line

    return article_list

# download_articles()
artlist = extract_articles()

for ii,art in enumerate(artlist):
    print "----------------- ARTICLE ",ii+1," ------------------"
    print art
    print "Most common words on article",ii+1
    top_words(art,"spanish")

print ""
print "Most common words on all the articles"
top_words(" ".join(artlist),"spanish")