Esempio n. 1
0
def genVocab(vocabfile):
    mysql=MySQL()
    mysql.login()
    cursor=mysql.get_cursor()

    vocab=defaultdict(int)
    def imdict(ab):
        for a in ab.split(" "):
            a=a.strip()
            # 去掉全是小写的英文单词
            if len(a)==0 or (rec.match(a) and a.islower()) or (rec0.match(a)):
                continue
            vocab[a]+=1

    urlset=set()
    dalist = []
    tables=["news","crawldata"]
    for table in tables:
        sent="select title,brief,content,url from %s where 1"%table
        cursor.execute(sent)

        for title, brief, content,url in cursor.fetchall():
            if url in urlset:
                continue
            else:
                urlset.add(url)
            title = Data.extract_html(title,False)
            imdict(title)

            if table=="news" and brief is not None:
                brief= re.sub("摘要:","",brief)
                brief = Data.extract_html(brief,False)
                imdict(brief)
                brieflen=len(brief)
            else:brieflen=0

            content=re.sub("资料图(图源:.*?)","",content)
            try:
               content=Data.extract_html(content)
            except:
                continue
            time.sleep(0.1)
            imdict(content)
            contentlen=len(content)
            dalist.append([brieflen,contentlen])

    data = pd.DataFrame(columns=["brief", "content"],data=dalist)
    data=data[data['brief']>0]
    data.to_csv("./data/len.csv",index=False)
    mysql.close()
    newvocab={Data.UNKNOWN_TOKEN:0,Data.PAD_TOKEN:-1,Data.SENTENCE_START:-1,Data.SENTENCE_END:-1}
    for key, value in vocab.items():
        if value >= 5:
            newvocab.update({key:value})
        else:
            newvocab[Data.UNKNOWN_TOKEN]+=value
    with open(vocabfile,'w') as f:
        for word,num in newvocab.items():
            f.write(word+" "+str(num)+"\n")
Esempio n. 2
0
def ExampleGen(num_epochs=None):
    epoch = 0
    mysql=MySQL(sqldb="HWCom")
    mysql.login()
    cursor=mysql.get_cursor()
    while True:
        if num_epochs is not None and epoch >= num_epochs:
            break

        sent="select title,brief,content from news where brief !=''"
        cursor.execute(sent)
        for rows in cursor.fetchall():
            title, brief, content=rows
            content=extract_html(content)
            brief=extract_html(brief,False)
            yield (title,content,brief)

        epoch += 1