def genVocab(vocabfile): mysql=MySQL() mysql.login() cursor=mysql.get_cursor() vocab=defaultdict(int) def imdict(ab): for a in ab.split(" "): a=a.strip() # 去掉全是小写的英文单词 if len(a)==0 or (rec.match(a) and a.islower()) or (rec0.match(a)): continue vocab[a]+=1 urlset=set() dalist = [] tables=["news","crawldata"] for table in tables: sent="select title,brief,content,url from %s where 1"%table cursor.execute(sent) for title, brief, content,url in cursor.fetchall(): if url in urlset: continue else: urlset.add(url) title = Data.extract_html(title,False) imdict(title) if table=="news" and brief is not None: brief= re.sub("摘要:","",brief) brief = Data.extract_html(brief,False) imdict(brief) brieflen=len(brief) else:brieflen=0 content=re.sub("资料图(图源:.*?)","",content) try: content=Data.extract_html(content) except: continue time.sleep(0.1) imdict(content) contentlen=len(content) dalist.append([brieflen,contentlen]) data = pd.DataFrame(columns=["brief", "content"],data=dalist) data=data[data['brief']>0] data.to_csv("./data/len.csv",index=False) mysql.close() newvocab={Data.UNKNOWN_TOKEN:0,Data.PAD_TOKEN:-1,Data.SENTENCE_START:-1,Data.SENTENCE_END:-1} for key, value in vocab.items(): if value >= 5: newvocab.update({key:value}) else: newvocab[Data.UNKNOWN_TOKEN]+=value with open(vocabfile,'w') as f: for word,num in newvocab.items(): f.write(word+" "+str(num)+"\n")
def ExampleGen(num_epochs=None): epoch = 0 mysql=MySQL(sqldb="HWCom") mysql.login() cursor=mysql.get_cursor() while True: if num_epochs is not None and epoch >= num_epochs: break sent="select title,brief,content from news where brief !=''" cursor.execute(sent) for rows in cursor.fetchall(): title, brief, content=rows content=extract_html(content) brief=extract_html(brief,False) yield (title,content,brief) epoch += 1