コード例 #1
0
ファイル: test.py プロジェクト: JohnDannl/NewsTech2
def getSecondRmDuplicationResult(split=True):
    rows=tablemerge2.getTitleSummary(dbconfig.mergetable2)
    if rows !=-1:
        f_summary=open(merge2_summary_file,'w')
        with open(merge2_title_file,'w') as fout:
            count=0
            for row in rows:
                # title,summary,ctime,source
                count+=1
                mtid,title,summary,ctime=row[0],row[1].strip(),row[2].strip(),row[3]
                if split:
                    title=' '.join(jieba.cut(delpunc(title.lower()))).encode('utf-8')
                    summary=' '.join(jieba.cut(delpunc(summary.lower()))).encode('utf-8')
                timeStr=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(ctime))
                msg_t='%s %s %s %s'%(count,mtid,timeStr,title)
                msg_s='%s %s %s %s'%(count,mtid,timeStr,summary)
                print msg_t
                print msg_s
                fout.write(msg_t+'\n')                
                f_summary.write(msg_s+'\n')
        f_summary.close()
コード例 #2
0
ファイル: getdoc.py プロジェクト: JohnDannl/NewsTechNLP
def getMerge2Title():
    rows=tablemerge2.getTitleSummary(dbconfig.mergetable2)
    if rows !=-1:
        f_summary=open(merge2_summary_file,'w')
        with open(merge2_title_file,'w') as fout:
            count=0
            for row in rows:
                # title,summary,ctime,source
                count+=1
                mtid,title,summary,ctime=row[0],row[1].strip(),re.sub('\s+','',row[2]),row[3]               
                title=delpunc(' '.join(jieba.cut(title)).lower()).encode('utf-8')
                summary=delpunc(' '.join(jieba.cut(summary)).lower()).encode('utf-8')
                if len(summary)<len(title):
                    summary=title
                #timeStr=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(ctime))
                msg_t='%s %s %s %s'%(count,mtid,ctime,title)
                msg_s='%s %s %s %s'%(count,mtid,ctime,summary)
                print msg_t
                print msg_s
                fout.write(msg_t+'\n')                
                f_summary.write(msg_s+'\n')
        f_summary.close()