def getMergeNews(): print tablemerge.getAllCount(dbconfig.mergetable); rows=tablemerge.getTitleSummary(dbconfig.mergetable) if rows !=-1: with open(news_file,'w') as fout: count=0 for row in rows: # id,title,summary,ctime,source count+=1 mtid,title,summary,ctime=row[0],row[1].strip(),re.sub('\s+','',row[2]),row[3] title=delpunc(' '.join(jieba.cut(title)).lower()).encode('utf-8') summary=delpunc(' '.join(jieba.cut(summary)).lower()).encode('utf-8') #timeStr=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(ctime)) msg_ts='%s %s %s'%(count,title,summary) fout.write(msg_ts+'\n')
def getDuplicationRate(): count=0 for tablename in dbconfig.tableName.itervalues(): count+=table.getAllCount(tablename) mcount=tablemerge.getAllCount(dbconfig.mergetable) m2count=tablemerge2.getAllCount(dbconfig.mergetable2) print 'First Duplication Rate: %.4f (%d/%d)'%(float(count-mcount)/count,count-mcount,count) print 'Second Duplication Rate: %.4f (%d/%d)'%(float(mcount-m2count)/mcount,mcount-m2count,mcount)