def near_duplicate(content1, content2, cuttor=None): tmp_cuttor = None if cuttor: tmp_cuttor = cuttor else: tmp_cuttor = Cuttor() tmp_cuttor.set_stage1_regex( re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U)) file_words = {} stopwords = [ get_dict(DICTS.STOP_SENTENCE), get_dict(DICTS.EXT_STOPWORD), get_dict(DICTS.STOPWORD) ] seg_list = tmp_cuttor.cut(content1) for w in seg_list: is_drop = False lw = w.lower() for stopword in stopwords: if stopword.has_key(lw): is_drop = True break if is_drop: continue if lw not in file_words.keys(): file_words[lw] = [1, 0] else: file_words[lw][0] += 1 seg_list = tmp_cuttor.cut(content2) for w in seg_list: is_drop = False lw = w.lower() for stopword in stopwords: if stopword.has_key(lw): is_drop = True break if is_drop: continue if lw not in file_words.keys(): file_words[lw] = [0, 1] else: file_words[lw][1] += 1 sum_2 = 0 sum_file1 = 0 sum_file2 = 0 for word in file_words.values(): sum_2 += word[0] * word[1] sum_file1 += word[0]**2 sum_file2 += word[1]**2 rate = sum_2 / (sqrt(sum_file1 * sum_file2)) return rate
def near_duplicate(content1, content2, cuttor=None): tmp_cuttor = None if cuttor: tmp_cuttor = cuttor else: tmp_cuttor = Cuttor() tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U)) file_words = {} stopwords = [get_dict(DICTS.STOP_SENTENCE), get_dict(DICTS.EXT_STOPWORD), get_dict(DICTS.STOPWORD)] seg_list = tmp_cuttor.cut(content1) for w in seg_list: is_drop = False lw = w.lower() for stopword in stopwords: if stopword.has_key(lw): is_drop = True break if is_drop: continue if lw not in file_words.keys(): file_words[lw] = [1,0] else: file_words[lw][0] += 1 seg_list = tmp_cuttor.cut(content2) for w in seg_list: is_drop = False lw = w.lower() for stopword in stopwords: if stopword.has_key(lw): is_drop = True break if is_drop: continue if lw not in file_words.keys(): file_words[lw] = [0,1] else: file_words[lw][1] += 1 sum_2 = 0 sum_file1 = 0 sum_file2 = 0 for word in file_words.values(): sum_2 += word[0]*word[1] sum_file1 += word[0]**2 sum_file2 += word[1]**2 rate = sum_2/(sqrt(sum_file1*sum_file2)) return rate
def extract_keywords(content, topk=18, cuttor=None): stopwords = get_dict(DICTS.STOPWORD) tmp_cuttor = None if cuttor: tmp_cuttor = cuttor else: tmp_cuttor = Cuttor() #support for number and english 21/08/13 08:43:23 tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U)) words = tmp_cuttor.cut(content) freq = {} total = 0 for word in words: if len(word.strip()) < 2: continue lower_word = word.lower() if stopwords.has_key(lower_word): continue #TODO only leave the 'n' word? 21/08/13 09:13:36 if tmp_cuttor.exist(lower_word) and not tmp_cuttor.word_type(lower_word, 'n'): continue total += 1 if word in freq: freq[lower_word] += 1 else: freq[lower_word] = 1 freq = [(k,v/total) for k,v in freq.iteritems()] tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq] st_list = sorted(tf_idf_list, reverse=True) top_tuples = st_list[:topk] keys = [a[1] for a in top_tuples] return keys
#for seg in seglist: # print ','.join(seg) #for s in cuttor.cut_to_sentence(str): # print s #str = "伟大祖国是中华人民共和国" #str = "九孔不好看来" #str = "而迈入社会后..." str = "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" #You can set WORD_MAX to 8 for better match #cuttor.WORD_MAX = 8 #Normal cut seglist = cuttor.cut(str) print 'Normal cut \n%s\n' % ','.join(list(seglist)) #All cut seglist = cuttor.cut_all(str) print 'All cut \n%s\n' % ','.join(list(seglist)) #Tokenize for search print 'Cut for search (term,start,end)' for term, start, end in cuttor.tokenize(str.decode('utf-8'), search=True): print term, start, end re_line = re.compile("\W+|[a-zA-Z0-9]+", re.UNICODE) def sentence_from_file(filename): with codecs.open(filename, 'r', 'utf-8') as file: for line in file: