def __init_stop_words(): global STOP_WORDS stop_words = [] for t, v in get_dict(DICTS.EXT_STOPWORD).iteritems(): stop_words.append(t) for t, v in get_dict(DICTS.STOPWORD).iteritems(): stop_words.append(t) for t, v in get_dict(DICTS.STOP_SENTENCE).iteritems(): stop_words.append(t) STOP_WORDS = frozenset(stop_words)
def __init_stop_words(): global STOP_WORDS stop_words = [] for t,v in get_dict(DICTS.EXT_STOPWORD).iteritems(): stop_words.append(t) for t,v in get_dict(DICTS.STOPWORD).iteritems(): stop_words.append(t) for t,v in get_dict(DICTS.STOP_SENTENCE).iteritems(): stop_words.append(t) STOP_WORDS = frozenset(stop_words)
def near_duplicate(content1, content2, cuttor=None): tmp_cuttor = None if cuttor: tmp_cuttor = cuttor else: tmp_cuttor = Cuttor() tmp_cuttor.set_stage1_regex( re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U)) file_words = {} stopwords = [ get_dict(DICTS.STOP_SENTENCE), get_dict(DICTS.EXT_STOPWORD), get_dict(DICTS.STOPWORD) ] seg_list = tmp_cuttor.cut(content1) for w in seg_list: is_drop = False lw = w.lower() for stopword in stopwords: if stopword.has_key(lw): is_drop = True break if is_drop: continue if lw not in file_words.keys(): file_words[lw] = [1, 0] else: file_words[lw][0] += 1 seg_list = tmp_cuttor.cut(content2) for w in seg_list: is_drop = False lw = w.lower() for stopword in stopwords: if stopword.has_key(lw): is_drop = True break if is_drop: continue if lw not in file_words.keys(): file_words[lw] = [0, 1] else: file_words[lw][1] += 1 sum_2 = 0 sum_file1 = 0 sum_file2 = 0 for word in file_words.values(): sum_2 += word[0] * word[1] sum_file1 += word[0]**2 sum_file2 += word[1]**2 rate = sum_2 / (sqrt(sum_file1 * sum_file2)) return rate
def near_duplicate(content1, content2, cuttor=None): tmp_cuttor = None if cuttor: tmp_cuttor = cuttor else: tmp_cuttor = Cuttor() tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U)) file_words = {} stopwords = [get_dict(DICTS.STOP_SENTENCE), get_dict(DICTS.EXT_STOPWORD), get_dict(DICTS.STOPWORD)] seg_list = tmp_cuttor.cut(content1) for w in seg_list: is_drop = False lw = w.lower() for stopword in stopwords: if stopword.has_key(lw): is_drop = True break if is_drop: continue if lw not in file_words.keys(): file_words[lw] = [1,0] else: file_words[lw][0] += 1 seg_list = tmp_cuttor.cut(content2) for w in seg_list: is_drop = False lw = w.lower() for stopword in stopwords: if stopword.has_key(lw): is_drop = True break if is_drop: continue if lw not in file_words.keys(): file_words[lw] = [0,1] else: file_words[lw][1] += 1 sum_2 = 0 sum_file1 = 0 sum_file2 = 0 for word in file_words.values(): sum_2 += word[0]*word[1] sum_file1 += word[0]**2 sum_file2 += word[1]**2 rate = sum_2/(sqrt(sum_file1*sum_file2)) return rate
def extract_keywords(content, topk=18, cuttor=None): stopwords = get_dict(DICTS.STOPWORD) tmp_cuttor = None if cuttor: tmp_cuttor = cuttor else: tmp_cuttor = Cuttor() #support for number and english 21/08/13 08:43:23 tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U)) words = tmp_cuttor.cut(content) freq = {} total = 0 for word in words: if len(word.strip()) < 2: continue lower_word = word.lower() if stopwords.has_key(lower_word): continue #TODO only leave the 'n' word? 21/08/13 09:13:36 if tmp_cuttor.exist(lower_word) and not tmp_cuttor.word_type(lower_word, 'n'): continue total += 1 if word in freq: freq[lower_word] += 1 else: freq[lower_word] = 1 freq = [(k,v/total) for k,v in freq.iteritems()] tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq] st_list = sorted(tf_idf_list, reverse=True) top_tuples = st_list[:topk] keys = [a[1] for a in top_tuples] return keys
def get_modified_dict(): global MODIFY_INIT dict = get_dict(DICTS.MAIN) if MODIFY_INIT: return dict with MODIFY_LOCK: if MODIFY_INIT: return dict for word in dict: modify_wordbase(word) MODIFY_INIT = True return dict
def sort_with_tfidf(freq_file_name, ofile): stopwords = get_dict(DICTS.STOPWORD) old_freqs = {} total = 0 st_list = None with codecs.open(freq_file_name, "r", "utf-8") as file: for line in file: ws = line.split() if stopwords.has_key(ws[0]): continue if len(ws) >= 2: f = int(ws[1]) old_freqs[ws[0]] = f total += f freqs = [(k, v / total) for k, v in old_freqs.iteritems()] tf_idf_list = [(v * idf_freq.get(k, median_idf), k) for k, v in freqs] st_list = sorted(tf_idf_list, reverse=True) with codecs.open(ofile, "w", "utf-8") as file: for v, w in st_list: file.write("%s %d\n" % (w, old_freqs[w]))
def sort_with_tfidf(freq_file_name, ofile): stopwords = get_dict(DICTS.STOPWORD) old_freqs = {} total = 0 st_list = None with codecs.open(freq_file_name, "r", "utf-8") as file: for line in file: ws = line.split() if stopwords.has_key(ws[0]): continue if len(ws) >= 2: f = int(ws[1]) old_freqs[ws[0]] = f total += f freqs = [(k,v/total) for k,v in old_freqs.iteritems()] tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freqs] st_list = sorted(tf_idf_list, reverse=True) with codecs.open(ofile, "w", "utf-8") as file: for v, w in st_list: file.write("%s %d\n" % (w, old_freqs[w]))