Esempio n. 1
0
def __init_stop_words():
    global STOP_WORDS
    stop_words = []
    for t, v in get_dict(DICTS.EXT_STOPWORD).iteritems():
        stop_words.append(t)
    for t, v in get_dict(DICTS.STOPWORD).iteritems():
        stop_words.append(t)
    for t, v in get_dict(DICTS.STOP_SENTENCE).iteritems():
        stop_words.append(t)
    STOP_WORDS = frozenset(stop_words)
Esempio n. 2
0
def __init_stop_words():
    global STOP_WORDS
    stop_words = []
    for t,v in get_dict(DICTS.EXT_STOPWORD).iteritems():
        stop_words.append(t)
    for t,v in get_dict(DICTS.STOPWORD).iteritems():
        stop_words.append(t)
    for t,v in get_dict(DICTS.STOP_SENTENCE).iteritems():
        stop_words.append(t)
    STOP_WORDS = frozenset(stop_words)
Esempio n. 3
0
def near_duplicate(content1, content2, cuttor=None):
    tmp_cuttor = None
    if cuttor:
        tmp_cuttor = cuttor
    else:
        tmp_cuttor = Cuttor()
        tmp_cuttor.set_stage1_regex(
            re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U))
    file_words = {}
    stopwords = [
        get_dict(DICTS.STOP_SENTENCE),
        get_dict(DICTS.EXT_STOPWORD),
        get_dict(DICTS.STOPWORD)
    ]

    seg_list = tmp_cuttor.cut(content1)
    for w in seg_list:
        is_drop = False
        lw = w.lower()
        for stopword in stopwords:
            if stopword.has_key(lw):
                is_drop = True
                break
            if is_drop:
                continue
            if lw not in file_words.keys():
                file_words[lw] = [1, 0]
            else:
                file_words[lw][0] += 1

    seg_list = tmp_cuttor.cut(content2)
    for w in seg_list:
        is_drop = False
        lw = w.lower()
        for stopword in stopwords:
            if stopword.has_key(lw):
                is_drop = True
                break
            if is_drop:
                continue
            if lw not in file_words.keys():
                file_words[lw] = [0, 1]
            else:
                file_words[lw][1] += 1

    sum_2 = 0
    sum_file1 = 0
    sum_file2 = 0
    for word in file_words.values():
        sum_2 += word[0] * word[1]
        sum_file1 += word[0]**2
        sum_file2 += word[1]**2

    rate = sum_2 / (sqrt(sum_file1 * sum_file2))
    return rate
Esempio n. 4
0
def near_duplicate(content1, content2, cuttor=None):
    tmp_cuttor = None
    if cuttor:
        tmp_cuttor = cuttor
    else:
        tmp_cuttor = Cuttor()
        tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))
    file_words = {}
    stopwords = [get_dict(DICTS.STOP_SENTENCE), get_dict(DICTS.EXT_STOPWORD), get_dict(DICTS.STOPWORD)]

    seg_list = tmp_cuttor.cut(content1)
    for w in seg_list:
        is_drop = False
        lw = w.lower()
        for stopword in stopwords:
            if stopword.has_key(lw):
                is_drop = True
                break
            if is_drop:
                continue
            if lw not in file_words.keys():
                file_words[lw] = [1,0]
            else:
                file_words[lw][0] += 1
    
    seg_list = tmp_cuttor.cut(content2)
    for w in seg_list:
        is_drop = False
        lw = w.lower()
        for stopword in stopwords:
            if stopword.has_key(lw):
                is_drop = True
                break
            if is_drop:
                continue
            if lw not in file_words.keys():
                file_words[lw] = [0,1]
            else:
                file_words[lw][1] += 1

    sum_2 = 0
    sum_file1 = 0
    sum_file2 = 0
    for word in file_words.values():
        sum_2 += word[0]*word[1]
        sum_file1 += word[0]**2
        sum_file2 += word[1]**2

    rate = sum_2/(sqrt(sum_file1*sum_file2))
    return rate
Esempio n. 5
0
def extract_keywords(content, topk=18, cuttor=None):
    stopwords = get_dict(DICTS.STOPWORD)

    tmp_cuttor = None
    if cuttor:
        tmp_cuttor = cuttor
    else:
        tmp_cuttor = Cuttor()
        #support for number and english 21/08/13 08:43:23
        tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))

    words = tmp_cuttor.cut(content)
    freq = {}
    total = 0
    for word in words:
        if len(word.strip()) < 2:
            continue
        lower_word = word.lower()
        if stopwords.has_key(lower_word):
            continue
        #TODO only leave the 'n' word? 21/08/13 09:13:36
        if tmp_cuttor.exist(lower_word) and not tmp_cuttor.word_type(lower_word, 'n'):
            continue
        total += 1
        if word in freq:
            freq[lower_word] += 1
        else:
            freq[lower_word] = 1
    freq = [(k,v/total) for k,v in freq.iteritems()]
    tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq]
    st_list = sorted(tf_idf_list, reverse=True)

    top_tuples = st_list[:topk]
    keys = [a[1] for a in top_tuples]
    return keys
Esempio n. 6
0
def get_modified_dict():
    global MODIFY_INIT
    dict = get_dict(DICTS.MAIN)
    if MODIFY_INIT:
        return dict
    with MODIFY_LOCK:
        if MODIFY_INIT:
            return dict
        for word in dict:
            modify_wordbase(word)
        MODIFY_INIT = True
    return dict
Esempio n. 7
0
def get_modified_dict():
    global MODIFY_INIT
    dict = get_dict(DICTS.MAIN)
    if MODIFY_INIT:
        return dict
    with MODIFY_LOCK:
        if MODIFY_INIT:
            return dict
        for word in dict:
            modify_wordbase(word)
        MODIFY_INIT = True
    return dict
Esempio n. 8
0
def sort_with_tfidf(freq_file_name, ofile):
    stopwords = get_dict(DICTS.STOPWORD)

    old_freqs = {}
    total = 0
    st_list = None
    with codecs.open(freq_file_name, "r", "utf-8") as file:
        for line in file:
            ws = line.split()
            if stopwords.has_key(ws[0]):
                continue

            if len(ws) >= 2:
                f = int(ws[1])
                old_freqs[ws[0]] = f
                total += f

        freqs = [(k, v / total) for k, v in old_freqs.iteritems()]
        tf_idf_list = [(v * idf_freq.get(k, median_idf), k) for k, v in freqs]
        st_list = sorted(tf_idf_list, reverse=True)

    with codecs.open(ofile, "w", "utf-8") as file:
        for v, w in st_list:
            file.write("%s %d\n" % (w, old_freqs[w]))
Esempio n. 9
0
def sort_with_tfidf(freq_file_name, ofile):
    stopwords = get_dict(DICTS.STOPWORD)

    old_freqs = {}
    total = 0
    st_list = None
    with codecs.open(freq_file_name, "r", "utf-8") as file:
        for line in file:
            ws = line.split()
            if stopwords.has_key(ws[0]):
                continue

            if len(ws) >= 2:
                f = int(ws[1])
                old_freqs[ws[0]] = f
                total += f

        freqs = [(k,v/total) for k,v in old_freqs.iteritems()]
        tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freqs]
        st_list = sorted(tf_idf_list, reverse=True)
    
    with codecs.open(ofile, "w", "utf-8") as file:
        for v, w in st_list:
            file.write("%s %d\n" % (w, old_freqs[w]))