Esempio n. 1
0
def near_duplicate(content1, content2, cuttor=None):
    tmp_cuttor = None
    if cuttor:
        tmp_cuttor = cuttor
    else:
        tmp_cuttor = Cuttor()
        tmp_cuttor.set_stage1_regex(
            re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U))
    file_words = {}
    stopwords = [
        get_dict(DICTS.STOP_SENTENCE),
        get_dict(DICTS.EXT_STOPWORD),
        get_dict(DICTS.STOPWORD)
    ]

    seg_list = tmp_cuttor.cut(content1)
    for w in seg_list:
        is_drop = False
        lw = w.lower()
        for stopword in stopwords:
            if stopword.has_key(lw):
                is_drop = True
                break
            if is_drop:
                continue
            if lw not in file_words.keys():
                file_words[lw] = [1, 0]
            else:
                file_words[lw][0] += 1

    seg_list = tmp_cuttor.cut(content2)
    for w in seg_list:
        is_drop = False
        lw = w.lower()
        for stopword in stopwords:
            if stopword.has_key(lw):
                is_drop = True
                break
            if is_drop:
                continue
            if lw not in file_words.keys():
                file_words[lw] = [0, 1]
            else:
                file_words[lw][1] += 1

    sum_2 = 0
    sum_file1 = 0
    sum_file2 = 0
    for word in file_words.values():
        sum_2 += word[0] * word[1]
        sum_file1 += word[0]**2
        sum_file2 += word[1]**2

    rate = sum_2 / (sqrt(sum_file1 * sum_file2))
    return rate
Esempio n. 2
0
def near_duplicate(content1, content2, cuttor=None):
    tmp_cuttor = None
    if cuttor:
        tmp_cuttor = cuttor
    else:
        tmp_cuttor = Cuttor()
        tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))
    file_words = {}
    stopwords = [get_dict(DICTS.STOP_SENTENCE), get_dict(DICTS.EXT_STOPWORD), get_dict(DICTS.STOPWORD)]

    seg_list = tmp_cuttor.cut(content1)
    for w in seg_list:
        is_drop = False
        lw = w.lower()
        for stopword in stopwords:
            if stopword.has_key(lw):
                is_drop = True
                break
            if is_drop:
                continue
            if lw not in file_words.keys():
                file_words[lw] = [1,0]
            else:
                file_words[lw][0] += 1
    
    seg_list = tmp_cuttor.cut(content2)
    for w in seg_list:
        is_drop = False
        lw = w.lower()
        for stopword in stopwords:
            if stopword.has_key(lw):
                is_drop = True
                break
            if is_drop:
                continue
            if lw not in file_words.keys():
                file_words[lw] = [0,1]
            else:
                file_words[lw][1] += 1

    sum_2 = 0
    sum_file1 = 0
    sum_file2 = 0
    for word in file_words.values():
        sum_2 += word[0]*word[1]
        sum_file1 += word[0]**2
        sum_file2 += word[1]**2

    rate = sum_2/(sqrt(sum_file1*sum_file2))
    return rate
Esempio n. 3
0
def extract_keywords(content, topk=18, cuttor=None):
    stopwords = get_dict(DICTS.STOPWORD)

    tmp_cuttor = None
    if cuttor:
        tmp_cuttor = cuttor
    else:
        tmp_cuttor = Cuttor()
        #support for number and english 21/08/13 08:43:23
        tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))

    words = tmp_cuttor.cut(content)
    freq = {}
    total = 0
    for word in words:
        if len(word.strip()) < 2:
            continue
        lower_word = word.lower()
        if stopwords.has_key(lower_word):
            continue
        #TODO only leave the 'n' word? 21/08/13 09:13:36
        if tmp_cuttor.exist(lower_word) and not tmp_cuttor.word_type(lower_word, 'n'):
            continue
        total += 1
        if word in freq:
            freq[lower_word] += 1
        else:
            freq[lower_word] = 1
    freq = [(k,v/total) for k,v in freq.iteritems()]
    tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq]
    st_list = sorted(tf_idf_list, reverse=True)

    top_tuples = st_list[:topk]
    keys = [a[1] for a in top_tuples]
    return keys
Esempio n. 4
0
#for seg in seglist:
#    print ','.join(seg)

#for s in cuttor.cut_to_sentence(str):
#    print s

#str = "伟大祖国是中华人民共和国"
#str = "九孔不好看来"
#str = "而迈入社会后..."
str = "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"

#You can set WORD_MAX to 8 for better match
#cuttor.WORD_MAX = 8

#Normal cut
seglist = cuttor.cut(str)
print 'Normal cut \n%s\n' % ','.join(list(seglist))

#All cut
seglist = cuttor.cut_all(str)
print 'All cut \n%s\n' % ','.join(list(seglist))

#Tokenize for search
print 'Cut for search (term,start,end)'
for term, start, end in cuttor.tokenize(str.decode('utf-8'), search=True):
    print term, start, end

re_line = re.compile("\W+|[a-zA-Z0-9]+", re.UNICODE)
def sentence_from_file(filename):
    with codecs.open(filename, 'r', 'utf-8') as file:
        for line in file:
Esempio n. 5
0
#for seg in seglist:
#    print ','.join(seg)

#for s in cuttor.cut_to_sentence(str):
#    print s

#str = "伟大祖国是中华人民共和国"
#str = "九孔不好看来"
#str = "而迈入社会后..."
str = "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"

#You can set WORD_MAX to 8 for better match
#cuttor.WORD_MAX = 8

#Normal cut
seglist = cuttor.cut(str)
print 'Normal cut \n%s\n' % ','.join(list(seglist))

#All cut
seglist = cuttor.cut_all(str)
print 'All cut \n%s\n' % ','.join(list(seglist))

#Tokenize for search
print 'Cut for search (term,start,end)'
for term, start, end in cuttor.tokenize(str.decode('utf-8'), search=True):
    print term, start, end

re_line = re.compile("\W+|[a-zA-Z0-9]+", re.UNICODE)
def sentence_from_file(filename):
    with codecs.open(filename, 'r', 'utf-8') as file:
        for line in file: