Exemple #1
0
def summarize3(txt, cuttor=None):
    # TODO how to do this better 21/08/13 13:07:22
    # You can replace this with "import numpy", and of cause you have to
    # install the lib numpy
    name = "numpy"
    numpy = __import__(name, fromlist=[])

    if cuttor:
        tmp_cuttor = cuttor
    else:
        tmp_cuttor = Cuttor()
        tmp_cuttor.set_stage1_regex(
            re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U))

    sentences = []
    #TODO do it better 21/08/13 12:36:08
    for s, need in tmp_cuttor.cut_to_sentence(txt):
        if need:
            sentences.append(s)
    normalized_sentences = [s.lower() for s in sentences]

    top_n_words = extract_keywords(txt, N, tmp_cuttor)
    scored_sentences = __score_sentences(normalized_sentences, top_n_words,
                                         tmp_cuttor)
    avg = numpy.mean([s[1] for s in scored_sentences])
    std = numpy.std([s[1] for s in scored_sentences])
    mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
                   if score > avg + 0.5 * std]
    mean_scored_summary = [sentences[idx] for (idx, score) in mean_scored]
    return ', '.join(mean_scored_summary) + '.'
Exemple #2
0
def summarize1(original_text, summary_size = 8, cuttor = None):
    if cuttor:
        tmp_cuttor = cuttor
    else:
        tmp_cuttor = Cuttor()
        tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))

    words_sorted = extract_keywords(original_text, 16, cuttor)
    summary_set = {}
    sentences = []
    for s,need in tmp_cuttor.cut_to_sentence(original_text):
        if need:
            sentences.append(s)

    for word in words_sorted:
        matching_sentence = __search_word(sentences, word)
        if matching_sentence <> '':
            summary_set[matching_sentence] = 1
            if len(summary_set) >= summary_size:
                break
    summary = []
    for s in sentences:
        if s in summary_set:
            summary.append(s)
    return ', '.join(summary)+'.'
Exemple #3
0
def summarize1(original_text, summary_size=8, cuttor=None):
    if cuttor:
        tmp_cuttor = cuttor
    else:
        tmp_cuttor = Cuttor()
        tmp_cuttor.set_stage1_regex(
            re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U))

    words_sorted = extract_keywords(original_text, 16, cuttor)
    summary_set = {}
    sentences = []
    for s, need in tmp_cuttor.cut_to_sentence(original_text):
        if need:
            sentences.append(s)

    for word in words_sorted:
        matching_sentence = __search_word(sentences, word)
        if matching_sentence <> '':
            summary_set[matching_sentence] = 1
            if len(summary_set) >= summary_size:
                break
    summary = []
    for s in sentences:
        if s in summary_set:
            summary.append(s)
    return ', '.join(summary) + '.'
Exemple #4
0
def summarize3(txt, cuttor=None):
    # TODO how to do this better 21/08/13 13:07:22
    # You can replace this with "import numpy", and of cause you have to
    # install the lib numpy
    name = "numpy"
    numpy = __import__(name, fromlist=[])

    if cuttor:
        tmp_cuttor = cuttor
    else:
        tmp_cuttor = Cuttor()
        tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))
    
    sentences = []
    #TODO do it better 21/08/13 12:36:08
    for s,need in tmp_cuttor.cut_to_sentence(txt):
        if need:
            sentences.append(s)
    normalized_sentences = [s.lower() for s in sentences]

    top_n_words = extract_keywords(txt, N, tmp_cuttor)
    scored_sentences = __score_sentences(normalized_sentences, top_n_words, tmp_cuttor)
    avg = numpy.mean([s[1] for s in scored_sentences])
    std = numpy.std([s[1] for s in scored_sentences])
    mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
                   if score > avg + 0.5 * std]
    mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored]
    return ', '.join(mean_scored_summary) + '.'
Exemple #5
0
def near_duplicate(content1, content2, cuttor=None):
    tmp_cuttor = None
    if cuttor:
        tmp_cuttor = cuttor
    else:
        tmp_cuttor = Cuttor()
        tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))
    file_words = {}
    stopwords = [get_dict(DICTS.STOP_SENTENCE), get_dict(DICTS.EXT_STOPWORD), get_dict(DICTS.STOPWORD)]

    seg_list = tmp_cuttor.cut(content1)
    for w in seg_list:
        is_drop = False
        lw = w.lower()
        for stopword in stopwords:
            if stopword.has_key(lw):
                is_drop = True
                break
            if is_drop:
                continue
            if lw not in file_words.keys():
                file_words[lw] = [1,0]
            else:
                file_words[lw][0] += 1
    
    seg_list = tmp_cuttor.cut(content2)
    for w in seg_list:
        is_drop = False
        lw = w.lower()
        for stopword in stopwords:
            if stopword.has_key(lw):
                is_drop = True
                break
            if is_drop:
                continue
            if lw not in file_words.keys():
                file_words[lw] = [0,1]
            else:
                file_words[lw][1] += 1

    sum_2 = 0
    sum_file1 = 0
    sum_file2 = 0
    for word in file_words.values():
        sum_2 += word[0]*word[1]
        sum_file1 += word[0]**2
        sum_file2 += word[1]**2

    rate = sum_2/(sqrt(sum_file1*sum_file2))
    return rate
Exemple #6
0
def extract_keywords(content, topk=18, cuttor=None):
    stopwords = get_dict(DICTS.STOPWORD)

    tmp_cuttor = None
    if cuttor:
        tmp_cuttor = cuttor
    else:
        tmp_cuttor = Cuttor()
        #support for number and english 21/08/13 08:43:23
        tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))

    words = tmp_cuttor.cut(content)
    freq = {}
    total = 0
    for word in words:
        if len(word.strip()) < 2:
            continue
        lower_word = word.lower()
        if stopwords.has_key(lower_word):
            continue
        #TODO only leave the 'n' word? 21/08/13 09:13:36
        if tmp_cuttor.exist(lower_word) and not tmp_cuttor.word_type(lower_word, 'n'):
            continue
        total += 1
        if word in freq:
            freq[lower_word] += 1
        else:
            freq[lower_word] = 1
    freq = [(k,v/total) for k,v in freq.iteritems()]
    tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq]
    st_list = sorted(tf_idf_list, reverse=True)

    top_tuples = st_list[:topk]
    keys = [a[1] for a in top_tuples]
    return keys
Exemple #7
0
def summarize2(txt, cuttor=None):
    if cuttor:
        tmp_cuttor = cuttor
    else:
        tmp_cuttor = Cuttor()
        tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))
    
    sentences = []
    for s in cut_sentence(txt):
        sentences.append(s)
    normalized_sentences = [s.lower() for s in sentences]

    top_n_words = extract_keywords(txt, N_2, tmp_cuttor)
    scored_sentences = __score_sentences(normalized_sentences, top_n_words, tmp_cuttor)

    top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:]
    top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
    top_n_summary=[sentences[idx] for (idx, score) in top_n_scored]
    #return ', '.join(top_n_summary) + '.'
    return u'。 '.join(top_n_summary) + u'。 '
Exemple #8
0
def summarize3(txt, cuttor=None):
    # Remove numpy and calc mean,std by own 21/08/13 13:07:22
    if cuttor:
        tmp_cuttor = cuttor
    else:
        tmp_cuttor = Cuttor()
        tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))
    
    sentences = []
    for s in cut_sentence(txt):
        sentences.append(s)
    normalized_sentences = [s.lower() for s in sentences]

    top_n_words = extract_keywords(txt, N_3, tmp_cuttor)
    scored_sentences = __score_sentences(normalized_sentences, top_n_words, tmp_cuttor)
    avg,std = _mean_std([s[1] for s in scored_sentences])
    mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
                   if score > avg + 0.5 * std]
    mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored]
    #return ', '.join(mean_scored_summary) + '.'
    return u'。 '.join(mean_scored_summary) + u'。 '
Exemple #9
0
def summarize2(txt, cuttor=None):
    if cuttor:
        tmp_cuttor = cuttor
    else:
        tmp_cuttor = Cuttor()
        tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))
    
    sentences = []
    #TODO do it better 21/08/13 12:36:08
    for s,need in tmp_cuttor.cut_to_sentence(txt):
        if need:
            sentences.append(s)
    normalized_sentences = [s.lower() for s in sentences]

    top_n_words = extract_keywords(txt, N_2, tmp_cuttor)
    scored_sentences = __score_sentences(normalized_sentences, top_n_words, tmp_cuttor)

    top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:]
    top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
    top_n_summary=[sentences[idx] for (idx, score) in top_n_scored]
    return ', '.join(top_n_summary) + '.'
Exemple #10
0
def summarize3(txt, cuttor=None):
    # Remove numpy and calc mean,std by own 21/08/13 13:07:22
    if cuttor:
        tmp_cuttor = cuttor
    else:
        tmp_cuttor = Cuttor()
        tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))
    
    sentences = []
    #TODO do it better 21/08/13 12:36:08
    for s,need in tmp_cuttor.cut_to_sentence(txt):
        if need:
            sentences.append(s)
    normalized_sentences = [s.lower() for s in sentences]

    top_n_words = extract_keywords(txt, N_3, tmp_cuttor)
    scored_sentences = __score_sentences(normalized_sentences, top_n_words, tmp_cuttor)
    avg,std = _mean_std([s[1] for s in scored_sentences])
    mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
                   if score > avg + 0.5 * std]
    mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored]
    return ', '.join(mean_scored_summary) + '.'
Exemple #11
0
def near_duplicate(content1, content2, cuttor=None):
    tmp_cuttor = None
    if cuttor:
        tmp_cuttor = cuttor
    else:
        tmp_cuttor = Cuttor()
        tmp_cuttor.set_stage1_regex(
            re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U))
    file_words = {}
    stopwords = [
        get_dict(DICTS.STOP_SENTENCE),
        get_dict(DICTS.EXT_STOPWORD),
        get_dict(DICTS.STOPWORD)
    ]

    seg_list = tmp_cuttor.cut(content1)
    for w in seg_list:
        is_drop = False
        lw = w.lower()
        for stopword in stopwords:
            if stopword.has_key(lw):
                is_drop = True
                break
            if is_drop:
                continue
            if lw not in file_words.keys():
                file_words[lw] = [1, 0]
            else:
                file_words[lw][0] += 1

    seg_list = tmp_cuttor.cut(content2)
    for w in seg_list:
        is_drop = False
        lw = w.lower()
        for stopword in stopwords:
            if stopword.has_key(lw):
                is_drop = True
                break
            if is_drop:
                continue
            if lw not in file_words.keys():
                file_words[lw] = [0, 1]
            else:
                file_words[lw][1] += 1

    sum_2 = 0
    sum_file1 = 0
    sum_file2 = 0
    for word in file_words.values():
        sum_2 += word[0] * word[1]
        sum_file1 += word[0]**2
        sum_file2 += word[1]**2

    rate = sum_2 / (sqrt(sum_file1 * sum_file2))
    return rate
Exemple #12
0
class YahaTokenizer(Tokenizer, Component):

    name = "tokenizer_yaha"

    provides = ["tokens"]

    cuttor = Cuttor()

    def __init__(self):
        pass

    @classmethod
    def required_packages(cls):
        # type: () -> List[Text]
        return ["yaha"]

    def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUConfig, **Any) -> None
        if config['language'] != 'zh':
            raise Exception(
                "tokenizer_yaha is only used for Chinese. Check your configure json file."
            )

        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text))

    def process(self, message, **kwargs):
        # type: (Message, **Any) -> None

        message.set("tokens", self.tokenize(message.text))

    def tokenize(self, text):
        # type: (Text) -> List[Token]
        tokenized = self.cuttor.tokenize(text.decode('utf-8'), search=True)
        tokens = [Token(word, start) for (word, start, end) in tokenized]

        return tokens
Exemple #13
0
STOP_WORDS = None
def __init_stop_words():
    global STOP_WORDS
    stop_words = []
    for t,v in get_dict(DICTS.EXT_STOPWORD).iteritems():
        stop_words.append(t)
    for t,v in get_dict(DICTS.STOPWORD).iteritems():
        stop_words.append(t)
    for t,v in get_dict(DICTS.STOP_SENTENCE).iteritems():
        stop_words.append(t)
    STOP_WORDS = frozenset(stop_words)
__init_stop_words()

accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")

_cuttor = Cuttor()
_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))
_cuttor.add_stage(SurnameCutting())
_cuttor.add_stage(SuffixCutting())

class ChineseTokenizer(Tokenizer):
    def __call__(self,text,**kargs):
        words = _cuttor.tokenize(text, search=True)
        token  = Token()
        for (w,start_pos,stop_pos) in words:
            if not accepted_chars.match(w):
                if len(w)>1:
                    pass
                else:
                    continue
            token.original = token.text = w
Exemple #14
0
#encoding:utf-8
'''
Created on Feb 27, 2015

@author: root
'''
import sys, re, codecs
import cProfile
from yaha import Cuttor, RegexCutting, SurnameCutting, SurnameCutting2, SuffixCutting
from yaha.wordmaker import WordDict
from yaha.analyse import extract_keywords, near_duplicate, summarize1, summarize2, summarize3

cuttor = Cuttor()


def init():
    cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U))
    surname = SurnameCutting2()
    cuttor.add_stage(surname)
    suffix = SuffixCutting()
    cuttor.add_stage(suffix)


def cutstring(str, wordsmap, wordsweight):
    seglist = cuttor.cut(str)
    for value in list(seglist):
        word = value.encode('utf-8')
        print word
        if wordsmap.has_key(word) == False:
            wordsmap[word] = 0
        wordsmap[word] += 1
Exemple #15
0
# -*- coding=utf-8 -*-
import sys, re, codecs
import cProfile
from yaha import Cuttor, RegexCutting, SurnameCutting, SurnameCutting2, SuffixCutting
from yaha.wordmaker import WordDict
from yaha.analyse import extract_keywords, near_duplicate, summarize1, summarize2, summarize3

str = '唐成真是唐成牛的长寿乡是个1998love唐成真诺维斯基'
cuttor = Cuttor()

# Get 3 shortest paths for choise_best
#cuttor.set_topk(3)

# Use stage 1 to cut english and number 
cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))

# Or use stage 2 to cut english and number 
#cuttor.add_stage(RegexCutting(re.compile('\d+', re.I|re.U)))
#cuttor.add_stage(RegexCutting(re.compile('[a-zA-Z]+', re.I|re.U)))

# Use stage 3 to cut chinese name
#surname = SurnameCutting()
#cuttor.add_stage(surname)

# Or use stage 4 to cut chinese name
surname = SurnameCutting2()
cuttor.add_stage(surname)

# Use stage 4 to cut chinese address or english name
suffix = SuffixCutting()
cuttor.add_stage(suffix)
Exemple #16
0
    global STOP_WORDS
    stop_words = []
    for t, v in get_dict(DICTS.EXT_STOPWORD).iteritems():
        stop_words.append(t)
    for t, v in get_dict(DICTS.STOPWORD).iteritems():
        stop_words.append(t)
    for t, v in get_dict(DICTS.STOP_SENTENCE).iteritems():
        stop_words.append(t)
    STOP_WORDS = frozenset(stop_words)


__init_stop_words()

accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+")

_cuttor = Cuttor()
_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U))
_cuttor.add_stage(SurnameCutting())
_cuttor.add_stage(SuffixCutting())


class ChineseTokenizer(Tokenizer):
    def __call__(self, text, **kargs):
        words = _cuttor.tokenize(text, search=True)
        token = Token()
        for (w, start_pos, stop_pos) in words:
            if not accepted_chars.match(w):
                if len(w) > 1:
                    pass
                else:
                    continue
Exemple #17
0
write_db()
add_doc()
search_db()
key_all()

########NEW FILE########
__FILENAME__ = test_cuttor
# -*- coding=utf-8 -*-
import sys, re, codecs
import cProfile
from yaha import Cuttor, RegexCutting, SurnameCutting, SurnameCutting2, SuffixCutting
from yaha.wordmaker import WordDict
from yaha.analyse import extract_keywords, near_duplicate, summarize1, summarize2, summarize3

str = '唐成真是唐成牛的长寿乡是个1998love唐成真诺维斯基'
cuttor = Cuttor()

# Get 3 shortest paths for choise_best
#cuttor.set_topk(3)

# Use stage 1 to cut english and number 
cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U))

# Or use stage 2 to cut english and number 
#cuttor.add_stage(RegexCutting(re.compile('\d+', re.I|re.U)))
#cuttor.add_stage(RegexCutting(re.compile('[a-zA-Z]+', re.I|re.U)))

# Use stage 3 to cut chinese name
#surname = SurnameCutting()
#cuttor.add_stage(surname)