def summarize3(txt, cuttor=None): # TODO how to do this better 21/08/13 13:07:22 # You can replace this with "import numpy", and of cause you have to # install the lib numpy name = "numpy" numpy = __import__(name, fromlist=[]) if cuttor: tmp_cuttor = cuttor else: tmp_cuttor = Cuttor() tmp_cuttor.set_stage1_regex( re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U)) sentences = [] #TODO do it better 21/08/13 12:36:08 for s, need in tmp_cuttor.cut_to_sentence(txt): if need: sentences.append(s) normalized_sentences = [s.lower() for s in sentences] top_n_words = extract_keywords(txt, N, tmp_cuttor) scored_sentences = __score_sentences(normalized_sentences, top_n_words, tmp_cuttor) avg = numpy.mean([s[1] for s in scored_sentences]) std = numpy.std([s[1] for s in scored_sentences]) mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences if score > avg + 0.5 * std] mean_scored_summary = [sentences[idx] for (idx, score) in mean_scored] return ', '.join(mean_scored_summary) + '.'
def summarize1(original_text, summary_size = 8, cuttor = None): if cuttor: tmp_cuttor = cuttor else: tmp_cuttor = Cuttor() tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U)) words_sorted = extract_keywords(original_text, 16, cuttor) summary_set = {} sentences = [] for s,need in tmp_cuttor.cut_to_sentence(original_text): if need: sentences.append(s) for word in words_sorted: matching_sentence = __search_word(sentences, word) if matching_sentence <> '': summary_set[matching_sentence] = 1 if len(summary_set) >= summary_size: break summary = [] for s in sentences: if s in summary_set: summary.append(s) return ', '.join(summary)+'.'
def summarize1(original_text, summary_size=8, cuttor=None): if cuttor: tmp_cuttor = cuttor else: tmp_cuttor = Cuttor() tmp_cuttor.set_stage1_regex( re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U)) words_sorted = extract_keywords(original_text, 16, cuttor) summary_set = {} sentences = [] for s, need in tmp_cuttor.cut_to_sentence(original_text): if need: sentences.append(s) for word in words_sorted: matching_sentence = __search_word(sentences, word) if matching_sentence <> '': summary_set[matching_sentence] = 1 if len(summary_set) >= summary_size: break summary = [] for s in sentences: if s in summary_set: summary.append(s) return ', '.join(summary) + '.'
def summarize3(txt, cuttor=None): # TODO how to do this better 21/08/13 13:07:22 # You can replace this with "import numpy", and of cause you have to # install the lib numpy name = "numpy" numpy = __import__(name, fromlist=[]) if cuttor: tmp_cuttor = cuttor else: tmp_cuttor = Cuttor() tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U)) sentences = [] #TODO do it better 21/08/13 12:36:08 for s,need in tmp_cuttor.cut_to_sentence(txt): if need: sentences.append(s) normalized_sentences = [s.lower() for s in sentences] top_n_words = extract_keywords(txt, N, tmp_cuttor) scored_sentences = __score_sentences(normalized_sentences, top_n_words, tmp_cuttor) avg = numpy.mean([s[1] for s in scored_sentences]) std = numpy.std([s[1] for s in scored_sentences]) mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences if score > avg + 0.5 * std] mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored] return ', '.join(mean_scored_summary) + '.'
def near_duplicate(content1, content2, cuttor=None): tmp_cuttor = None if cuttor: tmp_cuttor = cuttor else: tmp_cuttor = Cuttor() tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U)) file_words = {} stopwords = [get_dict(DICTS.STOP_SENTENCE), get_dict(DICTS.EXT_STOPWORD), get_dict(DICTS.STOPWORD)] seg_list = tmp_cuttor.cut(content1) for w in seg_list: is_drop = False lw = w.lower() for stopword in stopwords: if stopword.has_key(lw): is_drop = True break if is_drop: continue if lw not in file_words.keys(): file_words[lw] = [1,0] else: file_words[lw][0] += 1 seg_list = tmp_cuttor.cut(content2) for w in seg_list: is_drop = False lw = w.lower() for stopword in stopwords: if stopword.has_key(lw): is_drop = True break if is_drop: continue if lw not in file_words.keys(): file_words[lw] = [0,1] else: file_words[lw][1] += 1 sum_2 = 0 sum_file1 = 0 sum_file2 = 0 for word in file_words.values(): sum_2 += word[0]*word[1] sum_file1 += word[0]**2 sum_file2 += word[1]**2 rate = sum_2/(sqrt(sum_file1*sum_file2)) return rate
def extract_keywords(content, topk=18, cuttor=None): stopwords = get_dict(DICTS.STOPWORD) tmp_cuttor = None if cuttor: tmp_cuttor = cuttor else: tmp_cuttor = Cuttor() #support for number and english 21/08/13 08:43:23 tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U)) words = tmp_cuttor.cut(content) freq = {} total = 0 for word in words: if len(word.strip()) < 2: continue lower_word = word.lower() if stopwords.has_key(lower_word): continue #TODO only leave the 'n' word? 21/08/13 09:13:36 if tmp_cuttor.exist(lower_word) and not tmp_cuttor.word_type(lower_word, 'n'): continue total += 1 if word in freq: freq[lower_word] += 1 else: freq[lower_word] = 1 freq = [(k,v/total) for k,v in freq.iteritems()] tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq] st_list = sorted(tf_idf_list, reverse=True) top_tuples = st_list[:topk] keys = [a[1] for a in top_tuples] return keys
def summarize2(txt, cuttor=None): if cuttor: tmp_cuttor = cuttor else: tmp_cuttor = Cuttor() tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U)) sentences = [] for s in cut_sentence(txt): sentences.append(s) normalized_sentences = [s.lower() for s in sentences] top_n_words = extract_keywords(txt, N_2, tmp_cuttor) scored_sentences = __score_sentences(normalized_sentences, top_n_words, tmp_cuttor) top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:] top_n_scored = sorted(top_n_scored, key=lambda s: s[0]) top_n_summary=[sentences[idx] for (idx, score) in top_n_scored] #return ', '.join(top_n_summary) + '.' return u'。 '.join(top_n_summary) + u'。 '
def summarize3(txt, cuttor=None): # Remove numpy and calc mean,std by own 21/08/13 13:07:22 if cuttor: tmp_cuttor = cuttor else: tmp_cuttor = Cuttor() tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U)) sentences = [] for s in cut_sentence(txt): sentences.append(s) normalized_sentences = [s.lower() for s in sentences] top_n_words = extract_keywords(txt, N_3, tmp_cuttor) scored_sentences = __score_sentences(normalized_sentences, top_n_words, tmp_cuttor) avg,std = _mean_std([s[1] for s in scored_sentences]) mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences if score > avg + 0.5 * std] mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored] #return ', '.join(mean_scored_summary) + '.' return u'。 '.join(mean_scored_summary) + u'。 '
def summarize2(txt, cuttor=None): if cuttor: tmp_cuttor = cuttor else: tmp_cuttor = Cuttor() tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U)) sentences = [] #TODO do it better 21/08/13 12:36:08 for s,need in tmp_cuttor.cut_to_sentence(txt): if need: sentences.append(s) normalized_sentences = [s.lower() for s in sentences] top_n_words = extract_keywords(txt, N_2, tmp_cuttor) scored_sentences = __score_sentences(normalized_sentences, top_n_words, tmp_cuttor) top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:] top_n_scored = sorted(top_n_scored, key=lambda s: s[0]) top_n_summary=[sentences[idx] for (idx, score) in top_n_scored] return ', '.join(top_n_summary) + '.'
def summarize3(txt, cuttor=None): # Remove numpy and calc mean,std by own 21/08/13 13:07:22 if cuttor: tmp_cuttor = cuttor else: tmp_cuttor = Cuttor() tmp_cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U)) sentences = [] #TODO do it better 21/08/13 12:36:08 for s,need in tmp_cuttor.cut_to_sentence(txt): if need: sentences.append(s) normalized_sentences = [s.lower() for s in sentences] top_n_words = extract_keywords(txt, N_3, tmp_cuttor) scored_sentences = __score_sentences(normalized_sentences, top_n_words, tmp_cuttor) avg,std = _mean_std([s[1] for s in scored_sentences]) mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences if score > avg + 0.5 * std] mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored] return ', '.join(mean_scored_summary) + '.'
def near_duplicate(content1, content2, cuttor=None): tmp_cuttor = None if cuttor: tmp_cuttor = cuttor else: tmp_cuttor = Cuttor() tmp_cuttor.set_stage1_regex( re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U)) file_words = {} stopwords = [ get_dict(DICTS.STOP_SENTENCE), get_dict(DICTS.EXT_STOPWORD), get_dict(DICTS.STOPWORD) ] seg_list = tmp_cuttor.cut(content1) for w in seg_list: is_drop = False lw = w.lower() for stopword in stopwords: if stopword.has_key(lw): is_drop = True break if is_drop: continue if lw not in file_words.keys(): file_words[lw] = [1, 0] else: file_words[lw][0] += 1 seg_list = tmp_cuttor.cut(content2) for w in seg_list: is_drop = False lw = w.lower() for stopword in stopwords: if stopword.has_key(lw): is_drop = True break if is_drop: continue if lw not in file_words.keys(): file_words[lw] = [0, 1] else: file_words[lw][1] += 1 sum_2 = 0 sum_file1 = 0 sum_file2 = 0 for word in file_words.values(): sum_2 += word[0] * word[1] sum_file1 += word[0]**2 sum_file2 += word[1]**2 rate = sum_2 / (sqrt(sum_file1 * sum_file2)) return rate
class YahaTokenizer(Tokenizer, Component): name = "tokenizer_yaha" provides = ["tokens"] cuttor = Cuttor() def __init__(self): pass @classmethod def required_packages(cls): # type: () -> List[Text] return ["yaha"] def train(self, training_data, config, **kwargs): # type: (TrainingData, RasaNLUConfig, **Any) -> None if config['language'] != 'zh': raise Exception( "tokenizer_yaha is only used for Chinese. Check your configure json file." ) for example in training_data.training_examples: example.set("tokens", self.tokenize(example.text)) def process(self, message, **kwargs): # type: (Message, **Any) -> None message.set("tokens", self.tokenize(message.text)) def tokenize(self, text): # type: (Text) -> List[Token] tokenized = self.cuttor.tokenize(text.decode('utf-8'), search=True) tokens = [Token(word, start) for (word, start, end) in tokenized] return tokens
STOP_WORDS = None def __init_stop_words(): global STOP_WORDS stop_words = [] for t,v in get_dict(DICTS.EXT_STOPWORD).iteritems(): stop_words.append(t) for t,v in get_dict(DICTS.STOPWORD).iteritems(): stop_words.append(t) for t,v in get_dict(DICTS.STOP_SENTENCE).iteritems(): stop_words.append(t) STOP_WORDS = frozenset(stop_words) __init_stop_words() accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+") _cuttor = Cuttor() _cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U)) _cuttor.add_stage(SurnameCutting()) _cuttor.add_stage(SuffixCutting()) class ChineseTokenizer(Tokenizer): def __call__(self,text,**kargs): words = _cuttor.tokenize(text, search=True) token = Token() for (w,start_pos,stop_pos) in words: if not accepted_chars.match(w): if len(w)>1: pass else: continue token.original = token.text = w
#encoding:utf-8 ''' Created on Feb 27, 2015 @author: root ''' import sys, re, codecs import cProfile from yaha import Cuttor, RegexCutting, SurnameCutting, SurnameCutting2, SuffixCutting from yaha.wordmaker import WordDict from yaha.analyse import extract_keywords, near_duplicate, summarize1, summarize2, summarize3 cuttor = Cuttor() def init(): cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U)) surname = SurnameCutting2() cuttor.add_stage(surname) suffix = SuffixCutting() cuttor.add_stage(suffix) def cutstring(str, wordsmap, wordsweight): seglist = cuttor.cut(str) for value in list(seglist): word = value.encode('utf-8') print word if wordsmap.has_key(word) == False: wordsmap[word] = 0 wordsmap[word] += 1
# -*- coding=utf-8 -*- import sys, re, codecs import cProfile from yaha import Cuttor, RegexCutting, SurnameCutting, SurnameCutting2, SuffixCutting from yaha.wordmaker import WordDict from yaha.analyse import extract_keywords, near_duplicate, summarize1, summarize2, summarize3 str = '唐成真是唐成牛的长寿乡是个1998love唐成真诺维斯基' cuttor = Cuttor() # Get 3 shortest paths for choise_best #cuttor.set_topk(3) # Use stage 1 to cut english and number cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U)) # Or use stage 2 to cut english and number #cuttor.add_stage(RegexCutting(re.compile('\d+', re.I|re.U))) #cuttor.add_stage(RegexCutting(re.compile('[a-zA-Z]+', re.I|re.U))) # Use stage 3 to cut chinese name #surname = SurnameCutting() #cuttor.add_stage(surname) # Or use stage 4 to cut chinese name surname = SurnameCutting2() cuttor.add_stage(surname) # Use stage 4 to cut chinese address or english name suffix = SuffixCutting() cuttor.add_stage(suffix)
global STOP_WORDS stop_words = [] for t, v in get_dict(DICTS.EXT_STOPWORD).iteritems(): stop_words.append(t) for t, v in get_dict(DICTS.STOPWORD).iteritems(): stop_words.append(t) for t, v in get_dict(DICTS.STOP_SENTENCE).iteritems(): stop_words.append(t) STOP_WORDS = frozenset(stop_words) __init_stop_words() accepted_chars = re.compile(ur"[\u4E00-\u9FA5]+") _cuttor = Cuttor() _cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I | re.U)) _cuttor.add_stage(SurnameCutting()) _cuttor.add_stage(SuffixCutting()) class ChineseTokenizer(Tokenizer): def __call__(self, text, **kargs): words = _cuttor.tokenize(text, search=True) token = Token() for (w, start_pos, stop_pos) in words: if not accepted_chars.match(w): if len(w) > 1: pass else: continue
write_db() add_doc() search_db() key_all() ########NEW FILE######## __FILENAME__ = test_cuttor # -*- coding=utf-8 -*- import sys, re, codecs import cProfile from yaha import Cuttor, RegexCutting, SurnameCutting, SurnameCutting2, SuffixCutting from yaha.wordmaker import WordDict from yaha.analyse import extract_keywords, near_duplicate, summarize1, summarize2, summarize3 str = '唐成真是唐成牛的长寿乡是个1998love唐成真诺维斯基' cuttor = Cuttor() # Get 3 shortest paths for choise_best #cuttor.set_topk(3) # Use stage 1 to cut english and number cuttor.set_stage1_regex(re.compile('(\d+)|([a-zA-Z]+)', re.I|re.U)) # Or use stage 2 to cut english and number #cuttor.add_stage(RegexCutting(re.compile('\d+', re.I|re.U))) #cuttor.add_stage(RegexCutting(re.compile('[a-zA-Z]+', re.I|re.U))) # Use stage 3 to cut chinese name #surname = SurnameCutting() #cuttor.add_stage(surname)