def __score_sentences(sentences, important_words): scores = [] sentence_idx = -1 for s in [list(cppjiebapy.cut(s)) for s in sentences]: sentence_idx += 1 word_idx = [] # For each word in the word list... for w in important_words: try: # Compute an index for where any important words occur in the sentence word_idx.append(s.index(w)) except ValueError, e: # w not in this particular sentence pass word_idx.sort() # It is possible that some sentences may not contain any important words at all if len(word_idx) == 0: continue # Using the word index, compute clusters by using a max distance threshold # for any two consecutive words clusters = [] cluster = [word_idx[0]] i = 1 while i < len(word_idx): if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD: cluster.append(word_idx[i]) else: clusters.append(cluster[:]) cluster = [word_idx[i]] i += 1 clusters.append(cluster) # Score each cluster. The max score for any given cluster is the score # for the sentence max_cluster_score = 0 for c in clusters: significant_words_in_cluster = len(c) total_words_in_cluster = c[-1] - c[0] + 1 score = 1.0 * significant_words_in_cluster * significant_words_in_cluster / total_words_in_cluster if score > max_cluster_score: max_cluster_score = score scores.append((sentence_idx, score))
def __score_sentences(sentences, important_words): scores = [] sentence_idx = -1 for s in [list(cppjiebapy.cut(s)) for s in sentences]: sentence_idx += 1 word_idx = [] # For each word in the word list... for w in important_words: try: # Compute an index for where any important words occur in the sentence word_idx.append(s.index(w)) except ValueError, e: # w not in this particular sentence pass word_idx.sort() # It is possible that some sentences may not contain any important words at all if len(word_idx) == 0: continue # Using the word index, compute clusters by using a max distance threshold # for any two consecutive words clusters = [] cluster = [word_idx[0]] i = 1 while i < len(word_idx): if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD: cluster.append(word_idx[i]) else: clusters.append(cluster[:]) cluster = [word_idx[i]] i += 1 clusters.append(cluster) # Score each cluster. The max score for any given cluster is the score # for the sentence max_cluster_score = 0 for c in clusters: significant_words_in_cluster = len(c) total_words_in_cluster = c[-1] - c[0] + 1 score = 1.0 * significant_words_in_cluster \ * significant_words_in_cluster / total_words_in_cluster if score > max_cluster_score: max_cluster_score = score scores.append((sentence_idx, score))
def test4(): corpus = [u'SCANV网址安全中心(http://scanv.com)是一个综合性的网址安全服务平台。通过网址安全中心,用户可以方便的查询到要访问的网址是否存在恶意行为,同时可以在SCANV中在线举报曝光违法恶意网站。', u'工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作',] tagger = Tagger() for str in corpus: print '\nCPPJIEBA:' for s in tagger.pos(list(cppjiebapy.cut(str))): print s[0],'/',s[1], " ", print '\nCRF:' for s in tagger.cut_pos(str): print s[0],'/',s[1], " ", print '\n'
def test4(): corpus = [ u'SCANV网址安全中心(http://scanv.com)是一个综合性的网址安全服务平台。通过网址安全中心,用户可以方便的查询到要访问的网址是否存在恶意行为,同时可以在SCANV中在线举报曝光违法恶意网站。', u'工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作', ] tagger = Tagger() for str in corpus: print '\nCPPJIEBA:' for s in tagger.pos(list(cppjiebapy.cut(str))): print s[0], '/', s[1], " ", print '\nCRF:' for s in tagger.cut_pos(str): print s[0], '/', s[1], " ", print '\n'
def tokenize_1(text): start = 0 for term in cppjiebapy.cut(text): width = len(term) yield (term, start, start + width) start += width
def tokenize_1(text): start = 0 for term in cppjiebapy.cut(text): width = len(term) yield (term, start, start+width) start += width
def test1(): with codecs.open(filename,'r','utf-8') as file: out = codecs.open("./jieba", 'w', 'utf-8') for line in file: out.write(' '.join(list(cppjiebapy.cut(line))))
def test1(): with codecs.open(filename, 'r', 'utf-8') as file: out = codecs.open("./jieba", 'w', 'utf-8') for line in file: out.write(' '.join(list(cppjiebapy.cut(line))))