Exemple #1
0
def __score_sentences(sentences, important_words):
    scores = []
    sentence_idx = -1

    for s in [list(cppjiebapy.cut(s)) for s in sentences]:
        sentence_idx += 1
        word_idx = []

        # For each word in the word list...
        for w in important_words:
            try:
                # Compute an index for where any important words occur in the sentence
                word_idx.append(s.index(w))
            except ValueError, e:  # w not in this particular sentence
                pass

        word_idx.sort()

        # It is possible that some sentences may not contain any important words at all
        if len(word_idx) == 0:
            continue

        # Using the word index, compute clusters by using a max distance threshold
        # for any two consecutive words

        clusters = []
        cluster = [word_idx[0]]
        i = 1
        while i < len(word_idx):
            if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD:
                cluster.append(word_idx[i])
            else:
                clusters.append(cluster[:])
                cluster = [word_idx[i]]
            i += 1
        clusters.append(cluster)

        # Score each cluster. The max score for any given cluster is the score
        # for the sentence

        max_cluster_score = 0
        for c in clusters:
            significant_words_in_cluster = len(c)
            total_words_in_cluster = c[-1] - c[0] + 1
            score = 1.0 * significant_words_in_cluster * significant_words_in_cluster / total_words_in_cluster

            if score > max_cluster_score:
                max_cluster_score = score

        scores.append((sentence_idx, score))
Exemple #2
0
def __score_sentences(sentences, important_words):
    scores = []
    sentence_idx = -1

    for s in [list(cppjiebapy.cut(s)) for s in sentences]:
        sentence_idx += 1
        word_idx = []

        # For each word in the word list...
        for w in important_words:
            try:
                # Compute an index for where any important words occur in the sentence
                word_idx.append(s.index(w))
            except ValueError, e:  # w not in this particular sentence
                pass

        word_idx.sort()

        # It is possible that some sentences may not contain any important words at all
        if len(word_idx) == 0: continue

        # Using the word index, compute clusters by using a max distance threshold
        # for any two consecutive words

        clusters = []
        cluster = [word_idx[0]]
        i = 1
        while i < len(word_idx):
            if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD:
                cluster.append(word_idx[i])
            else:
                clusters.append(cluster[:])
                cluster = [word_idx[i]]
            i += 1
        clusters.append(cluster)

        # Score each cluster. The max score for any given cluster is the score
        # for the sentence

        max_cluster_score = 0
        for c in clusters:
            significant_words_in_cluster = len(c)
            total_words_in_cluster = c[-1] - c[0] + 1
            score = 1.0 * significant_words_in_cluster \
                * significant_words_in_cluster / total_words_in_cluster

            if score > max_cluster_score:
                max_cluster_score = score

        scores.append((sentence_idx, score))
Exemple #3
0
def test4():
    corpus = [u'SCANV网址安全中心(http://scanv.com)是一个综合性的网址安全服务平台。通过网址安全中心,用户可以方便的查询到要访问的网址是否存在恶意行为,同时可以在SCANV中在线举报曝光违法恶意网站。',
            u'工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作',]

    tagger = Tagger()
    for str in corpus:
        print '\nCPPJIEBA:'
        for s in tagger.pos(list(cppjiebapy.cut(str))):
            print s[0],'/',s[1], " ",

        print '\nCRF:'
        for s in tagger.cut_pos(str):
            print s[0],'/',s[1], " ",
        print '\n'
Exemple #4
0
def test4():
    corpus = [
        u'SCANV网址安全中心(http://scanv.com)是一个综合性的网址安全服务平台。通过网址安全中心,用户可以方便的查询到要访问的网址是否存在恶意行为,同时可以在SCANV中在线举报曝光违法恶意网站。',
        u'工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作',
    ]

    tagger = Tagger()
    for str in corpus:
        print '\nCPPJIEBA:'
        for s in tagger.pos(list(cppjiebapy.cut(str))):
            print s[0], '/', s[1], " ",

        print '\nCRF:'
        for s in tagger.cut_pos(str):
            print s[0], '/', s[1], " ",
        print '\n'
Exemple #5
0
def tokenize_1(text):
    start = 0
    for term in cppjiebapy.cut(text):
        width = len(term)
        yield (term, start, start + width)
        start += width
Exemple #6
0
def tokenize_1(text):
    start = 0
    for term in cppjiebapy.cut(text):
        width = len(term)
        yield (term, start, start+width)
        start += width
Exemple #7
0
def test1():
    with codecs.open(filename,'r','utf-8') as file:
        out = codecs.open("./jieba", 'w', 'utf-8')
        for line in file:
            out.write(' '.join(list(cppjiebapy.cut(line))))
Exemple #8
0
def test1():
    with codecs.open(filename, 'r', 'utf-8') as file:
        out = codecs.open("./jieba", 'w', 'utf-8')
        for line in file:
            out.write(' '.join(list(cppjiebapy.cut(line))))