コード例 #1
0
def extract_key_terms(doc, num_terms=50, ngrams=(1, 2, 3), algo='yake'):
    """Compute post most important terms

    This is particularly useful for the search and related posts

    Args:
        doc (Spacy.doc): Doc to extract terms from.
        num_terms (int, optional): How many terms to return. Defaults to 100.
        ngrams (int, optional): which size of ngrams to consider
        algo (str, optional): which algorithm to use to find key terms
    """
    if not len(doc):
        return []

    # special case
    if len(doc) < 3:
        return softmax(str(doc).split(' '))

    if algo == 'textrank':
        return softmax(textrank(doc, n_keyterms=NUM_TERMS))
    elif algo == 'yake':
        return softmax(yake(doc, ngrams=ngrams, topn=NUM_TERMS),
                       reverse=True)
    elif algo == 'scake':
        return softmax(scake(doc, topn=NUM_TERMS))
    elif algo == 'sgrank':
        return softmax(sgrank(doc, ngrams=ngrams,
                              n_keyterms=NUM_TERMS))
    else:
        err = 'Unknown key term extraction method:%s' % algo
        raise Exception(err)
コード例 #2
0
def get_vote_ranking(Q_A, include_POS=('NOUN', 'PROPN'), topN=0.99):
    # pd.DataFrame(textrank(Q_A, normalize='lemma', include_pos=('NOUN','PROPN'), window_size=2, edge_weighting='binary', position_bias=False, topn=0.99)).sort_values(by=1, ascending=False)
    x = pd.DataFrame(
        textrank(Q_A,
                 normalize='lemma',
                 include_pos=include_POS,
                 window_size=3,
                 edge_weighting='binary',
                 position_bias=False,
                 topn=topN),
        columns=['NOUN', 'rank']).sort_values(
            by=['rank'], ascending=False).reset_index().set_index(
                keys=['NOUN']).drop(columns=['rank']).add_prefix('textrank_')
    y = pd.DataFrame(
        yake(Q_A,
             normalize='lemma',
             ngrams=(1, 2, 3, 4, 5),
             include_pos=include_POS,
             window_size=3,
             topn=topN),
        columns=['NOUN', 'rank']).sort_values(
            by=['rank'],
            ascending=False).reset_index(drop=True).reset_index().set_index(
                keys=['NOUN']).drop(columns=['rank']).add_prefix('yake_')
    z = pd.DataFrame(
        sgrank(
            Q_A,
            normalize='lemma',
            ngrams=(1, 2, 3, 4, 5),
            include_pos=include_POS,
            window_size=1500,
            topn=topN,
            idf=None,
        ),
        columns=['NOUN', 'rank']).sort_values(
            by=['rank'], ascending=False).reset_index().set_index(
                keys=['NOUN']).drop(columns=['rank']).add_prefix('sgrank_')
    h = pd.DataFrame(
        scake(Q_A, normalize='lemma', include_pos=include_POS, topn=topN),
        columns=['NOUN', 'rank']).sort_values(
            by=['rank'], ascending=False).reset_index().set_index(
                keys=['NOUN']).drop(columns=['rank']).add_prefix('scake_')

    d = pd.concat([x, y, z, h], axis=1)
    dd = d.fillna(
        d.mean(axis=0)).sum(axis=1).sort_values().reset_index().drop(columns=0)
    dd.columns = ['rank', 'NOUN']
    dd['rank'] = 1 / (1 + dd['rank'])  # inverse weight
    return dd[['NOUN', 'rank']]
コード例 #3
0
def process_file(key, input_filename, output_filename):

    # open the given file and unwrap it
    text = ''
    with open(input_filename, 'r') as f:
        text = f.read()

    # consolidate whitespace
    text = re.sub(r'\n+', ' ', text)

    doc = nlp(text)

    with open(output_filename, 'w') as handle:
        # output a header
        print("id\tkeyword", file=handle)

        # process and output each keyword and done; can't get much simpler
        for keyword, score in (yake(doc, ngrams=NGRAMS, topn=TOPN)):

            if (len(keyword) < 3): continue
            print("\t".join([key, keyword]), file=handle)
コード例 #4
0
ファイル: txt2keywords.py プロジェクト: aculich/reader
# sanity check
if len(sys.argv) != 2:
    sys.stderr.write('Usage: ' + sys.argv[0] + " <file>\n")
    quit()

# initialize
file = sys.argv[1]

# open the given file and unwrap it
text = open(file).read()
text = textacy.preprocessing.normalize.normalize_quotation_marks(text)
text = textacy.preprocessing.normalize.normalize_hyphenated_words(text)
text = textacy.preprocessing.normalize.normalize_whitespace(text)

# compute the identifier
id = os.path.basename(os.path.splitext(file)[0])

# initialize model
maximum = len(text) + 1
model = spacy.load('en', max_length=maximum)
doc = model(text)

# output a header
print("id\tkeyword")

# process and output each keyword and done; can't get much simpler
for keyword, score in (yake(doc, ngrams=NGRAMS, topn=TOPN)):
    print("\t".join([id, keyword]))
exit()
コード例 #5
0
text = textacy.preprocessing.normalize.normalize_whitespace(text)

# compute the identifier
id = os.path.basename(os.path.splitext(file)[0])

# initialize model
maximum = len(text) + 1
model = spacy.load('en_core_web_sm', max_length=maximum)
doc = model(text)

# output a header
print("id\tkeyword")

# track found keywords to avoid duplicates
keywords = set()

# process and output each keyword with yake, will produce unigrams
for keyword, score in (yake(doc, topn=TOPN)):
    if keyword not in keywords:
        print("\t".join([id, keyword]))
        keywords.add(keyword)

# process and output each keyword with scake, will typically produce keyphrases
# removing lemmatization with normalize=None seems to produce better results
for keyword, score in (scake(doc, normalize=None, topn=TOPN)):
    if keyword not in keywords:
        print("\t".join([id, keyword]))
        keywords.add(keyword)

exit()