Example #1
0
def automatic_labelling(df, col='corpus', label_col='labels'):
	mapping = {}

	for label in df[label_col].unique():
			extractor = TopicRank()
			tx = ' '.join(df[df[label_col] == label][col].tolist())

			extractor.load_document(input=tx,language="en",normalization=None)

			# select the keyphrase candidates, for TopicRank the longest sequences of
			# nouns and adjectives
			extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})

			# weight the candidates using a random walk. The threshold parameter sets the
			# minimum similarity for clustering, and the method parameter defines the
			# linkage method
			extractor.candidate_weighting(threshold=0.5,
										  method='average')

			# print the n-highest (10) scored candidates
			cluster_labels = ' | '.join(
				[keyphrase for keyphrase, score in extractor.get_n_best(n=10, stemming=True)][0:5])
			mapping[label] = cluster_labels
			print('-' * 30)
			print(cluster_labels)

	df['cluster_name'] = df[label_col].map(mapping)

	return df
Example #2
0
def keywordExtractor(text):
	""" Keyword extraction from text string """

	extractor = TopicRank()
	extractor.load_document(input=text, language="en", normalization=None)
	extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
	extractor.candidate_weighting(threshold=0.5,method='average')
	keyword_counts_list = dict(extractor.get_n_best(n=len(extractor.candidates), stemming=True))
	return keyword_counts_list
Example #3
0
def topic_rank_kw_extraction(temp_file, text):
    # Save text to temporary '.txt' file (it's the only way it works with the pke library)
    with open(temp_file, "w", encoding="utf-8") as f:
        f.write(text)

    tr = TopicRank()
    tr.load_document(temp_file)
    tr.candidate_selection()
    tr.candidate_weighting()
    keywords = tr.get_n_best(n=TOP_N_KEYWORDS)
    keys = [kw for kw, _ in keywords]

    return keys
Example #4
0
def get_terms(row_data):
    clusters = []
    for key, val in row_data.items():
        extractor = TopicRank()
        text = ". ".join(val)
        extractor.load_document(input=text)
        try:
            extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})
            extractor.candidate_weighting(threshold=0.74, method='average')
            top_words = []
            for (key_phrases, score) in extractor.get_n_best(n=10):
                #if len(key_phrases.split(" ")) <= 2:
                top_words.append(key_phrases)
                #if len(top_words) == 10: break
            clusters.append({"size": len(val), "topWords": top_words})
        except:
            clusters.append({"size": len(val), "topWords": []})

    return clusters
Example #5
0
def key_phrase_extract(
    posts, number_of_candidates
):  #this will extract paragraph and header text from given json file and extract the topics from that
    extractor = TopicRank()
    print("key_phrase extraction started")
    data_hp = " ".join(posts)
    with open(
            'temp_text.txt', 'w', encoding='utf-8'
    ) as f:  #write the extracted header and paragraph text to .txt as this lib_guidedlda only accepts .txt files
        f.write(data_hp)
        f.close()

    extractor.load_document(
        input='temp_text.txt',
        language="en",
        max_length=10000000,  #load text file
        normalization='stemming')
    #get stop words list
    stoplist = stopwords.words('english')

    # select the keyphrase candidates, for TopicRank the longest sequences of
    # nouns and adjectives
    extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'},
                                  stoplist=stoplist)

    # weight the candidates using a random walk. The threshold parameter sets the
    # minimum similarity for clustering, and the method parameter defines the
    # linkage method
    try:
        extractor.candidate_weighting(threshold=0.74, method='average')
    except ValueError:  #handling exceptions if corpus is empty
        print("Observations set is empty or not valid")

    # print the n-highest (10) scored candidates
    kpe_results = []
    for (keyphrase, score) in extractor.get_n_best(n=number_of_candidates,
                                                   stemming=True):
        kpe_results.append([keyphrase, score])
    print("key phrase extraction completed")
    print(kpe_results)
Example #6
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# this example uses TopicRank
from pke.unsupervised import TopicRank

# create a TopicRank extractor
extractor = TopicRank()

# load the content of the document, here in CoreNLP XML format
# the input language is set to English (used for the stoplist)
# normalization is set to stemming (computed with Porter's stemming algorithm)
extractor.load_document(input='C-1.xml',
                        language="en",
                        normalization='stemming')

# select the keyphrase candidates, for TopicRank the longest sequences of
# nouns and adjectives
extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})

# weight the candidates using a random walk. The threshold parameter sets the
# minimum similarity for clustering, and the method parameter defines the
# linkage method
extractor.candidate_weighting(threshold=0.74, method='average')

# print the n-highest (10) scored candidates
for (keyphrase, score) in extractor.get_n_best(n=10, stemming=True):
    print(keyphrase, score)
def key_phrase_extract(
    entry_id, number_of_candidates
):  #this will extract paragraph and header text from given json file and extract the topics from that
    extractor = TopicRank()

    mycol = refer_collection()
    comp_data_entry = mycol.find({"_id": entry_id})
    data = [i for i in comp_data_entry]
    print("key_phrase extraction started", str(data[0]['_id']),
          data[0]['link'])
    try:
        h_data = data[0][
            "header_text"]  # do topic extraction on paragraph and header text

        data_hp = " ".join(h_data)
        with open(
                'temp_text.txt', 'w', encoding='utf-8'
        ) as f:  #write the extracted header and paragraph text to .txt as this lib_guidedlda only accepts .txt files
            f.write(data_hp)
            f.close()

        extractor.load_document(
            input='temp_text.txt',
            language="en",
            max_length=10000000,  #load text file
            normalization='stemming')
        #get stop words list
        stoplist = stopwords.words('english')

        # select the keyphrase candidates, for TopicRank the longest sequences of
        # nouns and adjectives
        extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'},
                                      stoplist=stoplist)

        # weight the candidates using a random walk. The threshold parameter sets the
        # minimum similarity for clustering, and the method parameter defines the
        # linkage method

        extractor.candidate_weighting(threshold=0.74, method='average')

        # print the n-highest (10) scored candidates
        kpe_results = []
        for (keyphrase, score) in extractor.get_n_best(n=number_of_candidates,
                                                       stemming=True):
            kpe_results.append([keyphrase, score])
        print("key phrase extraction completed")
        # print(kpe_results)
        kpe_words = [i[0] for i in kpe_results]
        # print(kpe_words)
        print(kpe_words)
        mycol.update_one({'_id': entry_id},
                         {'$set': {
                             'kpe_results': kpe_words
                         }})
        print("Successfully extended the data entry with kpe results",
              entry_id)

    except Exception:  #handling exceptions if corpus is empty
        print("Observations set is empty or not valid")
        mycol.update_one({'_id': entry_id}, {'$set': {'kpe_results': []}})
        return "Observations set is empty or not valid"
Example #8
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# this example uses TopicRank
from pke.unsupervised import TopicRank

# create a TopicRank extractor and set the input language to English (used for
# the stoplist in the candidate selection method)
extractor = TopicRank(input_file='C-1.xml', language='english')

# load the content of the document, here in CoreNLP XML format
# the use_lemmas parameter allows to choose using CoreNLP lemmas or stems
# computed using nltk
extractor.read_document(format='corenlp', use_lemmas=False)

# select the keyphrase candidates, for TopicRank the longest sequences of
# nouns and adjectives
extractor.candidate_selection(
    pos=['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'])

# weight the candidates using a random walk. The threshold parameter sets the
# minimum similarity for clustering, and the method parameter defines the
# linkage method
extractor.candidate_weighting(threshold=0.74, method='average')

# print the n-highest (10) scored candidates
print(';'.join([u for u, v in extractor.get_n_best(n=10)])).encode('utf-8')
Example #9
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# this example uses TopicRank
from pke.unsupervised import TopicRank

# create a TopicRank extractor and set the input language to English (used for
# the stoplist in the candidate selection method)
extractor = TopicRank(input_file='C-1.xml', language='english')

# load the content of the document, here in CoreNLP XML format
# the use_lemmas parameter allows to choose using CoreNLP lemmas or stems
# computed using nltk
extractor.read_document(format='corenlp', use_lemmas=False)

# select the keyphrase candidates, for TopicRank the longest sequences of
# nouns and adjectives
extractor.candidate_selection(
    pos=['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'])

# weight the candidates using a random walk. The threshold parameter sets the
# minimum similarity for clustering, and the method parameter defines the
# linkage method
extractor.candidate_weighting(threshold=0.74, method='average')

# print the n-highest (10) scored candidates
for (keyphrase, score) in extractor.get_n_best(n=10):
    print(keyphrase, score)
Example #10
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# this example uses TopicRank
from pke.unsupervised import TopicRank

# create a TopicRank extractor
extractor = TopicRank()

# load the content of the document, here in CoreNLP XML format
# the input language is set to English (used for the stoplist)
# normalization is set to stemming (computed with Porter's stemming algorithm)
# extractor.load_document(input='examples/C-2.txt',
#                         language="en",
#                         normalization='stemming')

extractor.load_document(input='examples/thai_sentence.txt',
                        language="th",
                        normalization=None)

# select the keyphrase candidates, for TopicRank the longest sequences of
# nouns and adjectives
extractor.candidate_selection(pos={'NOUN', 'ADJ'})

# weight the candidates using a random walk. The threshold parameter sets the
# minimum similarity for clustering, and the method parameter defines the
# linkage method
extractor.candidate_weighting(threshold=0.5, method='average')

# print the n-highest (10) scored candidates
for (keyphrase, score) in extractor.get_n_best(n=5, stemming=False):
def main():
    data = Data()
    data.parse_input_data()

    list_id = sys.argv[1]

    review_list = data.list_id_to_reviews.get(list_id)

    if not review_list:
        print("This listing has no review. Aborting...")

    print("Total # of reviews: ", len(review_list))
    for i in range(len(review_list)):
        print("Review #{}: {}".format(i + 1, review_list[i]))

    # use a pre-trained svm and ngram that used 8000 data to train
    loaded_svm = joblib.load("./trainedModel/svm_8000_1576207965.joblib")
    transformed_input = dataUtil.one_func_transform(
        data.list_id_to_reviews.get(list_id),
        "./trainedModel/ngram_8000_1576207965.joblib")
    predictions = loaded_svm.predict(transformed_input)

    print("==========================================================")

    # sentiment score predictions
    print("average sentiment score: ", numpy.mean(predictions))

    with open('temp_review_text.txt', 'w') as txt_file:
        for review in data.list_id_to_reviews.get(list_id):
            txt_file.write(review)

    # create a TopicRank extractor
    extractor = TopicRank()

    # get all review text for the target listing
    extractor.load_document(input='temp_review_text.txt', language="en")

    # select the keyphrase candidates, for TopicRank the longest sequences of
    # nouns and adjectives
    extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})

    # weight the candidates using a random walk. The threshold parameter sets the
    # minimum similarity for clustering, and the method parameter defines the
    # linkage method
    extractor.candidate_weighting(threshold=0.8,
                                  method='average',
                                  heuristic='frequent')

    count_output = 0
    filter_words = ['place', 'room', 'journey', 'trip']

    # print the n-highest (10) scored candidates
    print("Top three key words (phrases):")

    for (keyphrase, _) in extractor.get_n_best(n=10, stemming=True):
        if count_output >= 3:
            break
        if keyphrase not in filter_words:
            print(keyphrase)
            count_output += 1

    os.remove("temp_review_text.txt")