def automatic_labelling(df, col='corpus', label_col='labels'): mapping = {} for label in df[label_col].unique(): extractor = TopicRank() tx = ' '.join(df[df[label_col] == label][col].tolist()) extractor.load_document(input=tx,language="en",normalization=None) # select the keyphrase candidates, for TopicRank the longest sequences of # nouns and adjectives extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'}) # weight the candidates using a random walk. The threshold parameter sets the # minimum similarity for clustering, and the method parameter defines the # linkage method extractor.candidate_weighting(threshold=0.5, method='average') # print the n-highest (10) scored candidates cluster_labels = ' | '.join( [keyphrase for keyphrase, score in extractor.get_n_best(n=10, stemming=True)][0:5]) mapping[label] = cluster_labels print('-' * 30) print(cluster_labels) df['cluster_name'] = df[label_col].map(mapping) return df
def keywordExtractor(text): """ Keyword extraction from text string """ extractor = TopicRank() extractor.load_document(input=text, language="en", normalization=None) extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'}) extractor.candidate_weighting(threshold=0.5,method='average') keyword_counts_list = dict(extractor.get_n_best(n=len(extractor.candidates), stemming=True)) return keyword_counts_list
def topic_rank_kw_extraction(temp_file, text): # Save text to temporary '.txt' file (it's the only way it works with the pke library) with open(temp_file, "w", encoding="utf-8") as f: f.write(text) tr = TopicRank() tr.load_document(temp_file) tr.candidate_selection() tr.candidate_weighting() keywords = tr.get_n_best(n=TOP_N_KEYWORDS) keys = [kw for kw, _ in keywords] return keys
def get_terms(row_data): clusters = [] for key, val in row_data.items(): extractor = TopicRank() text = ". ".join(val) extractor.load_document(input=text) try: extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'}) extractor.candidate_weighting(threshold=0.74, method='average') top_words = [] for (key_phrases, score) in extractor.get_n_best(n=10): #if len(key_phrases.split(" ")) <= 2: top_words.append(key_phrases) #if len(top_words) == 10: break clusters.append({"size": len(val), "topWords": top_words}) except: clusters.append({"size": len(val), "topWords": []}) return clusters
def key_phrase_extract( posts, number_of_candidates ): #this will extract paragraph and header text from given json file and extract the topics from that extractor = TopicRank() print("key_phrase extraction started") data_hp = " ".join(posts) with open( 'temp_text.txt', 'w', encoding='utf-8' ) as f: #write the extracted header and paragraph text to .txt as this lib_guidedlda only accepts .txt files f.write(data_hp) f.close() extractor.load_document( input='temp_text.txt', language="en", max_length=10000000, #load text file normalization='stemming') #get stop words list stoplist = stopwords.words('english') # select the keyphrase candidates, for TopicRank the longest sequences of # nouns and adjectives extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'}, stoplist=stoplist) # weight the candidates using a random walk. The threshold parameter sets the # minimum similarity for clustering, and the method parameter defines the # linkage method try: extractor.candidate_weighting(threshold=0.74, method='average') except ValueError: #handling exceptions if corpus is empty print("Observations set is empty or not valid") # print the n-highest (10) scored candidates kpe_results = [] for (keyphrase, score) in extractor.get_n_best(n=number_of_candidates, stemming=True): kpe_results.append([keyphrase, score]) print("key phrase extraction completed") print(kpe_results)
#!/usr/bin/env python # -*- coding: utf-8 -*- # this example uses TopicRank from pke.unsupervised import TopicRank # create a TopicRank extractor extractor = TopicRank() # load the content of the document, here in CoreNLP XML format # the input language is set to English (used for the stoplist) # normalization is set to stemming (computed with Porter's stemming algorithm) extractor.load_document(input='C-1.xml', language="en", normalization='stemming') # select the keyphrase candidates, for TopicRank the longest sequences of # nouns and adjectives extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'}) # weight the candidates using a random walk. The threshold parameter sets the # minimum similarity for clustering, and the method parameter defines the # linkage method extractor.candidate_weighting(threshold=0.74, method='average') # print the n-highest (10) scored candidates for (keyphrase, score) in extractor.get_n_best(n=10, stemming=True): print(keyphrase, score)
def key_phrase_extract( entry_id, number_of_candidates ): #this will extract paragraph and header text from given json file and extract the topics from that extractor = TopicRank() mycol = refer_collection() comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] print("key_phrase extraction started", str(data[0]['_id']), data[0]['link']) try: h_data = data[0][ "header_text"] # do topic extraction on paragraph and header text data_hp = " ".join(h_data) with open( 'temp_text.txt', 'w', encoding='utf-8' ) as f: #write the extracted header and paragraph text to .txt as this lib_guidedlda only accepts .txt files f.write(data_hp) f.close() extractor.load_document( input='temp_text.txt', language="en", max_length=10000000, #load text file normalization='stemming') #get stop words list stoplist = stopwords.words('english') # select the keyphrase candidates, for TopicRank the longest sequences of # nouns and adjectives extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'}, stoplist=stoplist) # weight the candidates using a random walk. The threshold parameter sets the # minimum similarity for clustering, and the method parameter defines the # linkage method extractor.candidate_weighting(threshold=0.74, method='average') # print the n-highest (10) scored candidates kpe_results = [] for (keyphrase, score) in extractor.get_n_best(n=number_of_candidates, stemming=True): kpe_results.append([keyphrase, score]) print("key phrase extraction completed") # print(kpe_results) kpe_words = [i[0] for i in kpe_results] # print(kpe_words) print(kpe_words) mycol.update_one({'_id': entry_id}, {'$set': { 'kpe_results': kpe_words }}) print("Successfully extended the data entry with kpe results", entry_id) except Exception: #handling exceptions if corpus is empty print("Observations set is empty or not valid") mycol.update_one({'_id': entry_id}, {'$set': {'kpe_results': []}}) return "Observations set is empty or not valid"
#!/usr/bin/env python # -*- coding: utf-8 -*- # this example uses TopicRank from pke.unsupervised import TopicRank # create a TopicRank extractor and set the input language to English (used for # the stoplist in the candidate selection method) extractor = TopicRank(input_file='C-1.xml', language='english') # load the content of the document, here in CoreNLP XML format # the use_lemmas parameter allows to choose using CoreNLP lemmas or stems # computed using nltk extractor.read_document(format='corenlp', use_lemmas=False) # select the keyphrase candidates, for TopicRank the longest sequences of # nouns and adjectives extractor.candidate_selection( pos=['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS']) # weight the candidates using a random walk. The threshold parameter sets the # minimum similarity for clustering, and the method parameter defines the # linkage method extractor.candidate_weighting(threshold=0.74, method='average') # print the n-highest (10) scored candidates print(';'.join([u for u, v in extractor.get_n_best(n=10)])).encode('utf-8')
#!/usr/bin/env python # -*- coding: utf-8 -*- # this example uses TopicRank from pke.unsupervised import TopicRank # create a TopicRank extractor and set the input language to English (used for # the stoplist in the candidate selection method) extractor = TopicRank(input_file='C-1.xml', language='english') # load the content of the document, here in CoreNLP XML format # the use_lemmas parameter allows to choose using CoreNLP lemmas or stems # computed using nltk extractor.read_document(format='corenlp', use_lemmas=False) # select the keyphrase candidates, for TopicRank the longest sequences of # nouns and adjectives extractor.candidate_selection( pos=['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS']) # weight the candidates using a random walk. The threshold parameter sets the # minimum similarity for clustering, and the method parameter defines the # linkage method extractor.candidate_weighting(threshold=0.74, method='average') # print the n-highest (10) scored candidates for (keyphrase, score) in extractor.get_n_best(n=10): print(keyphrase, score)
#!/usr/bin/env python # -*- coding: utf-8 -*- # this example uses TopicRank from pke.unsupervised import TopicRank # create a TopicRank extractor extractor = TopicRank() # load the content of the document, here in CoreNLP XML format # the input language is set to English (used for the stoplist) # normalization is set to stemming (computed with Porter's stemming algorithm) # extractor.load_document(input='examples/C-2.txt', # language="en", # normalization='stemming') extractor.load_document(input='examples/thai_sentence.txt', language="th", normalization=None) # select the keyphrase candidates, for TopicRank the longest sequences of # nouns and adjectives extractor.candidate_selection(pos={'NOUN', 'ADJ'}) # weight the candidates using a random walk. The threshold parameter sets the # minimum similarity for clustering, and the method parameter defines the # linkage method extractor.candidate_weighting(threshold=0.5, method='average') # print the n-highest (10) scored candidates for (keyphrase, score) in extractor.get_n_best(n=5, stemming=False):
def main(): data = Data() data.parse_input_data() list_id = sys.argv[1] review_list = data.list_id_to_reviews.get(list_id) if not review_list: print("This listing has no review. Aborting...") print("Total # of reviews: ", len(review_list)) for i in range(len(review_list)): print("Review #{}: {}".format(i + 1, review_list[i])) # use a pre-trained svm and ngram that used 8000 data to train loaded_svm = joblib.load("./trainedModel/svm_8000_1576207965.joblib") transformed_input = dataUtil.one_func_transform( data.list_id_to_reviews.get(list_id), "./trainedModel/ngram_8000_1576207965.joblib") predictions = loaded_svm.predict(transformed_input) print("==========================================================") # sentiment score predictions print("average sentiment score: ", numpy.mean(predictions)) with open('temp_review_text.txt', 'w') as txt_file: for review in data.list_id_to_reviews.get(list_id): txt_file.write(review) # create a TopicRank extractor extractor = TopicRank() # get all review text for the target listing extractor.load_document(input='temp_review_text.txt', language="en") # select the keyphrase candidates, for TopicRank the longest sequences of # nouns and adjectives extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'}) # weight the candidates using a random walk. The threshold parameter sets the # minimum similarity for clustering, and the method parameter defines the # linkage method extractor.candidate_weighting(threshold=0.8, method='average', heuristic='frequent') count_output = 0 filter_words = ['place', 'room', 'journey', 'trip'] # print the n-highest (10) scored candidates print("Top three key words (phrases):") for (keyphrase, _) in extractor.get_n_best(n=10, stemming=True): if count_output >= 3: break if keyphrase not in filter_words: print(keyphrase) count_output += 1 os.remove("temp_review_text.txt")