def print_representative_np(df, product, n=50): def _identity_tokenizer(text): return text tfidf = TfidfVectorizer(tokenizer=_identity_tokenizer, stop_words='english', lowercase=False) try: result = tfidf.fit_transform(df['nounPhrases']) except Exception as e: df['posTagged'] = df['tokenizedSentences'].apply( lambda tokenizedSentences: [pos_tag(sentence) for sentence in tokenizedSentences]) df['nounPhrases'] = df['posTagged'].apply(lambda posTagged: [ np.lower() for np in flatten([extract_NP(tag) for tag in posTagged]) ]) result = tfidf.fit_transform(df['nounPhrases']) scores = zip(tfidf.get_feature_names(), np.asarray(result.sum(axis=0)).ravel()) sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True) print('=' * 30 + product + '=' * 30) for item in sorted_scores[:n]: print("{0:50} Score: {1}".format(item[0], item[1])) print() print()
def getKeyPhrases(student_summaryList, sennafile, method=None, MalformedFlilter=False, save2file=None): #read senna file sentences = SennaParser.SennaParse(sennafile) phrases = [] for s in sentences: if method=='syntax': NPs = s.getSyntaxNP() else: NPs = [] for np in NPs: if MalformedFlilter: if isMalformed(np): continue phrases.append(np.lower()) return phrases
def get_position_score(keyphrase_candidate_list, position_bias): length = len(keyphrase_candidate_list) position_score = {} for i, kc in enumerate(keyphrase_candidate_list): np = kc[0] p = kc[1][0] np = np.lower() if np in position_score: position_score[np] += 0.0 else: position_score[np] = 1 / (float(i) + 1 + position_bias) score_list = [] for np, score in position_score.items(): score_list.append(score) score_list = softmax(score_list) i = 0 for np, score in position_score.items(): position_score[np] = score_list[i] i += 1 return position_score
def getKeyPhrases(student_summaryList, sennafile, method=None, MalformedFlilter=False, save2file=None): #read senna file sentences = SennaParser.SennaParse(sennafile) phrases = [] for s in sentences: if method == 'syntax': NPs = s.getSyntaxNP() else: NPs = [] for np in NPs: if MalformedFlilter: if isMalformed(np): continue phrases.append(np.lower()) return phrases
def sampleComments(data): data['NounChunks'] = data['Comment'].apply(lambda x: nounPhraseChunking(x)) data['NounPhrase'] = data['NounChunks'].apply( lambda x: extractNPPhrases(x)) # Create a dictionary of noun phrases npDict = {} nounPhrasePos = {} nounPhraseNeg = {} nounPhraseUnc = {} for idx, nounPhrase in enumerate(data['NounPhrase']): if len(nounPhrase) == 0: continue for np in nounPhrase: np = np.lower() if np not in npDict: npDict[np] = 1 else: npDict[np] += 1 if data.iloc[idx].Sentiment == 'POS': if np in nounPhrasePos: nounPhrasePos[np] += 1 else: nounPhrasePos[np] = 1 if data.iloc[idx].Sentiment == 'NEG': if np in nounPhraseNeg: nounPhraseNeg[np] += 1 else: nounPhraseNeg[np] = 1 if data.iloc[idx].Sentiment == 'UNC': if np in nounPhraseUnc: nounPhraseUnc[np] += 1 else: nounPhraseUnc[np] = 1 df = pd.DataFrame( data=[npDict, nounPhraseNeg, nounPhrasePos, nounPhraseUnc ]).transpose() df.rename(columns={0: 'All', 1: 'Neg', 2: 'Pos', 3: 'Unc'}, inplace=True) df.fillna(0, inplace=True) df['PosPercentage'] = df.eval('Pos/All') df['NegPercentage'] = df.eval('Neg/All') df['Diff'] = df.eval('PosPercentage-NegPercentage') topPosNP = df[df['Diff'] > 0] topPosNP = topPosNP.sort_values(by=['Diff', 'All'], ascending=False) topNegNP = df[df['Diff'] < 0] topNegNP = topNegNP.sort_values(by=['Diff', 'All'], ascending=[True, False]) if topNegNP.shape[0] > 10: topNegNPList = topNegNP.index[:10].tolist() else: topNegNPList = topNegNP.index.tolist() if topPosNP.shape[0] > 10: topPosNPList = topPosNP.index[:10].tolist() else: topPosNPList = topPosNP.index.tolist() indexListPos = [] indexListNeg = [] for idx, nounPhrase in enumerate(data['NounPhrase']): if len(nounPhrase) == 0: continue for np in nounPhrase: np = np.lower() if np in topPosNPList and data.iloc[idx].Sentiment == 'POS': indexListPos.append(idx) if np in topNegNPList and data.iloc[idx].Sentiment == 'NEG': indexListNeg.append(idx) samplePosComments = data.iloc[indexListPos].sort_values(by=['Polarity'], ascending=True) samplePosComments.reset_index(drop=True, inplace=True) sampleNegComments = data.iloc[indexListNeg].sort_values(by=['Polarity'], ascending=True) sampleNegComments.reset_index(drop=True, inplace=True) return samplePosComments, sampleNegComments, topPosNPList, topNegNPList
data = [] with open(data_path) as f: for line in f: data.append(json.loads(line)) df = pd.DataFrame.from_dict(data) df = df.drop(columns = ['overall', 'reviewTime', 'summary', 'unixReviewTime']) df = df[:10000] df['sentences'] = df['reviewText'].apply(segment_sent) df['tokenizedSentences'] = df['sentences'].apply(lambda sentences: [tokenize(sentence) for sentence in sentences]) df['cleanedTokenizedSentences'] = df['sentences'].apply(lambda sentences: [preprocessSentence(sentence) for sentence in sentences]) cleanedTokenizedSentences = flatten(df['cleanedTokenizedSentences']) df['posTagged'] = df['tokenizedSentences'].apply(lambda tokenizedSentences: [pos_tag(sentence) for sentence in tokenizedSentences]) df['nounPhrases'] = df['posTagged'].apply(lambda posTagged: [np.lower() for np in flatten([extract_NP(tag) for tag in posTagged])]) df['uniqueNounPhrases'] = df['nounPhrases'].apply(set).apply(list) import gensim word2vec_model = gensim.models.Word2Vec( cleanedTokenizedSentences, seed=42, workers=10, size=150, min_count=2, window=10) # NP Summary Per Review # word2vec_model.train(sentences=cleanedTokenizedSentences, total_examples=len(cleanedTokenizedSentences), epochs=10) # word2vec_model.save("word2vec_model1.w2v") # print("Model saved")
w = '' for word in term: w += word + ' ' noun_phrases.append(w.strip()) for term in terms2: w = '' for word in term: w += word + ' ' noun_phrases.append(w.strip()) a = [] for np in noun_phrases: a.extend(refine_noun_phrase(np.lower())) noun_phrases = list(a) left = len(set(phrase_list) - set(noun_phrases)) overall_recall += L - left print(L, '\t', left / L) # if left/L > 0.3 and L>10: # print(left) # print(cp_file) # print(phrase_list) # print() # print((set(phrase_list)- set(noun_phrases))
def main(data_file, seed): # set seed np.random.seed(seed) # load in a pd.df data = [json.loads(line) for line in data_file] df = pd.DataFrame.from_dict(data) # make directory for images if not os.path.exists(IMAGES_DIRECTORY): os.mkdir(IMAGES_DIRECTORY) # make directory for representative words if not os.path.exists(REP_DIRECTORY): os.mkdir(REP_DIRECTORY) print_header('3.2.1 Popular Products and Frequent Reviewers', 50) ## 3.2.1 get top 10 products top_10_products = df['asin'].value_counts().head(10).reset_index().rename( columns={ 'index': 'productID', 'asin': 'reviewCount' }) print_header('Top 10 products', char='-') print(top_10_products) # productID reviewCount # 0 B005SUHPO6 836 # 1 B0042FV2SI 690 # 2 B008OHNZI0 657 # 3 B009RXU59C 634 # 4 B000S5Q9CA 627 # 5 B008DJIIG8 510 # 6 B0090YGJ4I 448 # 7 B009A5204K 434 # 8 B00BT7RAPG 431 # 9 B0015RB39O 424 ## 3.2.1 get top 10 reviewers top_10_reviewers = df['reviewerID'].value_counts().head( 10).reset_index().rename(columns={ 'index': 'reviewerID', 'reviewerID': 'reviewCount' }) print_header('Top 10 reviewers', char='-') print(top_10_reviewers) # reviewerID reviewCount # 0 A2NYK9KWFMJV4Y 152 # 1 A22CW0ZHY3NJH8 138 # 2 A1EVV74UQYVKRY 137 # 3 A1ODOGXEYECQQ8 133 # 4 A2NOW4U7W3F7RI 132 # 5 A36K2N527TXXJN 124 # 6 A1UQBFCERIP7VJ 112 # 7 A1E1LEVQ9VQNK 109 # 8 A18U49406IPPIJ 109 # 9 AYB4ELCS5AM8P 107 ## 3.2.2 Sentence segmentation print_header('3.2.2 Sentence Segmentation', 50) df['sentences'] = df['reviewText'].apply(segment_sent) df['sentenceCount'] = df['sentences'].apply(len) # plotting for number of sentences plot_bar(df['sentenceCount'], \ title = 'Distribution of Number of Sentences for Each Review', \ x_label = "Sentence Count", y_label = "Review Count", countplot = False) plot_bar(df['sentenceCount'].clip(0, 50), \ title = 'Distribution of Number of Sentences for Each Review (Clipped)', \ x_label = "Sentence Count (Clipped)", y_label = "Review Count", countplot = True) # get 5 random reviews to do sentence segmentation and display results reviews = df['reviewText'] _seed = 43 # To give us an interesting result random_reviews = reviews.sample(5, random_state=_seed) random_reviews = pd.DataFrame( random_reviews, columns=['reviewText']).reset_index().drop(columns=['index']) random_reviews['segmentedSentences'] = random_reviews['reviewText'].apply( segment_sent) print( "5 Randomly selected reviews before and after sentence segmenetation:") print(random_reviews) ## 3.2.3 Tokenization and Stemming print_header('3.2.3 Tokenization and Stemming', 50) df['tokenizedSentences'] = df['sentences'].apply( lambda sentences: [tokenize(sentence) for sentence in sentences]) df['tokens'] = df['tokenizedSentences'].apply(flatten) ### No Stemming print_header('No Stemming', char='-') df['words'] = df['tokens'].apply( lambda tokens: [token.lower() for token in tokens]) df['words'] = df['words'].apply( lambda tokens: [token for token in tokens if is_word(token)]) df['uniqueWords'] = df['words'].apply(set) df['wordCount'] = df['uniqueWords'].apply(len) # token = {normal_word, emoji, stopword, punctuation} # word = {normal_word, emoji} plot_bar( df['wordCount'], title= 'Distribution of Number of Words for Each Review Without Stemming', x_label="Word Count", y_label="Review Count", countplot=False) plot_bar( df['wordCount'].clip(0, 300), title= 'Distribution of Number of Words for Each Review Without Stemming (Clipped)', x_label="Word Count (Clipped)", y_label="Review Count", countplot=False) words = flatten(df['words']) words_unique = flatten(df['uniqueWords']) top_20_words = pd.DataFrame.from_dict(Counter(words), orient='index').\ reset_index().rename(columns = {'index': 'Word', 0: 'Count'}).\ sort_values(['Count'], ascending = False).head(20).\ reset_index().drop(columns = ['index']) print_header('Top 20 Words Without Stemming', char='-') print(top_20_words) ### With Stemming print_header('With Stemming', char='-') stemmer = SnowballStemmer("english") df['stemmedWords'] = df['words'].apply( lambda tokens: [stemmer.stem(token) for token in tokens]) df['uniqueStemmedWords'] = df['stemmedWords'].apply(set) df['stemmedWordCount'] = df['uniqueStemmedWords'].apply(len) plot_bar(df['stemmedWordCount'], \ title = 'Distribution of Number of Words for Each Review With Stemming', \ x_label = "Stemmed Word Count", y_label = "Review Count", countplot = False) plot_bar(df['stemmedWordCount'].clip(0, 300), \ title = 'Distribution of Number of Words for Each Review With Stemming (Clipped)', \ x_label = "Word Count (Clipped)", y_label = "Review Count", countplot = False) plot_bar_overlap(df, ['wordCount', 'stemmedWordCount'], \ title = 'Distribution of Number of Words for Each Review', \ x_label = "Word Count", y_label = "Review Count", countplot = False) plot_bar_overlap(df[['wordCount', 'stemmedWordCount']].clip(0, 300), ['wordCount', 'stemmedWordCount'], \ title = 'Distribution of Number of Words for Each Review (Clipped)', \ x_label = "Word Count", y_label = "Review Count", countplot = False) stemmed_words = flatten(df['stemmedWords']) stemmed_words_unique = flatten(df['uniqueStemmedWords']) top_20_stemmed_words = pd.DataFrame.from_dict(Counter(stemmed_words), orient='index').\ reset_index().rename(columns = {'index': 'Word', 0: 'Count'}).\ sort_values(['Count'], ascending = False).head(20).\ reset_index().drop(columns = ['index']) print_header('Top 20 Words with Stemming', char='-') print(top_20_stemmed_words) print_header('3.2.4 POS Tagging', 50) tokenized_sentences = pd.Series(flatten(df['tokenizedSentences'])) print('Total Number of Sentences: ' + str(len(tokenized_sentences))) random_5_sentences = tokenized_sentences.sample(5, random_state=seed) random_5_df = pd.DataFrame( random_5_sentences, columns=['sentence']).reset_index().drop(columns=['index']) random_5_df['posTagged'] = random_5_df['sentence'].apply(pos_tag) print('=' * 30) print(random_5_df) print('=' * 30) # 3.3 Development of a Noun Phrase Summarizer print_header('3.3 Development of a Noun Phrase Summarizer', 50) df['posTagged'] = df['tokenizedSentences'].apply( lambda tokenizedSentences: [pos_tag(sentence) for sentence in tokenizedSentences]) df['nounPhrases'] = df['posTagged'].apply( lambda posTagged: [np.lower() for np in flatten([extract_NP(tag) for tag in posTagged])]) df[['reviewText', 'posTagged', 'nounPhrases']].head() # Including single noun phrases print_header('Including single noun phrases', char='-') noun_phrases = pd.DataFrame.from_dict(Counter(flatten(df['nounPhrases'])), orient='index').\ reset_index().rename(columns = {'index': 'Noun Phrase', 0: 'Count'}).\ sort_values(['Count'], ascending = False) top_20_noun_phrases = noun_phrases.head(20).reset_index().drop( columns=['index']) print_header('Top 20 Noun Phrases Including Single Noun Phrases', char='-') print(top_20_noun_phrases) df['nounPhrasesExcludeSingle'] = df['nounPhrases'].apply( lambda noun_phrases: [ noun_phrase for noun_phrase in noun_phrases if len(noun_phrase.split()) > 1 ]) noun_phrases = pd.DataFrame.from_dict(Counter(flatten(df['nounPhrasesExcludeSingle'])), orient='index').\ reset_index().rename(columns = {'index': 'Noun Phrase', 0: 'Count'}).\ sort_values(['Count'], ascending = False) top_20_noun_phrases = noun_phrases.head(20).reset_index().drop( columns=['index']) print_header('Top 20 Noun Phrases Excluding Single Noun Phrases', char='-') print(top_20_noun_phrases) products = df['asin'].value_counts().head(3).index products_np_top1 = df[df['asin'] == products[0]] products_np_top2 = df[df['asin'] == products[1]] products_np_top3 = df[df['asin'] == products[2]] print_representative_np(products_np_top1, product=products[0], n=30) print_representative_np(products_np_top2, product=products[1], n=30) print_representative_np(products_np_top3, product=products[2], n=30) random_5_reviews = df[['reviewText', 'posTagged', 'nounPhrases']].sample(5, random_state=seed) random_5_reviews['nounPhrasesLen'] = random_5_reviews['nounPhrases'].apply( len) print_header('Noun Phrase Detector Evaluation for Random 5 Reviews', char='-') print(random_5_reviews) # 3.4. Sentiment Word Detection print( str(datetime.datetime.now()).split('.')[0] + ': Start processing sentence segmentation') # Without Stemming and Without Negation sentiment_score(df, "./rep_words/ns_nn.csv") # With Stemming and Without Negation sentiment_score(df, "./rep_words/s_nn.csv", stemmer=stemmer) # Without Stemming and With Negation sentiment_score(df, "./rep_words/ns_n.csv", convert_neg=True) # With Stemming and With Negation sentiment_score(df, "./rep_words/s_n.csv", stemmer=stemmer, convert_neg=True)