def sentiment_train_prep(in_name, out_name): vocab, n_vocab = loader.load_corpora_vocab() vocab_freq = {} with open('yelp/' + in_name + '.json') as in_file: sent_order = [1, 0, 2] i = 0 j = 0 for line in in_file: review = json.loads(line) stars = int(review['stars']) sent = 0 if stars == 3: sent = 1 elif stars == 4 or stars == 5: sent = 2 text = review['text'] tokk_sentences = my_tokkenize(text.lower(), vocab)[0] if (sent == sent_order[j]): j = j + 1 if j == 3: j = 0 for sentence in tokk_sentences: for tokk in sentence: if (tokk not in vocab_freq): vocab_freq[tokk] = [0, 0, 0] vocab_freq[tokk][sent] = vocab_freq[tokk][sent] + 1 if (i % 10000 == 0): print(i, TRAIN_SIZE_REVIEWS) if (i >= TRAIN_SIZE_REVIEWS): break i = i + 1 loader.serialize_structure(vocab_freq, 'yelp/' + out_name) return vocab_freq
def tfidf_train_prep(in_name, out_name): with open('yelp/' + in_name + '.json') as in_file: i = 0 j = 0 for line in in_file: business_id = '' buss = json.loads(line) for b in buss: business_id = b business = buss[business_id] business_vocab_freq = {} for review_id in business: review = business[review_id] sentences_lemmas = review['lemmas'] for sentence in sentences_lemmas: for tokk in sentence: if (tokk not in business_vocab_freq): business_vocab_freq[tokk] = 0 business_vocab_freq[ tokk] = business_vocab_freq[tokk] + 1 loader.serialize_structure({business_id: business_vocab_freq}, 'yelp/' + out_name, 'a+') if (i % 10000 == 0): print(i, TRAIN_SIZE_REVIEWS) if (i >= TRAIN_SIZE_REVIEWS): break i = i + 1
def data_prep(in_name, out_name): vocab, n_vocab = loader.load_corpora_vocab() sentiment_count = [0, 0, 0] with open('yelp/' + in_name + '.json') as file: k = 0 for line in file: business_dict = json.loads(line) business_id = '' for b in business_dict: business_id = b business = business_dict[business_id] to_use_reviews = {} delta_sentiment_count = [0, 0, 0] for review_id in business: if (len(to_use_reviews) <= 20): review = business[review_id] text = review['text'] try: if (detect(text) == 'en'): lemmas, = my_tokkenize(text.lower(), vocab)[0] if (len(lemmas) > 4): stars = int(review['stars']) useful = int(review['useful']) sentiment = None if (stars == 1 or stars == 2): sentiment = 0 elif (stars == 3): sentiment = 1 else: sentiment = 2 delta_sentiment_count[ sentiment] = delta_sentiment_count[ sentiment] + 1 to_use_reviews[review_id] = { 'sentiment': sentiment, 'text': text, 'useful': useful, 'lemmas': lemmas } except Exception as e: print(e) else: break if (len(to_use_reviews) >= 10): for i in range(len(sentiment_count)): sentiment_count[ i] = sentiment_count[i] + delta_sentiment_count[i] buss = {business_id: to_use_reviews} loader.serialize_structure(buss, 'yelp/' + out_name, 'a+') k = k + 1 print(k) return sentiment_count
def compute_max_tfidf(Wd, limit, serialize=True): for business in Wd: max_tfidf = [] words = [] for lemma in Wd[business]: if len(max_tfidf) < limit: max_tfidf.append(Wd[business][lemma]) words.append(lemma) else: index, value = min(enumerate(max_tfidf)) #print(index, value) if Wd[business][lemma] > value: max_tfidf[index] = Wd[business][lemma] words[index] = lemma Wd[business] = words if (serialize): loader.serialize_structure(Wd, 'yelp/max_tfidf') return Wd
def compute_vocab_tf_idf(in_file, serialize=True): Fwd = loader.load('yelp/' + in_file) FwD = {} D = len(Fwd) print("Processing lemma frequencies per document") for business in Fwd: for lemma in Fwd[business]: if lemma in FwD: FwD[lemma] = FwD[lemma] + 1 else: FwD[lemma] = 1 print("Processing tf-idf") for business in Fwd: for lemma in Fwd[business]: # print(lemma) Fwd[business][lemma] = tf_idf(Fwd[business][lemma], FwD[lemma], D) if (serialize): loader.serialize_structure(Fwd, 'yelp/tf-idf') return Fwd
def sentiment_post_processing(in_name, out_name): with open('yelp/' + in_name + '.json') as in_file: for line in in_file: word = json.loads(line) word_lemma = None word_freq = None for w in word: word_lemma = w word_freq = word[w] total_freq = sum(word_freq) if (total_freq >= 5): use_word = False prob = [] for i in range(len(word_freq)): prob.append(float(word_freq[i]) / total_freq) if (prob[i] >= 0.73): use_word = True if (use_word): loader.serialize_structure({word_lemma: prob}, 'yelp/' + out_name, 'a+')
def compute_business_sentence_aspect_sentiment(NeH_dict, NH, business_relevant_aspects, business_reviews_sentences, serialize=True): total = len(business_reviews_sentences) i = 0 business_aspect_sentiment_overall = {} for business in business_reviews_sentences: print(str(i)+'/'+str(total)) relevant_aspects = business_relevant_aspects[business] relevant_aspects_prob = dict.fromkeys(relevant_aspects, NH) #print(relevant_aspects_prob) #business_relevant_aspects[business][i][1] = NH for i in range(len(business_relevant_aspects[business])) for review in business_reviews_sentences[business]: business_reviews_sentences_prob = [] for sentence in business_reviews_sentences[business][review]: review_sentence_aspects_prob = {} for aspect in sentence[1]: if(aspect in relevant_aspects): review_sentence_aspects_prob[aspect] = NH for adj in sentence[2]: if(adj in NeH_dict): relevant_aspects_prob[aspect] = compute_adj_posteriors(NeH_dict[adj], relevant_aspects_prob[aspect]) review_sentence_aspects_prob[aspect] = compute_adj_posteriors(NeH_dict[adj], review_sentence_aspects_prob[aspect]) # business_relevant_aspects[business][][1] # business_sentences[business] if(len(relevant_aspects_prob) > 0 and len(review_sentence_aspects_prob) > 0): business_reviews_sentences_prob.append([sentence[0], review_sentence_aspects_prob]) business_reviews_sentences[business][review] = business_reviews_sentences_prob #print(relevant_aspects_prob) business_aspect_sentiment_overall[business] = {} for aspect in relevant_aspects_prob: print(aspect, relevant_aspects_prob[aspect]) classification = max((val, idx) for (idx, val) in enumerate(relevant_aspects_prob[aspect]))[1] business_aspect_sentiment_overall[business][aspect] = [classification, relevant_aspects_prob[aspect]] i = i + 1 if(serialize): loader.serialize_structure(business_reviews_sentences, 'business_review_sentence_aspect_sentiment', agregation='a+') loader.serialize_structure(business_aspect_sentiment_overall, 'business_aspect_sentiment', agregation='a+')
def choose_business_simmilar_sentiment_sentences(): with open('business_aspect_sentiment.json', 'r') as file: i = 0 for line in file: # loading business business_general_info = json.loads(line) business_name = '' smaller_error_sentences = {} for business in business_general_info: business_name = business business_general_info = business_general_info[business_name] business_review_sentence_info = None with open('business_review_sentence_aspect_sentiment.json', 'r') as file2: for line in file2: new_business_info = json.loads(line) buss = '' for business in new_business_info: buss = business if(buss == business_name): business_review_sentence_info = new_business_info[business] break # getting business general aspect sentiment general_aspects_sentiment = {} smaller_error = {} for aspect in business_general_info: general_aspects_sentiment[aspect] = business_general_info[aspect][1] smaller_error_sentences[aspect] = [] smaller_error[aspect] = 99. # iterationg over reviews and sentences # get the most simmilar sentence for review in business_review_sentence_info: for sentence in business_review_sentence_info[review]: for aspect in sentence[1]: new_error = compute_error(general_aspects_sentiment[aspect], sentence[1][aspect]) if(new_error < smaller_error[aspect]): smaller_error_sentences[aspect] = [review, sentence[0]] smaller_error[aspect] = new_error # find the sentence and store it best_sentence = {} for aspect in business_general_info: to_find_business = business_name try: to_find_review = smaller_error_sentences[aspect][0] to_find_sentence_index = smaller_error_sentences[aspect][1] with open('yelp/yelp_academic_dataset_review.json', 'r') as file3: for line in file3: entry = json.loads(line) line_business = entry['business_id'] if(line_business == to_find_business): line_review = entry['review_id'] if(line_review == to_find_review): line_text = entry['text'] line_sentences = sent_tokenize(line_text) best_sentence[aspect] = line_sentences[to_find_sentence_index] break except: pass to_store = {business_name : best_sentence} loader.serialize_structure(to_store, 'business_best_sentences', 'a+') i = i + 1 print(i)