コード例 #1
0
def sentiment_train_prep(in_name, out_name):
    vocab, n_vocab = loader.load_corpora_vocab()
    vocab_freq = {}
    with open('yelp/' + in_name + '.json') as in_file:
        sent_order = [1, 0, 2]
        i = 0
        j = 0
        for line in in_file:
            review = json.loads(line)
            stars = int(review['stars'])
            sent = 0
            if stars == 3:
                sent = 1
            elif stars == 4 or stars == 5:
                sent = 2
            text = review['text']
            tokk_sentences = my_tokkenize(text.lower(), vocab)[0]
            if (sent == sent_order[j]):
                j = j + 1
                if j == 3:
                    j = 0
                for sentence in tokk_sentences:
                    for tokk in sentence:
                        if (tokk not in vocab_freq):
                            vocab_freq[tokk] = [0, 0, 0]
                        vocab_freq[tokk][sent] = vocab_freq[tokk][sent] + 1
            if (i % 10000 == 0):
                print(i, TRAIN_SIZE_REVIEWS)
            if (i >= TRAIN_SIZE_REVIEWS):
                break
            i = i + 1
    loader.serialize_structure(vocab_freq, 'yelp/' + out_name)
    return vocab_freq
コード例 #2
0
def tfidf_train_prep(in_name, out_name):
    with open('yelp/' + in_name + '.json') as in_file:
        i = 0
        j = 0
        for line in in_file:
            business_id = ''
            buss = json.loads(line)
            for b in buss:
                business_id = b
            business = buss[business_id]
            business_vocab_freq = {}
            for review_id in business:
                review = business[review_id]
                sentences_lemmas = review['lemmas']
                for sentence in sentences_lemmas:
                    for tokk in sentence:
                        if (tokk not in business_vocab_freq):
                            business_vocab_freq[tokk] = 0
                        business_vocab_freq[
                            tokk] = business_vocab_freq[tokk] + 1
            loader.serialize_structure({business_id: business_vocab_freq},
                                       'yelp/' + out_name, 'a+')
            if (i % 10000 == 0):
                print(i, TRAIN_SIZE_REVIEWS)
            if (i >= TRAIN_SIZE_REVIEWS):
                break
            i = i + 1
コード例 #3
0
def data_prep(in_name, out_name):
    vocab, n_vocab = loader.load_corpora_vocab()
    sentiment_count = [0, 0, 0]
    with open('yelp/' + in_name + '.json') as file:
        k = 0
        for line in file:
            business_dict = json.loads(line)
            business_id = ''
            for b in business_dict:
                business_id = b
            business = business_dict[business_id]
            to_use_reviews = {}
            delta_sentiment_count = [0, 0, 0]
            for review_id in business:
                if (len(to_use_reviews) <= 20):
                    review = business[review_id]
                    text = review['text']
                    try:
                        if (detect(text) == 'en'):
                            lemmas, = my_tokkenize(text.lower(), vocab)[0]
                            if (len(lemmas) > 4):
                                stars = int(review['stars'])
                                useful = int(review['useful'])
                                sentiment = None
                                if (stars == 1 or stars == 2):
                                    sentiment = 0
                                elif (stars == 3):
                                    sentiment = 1
                                else:
                                    sentiment = 2
                                delta_sentiment_count[
                                    sentiment] = delta_sentiment_count[
                                        sentiment] + 1
                                to_use_reviews[review_id] = {
                                    'sentiment': sentiment,
                                    'text': text,
                                    'useful': useful,
                                    'lemmas': lemmas
                                }
                    except Exception as e:
                        print(e)
                else:
                    break
            if (len(to_use_reviews) >= 10):
                for i in range(len(sentiment_count)):
                    sentiment_count[
                        i] = sentiment_count[i] + delta_sentiment_count[i]
                buss = {business_id: to_use_reviews}
                loader.serialize_structure(buss, 'yelp/' + out_name, 'a+')
            k = k + 1
            print(k)
    return sentiment_count
コード例 #4
0
def compute_max_tfidf(Wd, limit, serialize=True):
    for business in Wd:
        max_tfidf = []
        words = []
        for lemma in Wd[business]:
            if len(max_tfidf) < limit:
                max_tfidf.append(Wd[business][lemma])
                words.append(lemma)
            else:
                index, value = min(enumerate(max_tfidf))
                #print(index, value)
                if Wd[business][lemma] > value:
                    max_tfidf[index] = Wd[business][lemma]
                    words[index] = lemma
        Wd[business] = words
    if (serialize):
        loader.serialize_structure(Wd, 'yelp/max_tfidf')
    return Wd
コード例 #5
0
def compute_vocab_tf_idf(in_file, serialize=True):
    Fwd = loader.load('yelp/' + in_file)
    FwD = {}
    D = len(Fwd)
    print("Processing lemma frequencies per document")
    for business in Fwd:
        for lemma in Fwd[business]:
            if lemma in FwD:
                FwD[lemma] = FwD[lemma] + 1
            else:
                FwD[lemma] = 1
    print("Processing tf-idf")
    for business in Fwd:
        for lemma in Fwd[business]:
            # print(lemma)
            Fwd[business][lemma] = tf_idf(Fwd[business][lemma], FwD[lemma], D)
    if (serialize):
        loader.serialize_structure(Fwd, 'yelp/tf-idf')
    return Fwd
コード例 #6
0
def sentiment_post_processing(in_name, out_name):
    with open('yelp/' + in_name + '.json') as in_file:
        for line in in_file:
            word = json.loads(line)
            word_lemma = None
            word_freq = None
            for w in word:
                word_lemma = w
                word_freq = word[w]
            total_freq = sum(word_freq)
            if (total_freq >= 5):
                use_word = False
                prob = []
                for i in range(len(word_freq)):
                    prob.append(float(word_freq[i]) / total_freq)
                    if (prob[i] >= 0.73):
                        use_word = True
                if (use_word):
                    loader.serialize_structure({word_lemma: prob},
                                               'yelp/' + out_name, 'a+')
コード例 #7
0
def compute_business_sentence_aspect_sentiment(NeH_dict, NH, business_relevant_aspects, business_reviews_sentences, serialize=True):
	total = len(business_reviews_sentences)
	i = 0
	business_aspect_sentiment_overall = {}
	for business in business_reviews_sentences:
		print(str(i)+'/'+str(total))
		relevant_aspects = business_relevant_aspects[business]
		relevant_aspects_prob = dict.fromkeys(relevant_aspects, NH)
		#print(relevant_aspects_prob)
		#business_relevant_aspects[business][i][1] = NH for i in range(len(business_relevant_aspects[business]))
		for review in business_reviews_sentences[business]:
			business_reviews_sentences_prob = []
			for sentence in business_reviews_sentences[business][review]:
				review_sentence_aspects_prob = {}
				for aspect in sentence[1]:
					if(aspect in relevant_aspects):
						review_sentence_aspects_prob[aspect] = NH
						for adj in sentence[2]:
							if(adj in NeH_dict):
								relevant_aspects_prob[aspect] = compute_adj_posteriors(NeH_dict[adj], relevant_aspects_prob[aspect])
								review_sentence_aspects_prob[aspect] = compute_adj_posteriors(NeH_dict[adj], review_sentence_aspects_prob[aspect])
							#	business_relevant_aspects[business][][1]
							#	business_sentences[business]
				if(len(relevant_aspects_prob) > 0 and len(review_sentence_aspects_prob) > 0):
					business_reviews_sentences_prob.append([sentence[0], review_sentence_aspects_prob])
			business_reviews_sentences[business][review] = business_reviews_sentences_prob
		#print(relevant_aspects_prob)
		business_aspect_sentiment_overall[business] = {}
		for aspect in relevant_aspects_prob:
			print(aspect, relevant_aspects_prob[aspect])
			classification = max((val, idx) for (idx, val) in enumerate(relevant_aspects_prob[aspect]))[1]
			business_aspect_sentiment_overall[business][aspect] = [classification, relevant_aspects_prob[aspect]]
		i = i + 1
	if(serialize):
		loader.serialize_structure(business_reviews_sentences, 'business_review_sentence_aspect_sentiment', agregation='a+')
		loader.serialize_structure(business_aspect_sentiment_overall, 'business_aspect_sentiment', agregation='a+')
コード例 #8
0
def choose_business_simmilar_sentiment_sentences():
	with open('business_aspect_sentiment.json', 'r') as file:
		i = 0
		for line in file:
			# loading business
			business_general_info = json.loads(line)
			business_name = ''
			smaller_error_sentences = {}
			for business in business_general_info:
				business_name = business
			business_general_info = business_general_info[business_name]
			business_review_sentence_info = None
			with open('business_review_sentence_aspect_sentiment.json', 'r') as file2:
				for line in file2:
					new_business_info = json.loads(line)
					buss = ''
					for business in new_business_info:
						buss = business
					if(buss == business_name):
						business_review_sentence_info = new_business_info[business]
						break
			# getting business general aspect sentiment
			general_aspects_sentiment = {}
			smaller_error = {}
			for aspect in business_general_info:
				general_aspects_sentiment[aspect] = business_general_info[aspect][1]
				smaller_error_sentences[aspect] = []
				smaller_error[aspect] = 99.
			# iterationg over reviews and sentences
			# get the most simmilar sentence
			for review in business_review_sentence_info:
				for sentence in business_review_sentence_info[review]:
					for aspect in sentence[1]:
						new_error = compute_error(general_aspects_sentiment[aspect], sentence[1][aspect])
						if(new_error < smaller_error[aspect]):
							smaller_error_sentences[aspect] = [review, sentence[0]]
							smaller_error[aspect] = new_error
			# find the sentence and store it
			best_sentence = {}
			for aspect in business_general_info:
				to_find_business = business_name
				try:
					to_find_review = smaller_error_sentences[aspect][0]
					to_find_sentence_index = smaller_error_sentences[aspect][1]
					with open('yelp/yelp_academic_dataset_review.json', 'r') as file3:
						for line in file3:
							entry = json.loads(line)
							line_business = entry['business_id']
							if(line_business == to_find_business):
								line_review = entry['review_id']
								if(line_review == to_find_review):
									line_text = entry['text']
									line_sentences = sent_tokenize(line_text)
									best_sentence[aspect] = line_sentences[to_find_sentence_index]
									break
				except:
					pass						
			to_store = {business_name : best_sentence}
			loader.serialize_structure(to_store, 'business_best_sentences', 'a+')
			i = i + 1
			print(i)