def main(): binary_label = True exclude_stopwords = True data_nosw, data_positive_nosw, data_negative_nosw = (scan.scan('finemedium.txt', exclude_stopwords, binary_label)) data = [] for datum in data_nosw: new_datum = datum[0].split() new_datum.append(datum[1]) data.append(new_datum) #print data positive_review_nosw = ' '.join([row[0] for row in data_positive_nosw]) dict_positive_nosw = utils.get_unigram(positive_review_nosw)[0] positive_words = [x[0] for x in sorted(dict_positive_nosw.items(), key=operator.itemgetter(1), reverse = True)[1:501]] negative_review_nosw = ' '.join([row[0] for row in data_negative_nosw]) dict_negative_nosw = utils.get_unigram(negative_review_nosw)[0] negative_words = [x[0] for x in sorted(dict_negative_nosw.items(), key=operator.itemgetter(1), reverse = True)[1:501]] all_words = positive_words all_words.extend(x for x in negative_words if x not in positive_words) length = len(data) train_data = data[:int(length*.8)] test_data = data[int(length*.8):] decision_tree = dt.train(train_data, all_words) test_results = dt.test(decision_tree, test_data) print test_results
def main(): binary_label = True exclude_stopwords = True data_nosw, data_positive_nosw, data_negative_nosw = (scan.scan('finefoods.txt', exclude_stopwords, binary_label)) data = [] for datum in data_nosw: new_datum = datum[0].split() new_datum.append(datum[1]) data.append(new_datum) positive_review_nosw = ' '.join([row[0] for row in data_positive_nosw]) dict_positive_nosw = utils.get_unigram(positive_review_nosw)[0] positive_words = [x[0] for x in sorted(dict_positive_nosw.items(), key=operator.itemgetter(1), reverse = True)[1:501]] negative_review_nosw = ' '.join([row[0] for row in data_negative_nosw]) dict_negative_nosw = utils.get_unigram(negative_review_nosw)[0] negative_words = [x[0] for x in sorted(dict_negative_nosw.items(), key=operator.itemgetter(1), reverse = True)[1:501]] all_words = positive_words all_words.extend(x for x in negative_words if x not in positive_words) length = len(data) train_data = data[:int(length*.8)] test_data = data[int(length*.8):] decision_tree = dt.train(train_data, all_words) test_results = dt.test(decision_tree, test_data) print test_results
def main(): # default is binary outcome and no stopwords binary_label = True exclude_stopwords = True data_nosw, data_positive_nosw, data_negative_nosw = (scan.scan( 'finefoods.txt', exclude_stopwords, binary_label)) # format data into 2 dimensional array data = [] for datum in data_nosw: # first part of array is each review with words splitted new_datum = datum[0].split() # last item in each array is the label new_datum.append(datum[1]) data.append(new_datum) # get a list of 500 most frequent positive words, ignoring the <br> positive_review_nosw = ' '.join([row[0] for row in data_positive_nosw]) dict_positive_nosw = utils.get_unigram(positive_review_nosw)[0] positive_words = [ x[0] for x in sorted(dict_positive_nosw.items(), key=operator.itemgetter(1), reverse=True)[1:501] ] # get a list of 500 most frequent negative words, ignoring the <br> negative_review_nosw = ' '.join([row[0] for row in data_negative_nosw]) dict_negative_nosw = utils.get_unigram(negative_review_nosw)[0] negative_words = [ x[0] for x in sorted(dict_negative_nosw.items(), key=operator.itemgetter(1), reverse=True)[1:501] ] # create non duplicate list of all frequent words from the two lists all_words = positive_words all_words.extend(x for x in negative_words if x not in positive_words) # split training and testing data length = len(data) train_data = data[:int(length * .8)] test_data = data[int(length * .8):] # using a dicision tree utilizing dictionaries decision_tree_dict = dtd.train(train_data, all_words) test_results_dict = dtd.test(decision_tree_dict, test_data) print test_results_dict # the same disicion tree utilizing binary tree decision_tree = dt.train(train_data, all_words) test_results = dt.test(decision_tree, test_data) print test_results
def __init__(self, datum): unigrams = utils.get_unigram(datum[config.REVIEW_INDEX]) self.unigrams = unigrams[config.UNIGRAMS_INDEX] self.word_set = frozenset(self.unigrams.keys()) self.word_count = unigrams[config.WORD_COUNT_INDEX] self.rating = datum[config.SCORE_INDEX] self.predicted_rating = None
def main(): binary_label = True exclude_stopwords = False data = scan.scan('fineshort.txt', exclude_stopwords, binary_label) length = len(data) all_review = ' '.join([row[0] for row in data]) dict_all = utils.get_unigram(all_review)[0] print(sorted(dict_all.items(), key=operator.itemgetter(1))) train_data = data[:int(length*.8)] test_data = data[int(length*.8):]
def main(): # default is binary outcome and no stopwords binary_label = True exclude_stopwords = True data_nosw, data_positive_nosw, data_negative_nosw = (scan.scan('finefoods.txt', exclude_stopwords, binary_label)) # format data into 2 dimensional array data = [] for datum in data_nosw: # first part of array is each review with words splitted new_datum = datum[0].split() # last item in each array is the label new_datum.append(datum[1]) data.append(new_datum) # get a list of 500 most frequent positive words, ignoring the <br> positive_review_nosw = ' '.join([row[0] for row in data_positive_nosw]) dict_positive_nosw = utils.get_unigram(positive_review_nosw)[0] positive_words = [x[0] for x in sorted(dict_positive_nosw.items(), key=operator.itemgetter(1), reverse = True)[1:501]] # get a list of 500 most frequent negative words, ignoring the <br> negative_review_nosw = ' '.join([row[0] for row in data_negative_nosw]) dict_negative_nosw = utils.get_unigram(negative_review_nosw)[0] negative_words = [x[0] for x in sorted(dict_negative_nosw.items(), key=operator.itemgetter(1), reverse = True)[1:501]] # create non duplicate list of all frequent words from the two lists all_words = positive_words all_words.extend(x for x in negative_words if x not in positive_words) # split training and testing data length = len(data) train_data = data[:int(length*.8)] test_data = data[int(length*.8):] # using a dicision tree utilizing dictionaries decision_tree_dict = dtd.train(train_data, all_words) test_results_dict = dtd.test(decision_tree_dict, test_data) print test_results_dict # the same disicion tree utilizing binary tree decision_tree = dt.train(train_data, all_words) test_results = dt.test(decision_tree, test_data) print test_results
def main(): binary_label = True exclude_stopwords = False data = scan.scan('fineshort.txt', exclude_stopwords, binary_label) length = len(data) all_review = ' '.join([row[0] for row in data]) dict_all = utils.get_unigram(all_review)[0] print(sorted(dict_all.items(), key=operator.itemgetter(1))) train_data = data[:int(length * .8)] test_data = data[int(length * .8):]
def main(): # defaults are binary outcome and exclude stopwords binary_label = True exclude_stopwords = True # a modified scan that returns three lists data_nosw, data_positive_nosw, data_negative_nosw = (scan.scan('finefoods.txt', exclude_stopwords, binary_label)) # open files for writing top_all_nosw = open(os.path.join(sys.path[0], "top_all_nosw.txt"), "wb") top_positive_nosw = open(os.path.join(sys.path[0], "top_positive_nosw.txt"), "wb") top_negative_nosw = open(os.path.join(sys.path[0], "top_negative_nosw.txt"), "wb") # join all reviews for wordcount all_review_nosw = ' '.join([row[0] for row in data_nosw]) dict_all_nosw = utils.get_unigram(all_review_nosw)[0] # sort by most frequent words and write to file top_all_nosw.write('\n'.join('%s %s' % x for x in sorted(dict_all_nosw.items(), key=operator.itemgetter(1), reverse = True))) # join all positive reviews for wordcount positive_review_nosw = ' '.join([row[0] for row in data_positive_nosw]) dict_positive_nosw = utils.get_unigram(positive_review_nosw)[0] top_positive_nosw.write('\n'.join('%s %s' % x for x in sorted(dict_positive_nosw.items(), key=operator.itemgetter(1), reverse = True))) # join all negative reviews for wordcount negative_review_nosw = ' '.join([row[0] for row in data_negative_nosw]) dict_negative_nosw = utils.get_unigram(negative_review_nosw)[0] top_negative_nosw.write('\n'.join('%s %s' % x for x in sorted(dict_negative_nosw.items(), key=operator.itemgetter(1), reverse = True))) # close files top_all_nosw.close() top_positive_nosw.close() top_negative_nosw.close() # same set of routines, but this time allowing stopwords data_sw, data_positive_sw, data_negative_sw = scan.scan('finefoods.txt', not exclude_stopwords, binary_label) top_all_sw = open(os.path.join(sys.path[0], "top_all_sw.txt"), "wb") top_positive_sw = open(os.path.join(sys.path[0], "top_positive_sw.txt"), "wb") top_negative_sw = open(os.path.join(sys.path[0], "top_negative_sw.txt"), "wb") all_review_sw = ' '.join([row[0] for row in data_sw]) dict_all_sw = utils.get_unigram(all_review_sw)[0] top_all_sw.write('\n'.join('%s %s' % x for x in sorted(dict_all_sw.items(), key=operator.itemgetter(1), reverse = True))) positive_review_sw = ' '.join([row[0] for row in data_positive_sw]) dict_positive_sw = utils.get_unigram(positive_review_sw)[0] top_positive_sw.write('\n'.join('%s %s' % x for x in sorted(dict_positive_sw.items(), key=operator.itemgetter(1), reverse = True))) negative_review_sw = ' '.join([row[0] for row in data_negative_sw]) dict_negative_sw = utils.get_unigram(negative_review_sw)[0] top_negative_sw.write('\n'.join('%s %s' % x for x in sorted(dict_negative_sw.items(), key=operator.itemgetter(1), reverse = True))) top_all_sw.close() top_positive_sw.close() top_negative_sw.close()
def mostPopularPositiveNegative(data, n = 500): allUnigrams = defaultdict(int) positiveUnigrams = defaultdict(int) negativeUnigrams = defaultdict(int) for review in data: for unigram, freq in utils.get_unigram(review[0])[0].items(): allUnigrams[unigram] += freq positiveUnigrams[unigram] += freq if review[1] == 1 else 0 negativeUnigrams[unigram] += freq if review[1] == 0 else 0 topAllUnigrams = sorted(allUnigrams, key=allUnigrams.get, reverse=True) topPositiveUnigrams = sorted(positiveUnigrams, key=positiveUnigrams.get, reverse=True) topNegativeUnigrams = sorted(negativeUnigrams, key=negativeUnigrams.get, reverse=True) print ("Top 30 Popular Unigrams", "Top 30 Positive Unigrams", "Top 30 Negative Unigrams") for i in xrange(30): print repr(topAllUnigrams[i]).center(30), repr(topPositiveUnigrams[i]).center(30), repr(topNegativeUnigrams[i]).center(30) return topPositiveUnigrams + topNegativeUnigrams
def main(): binary_label = True exclude_stopwords = True data_nosw, data_positive_nosw, data_negative_nosw = (scan.scan('fineshort.txt', exclude_stopwords, binary_label)) top_all_nosw = open(os.path.join(sys.path[0], "top_all_nosw.txt"), "wb") top_positive_nosw = open(os.path.join(sys.path[0], "top_positive_nosw.txt"), "wb") top_negative_nosw = open(os.path.join(sys.path[0], "top_negative_nosw.txt"), "wb") all_review_nosw = ' '.join([row[0] for row in data_nosw]) dict_all_nosw = utils.get_unigram(all_review_nosw)[0] top_all_nosw.write('\n'.join('%s %s' % x for x in sorted(dict_all_nosw.items(), key=operator.itemgetter(1), reverse = True))) positive_review_nosw = ' '.join([row[0] for row in data_positive_nosw]) dict_positive_nosw = utils.get_unigram(positive_review_nosw)[0] top_positive_nosw.write('\n'.join('%s %s' % x for x in sorted(dict_positive_nosw.items(), key=operator.itemgetter(1), reverse = True))) negative_review_nosw = ' '.join([row[0] for row in data_negative_nosw]) dict_negative_nosw = utils.get_unigram(negative_review_nosw)[0] top_negative_nosw.write('\n'.join('%s %s' % x for x in sorted(dict_negative_nosw.items(), key=operator.itemgetter(1), reverse = True))) top_all_nosw.close() top_positive_nosw.close() top_negative_nosw.close() data_sw, data_positive_sw, data_negative_sw = scan.scan('fineshort.txt', not exclude_stopwords, binary_label) top_all_sw = open(os.path.join(sys.path[0], "top_all_sw.txt"), "wb") top_positive_sw = open(os.path.join(sys.path[0], "top_positive_sw.txt"), "wb") top_negative_sw = open(os.path.join(sys.path[0], "top_negative_sw.txt"), "wb") all_review_sw = ' '.join([row[0] for row in data_sw]) dict_all_sw = utils.get_unigram(all_review_sw)[0] top_all_sw.write('\n'.join('%s %s' % x for x in sorted(dict_all_sw.items(), key=operator.itemgetter(1), reverse = True))) positive_review_sw = ' '.join([row[0] for row in data_positive_sw]) dict_positive_sw = utils.get_unigram(positive_review_sw)[0] top_positive_sw.write('\n'.join('%s %s' % x for x in sorted(dict_positive_sw.items(), key=operator.itemgetter(1), reverse = True))) negative_review_sw = ' '.join([row[0] for row in data_negative_sw]) dict_negative_sw = utils.get_unigram(negative_review_sw)[0] top_negative_sw.write('\n'.join('%s %s' % x for x in sorted(dict_negative_sw.items(), key=operator.itemgetter(1), reverse = True))) top_all_sw.close() top_positive_sw.close() top_negative_sw.close() length = len(data_nosw) train_data = data_nosw[:int(length*.8)] test_data = data_nosw[int(length*.8):]
def create_vocabs_from_trainset(trainset, threshold=2, fn_dictionary=None, save_vocabs=True, fn_vocabs=None, oov=OOV, pad=PAD): # Creat a word-to-POStags dictionary. word2postags = {} with open(fn_dictionary, 'r') as texts: for text in texts: text = utils.utf8rstrip(text) word, postag = text.split('\t') word = utils.normalize(word) # lower setting: 1 word = word.lower() if word in word2postags: word2postags[word].append(postag) else: word2postags[word] = [postag] # Creat a word-to-index dictionary and a index-to-word dictionary. dictionary = {oov: 0, pad: 1} for word in word2postags.keys(): dictionary[word] = len(dictionary) id2word = {i: w for w, i in dictionary.items()} # Creat a unigram-to-index dictionary, a bigram-to-index dictionary. # Reconstruct a word-to-index dictionary. words = [] uni2id = {} bi2id = {} word2id = {} pos2id = {oov: 0} with open(trainset, 'r') as texts: for text in texts: text = utils.utf8rstrip(text) if text == 'EOS': sent = ''.join(words) unis = utils.get_unigram(sent) for uni in unis: uni2id = update_dict(uni, uni2id) bis = utils.get_bigram(sent) for bi in bis: bi2id = update_dict(bi, bi2id) words_at_i = utils.get_words_starting_at_i(sent, dictionary) words_at_i += utils.get_words_ending_at_i(sent, dictionary) for words in words_at_i: for wid in words: word = id2word[wid] word2id = update_dict(word, word2id) words = [] else: word, pos = text.split('\t') word = utils.normalize(word) word = word.replace(' ', ' ') # lower setting: 2 word = word.lower() words.append(word) word2id = update_dict(word, word2id) pos2id = update_dict(pos, pos2id) # Cut keys by frequency threshold. uni2id = cut_by_threshold(uni2id, oov, pad, threshold) bi2id = cut_by_threshold(bi2id, oov, pad, threshold) word2id = cut_by_threshold(word2id, oov, pad, threshold) # Creat a POStag-to-index dictionary. pos2id = {k: i for i, k in enumerate(pos2id.keys())} word2postags = { k: [pos2id[p] for p in list(set(v))] for k, v in word2postags.items() } vocabs = [uni2id, bi2id, word2id, pos2id, word2postags] if save_vocabs is True: utils.dump_data(vocabs, fn_vocabs) return vocabs
def main(): # defaults are binary outcome and exclude stopwords binary_label = True exclude_stopwords = True # a modified scan that returns three lists data_nosw, data_positive_nosw, data_negative_nosw = (scan.scan( 'finefoods.txt', exclude_stopwords, binary_label)) # open files for writing top_all_nosw = open(os.path.join(sys.path[0], "top_all_nosw.txt"), "wb") top_positive_nosw = open( os.path.join(sys.path[0], "top_positive_nosw.txt"), "wb") top_negative_nosw = open( os.path.join(sys.path[0], "top_negative_nosw.txt"), "wb") # join all reviews for wordcount all_review_nosw = ' '.join([row[0] for row in data_nosw]) dict_all_nosw = utils.get_unigram(all_review_nosw)[0] # sort by most frequent words and write to file top_all_nosw.write('\n'.join('%s %s' % x for x in sorted( dict_all_nosw.items(), key=operator.itemgetter(1), reverse=True))) # join all positive reviews for wordcount positive_review_nosw = ' '.join([row[0] for row in data_positive_nosw]) dict_positive_nosw = utils.get_unigram(positive_review_nosw)[0] top_positive_nosw.write('\n'.join('%s %s' % x for x in sorted( dict_positive_nosw.items(), key=operator.itemgetter(1), reverse=True))) # join all negative reviews for wordcount negative_review_nosw = ' '.join([row[0] for row in data_negative_nosw]) dict_negative_nosw = utils.get_unigram(negative_review_nosw)[0] top_negative_nosw.write('\n'.join('%s %s' % x for x in sorted( dict_negative_nosw.items(), key=operator.itemgetter(1), reverse=True))) # close files top_all_nosw.close() top_positive_nosw.close() top_negative_nosw.close() # same set of routines, but this time allowing stopwords data_sw, data_positive_sw, data_negative_sw = scan.scan( 'finefoods.txt', not exclude_stopwords, binary_label) top_all_sw = open(os.path.join(sys.path[0], "top_all_sw.txt"), "wb") top_positive_sw = open(os.path.join(sys.path[0], "top_positive_sw.txt"), "wb") top_negative_sw = open(os.path.join(sys.path[0], "top_negative_sw.txt"), "wb") all_review_sw = ' '.join([row[0] for row in data_sw]) dict_all_sw = utils.get_unigram(all_review_sw)[0] top_all_sw.write('\n'.join('%s %s' % x for x in sorted( dict_all_sw.items(), key=operator.itemgetter(1), reverse=True))) positive_review_sw = ' '.join([row[0] for row in data_positive_sw]) dict_positive_sw = utils.get_unigram(positive_review_sw)[0] top_positive_sw.write('\n'.join('%s %s' % x for x in sorted( dict_positive_sw.items(), key=operator.itemgetter(1), reverse=True))) negative_review_sw = ' '.join([row[0] for row in data_negative_sw]) dict_negative_sw = utils.get_unigram(negative_review_sw)[0] top_negative_sw.write('\n'.join('%s %s' % x for x in sorted( dict_negative_sw.items(), key=operator.itemgetter(1), reverse=True))) top_all_sw.close() top_positive_sw.close() top_negative_sw.close()