# review word length, average weighted score, maximum word score, minimum # word score, and pure mathematical average. # From group result of the yelp_review_flat and selected category # categories table: the categories user ever commented. # # Notice: guest database account will not have right to modify the database. import psycopg2 import sys import re import scoresentence TEST = False # True if generating test data. False for training data. if __name__ == '__main__': connection = scoresentence.dbConnect() conn = connection[1] cursor = connection[0] cursor.execute("SELECT * FROM categories;") cat = [] for item1 in cursor.fetchall(): cat.append(item1[1]) if (TEST): end = "_test" else: end = "" cursor.execute("SELECT distinct(uid) FROM features" + end + ";") uid_set = cursor.fetchall() count = 0 for uid in uid_set:
if OUTTOFILE: print >> out, str(uid) + ", " + str(bid) + ", " + str(word_len) + ", " + str(weighted_score) + ", " + str( maximum ) + ", " + str(minimum) + ", " + str(avg) else: print str(word_len) + ", " + str(weighted_score) total = total + weighted_score if weighted_score > maximum: maximum = weighted_score if weighted_score < minimum: minimum = weighted_score print "Average score is: " + str(total / len(score[4])) + "\nMaximum score is: " + str( maximum ) + "\nMinimum score is: " + str(minimum) if __name__ == "__main__": connect = scoresentence.dbConnect() cursor = connect[0] conn = connect[1] st = LancasterStemmer() cursor.execute("SELECT avg(star_sum/number), avg((star_sum / number) * (star_sum / number)) from word_star;") stat_data = cursor.fetchall() avg_score = stat_data[0][0] avg_score_square = stat_data[0][1] std_dev = (float(avg_score_square) - (float(avg_score) ** (2))) ** (1 / 2.0) score = scoreSentence(cursor, st, avg_score, std_dev) printscore(score, OUTTOFILE, END) cursor.close() conn.close()
# This file is used to insert the generated file from getDataDistribution.py # to the database. # # Notice: guest database account will not have right to modify the database. import psycopg2 import sys import re import scoresentence START = 1 # start file name END = 2 # end file name if __name__ == "__main__": ret = scoresentence.dbConnect() conn = ret[1] cursor = ret[0] for x in range(START, END): count = 0 f = open("scoredistributiontop" + str(x) + "000.txt", 'r') for line in f: count = count + 1 split = line.strip().split(", ") line = "INSERT INTO features_test values('" + split[ 0] + "', '" + split[1] + "', " + split[2] + ", " + split[ 3] + ", " + split[4] + ", " + split[5] + ", " + split[ 6] + "); " print "Insert into the row " + str(count) cursor.execute(line) conn.commit() cursor.close()