def getUserListfromDB(DB_NAME): (cur,con) = getDBConnection(DB_NAME) try: results = cur.execute('select fbuser_id,user_id from user_mapping;') except sql.Error, e: print "Error %s:" % e.args[0] sys.exit(1)
def activityFileToDB(log_file_name, DB_NAME): log_expr = re.compile("\[(\w{3} \w{3} \d{2} \d{2}:\d{2}:\d{2} \d{4})\] \[(\w+)\] (\w+) .+ ID\{(\d+)\}") ACTIVITY_STRING = {""} log_file = open(log_file_name, "r") (cur, con) = getDBConnection(DB_NAME) ACTIVITY_ID = cur.execute("select count(*) from logs;").next()[0] for line in log_file: pattern_match = log_expr.match(line) if not pattern_match is None: date = pattern_match.group(1) username = pattern_match.group(3) # userid = pattern_match.group(6) activity = pattern_match.group(2) object_id = pattern_match.group(4) object_type = re.split("_", activity)[0] # print date+" "+time+" "+request_string+" "+status_code+" "+username+" "+userid ACTIVITY_ID += 1 cur.execute( "INSERT INTO logs values(?,?,?,?,?,?)", [ ACTIVITY_ID, username, activity, object_id, object_type, convertToTimestamp(date, type="activity-log"), ], ) else: print str(pattern_match) con.commit()
def getPetReportsLikesListfromDB(DB_NAME): (cur,con) = getDBConnection(DB_NAME) try: results = cur.execute('SELECT distinct a.userid, b.petreport_id, b.created_time from user_likes a,petreport_mapping b where a.post_id=b.fbpost_id and post_id not in (select * from duplicate_posts)') except sql.Error, e: print "Error %s:" % e.args[0] sys.exit(1)
def getPetreportsListFromDB(DB_NAME): (cur,con) = getDBConnection(DB_NAME) try: results = cur.execute('select fbpost_id,petreport_id,created_time from petreport_mapping;') except sql.Error, e: print "Error %s:" % e.args[0] sys.exit(1)
def createTables(DB_NAME): (cur, con) = getDBConnection(DB_NAME) cur.execute( "CREATE TABLE IF NOT EXISTS logs(activityid INT,userid TEXT, activity TEXT, objectid TEXT, object_type TEXT, activity_timestamp TIMESTAMP);" ) cur.execute( "CREATE TABLE IF NOT EXISTS logs_petmatch(activityid INT,userid TEXT, activity TEXT, objectid TEXT, objectid2 TEXT, activity_timestamp TIMESTAMP);" )
def getSubmittedPetReportsListfromDB(DB_NAME): (cur,con) = getDBConnection(DB_NAME) try: #page_id = cur.execute('select author_id,count(author_id) as countid from post_info group by author_id order by countid desc limit 1;')[0].encode('ascii','ignore') results = cur.execute('SELECT distinct a.author_id,b.fbpost_id from post_info a, userpost_mapping b where b.userpostid=a.postid;') list_submittedpetreports = [[result[0].encode('ascii','ignore'),result[1].encode('ascii','ignore')] for result in results] except sql.Error, e: print "Error %s:" % e.args[0] sys.exit(1)
def getPetMatchViewsListfromDB(DB_NAME): (cur,con) = getDBConnection(DB_NAME) try: results = cur.execute('SELECT DISTINCT b.fbcomment_id_petmatch, a.userid, a.created_time FROM user_comments a, user_comments_petmatchviews b where a.commentid = b.fbcomment_id_viewer;') list_petmatchviews = [[result[0].encode('ascii','ignore'), result[1].encode('ascii','ignore'), result[2].encode('ascii','ignore')] for result in results] results = cur.execute ('SELECT distinct fbcomment_id,petmatch_id from petmatch_mapping;') list_petmatchmaps = dict([[result[0].encode('ascii','ignore'), result[1]] for result in results]) except sql.Error, e: print "Error %s:" % e.args[0] sys.exit(1)
def getPetMatchListsFromDB(DB_NAME): (cur,con) = getDBConnection(DB_NAME) try: results = cur.execute('SELECT distinct a.commentid,a.post_id,a.userid,a.created_time FROM user_comments a,petmatch_mapping b WHERE a.commentid=b.fbcomment_id and a.post_id not in (select post_id from duplicate_posts);') list_petmatchcomments = [[result[0].encode('ascii','ignore'), result[1].encode('ascii','ignore'), result[2].encode('ascii','ignore'), result[3].encode('ascii','ignore')] for result in results] results = cur.execute ('SELECT fbcomment_id,petmatch_id from petmatch_mapping;') list_petmatchmaps = dict([[result[0].encode('ascii','ignore'), result[1]] for result in results]) except sql.Error, e: print "Error %s:" % e.args[0] sys.exit(1)
def getPetReportsCommentersListfromDB(DB_NAME): (cur,con) = getDBConnection(DB_NAME) try: #change: exclude pet match creators #select count(*) from (SELECT userid,post_id,a.created_time from user_comments a,post_info b where post_id not in (select * from duplicate_posts) and post_id=postid and b.author_id="403922739676650"); results = cur.execute('SELECT distinct userid,post_id,a.created_time from user_comments a,post_info b where post_id not in (select * from duplicate_posts) and post_id=postid and b.author_id="403922739676650"') list_postcommenters = [[result[0].encode('ascii','ignore'),result[1].encode('ascii','ignore'),result[2]] for result in results] except sql.Error, e: print "Error %s:" % e.args[0] sys.exit(1)
def getPetMatchVotersListfromDB(DB_NAME): #need list of users with petmatch that they have voted and the time of voting. this would #possibly be stored in a table. the table would have details about petmatch-comment-id, petmatch #voting timestamp, user-id, (cur,con) = getDBConnection(DB_NAME) try: results = cur.execute('SELECT distinct fbcomment_id_petmatch, fbuser_id, vote_time FROM user_comments_petmatchvotes') list_petmatchvoters = [[result[0].encode('ascii','ignore'), result[1].encode('ascii','ignore'), result[2].encode('ascii','ignore')] for result in results] results = cur.execute ('SELECT fbcomment_id,petmatch_id from petmatch_mapping;') list_petmatchmaps = dict([[result[0].encode('ascii','ignore'), result[1]] for result in results]) except sql.Error, e: print "Error %s:" % e.args[0] sys.exit(1)
def apacheFileToDB(log_file_name, DB_NAME): ACTIVITY_STRING = {"PETMATCH:GET": "PETMATCH_VIEW", "PETREPORT:GET": "PETREPORT_VIEW"} # make this work for other requests as get petmatch log_file = open(log_file_name, "r") log_expr = re.compile('\d+\.\d+\.\d+\.\d+ - - \[([^\[\]:]+:\d+:\d+:\d+) \+(\d{4})\] (".+") (\d{3}) (.+) (\d{1,10})') (cur, con) = getDBConnection(DB_NAME) PETMATCH_ACTIVITY_ID = cur.execute("SELECT count(*) from logs_petmatch;").next()[0] ACTIVITY_ID = cur.execute("SELECT count(*) from logs;").next()[0] for line in log_file: pattern_match = log_expr.match(line) if not pattern_match is None: date = pattern_match.group(1) time = pattern_match.group(2) request_string = pattern_match.group(3) status_code = pattern_match.group(4) # check status code before inserting username = pattern_match.group(5) userid = pattern_match.group(6) request_string = (re.split('"', request_string)[1]).split() method = request_string[0] request = re.split("/", request_string[1]) object_type = request[2].upper() object_id = request[3] if object_type == "PROPOSE_PETMATCH": activity = object_type objectid2 = request[4] PETMATCH_ACTIVITY_ID += 1 cur.execute( "INSERT INTO logs_petmatch values(?,?,?,?,?,?)", [PETMATCH_ACTIVITY_ID, username, activity, object_id, objectid2, convertToTimestamp(date)], ) else: ACTIVITY_ID += 1 cur.execute( "INSERT INTO logs values(?,?,?,?,?,?)", [ ACTIVITY_ID, username, ACTIVITY_STRING[object_type + ":" + method], object_id, object_type, convertToTimestamp(date), ], ) # print date+" "+time+" "+request_string+" "+status_code+" "+username+" "+userid else: print str(pattern_match) con.commit()
def classify(DB_NAME): feature_file629 = open("feature_file629.txt", "w") # get the userids for all the training data (cur, con) = getDBConnection(DB_NAME) results = cur.execute( 'SELECT user_id from user_mapping a, (SELECT userid,sum(count) from (select userid, count(comment) as count from user_comments where post_id not in (select * from duplicate_posts) group by userid union select author_id as userid, count(postid) as count from post_info where postid not in (select * from duplicate_posts) group by author_id) where userid!="403922739676650" group by userid order by sum(count) desc limit 89) b where a.fbuser_id=b.userid;' ) list_users = ["user" + str(result[0]) for result in results] feature_vector = [] class1_features = [] class2_features = [] for i in range(len(list_users)): if i < 28: highly_active = True user_features = [constructFeatures(DB_NAME, list_users[i]), True] class1_features.append(user_features) feature_file629.write(str(user_features) + "\n") else: highly_active = False user_features = [constructFeatures(DB_NAME, list_users[i]), False] class2_features.append(user_features) feature_file629.write(str(user_features) + "\n") class1_cutoff = 22 # 80% 22 70%: 20 80.77 acc with 70% class2_cutoff = 49 # 80% 49 70%: 43 trainfeats = class1_features[:class1_cutoff] + class2_features[:class2_cutoff] testfeats = class1_features[class1_cutoff:] + class2_features[class2_cutoff:] classifier = NaiveBayesClassifier.train(trainfeats) # classifier = DecisionTreeClassifier.train(trainfeats) accuracy = nltk.classify.util.accuracy(classifier, testfeats) test_userid = "user24147" print "class of test_userid %s is :%s " % ( test_userid, str(classifier.classify(constructFeatures(DB_NAME, test_userid))), ) test_userid2 = "user7973" print "class of test_userid2 %s is : %s" % ( test_userid2, str(classifier.classify(constructFeatures(DB_NAME, test_userid2))), ) test_userid3 = "user18095" test_userid4 = "user28724" test_userid5 = "user6173" test_userid6 = "user11398" test_userid7 = "user13954" print "class of test_userid2 %s is : %s" % ( test_userid3, str(classifier.classify(constructFeatures(DB_NAME, test_userid3))), ) print "class of test_userid2 %s is : %s" % ( test_userid4, str(classifier.classify(constructFeatures(DB_NAME, test_userid4))), ) print "class of test_userid2 %s is : %s" % ( test_userid5, str(classifier.classify(constructFeatures(DB_NAME, test_userid5))), ) print "class of test_userid2 %s is : %s" % ( test_userid6, str(classifier.classify(constructFeatures(DB_NAME, test_userid6))), ) print "class of test_userid2 %s is : %s" % ( test_userid7, str(classifier.classify(constructFeatures(DB_NAME, test_userid7))), )
def constructFeatures(DB_NAME, userid): # will have to create # of columns in each user feature list = num of features (cur, con) = getDBConnection(DB_NAME) user_features = {} # create skeleton for all features # Feature 1 & 2: Mean, Stddev of time between any two activities by a user # results = cur.execute("SELECT userid from logs2 group by userid having count(userid)>1;") # list_users = [list(result)[0].encode('ascii','ignore') for result in results] # for user in list_users: query = 'SELECT activity_timestamp from logs where userid="' + userid.strip() + '" order by activity_timestamp;' print query results = cur.execute(query) list_results = [result[0] for result in results] timestats = [] if len(list_results) > 1: for i in range(len(list_results)): if i == 1: continue time_diff = datetime.strptime(list_results[i], "%Y-%m-%d %H:%M:%S") - datetime.strptime( list_results[i - 1], "%Y-%m-%d %H:%M:%S" ) timestats.append(abs(time_diff.total_seconds())) # print str(timestats) user_mean = mean_list(timestats) user_stddev = stddev_list(timestats) print "mean: %s stddev: %s" % (str(user_mean), str(user_stddev)) else: user_mean = 0 user_stddev = 0 user_features["f1"] = user_mean user_features["f2"] = user_stddev print "Feature 1&2 complete" # Feature 3,4,5: Mean, Stddev of time between any two pet matches proposed, Feature 3: number of pet matches proposed # for user in user_features.keys(): results = cur.execute( 'SELECT activity_timestamp from logs where userid="' + userid.strip() + '" and activity="PETMATCH_PROPOSED" order by activity_timestamp;' ) list_results = [result[0] for result in results] timestats = [] if len(list_results) > 1: for i in range(len(list_results)): if i == 1: continue time_diff = datetime.strptime(list_results[i], "%Y-%m-%d %H:%M:%S") - datetime.strptime( list_results[i - 1], "%Y-%m-%d %H:%M:%S" ) timestats.append(abs(time_diff.total_seconds())) # print str(timestats) user_mean = mean_list(timestats) user_stddev = stddev_list(timestats) print "mean: %s stddev: %s" % (str(user_mean), str(user_stddev)) else: user_mean = 0 user_stddev = 0 user_features["f3"] = user_mean user_features["f4"] = user_stddev user_features["f5"] = len(list_results) print "Feature 3,4&5 complete" # Feature 6&7: mean $ stddev number of bookmarks per day # for user in user_features.keys(): results = cur.execute( 'SELECT strftime(\'%d/%m/%Y\',activity_timestamp) as "time", count("time") from logs where activity="PETREPORT_ADD_BOOKMARK"AND USERID="' + userid.strip() + '" group by time;' ) list_results = [result[1] for result in results] numstats = [] if len(list_results) > 1: for i in range(len(list_results)): if i == 1: continue numstats.append(list_results[i]) # print str(timestats) user_mean = mean_list(numstats) user_stddev = stddev_list(numstats) print "mean: %s stddev: %s" % (str(user_mean), str(user_stddev)) else: user_mean = 0 user_stddev = 0 user_features["f6"] = (user_mean,) user_features["f7"] = user_stddev print "Feature 6&7 complete" # Feature x: average time between viewing and proposing a petmatch for the same pet # select a.userid, a.activity_timestamp as "viewed_time",b.activity_timestamp as "matched_time" from logs a,logs_petmatch b where (a.objectid=b.objectid or a.objectid=b.objectid2) and a.activity="PETREPORT_VIEW" AND a.userid=b.userid; # INSUFFICIENT DATA: only 1 instance each of 7 users available. # Feature 8: number of bookmarks results = cur.execute( 'SELECT userid, count(userid) from logs where activity="PETREPORT_ADD_BOOKMARK" AND USERID="' + userid.strip() + '" group by userid;' ) list_results = [[result[0], result[1]] for result in results] if len(list_results) > 0: [user, count_bookmarks] = list_results[0] else: count_bookmarks = 0 user_features["f8"] = count_bookmarks print "Feature 8 complete" # Feature 9: number of votes results = cur.execute( 'SELECT userid, count(userid) from logs where (activity="PETMATCH_UPVOTE" or activity="PETMATCH_UPVOTE") AND USERID="' + userid.strip() + '" group by userid;' ) list_results = [[result[0], result[1]] for result in results] if len(list_results) > 0: [user, count_votes] = list_results[0] else: count_votes = 0 user_features["f9"] = count_votes print "Feature 9 complete" # Feature 10: number of pet match views results = cur.execute( 'SELECT userid, count(userid) from logs where activity="PETMATCH_VIEW" AND USERID="' + userid.strip() + '" group by userid;' ) list_results = [[result[0], result[1]] for result in results] if len(list_results) > 0: [user, count_views] = list_results[0] else: count_views = 0 user_features["f10"] = count_views print "Feature 10 complete" # Feature 11&12, 13&14 : Ratio of number of bookmarks OR VIEWS: Number of pet reports per day # GROUP COUNT OF PETREPORTS BY DAY-MONTH AND STORE results = cur.execute( 'SELECT strftime("%m-%d",created_time) as date, count(petreport_id) from PETREPORT_MAPPING where created_time!="" group by date ' ) list_date_counts = dict([[result[0].encode("ascii", "ignore"), result[1]] for result in results]) # FOR EACH USER, GROUP COUNT OF BOOKMARKS BY DAY-MONTH. results = cur.execute( 'SELECT strftime("%m-%d",activity_timestamp) as date, count(objectid) from logs where activity="PETREPORT_ADD_BOOKMARK" AND USERID="' + userid.strip() + '" group by userid,date;' ) list_user_bookmarks = [[result[0].encode("ascii", "ignore"), result[1]] for result in results] list_count_bookmarks = [] for [date, count_bookmarks] in list_user_bookmarks: if date in list_date_counts: ratio = count_bookmarks / list_date_counts[date] else: ratio = count_bookmarks list_count_bookmarks.append(ratio) mean_bookmark_count = mean_list(list_count_bookmarks) stddev_bookmark_count = stddev_list(list_count_bookmarks) # FOR EACH USER, GROUP COUNT OF VIEWS BY DAY-MONTH results = cur.execute( 'SELECT strftime("%m-%d",activity_timestamp) as date, count(objectid) from logs where activity="PETREPORT_VIEW" AND USERID="' + userid.strip() + '" group by userid,date;' ) list_user_petviews = [[result[0].encode("ascii", "ignore"), result[1]] for result in results] list_count_petviews = [] for [date, count_petviews] in list_user_petviews: if date in list_date_counts: ratio = count_petviews / list_date_counts[date] else: ratio = count_petviews list_count_petviews.append(ratio) mean_petviews_count = mean_list(list_count_petviews) stddev_petviews_count = stddev_list(list_count_petviews) user_features["f11"] = mean_bookmark_count user_features["f12"] = stddev_bookmark_count user_features["f13"] = mean_petviews_count user_features["f14"] = stddev_petviews_count # CALCULATE BOOKMARK-RATIO, CALUCULATE PETREPORT VIEW RATIO. print "Features 11,12,13,14 complete" # Feature 15: Number of submitted pet reports results = cur.execute( 'SELECT count(userid) from logs where activity="PETREPORT_SUBMITTED" AND USERID="' + userid.strip() + '" group by userid;' ) list_results = [result[0] for result in results] if len(list_results) > 0: count_petreports = list_results[0] else: count_petreports = 0 user_features["f15"] = count_petreports print "Feature 15 complete" return user_features