def main(argv): if len(sys.argv) != 3: print("Usage:> python Counterfactual_Job.py sql_query.sql outfile.csv") exit() print('Starting process...') sql_file_name = str(sys.argv[1]) outfile_name = str(sys.argv[2]) outfile = open(outfile_name, 'w') sql_file = open(sql_file_name, 'r') tagger = PerceptronTagger() print("Executing query...") db = 'randomTwitterByCounty' query = sql_file.read() username = '******' # Execute query # ss = mysqlMethods.executeGetSSCursor(mysqlMethods, db, query) cnx = mysql.connector.connect(host='127.0.0.1', port=3306, user=username, db=db) cursor = cnx.cursor() cursor.execute(query) print('Retrieving counterfactuals...') i = 0 for result in cursor: message_id = result[0] message = format_tweet(result[1]) cnty = result[2] ''' coordinates = ast.literal_eval(result[2]) latitude = coordinates[0] longitude = coordinates[1] ''' tagged_message = CounterfactualMethods.get_tagged_message(message, tagger) cf_form = CounterfactualMethods.get_cf_form(tagged_message) cf = 0 if cf_form == 0 else 1 if(i % 10000 == 0): sys.stdout.write("\r%d results finished..." % i) sys.stdout.flush() s_out = str(message_id) + ", " + str(cnty) + ", " + str(cf) + ", " + str(cf_form) + "\n" outfile.write(s_out) i = i + 1 outfile.close()
def main(argv): if len(sys.argv) != 6: print("Usage:> python ClassifyTweets.py tweets.txt labels.txt predicted.txt tagged.txt report.txt") exit() # Input Files tweetsFile = open(sys.argv[1], 'r') labelsFile = open(sys.argv[2], 'r') #Output Files predictedFile = open(sys.argv[3], 'w') taggedFile = open(sys.argv[4], 'w') reportFile = open(sys.argv[5], 'w') tagger = PerceptronTagger() form_num = 7 cf_count = [[0 for x in range(form_num)] for x in range(form_num)] y_true = labelsFile.read().splitlines() y_true = [int(i) for i in y_true] y_pred = [] # Output to report file to keep track of number of CFs of each form form_vec = [] print("Reading file...") tweet = tweetsFile.readline() i = 0 while tweet != '': # Get Counterfactual form taggedTweet = CounterfactualMethods.get_tagged_message(tweet, tagger) taggedFile.write(taggedTweet) form = int(CounterfactualMethods.get_cf_form(taggedTweet)) form_vec.append(form) # Increase counter cf_count[form][0] = cf_count[form][0] + 1 if(y_true[i] == 1): cf_count[form][1] = cf_count[form][1] + 1 if(form == 0): # write 0 predictedFile.write('0\n') y_pred.append(0) else: # write 1 predictedFile.write('1\n') y_pred.append(1) i = i + 1 tweet = tweetsFile.readline() ## Report accuracy = accuracy_score(y_true, y_pred) precision, recall, thresholds = precision_recall_curve(y_true, y_pred) pearson_corr = scipy.stats.pearsonr(y_true, y_pred) count = 0 for i in xrange(1,form_num): count = count + cf_count[i][0] reportFile.write("Identified " + str(count) + " Counterfactuals \n") reportFile.write("Accuracy: %0.4f \n" % accuracy) reportFile.write("pearson corr: %0.4f \n" % pearson_corr[0]) reportFile.write("p-val: %0.4f \n" % pearson_corr[1]) reportFile.write("Precision: %0.4f \n" % precision[1]) reportFile.write("Recall: %0.4f \n" % recall[1]) for i in xrange(1,form_num): c = 1 if cf_count[i][0] != 0: c = float(cf_count[i][1]) / float(cf_count[i][0]) reportFile.write("Form %d Count: %d, Accuracy: %0.4f \n" % (i, cf_count[i][0], c)) reportFile.write("Incorrect predictions\n") reportFile.write("idx, pred, label, form\n") for i in xrange(len(y_true)): if(int(y_true[i]) != int(y_pred[i])): reportFile.write(str(i + 1) + " " + str(y_pred[i]) + " " + str(y_true[i]) + " " + str(form_vec[i]) + "\n") print("Finished tagging... Closing files.") print("Identified " + str(count) + " Counterfactuals") #Input Files tweetsFile.close() labelsFile.close() #Output Files predictedFile.close() taggedFile.close() reportFile.close()