def main(argv):

    if len(sys.argv) != 3:
    	print("Usage:> python Counterfactual_Job.py sql_query.sql outfile.csv")
    	exit()

    print('Starting process...')

    sql_file_name = str(sys.argv[1])
    outfile_name = str(sys.argv[2])

    outfile = open(outfile_name, 'w')
    sql_file = open(sql_file_name, 'r')

    tagger = PerceptronTagger()

    print("Executing query...")

    db = 'randomTwitterByCounty'
    query = sql_file.read()

    username = '******'
    # Execute query
    # ss = mysqlMethods.executeGetSSCursor(mysqlMethods, db, query)
    cnx = mysql.connector.connect(host='127.0.0.1', port=3306, user=username, db=db)
    cursor = cnx.cursor()
    cursor.execute(query)

    print('Retrieving counterfactuals...')

    i = 0
    for result in cursor:
        message_id = result[0]
        message = format_tweet(result[1])
        cnty = result[2]
        '''
        coordinates = ast.literal_eval(result[2])
        latitude = coordinates[0]
        longitude = coordinates[1]
        '''
        tagged_message = CounterfactualMethods.get_tagged_message(message, tagger)
        cf_form = CounterfactualMethods.get_cf_form(tagged_message)
        cf = 0 if cf_form == 0 else 1

        if(i % 10000 == 0):
            sys.stdout.write("\r%d results finished..." % i)
            sys.stdout.flush()
        
        s_out = str(message_id) + ", "  + str(cnty) + ", " + str(cf) + ", " + str(cf_form) + "\n"
        outfile.write(s_out)

        i = i + 1

    outfile.close()
Ejemplo n.º 2
0
def main(argv):

    if len(sys.argv) != 6:
        print("Usage:> python ClassifyTweets.py tweets.txt labels.txt predicted.txt tagged.txt report.txt")
        exit()

    # Input Files
    tweetsFile = open(sys.argv[1], 'r')
    labelsFile = open(sys.argv[2], 'r')

    #Output Files
    predictedFile = open(sys.argv[3], 'w')
    taggedFile = open(sys.argv[4], 'w')
    reportFile = open(sys.argv[5], 'w')

    tagger = PerceptronTagger()
    form_num = 7

    cf_count = [[0 for x in range(form_num)] for x in range(form_num)]

    y_true = labelsFile.read().splitlines()
    y_true = [int(i) for i in y_true]

    y_pred = []

    # Output to report file to keep track of number of CFs of each form
    form_vec = []

    print("Reading file...")
    tweet = tweetsFile.readline()

    i = 0
    while tweet != '': 
        
        # Get Counterfactual form
        taggedTweet = CounterfactualMethods.get_tagged_message(tweet, tagger)
        taggedFile.write(taggedTweet)
        form = int(CounterfactualMethods.get_cf_form(taggedTweet))
        form_vec.append(form)

        # Increase counter
        cf_count[form][0] = cf_count[form][0] + 1

        if(y_true[i] == 1):
            cf_count[form][1] = cf_count[form][1] + 1

        if(form == 0):
            # write 0
            predictedFile.write('0\n')
            y_pred.append(0)

        else:
            # write 1
            predictedFile.write('1\n')
            y_pred.append(1)


        i = i + 1
        tweet = tweetsFile.readline()

    ## Report
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)

    pearson_corr = scipy.stats.pearsonr(y_true, y_pred)

    count = 0
    for i in xrange(1,form_num):
        count = count + cf_count[i][0]

    reportFile.write("Identified " + str(count) + " Counterfactuals \n")
    reportFile.write("Accuracy: %0.4f \n" % accuracy)
    reportFile.write("pearson corr: %0.4f \n" % pearson_corr[0])
    reportFile.write("p-val: %0.4f \n" % pearson_corr[1])
    reportFile.write("Precision: %0.4f \n" % precision[1])
    reportFile.write("Recall: %0.4f \n" % recall[1])

    for i in xrange(1,form_num):
        c = 1
        if cf_count[i][0] != 0:
            c = float(cf_count[i][1]) / float(cf_count[i][0])
        reportFile.write("Form %d   Count: %d,  Accuracy: %0.4f \n" % (i, cf_count[i][0], c))

    reportFile.write("Incorrect predictions\n")
    reportFile.write("idx, pred, label, form\n")

    for i in xrange(len(y_true)):
        if(int(y_true[i]) != int(y_pred[i])):
            reportFile.write(str(i + 1) + "  " + str(y_pred[i]) +  "  " + str(y_true[i]) + "  " + str(form_vec[i]) + "\n")


    print("Finished tagging... Closing files.")
    print("Identified " + str(count) + " Counterfactuals")

    #Input Files
    tweetsFile.close()
    labelsFile.close()

    #Output Files
    predictedFile.close()
    taggedFile.close()
    reportFile.close()