Example #1
0
def main(arg):

    # Create log file
    if not os.path.exists(OUTPUT):
        os.makedirs(OUTPUT)
    logging.basicConfig(filename=OUTPUT + 'output-' + str(time.time()) +
                        '.txt',
                        level=logging.INFO)

    # Read table from MySQL
    table = read(arg)

    # Generate summary data
    summary(table)
    genPlots(table)

    table = discretize_score(table, bins=3)

    # Test a KNN Classifier
    test_KNN(table, 5)

    # Test a Naive Bayes Classifier
    test_bayes(table)

    # Test forest
    test_forest(table, 10)

    output.update(" " * 20)
    output.update("---> Finished.")
Example #2
0
def populateAnswers(db, data_folder):

    for sitename in os.listdir(data_folder):

        # Avoid weird .DS_store's on macs
        if not os.path.isdir(data_folder + sitename):
            continue

        if 'Posts.xml' in os.listdir(data_folder + sitename):
            tree = ET.parse(data_folder + sitename + '/Posts.xml')
            root = tree.getroot()  #GROOT

            count = 0
            for post in root.iter('row'):
                if int(post.get('PostTypeId')) == POST_TYPE['answer']:
                    stackexchange_id = post.get('Id')
                    score = post.get('Score')
                    question_id = post.get('ParentId')
                    body = post.get('Body')

                    output.update("...Popluating Answer %s" % count)
                    toMySQL.insert_answer(db, sitename, question_id,
                                          stackexchange_id, score, body)
                    count += 1
        else:
            print 'ERROR: Posts.xml not found in', sitename
Example #3
0
def test_KNN(table, k):

    # Test KNN for several variations of K.
    output.update("... Testing KNN")
    logging.info("KNN report")
    start = time.time()
    labels = run_KNN(table, k)
    confusion_matrix(labels, 'score')
    logging.info('KNN at k=%s has %s accuracy in %s seconds' %
                 (k, accuracy(labels), str(time.time() - start)))
Example #4
0
def genFrequencyGraph(filename, table, index, label, title):
    output.update("... Plot %s" % label)

    pyplot.figure()
    xs = getCol(table, index)
    pyplot.hist(xs, bins=100)
    pyplot.suptitle(title)
    pyplot.xlabel(label)

    pyplot.savefig(PDFs + filename)
Example #5
0
def test_bayes(table):
    logging.info('\n# Testing Naive Bayes')

    discrete_table = discretize_table(table, [(1, 10), (2, 10), (3, 10),
                                              (4, 10), (5, 10)])

    start = time.time()
    labels = run_bayes(discrete_table)
    output.update("... Running Naive Bayes")
    logging.info("Time in Seconds:" + str(time.time() - start) + "s")
    confusion_matrix(labels, 'score')
    logging.info("\nAccuracy:" + str(accuracy(labels)))
Example #6
0
def genScatterPlot(filename, table, xIndex, yIndex, xLabel, yLabel, title):
    output.update("... Plot %s" % xLabel)
    pyplot.figure()

    ys = getCol(table, yIndex)
    xs = getCol(table, xIndex)

    pyplot.plot(xs, ys, 'b.', alpha=0.2)
    pyplot.xlabel(xLabel)
    pyplot.ylabel(yLabel)
    pyplot.suptitle(title)

    pyplot.savefig(PDFs + filename)
Example #7
0
def populateSites(db, data_folder):

    folders = []
    count = 0
    for folder in os.listdir(data_folder):

        # Avoid weird .DS_store's on macs
        if not os.path.isdir(data_folder + folder):
            continue

        output.update("...Popluating site %s" % count)

        folders.append(folder)
        count += 1

    toMySQL.bulk_insert_site(db, folders)
Example #8
0
def test_forest(table, maxN):
    logging.info('\n# Testing Random Forest')

    toTabulate = [[
        "Attempt Number", "N", "M", "F", "Accuracy", "Time in Seconds"
    ]]
    attempt = 1

    for n in range(1, maxN):
        output.update("... Running Trees for n=%s" % n)
        for m in range(1, n):
            for f in range(1, 4):
                start = time.time()
                labels = run_forest(table, n, m, f)
                toTabulate.append([attempt, n, m, f, accuracy(labels), \
                    str(time.time() - start) + 's'])
                attempt += 1

    logging.info(
        '\n' + str(tabulate(toTabulate, headers="firstrow", tablefmt="fancy")))