def main(arg): # Create log file if not os.path.exists(OUTPUT): os.makedirs(OUTPUT) logging.basicConfig(filename=OUTPUT + 'output-' + str(time.time()) + '.txt', level=logging.INFO) # Read table from MySQL table = read(arg) # Generate summary data summary(table) genPlots(table) table = discretize_score(table, bins=3) # Test a KNN Classifier test_KNN(table, 5) # Test a Naive Bayes Classifier test_bayes(table) # Test forest test_forest(table, 10) output.update(" " * 20) output.update("---> Finished.")
def populateAnswers(db, data_folder): for sitename in os.listdir(data_folder): # Avoid weird .DS_store's on macs if not os.path.isdir(data_folder + sitename): continue if 'Posts.xml' in os.listdir(data_folder + sitename): tree = ET.parse(data_folder + sitename + '/Posts.xml') root = tree.getroot() #GROOT count = 0 for post in root.iter('row'): if int(post.get('PostTypeId')) == POST_TYPE['answer']: stackexchange_id = post.get('Id') score = post.get('Score') question_id = post.get('ParentId') body = post.get('Body') output.update("...Popluating Answer %s" % count) toMySQL.insert_answer(db, sitename, question_id, stackexchange_id, score, body) count += 1 else: print 'ERROR: Posts.xml not found in', sitename
def test_KNN(table, k): # Test KNN for several variations of K. output.update("... Testing KNN") logging.info("KNN report") start = time.time() labels = run_KNN(table, k) confusion_matrix(labels, 'score') logging.info('KNN at k=%s has %s accuracy in %s seconds' % (k, accuracy(labels), str(time.time() - start)))
def genFrequencyGraph(filename, table, index, label, title): output.update("... Plot %s" % label) pyplot.figure() xs = getCol(table, index) pyplot.hist(xs, bins=100) pyplot.suptitle(title) pyplot.xlabel(label) pyplot.savefig(PDFs + filename)
def test_bayes(table): logging.info('\n# Testing Naive Bayes') discrete_table = discretize_table(table, [(1, 10), (2, 10), (3, 10), (4, 10), (5, 10)]) start = time.time() labels = run_bayes(discrete_table) output.update("... Running Naive Bayes") logging.info("Time in Seconds:" + str(time.time() - start) + "s") confusion_matrix(labels, 'score') logging.info("\nAccuracy:" + str(accuracy(labels)))
def genScatterPlot(filename, table, xIndex, yIndex, xLabel, yLabel, title): output.update("... Plot %s" % xLabel) pyplot.figure() ys = getCol(table, yIndex) xs = getCol(table, xIndex) pyplot.plot(xs, ys, 'b.', alpha=0.2) pyplot.xlabel(xLabel) pyplot.ylabel(yLabel) pyplot.suptitle(title) pyplot.savefig(PDFs + filename)
def populateSites(db, data_folder): folders = [] count = 0 for folder in os.listdir(data_folder): # Avoid weird .DS_store's on macs if not os.path.isdir(data_folder + folder): continue output.update("...Popluating site %s" % count) folders.append(folder) count += 1 toMySQL.bulk_insert_site(db, folders)
def test_forest(table, maxN): logging.info('\n# Testing Random Forest') toTabulate = [[ "Attempt Number", "N", "M", "F", "Accuracy", "Time in Seconds" ]] attempt = 1 for n in range(1, maxN): output.update("... Running Trees for n=%s" % n) for m in range(1, n): for f in range(1, 4): start = time.time() labels = run_forest(table, n, m, f) toTabulate.append([attempt, n, m, f, accuracy(labels), \ str(time.time() - start) + 's']) attempt += 1 logging.info( '\n' + str(tabulate(toTabulate, headers="firstrow", tablefmt="fancy")))