def main(follow, followed, test_file, submission_file, data_file, validation_file, max_suggestion): """ The main method for the problem. """ print 'Reading graph...' test_nodes = utilities.read_nodes_list(test_file) print 'Training with logistic regression...' clf = rank.train(data_file, validation_file) print 'Getting popular people...' popular_people = get_popular_people(followed, max_suggestion) print 'Predicting...' predictions = [] count = 0 for node in test_nodes: suggested = suggest_friends(follow, followed, clf, node, popular_people, max_suggestion) predictions.append(suggested) count += 1 if count % 100 == 0: print 'Suggested %d friends.' % count print 'Writing submission files...' utilities.write_submission_file(submission_file, test_nodes, predictions)
def main(training_file, test_file, submission_file, ratio): data = utilities.read_file(training_file) test_data = utilities.read_file(test_file) print 'Preparing data...' x, y = preprocess.prepare_data(data) refid, x_test = preprocess.prepare_test_data(test_data) x, x_test = preprocess.preprocess_features(x, x_test) print 'Feature extracting...' x, x_test = feature_extraction.create_feature(x, y, x_test) indices = feature_extraction.get_best_k_feature_indices(x, y, 300) x = feature_extraction.get_best_k_features(x, indices) x_test = feature_extraction.get_best_k_features(x_test, indices) print 'Get %s features.' % len(x[0]) x_train, x_cv, y_train, y_cv = cross_validation.train_test_split( x, y, test_size=.3, random_state=0) x_train, y_train = preprocess.down_sample(x_train, y_train, ratio) clf = classification.random_forest(x_train, y_train, x_cv, y_cv) print 'Predicting...' predict = clf.predict_proba(x_test) utilities.write_submission_file(submission_file, refid, predict)
def top_k_benchmark(train_file, test_file, submission_file, num_predictions): """ Runs the top k benchmark """ top_k_nodes = get_top_k_nodes(train_file, num_predictions) test_nodes = utilities.read_nodes_list(test_file) test_predictions = [top_k_nodes for node in test_nodes] utilities.write_submission_file(submission_file, test_nodes, test_predictions)
def communicative_basic(train_file, test_file, submission_file, num_predictions): ''' main function ''' print ">>> reading the graph from file ...", graph = {} graph = utilities.read_graph(train_file) print " done!" print ">> the graph contains %d ndoes" % len(graph) print ">>> building the edge set ...", edgeSet = set() nodeCredit = {} for node in graph.keys(): nodeCredit[node] = 0 for frdNode in graph[node]: edgeSet.add((node,frdNode)) print "done!" def compareCredit(key): ''' utility function to comapre the two credits given the key ''' return nodeCredit[key] missingEdgeSet = set() print ">>> reversing the edge set, computing the credicts of each node and finding missing edges ...", for edge in edgeSet: if (edge[1], edge[0]) not in edgeSet: missingEdgeSet.add((edge[1], edge[0])) nodeCredit[edge[1]]+=1 print " done!" testResult = {} testNodeList = utilities.read_nodes_list(test_file) testNodeSet = set(testNodeList) print ">> %d test Nodes read." % len(testNodeList) print ">>> making the missing edge dictionary for test nodes ...", for testNode in testNodeList: # pre-build the dictionary testResult[testNode] = [] for edge in missingEdgeSet: if (edge[0] in testNodeSet): testResult[edge[0]].append(edge[1]) print " done!" print ">>> sorting the final results according to node credits ...", for testNode in testNodeList: testResult[testNode].sort(key=compareCredit, reverse=True) print " done!" print ">>> outputing the final result ...", utilities.write_submission_file(submission_file, testNodeList, [testResult[testNode] for testNode in testNodeList]) print " done!"
def random_benchmark(train_file, test_file, submission_file, num_predictions): """ Runs the random benchmark. """ nodes = read_nodes_from_training(train_file) test_nodes = utilities.read_nodes_list(test_file) test_predictions = [[random.choice(nodes) for x in range(num_predictions)] for node in test_nodes] utilities.write_submission_file(submission_file, test_nodes, test_predictions)
def bfs_benchmark(train_file, test_file, submission_file, num_predictions): """ Runs the breadth-first search benchmark. """ graph = utilities.read_graph(train_file) test_nodes = utilities.read_nodes_list(test_file) test_predictions = [breadth_first_search(graph, node, num_predictions) for node in test_nodes] utilities.write_submission_file(submission_file, test_nodes, test_predictions)
def spring_brother(training_file, test_file, submission_file): """ Running on the test file. """ y, meta_data = utilities.read_training_file(training_file) ids, meta_data_test = utilities.read_test_file(test_file) x_train, x_test = feature_selection.generate_features(meta_data, y, meta_data_test) clf = classification.random_forest(x_train, y, None, None) p = classification.get_prob(clf, x_test) utilities.write_submission_file(submission_file, ids, p)
def spring_brother(training_file, test_file, submission_file): """ Running on the test file. """ y, meta_data = utilities.read_training_file(training_file) ids, meta_data_test = utilities.read_test_file(test_file) x_train, x_test = feature_selection.generate_features( meta_data, y, meta_data_test) clf = classification.random_forest(x_train, y, None, None) p = classification.get_prob(clf, x_test) utilities.write_submission_file(submission_file, ids, p)
def run_recs(train_file, test_file, submission_file): global graph, graph_inverse graph, graph_inverse = utilities.read_graph_and_inverse(train_file) test_nodes = utilities.read_nodes_list(test_file) #change the val below to match cpu/memory usage, allow for ~1.2G of ram per cpu, swap will kill performance pool = multiprocessing.Pool(8) predictions = {} for target_node, recs in pool.imap_unordered(make_recs, test_nodes, chunksize=10000): #can experiment w/chunksize predictions[target_node] = recs test_predictions = [predictions[node] for node in test_nodes] utilities.write_submission_file(submission_file, test_nodes, test_predictions)
def jaccard_benchmark(train_file, test_file, submission_file, num_predictions): """ Runs the breadth-first search benchmark. """ start_time = time.time() (graph, reversegraph) = utilities.read_graph(train_file) print "Graph forming time = ", time.time() - start_time, "seconds" start_time = time.time() test_nodes = utilities.read_nodes_list(test_file) test_predictions = [jaccard_search(graph, reversegraph, node, num_predictions) for node in test_nodes] print "Prediction time = ", time.time() - start_time, "seconds" utilities.write_submission_file(submission_file, test_nodes, test_predictions)
def main_entrance(train_data_file, test_data_file, submit_data_file): ''' the main entrance of the program ''' ###############Configs################# minMutualFrd = 2 ############End of Configs############# print ">>> reading the graph from file ...", following_graph = utilities.read_graph(train_data_file) print " done!" print ">> the graph contains %d ndoes" % len(following_graph) print ">>> reading test nodes ...", testNodeList = utilities.read_nodes_list(test_data_file) print " done!" edgeSet = get_edge_set(following_graph) nodeCredit = get_node_credit(edgeSet, following_graph.keys()) commu_missingEdgeDict = get_commu_missing_edge(edgeSet, testNodeList) mutual_missingEdgeDict = get_mutual_missing_edge(following_graph, testNodeList, edgeSet, following_graph.keys(), minMutualFrd) # union two edge dicts finalPrediction = {} for node in testNodeList: finalPrediction[node] = list(set(mutual_missingEdgeDict[node]) | set(commu_missingEdgeDict[node])) # customized comparator for final prediction def compareCredit(key): ''' utility function to comapre the two credits given the key ''' return nodeCredit[key] # rank the predictions print ">>> sorting the final results according to node credits ...", for testNode in testNodeList: finalPrediction[testNode].sort(key=compareCredit, reverse=True) print " done!" # write prediction to file print ">>> outputing the final result ...", utilities.write_submission_file(submit_data_file, testNodeList, [finalPrediction[testNode] for testNode in testNodeList]) print " done!"
def jaccard_benchmark(train_file, test_file, submission_file, num_predictions): """ Runs the breadth-first search benchmark. """ start_time = time.time() (graph, reversegraph) = utilities.read_graph(train_file) print "Graph forming time = ", time.time() - start_time, "seconds" start_time = time.time() test_nodes = utilities.read_nodes_list(test_file) test_predictions = [ jaccard_search(graph, reversegraph, node, num_predictions) for node in test_nodes ] print "Prediction time = ", time.time() - start_time, "seconds" utilities.write_submission_file(submission_file, test_nodes, test_predictions)
def communicative_basic(train_file, test_file, submission_file, num_predictions): ''' main function ''' print ">>> reading the graph from file ...", graph = {} graph = utilities.read_graph(train_file) print " done!" print ">> the graph contains %d ndoes" % len(graph) print ">>> building the edge set ...", edgeSet = set() for node in graph.keys(): for frdNode in graph[node]: edgeSet.add((node,frdNode)) print "done!" missingEdgeSet = set() print ">>> reversing the edge set, finding missing edges ...", for edge in edgeSet: if (edge[1], edge[0]) not in edgeSet: missingEdgeSet.add((edge[1], edge[0])) print " done!" testResult = {} testNodeList = utilities.read_nodes_list(test_file) testNodeSet = set(testNodeList) print ">> %d test Nodes read." % len(testNodeList) print ">>> making the missing edge dictionary for test nodes ...", for testNode in testNodeList: # pre-build the dictionary testResult[testNode] = [] for edge in missingEdgeSet: if (edge[0] in testNodeSet): testResult[edge[0]].append(edge[1]) print " done!" print ">>> outputing the final result ...", utilities.write_submission_file(submission_file, testNodeList, [testResult[testNode] for testNode in testNodeList]) print " done!"