def main(follow, followed, test_file, submission_file, data_file,
         validation_file, max_suggestion):
    """ The main method for the problem. """

    print 'Reading graph...'
    test_nodes = utilities.read_nodes_list(test_file)

    print 'Training with logistic regression...'
    clf = rank.train(data_file, validation_file)

    print 'Getting popular people...'
    popular_people = get_popular_people(followed, max_suggestion)

    print 'Predicting...'
    predictions = []
    count = 0
    for node in test_nodes:
        suggested = suggest_friends(follow, followed, clf, node,
                                    popular_people, max_suggestion)
        predictions.append(suggested)

        count += 1
        if count % 100 == 0:
            print 'Suggested %d friends.' % count

    print 'Writing submission files...'
    utilities.write_submission_file(submission_file, test_nodes, predictions)
Exemple #2
0
def main(training_file, test_file, submission_file, ratio):
    data = utilities.read_file(training_file)
    test_data = utilities.read_file(test_file)

    print 'Preparing data...'
    x, y = preprocess.prepare_data(data)
    refid, x_test = preprocess.prepare_test_data(test_data)
    x, x_test = preprocess.preprocess_features(x, x_test)

    print 'Feature extracting...'
    x, x_test = feature_extraction.create_feature(x, y, x_test)

    indices = feature_extraction.get_best_k_feature_indices(x, y, 300)
    x = feature_extraction.get_best_k_features(x, indices)
    x_test = feature_extraction.get_best_k_features(x_test, indices)
    print 'Get %s features.' % len(x[0])

    x_train, x_cv, y_train, y_cv = cross_validation.train_test_split(
        x, y, test_size=.3, random_state=0)
    x_train, y_train = preprocess.down_sample(x_train, y_train, ratio)

    clf = classification.random_forest(x_train, y_train, x_cv, y_cv)

    print 'Predicting...'
    predict = clf.predict_proba(x_test)
    utilities.write_submission_file(submission_file, refid, predict)
Exemple #3
0
def main(follow, followed, test_file, submission_file, data_file,
    validation_file, max_suggestion):
    """ The main method for the problem. """

    print 'Reading graph...'
    test_nodes = utilities.read_nodes_list(test_file)

    print 'Training with logistic regression...'
    clf = rank.train(data_file, validation_file)

    print 'Getting popular people...'
    popular_people = get_popular_people(followed, max_suggestion)

    print 'Predicting...'
    predictions = []
    count = 0
    for node in test_nodes:
        suggested = suggest_friends(follow, followed, clf, node,
            popular_people, max_suggestion)
        predictions.append(suggested)

        count += 1
        if count % 100 == 0:
            print 'Suggested %d friends.' % count

    print 'Writing submission files...'
    utilities.write_submission_file(submission_file, test_nodes, predictions)
Exemple #4
0
def main(training_file, test_file, submission_file, ratio):
    data = utilities.read_file(training_file)
    test_data = utilities.read_file(test_file)

    print 'Preparing data...'
    x, y = preprocess.prepare_data(data)
    refid, x_test = preprocess.prepare_test_data(test_data)
    x, x_test = preprocess.preprocess_features(x, x_test)

    print 'Feature extracting...'
    x, x_test = feature_extraction.create_feature(x, y, x_test)

    indices = feature_extraction.get_best_k_feature_indices(x, y, 300)
    x = feature_extraction.get_best_k_features(x, indices)
    x_test = feature_extraction.get_best_k_features(x_test, indices)
    print 'Get %s features.' % len(x[0])

    x_train, x_cv, y_train, y_cv = cross_validation.train_test_split(
        x, y, test_size=.3, random_state=0)
    x_train, y_train = preprocess.down_sample(x_train, y_train, ratio)

    clf = classification.random_forest(x_train, y_train, x_cv, y_cv)

    print 'Predicting...'
    predict = clf.predict_proba(x_test)
    utilities.write_submission_file(submission_file, refid, predict)
def top_k_benchmark(train_file, test_file, submission_file, num_predictions):
    """
    Runs the top k benchmark
    """
    top_k_nodes = get_top_k_nodes(train_file, num_predictions)
    test_nodes = utilities.read_nodes_list(test_file)
    test_predictions = [top_k_nodes for node in test_nodes]
    utilities.write_submission_file(submission_file, test_nodes,
                                    test_predictions)
def communicative_basic(train_file, test_file, submission_file, num_predictions):
    '''
    main function
    '''
    
    print ">>> reading the graph from file ...",
    graph = {}
    graph = utilities.read_graph(train_file)
    print " done!"
    print ">> the graph contains %d ndoes" % len(graph)
    
    print ">>> building the edge set ...",
    edgeSet = set()
    nodeCredit = {}
    for node in graph.keys():
        nodeCredit[node] = 0
        for frdNode in graph[node]:
            edgeSet.add((node,frdNode))
    print "done!"
    
    def compareCredit(key):
        '''
        utility function to comapre the two credits given the key
        '''
        return nodeCredit[key]
    
    
    missingEdgeSet = set()
    print ">>> reversing the edge set, computing the credicts of each node and finding missing edges ...",
    for edge in edgeSet:
        if (edge[1], edge[0]) not in edgeSet:
            missingEdgeSet.add((edge[1], edge[0]))
        nodeCredit[edge[1]]+=1
    print " done!"
    
    testResult = {}
    testNodeList = utilities.read_nodes_list(test_file)
    testNodeSet = set(testNodeList)
    print ">> %d test Nodes read." % len(testNodeList)
    print ">>> making the missing edge dictionary for test nodes ...",
    for testNode in testNodeList: # pre-build the dictionary
        testResult[testNode] = []
    
    for edge in missingEdgeSet:
        if (edge[0] in testNodeSet):
            testResult[edge[0]].append(edge[1])
    print " done!"
    
    print ">>> sorting the final results according to node credits ...",
    for testNode in testNodeList:
        testResult[testNode].sort(key=compareCredit, reverse=True)
    print " done!"
    
    print ">>> outputing the final result ...",
    utilities.write_submission_file(submission_file, testNodeList, [testResult[testNode] for testNode in testNodeList])
    print " done!"
Exemple #7
0
def random_benchmark(train_file, test_file, submission_file, num_predictions):
    """
    Runs the random benchmark.
    """
    nodes = read_nodes_from_training(train_file)
    test_nodes = utilities.read_nodes_list(test_file)
    test_predictions = [[random.choice(nodes) for x in range(num_predictions)]
                        for node in test_nodes]
    utilities.write_submission_file(submission_file, test_nodes,
                                    test_predictions)
def top_k_benchmark(train_file, test_file, submission_file, num_predictions):
    """
    Runs the top k benchmark
    """
    top_k_nodes = get_top_k_nodes(train_file, num_predictions)
    test_nodes = utilities.read_nodes_list(test_file)
    test_predictions = [top_k_nodes for node in test_nodes]
    utilities.write_submission_file(submission_file, 
                                    test_nodes, 
                                    test_predictions)
Exemple #9
0
def bfs_benchmark(train_file, test_file, submission_file, num_predictions):
    """
    Runs the breadth-first search benchmark.
    """
    graph = utilities.read_graph(train_file)
    test_nodes = utilities.read_nodes_list(test_file)
    test_predictions = [breadth_first_search(graph, node, num_predictions)
                        for node in test_nodes]
    utilities.write_submission_file(submission_file, 
                                    test_nodes, 
                                    test_predictions)
def random_benchmark(train_file, test_file, submission_file, num_predictions):
    """
    Runs the random benchmark.
    """
    nodes = read_nodes_from_training(train_file)
    test_nodes = utilities.read_nodes_list(test_file)
    test_predictions = [[random.choice(nodes) for x in range(num_predictions)]
                        for node in test_nodes]
    utilities.write_submission_file(submission_file, 
                                    test_nodes, 
                                    test_predictions)
def bfs_benchmark(train_file, test_file, submission_file, num_predictions):
    """
    Runs the breadth-first search benchmark.
    """
    graph = utilities.read_graph(train_file)
    test_nodes = utilities.read_nodes_list(test_file)
    test_predictions = [breadth_first_search(graph, node, num_predictions)
                        for node in test_nodes]
    utilities.write_submission_file(submission_file, 
                                    test_nodes, 
                                    test_predictions)
Exemple #12
0
def spring_brother(training_file, test_file, submission_file):
    """ Running on the test file. """

    y, meta_data = utilities.read_training_file(training_file)
    ids, meta_data_test = utilities.read_test_file(test_file)

    x_train, x_test = feature_selection.generate_features(meta_data,
        y, meta_data_test)

    clf = classification.random_forest(x_train, y, None, None)

    p = classification.get_prob(clf, x_test)
    utilities.write_submission_file(submission_file, ids, p)
Exemple #13
0
def spring_brother(training_file, test_file, submission_file):
    """ Running on the test file. """

    y, meta_data = utilities.read_training_file(training_file)
    ids, meta_data_test = utilities.read_test_file(test_file)

    x_train, x_test = feature_selection.generate_features(
        meta_data, y, meta_data_test)

    clf = classification.random_forest(x_train, y, None, None)

    p = classification.get_prob(clf, x_test)
    utilities.write_submission_file(submission_file, ids, p)
def run_recs(train_file, test_file, submission_file):
    global graph, graph_inverse
    graph, graph_inverse = utilities.read_graph_and_inverse(train_file)
    test_nodes = utilities.read_nodes_list(test_file)
    
    #change the val below to match cpu/memory usage, allow for ~1.2G of ram per cpu, swap will kill performance
    pool = multiprocessing.Pool(8)
    predictions = {}
    for target_node, recs in pool.imap_unordered(make_recs, test_nodes, chunksize=10000): #can experiment w/chunksize
      predictions[target_node] = recs
    test_predictions = [predictions[node] for node in test_nodes]
    utilities.write_submission_file(submission_file, 
                                    test_nodes, 
                                    test_predictions)
def jaccard_benchmark(train_file, test_file, submission_file, num_predictions):
    """
    Runs the breadth-first search benchmark.
    """

    start_time = time.time()
    (graph, reversegraph) = utilities.read_graph(train_file)
    print "Graph forming time = ", time.time() - start_time, "seconds"
    start_time = time.time()
    test_nodes = utilities.read_nodes_list(test_file)
    test_predictions = [jaccard_search(graph, reversegraph, node, num_predictions) for node in test_nodes]

    print "Prediction time = ", time.time() - start_time, "seconds"

    utilities.write_submission_file(submission_file, test_nodes, test_predictions)
def main_entrance(train_data_file, test_data_file, submit_data_file):
    '''
    the main entrance of the program
    '''
    ###############Configs#################
    minMutualFrd = 2
    ############End of Configs#############
    
    print ">>> reading the graph from file ...",
    following_graph = utilities.read_graph(train_data_file)
    print " done!"
    print ">> the graph contains %d ndoes" % len(following_graph)
    
    print ">>> reading test nodes ...",
    testNodeList = utilities.read_nodes_list(test_data_file)
    print " done!"
    
    edgeSet = get_edge_set(following_graph)
    nodeCredit = get_node_credit(edgeSet, following_graph.keys())
    commu_missingEdgeDict = get_commu_missing_edge(edgeSet, testNodeList)
    mutual_missingEdgeDict = get_mutual_missing_edge(following_graph, testNodeList, edgeSet, following_graph.keys(), minMutualFrd)
    
    # union two edge dicts
    finalPrediction = {}
    for node in testNodeList:
        finalPrediction[node] = list(set(mutual_missingEdgeDict[node]) | set(commu_missingEdgeDict[node]))
    
    # customized comparator for final prediction
    def compareCredit(key):
        '''
        utility function to comapre the two credits given the key
        '''
        return nodeCredit[key]
    
    # rank the predictions
    print ">>> sorting the final results according to node credits ...",
    for testNode in testNodeList:
        finalPrediction[testNode].sort(key=compareCredit, reverse=True)
    print " done!"
    
    # write prediction to file
    print ">>> outputing the final result ...",
    utilities.write_submission_file(submit_data_file, testNodeList, [finalPrediction[testNode] for testNode in testNodeList])
    print " done!"
def jaccard_benchmark(train_file, test_file, submission_file, num_predictions):
    """
    Runs the breadth-first search benchmark.
    """

    start_time = time.time()
    (graph, reversegraph) = utilities.read_graph(train_file)
    print "Graph forming time = ", time.time() - start_time, "seconds"
    start_time = time.time()
    test_nodes = utilities.read_nodes_list(test_file)
    test_predictions = [
        jaccard_search(graph, reversegraph, node, num_predictions)
        for node in test_nodes
    ]

    print "Prediction time = ", time.time() - start_time, "seconds"

    utilities.write_submission_file(submission_file, test_nodes,
                                    test_predictions)
def communicative_basic(train_file, test_file, submission_file, num_predictions):
    '''
    main function
    '''
    
    print ">>> reading the graph from file ...",
    graph = {}
    graph = utilities.read_graph(train_file)
    print " done!"
    print ">> the graph contains %d ndoes" % len(graph)
    
    print ">>> building the edge set ...",
    edgeSet = set()
    for node in graph.keys():
        for frdNode in graph[node]:
            edgeSet.add((node,frdNode))
    print "done!"
    
    missingEdgeSet = set()
    print ">>> reversing the edge set, finding missing edges ...",
    for edge in edgeSet:
        if (edge[1], edge[0]) not in edgeSet:
            missingEdgeSet.add((edge[1], edge[0]))
    print " done!"
    
    testResult = {}
    testNodeList = utilities.read_nodes_list(test_file)
    testNodeSet = set(testNodeList)
    print ">> %d test Nodes read." % len(testNodeList)
    print ">>> making the missing edge dictionary for test nodes ...",
    for testNode in testNodeList: # pre-build the dictionary
        testResult[testNode] = []
    
    for edge in missingEdgeSet:
        if (edge[0] in testNodeSet):
            testResult[edge[0]].append(edge[1])
    print " done!"
    
    print ">>> outputing the final result ...",
    utilities.write_submission_file(submission_file, testNodeList, [testResult[testNode] for testNode in testNodeList])
    print " done!"