Example #1
0
def main():
    print("Getting features for valid papers from the database")
    #data = data_io.get_features_db("ValidPaper")
    data = data_io.get_precomputed_features("ValidFeatures")

    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions")
    predictions = classifier.predict_proba(features)[:, 1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions)
def main():
    print("Getting features for valid papers from the database")
    #data = data_io.get_features_db("ValidPaper")
    data = data_io.get_precomputed_features("ValidFeatures")

    author_paper_ids = [x[:2] for x in data]
    features = [x[2:] for x in data]

    print("Loading the classifier")
    classifier = data_io.load_model()

    print("Making predictions")
    predictions = classifier.predict_proba(features)[:,1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions)
Example #3
0
def main():
    print("Getting features for deleted papers from the database")
    #features_deleted = data_io.get_features_db("TrainDeleted")
    features_deleted = data_io.get_precomputed_features("DeletedFeatures")

    print("Getting features for confirmed papers from the database")
    #features_conf = data_io.get_features_db("TrainConfirmed")
    features_conf = data_io.get_precomputed_features("ConfirmedFeatures")

    print("Getting features for deleted papers from the database")
    #valid_features_deleted = data_io.get_features_db("ValidDeleted")
    valid_features_deleted = data_io.get_precomputed_features(
        "ValidDeletedFeatures")

    print("Getting features for confirmed papers from the database")
    #valid_features_conf = data_io.get_features_db("ValidConfirmed")
    valid_features_conf = data_io.get_precomputed_features(
        "ValidConfirmedFeatures")

    features = [
        x[2:] for x in features_deleted + features_conf +
        valid_features_deleted + valid_features_conf
    ]
    target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] \
          + [0 for x in range(len(valid_features_deleted))] + [1 for x in range(len(valid_features_conf))]

    print("Training the Classifier")
    clfRF = RandomForestClassifier(n_estimators=100,
                                   verbose=2,
                                   n_jobs=10,
                                   min_samples_split=10,
                                   compute_importances=True,
                                   random_state=1)

    clfGBM = GradientBoostingClassifier(n_estimators=100,
                                        verbose=2,
                                        min_samples_split=10,
                                        random_state=1)

    classifier = clfRF
    classifier.fit(features, target)

    print("Saving the classifier")
    #data_io.save_model(clfier)
    print "Feature importance", classifier.feature_importances_
def main():
    print("Getting features for deleted papers from the database")
    #features_deleted = data_io.get_features_db("TrainDeleted")
    features_deleted = data_io.get_precomputed_features("DeletedFeatures")

    print("Getting features for confirmed papers from the database")
    #features_conf = data_io.get_features_db("TrainConfirmed")
    features_conf = data_io.get_precomputed_features("ConfirmedFeatures")

    print("Getting features for deleted papers from the database")
    #valid_features_deleted = data_io.get_features_db("ValidDeleted")
    valid_features_deleted = data_io.get_precomputed_features("ValidDeletedFeatures")

    print("Getting features for confirmed papers from the database")
    #valid_features_conf = data_io.get_features_db("ValidConfirmed")
    valid_features_conf = data_io.get_precomputed_features("ValidConfirmedFeatures")

    features = [x[2:] for x in features_deleted + features_conf + valid_features_deleted + valid_features_conf]
    target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] \
          + [0 for x in range(len(valid_features_deleted))] + [1 for x in range(len(valid_features_conf))]


    print("Training the Classifier")
    clfRF = RandomForestClassifier(n_estimators=100, 
                                       verbose=2,
                                        n_jobs=10,
                                        min_samples_split=10,
                                        compute_importances=True,
                                        random_state=1)
    
    clfGBM = GradientBoostingClassifier(n_estimators=100, 
                                        verbose=2,
                                        min_samples_split=10,
                                        random_state=1)
    
    classifier = clfRF
    classifier.fit(features, target)

    print("Saving the classifier")
    #data_io.save_model(clfier)
    print "Feature importance", classifier.feature_importances_ 
Example #5
0
def main():
    print("Getting features for deleted papers from the database")
    #features_deleted = data_io.get_features_db("TrainDeleted")
    features_deleted = data_io.get_precomputed_features("DeletedFeatures")
    print("Getting features for confirmed papers from the database")
    #features_conf = data_io.get_features_db("TrainConfirmed")
    features_conf = data_io.get_precomputed_features("ConfirmedFeatures")
    print("Getting features for deleted papers from the database")
    #valid_features_deleted = data_io.get_features_db("ValidDeleted")
    valid_features_deleted = data_io.get_precomputed_features("ValidDeletedFeatures")
    print("Getting features for confirmed papers from the database")
    #valid_features_conf = data_io.get_features_db("ValidConfirmed")
    valid_features_conf = data_io.get_precomputed_features("ValidConfirmedFeatures")

    features = [x[2:] for x in features_deleted + features_conf] #+ valid_features_deleted + valid_features_conf
    target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] 
          #+ [0 for x in range(len(valid_features_deleted))] + [1 for x in range(len(valid_features_conf))]

    print("Training the Classifier")
    RF = RandomForestClassifier(n_estimators=50, 
                                       verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        compute_importances=True,
                                        random_state=1)
    
    GBM = GradientBoostingClassifier(n_estimators=100, 
                                        verbose=2,
                                        min_samples_split=10,
                                        random_state=1)
    classifier = RF
    classifier.fit(features, target)

    # Validation
    author_paper_ids = [x[:2] for x in valid_features_conf+valid_features_deleted]
    features = [x[2:] for x in valid_features_conf+valid_features_deleted]

    print("Making predictions")
    predictions = classifier.predict_proba(features)[:,1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[str(a_id)].append((pred,str(p_id)))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]
   
    predicted = paper_predictions.items()
    predicted.sort()
    #Now I have sorted predictions for each author_id
    #Need to get the ground truth for the validation set:

    valid_confirmed_data = [row for row in csv.reader(open("ValidSolution.csv"))] #TrainConfirmed.csv
    valid_confirmed_papers = [(row[0],row[1].split()) for row in valid_confirmed_data[1:]]
    valid_confirmed_papers.sort()

    print predicted[0]
    print valid_confirmed_papers[0]
   
    import ml_metrics as metrics
    print metrics.mapk([row[1] for row in valid_confirmed_papers], [row[1] for row in predicted],10000)
def main():
    print("Getting features for deleted papers from the database")
    #features_deleted = data_io.get_features_db("TrainDeleted")
    features_deleted = data_io.get_precomputed_features("DeletedFeaturester")
    print("Getting features for confirmed papers from the database")
    #features_conf = data_io.get_features_db("TrainConfirmed")
    features_conf = data_io.get_precomputed_features("ConfirmedFeaturester")
    print("Getting features for deleted papers from the database")
    #valid_features_deleted = data_io.get_features_db("ValidDeleted")
    valid_features_deleted = data_io.get_precomputed_features("DeletedValidFeaturester")
    print("Getting features for confirmed papers from the database")
    #valid_features_conf = data_io.get_features_db("ValidConfirmed")
    valid_features_conf = data_io.get_precomputed_features("ConfirmedValidFeaturester")

    all_features = [x for x in features_deleted + features_conf + valid_features_deleted + valid_features_conf]
    all_target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] \
          + [0 for x in range(len(valid_features_deleted))] + [1 for x in range(len(valid_features_conf))]

    print "Load ground truth"
    valid_confirmed_data = [row for row in csv.reader(open("ValidSolution.csv"))]
    valid_confirmed_papers = [(row[0],row[1].split()) for row in valid_confirmed_data[1:]]
    valid_confirmed_papers.sort()
    train_confirmed_data = [row for row in csv.reader(open("Train.csv"))]
    train_confirmed_papers = [(row[0],row[1].split()) for row in train_confirmed_data[1:]]
    train_confirmed_papers.sort()
    ground_truth = valid_confirmed_papers + train_confirmed_papers
    authors = [row[0] for row in ground_truth]
    
    scaling = False 
    mp = []
    for k in xrange(10): 
        # Now split authors anyway you like.
        print "Split data"
        authors_train, authors_test = cross_validation.train_test_split(authors,test_size=0.1, random_state=k)
    
        print "Build training set"
        train_indices = [i for (i,x) in enumerate(all_features) if str(x[0]) in authors_train]
        test_indices = [i for (i,x) in enumerate(all_features) if str(x[0]) in authors_test]
        train_features = [map(float,all_features[i][2:]) for i in train_indices]
        print len(train_features)
        print train_features[0]
        if scaling:
            scaler = preprocessing.StandardScaler().fit(train_features)
            train_features = scaler.transform(train_features)
        train_targets  = [all_target[i] for i in train_indices]
        print len(train_targets)
    
        print "Build test set"
        author_paper_ids = [all_features[i][:2] for i in test_indices]
        test_features = [map(float,all_features[i][2:]) for i in test_indices]
        if scaling:
            test_features = scaler.transform(test_features)
        test_targets  = [all_target[i] for i in test_indices]
        test_ground_truth = [row for row in ground_truth if row[0] in authors_test]
        test_ground_truth.sort()
    
        print("Training the Classifier")
        RF = RandomForestClassifier(n_estimators=100, verbose=1, n_jobs=10, min_samples_split=10, compute_importances=True, random_state=1)
        SVM = svm.SVC(cache_size=1000, verbose=True)
        knn = neighbors.KNeighborsClassifier()
        GBM = GradientBoostingClassifier(n_estimators=100, verbose=1, min_samples_split=10, random_state=1)
        log = linear_model.LogisticRegression(random_state=1)
        classifier = log 
        classifier.fit(train_features, train_targets)
        if classifier == RF or classifier == GBM:
            print "Feature importance", classifier.feature_importances_

        print("Making predictions")
        predictions = classifier.predict_proba(test_features)[:,1]
        predictions = list(predictions)
    
        author_predictions = defaultdict(list)
        paper_predictions = {}
    
        for (a_id, p_id), pred in zip(author_paper_ids, predictions):
            author_predictions[str(a_id)].append((pred,str(p_id)))
    
        for author_id in sorted(author_predictions):
            paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
            paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]
       
        predicted = paper_predictions.items()
        predicted.sort()
    
        print [x[0] for x in predicted[:5]]
Example #7
0
def main():
    print("Getting features for deleted papers from the database")
    #features_deleted = data_io.get_features_db("TrainDeleted")
    features_deleted = data_io.get_precomputed_features("DeletedFeaturester")
    print("Getting features for confirmed papers from the database")
    #features_conf = data_io.get_features_db("TrainConfirmed")
    features_conf = data_io.get_precomputed_features("ConfirmedFeaturester")
    print("Getting features for deleted papers from the database")
    #valid_features_deleted = data_io.get_features_db("ValidDeleted")
    valid_features_deleted = data_io.get_precomputed_features(
        "DeletedValidFeaturester")
    print("Getting features for confirmed papers from the database")
    #valid_features_conf = data_io.get_features_db("ValidConfirmed")
    valid_features_conf = data_io.get_precomputed_features(
        "ConfirmedValidFeaturester")

    all_features = [
        x for x in features_deleted + features_conf + valid_features_deleted +
        valid_features_conf
    ]
    all_target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] \
          + [0 for x in range(len(valid_features_deleted))] + [1 for x in range(len(valid_features_conf))]

    print "Load ground truth"
    valid_confirmed_data = [
        row for row in csv.reader(open("ValidSolution.csv"))
    ]
    valid_confirmed_papers = [(row[0], row[1].split())
                              for row in valid_confirmed_data[1:]]
    valid_confirmed_papers.sort()
    train_confirmed_data = [row for row in csv.reader(open("Train.csv"))]
    train_confirmed_papers = [(row[0], row[1].split())
                              for row in train_confirmed_data[1:]]
    train_confirmed_papers.sort()
    ground_truth = valid_confirmed_papers + train_confirmed_papers
    authors = [row[0] for row in ground_truth]

    scaling = False
    mp = []
    for k in xrange(10):
        # Now split authors anyway you like.
        print "Split data"
        authors_train, authors_test = cross_validation.train_test_split(
            authors, test_size=0.1, random_state=k)

        print "Build training set"
        train_indices = [
            i for (i, x) in enumerate(all_features)
            if str(x[0]) in authors_train
        ]
        test_indices = [
            i for (i, x) in enumerate(all_features)
            if str(x[0]) in authors_test
        ]
        train_features = [
            map(float, all_features[i][2:]) for i in train_indices
        ]
        print len(train_features)
        print train_features[0]
        if scaling:
            scaler = preprocessing.StandardScaler().fit(train_features)
            train_features = scaler.transform(train_features)
        train_targets = [all_target[i] for i in train_indices]
        print len(train_targets)

        print "Build test set"
        author_paper_ids = [all_features[i][:2] for i in test_indices]
        test_features = [map(float, all_features[i][2:]) for i in test_indices]
        if scaling:
            test_features = scaler.transform(test_features)
        test_targets = [all_target[i] for i in test_indices]
        test_ground_truth = [
            row for row in ground_truth if row[0] in authors_test
        ]
        test_ground_truth.sort()

        print("Training the Classifier")
        RF = RandomForestClassifier(n_estimators=100,
                                    verbose=1,
                                    n_jobs=10,
                                    min_samples_split=10,
                                    compute_importances=True,
                                    random_state=1)
        SVM = svm.SVC(cache_size=1000, verbose=True)
        knn = neighbors.KNeighborsClassifier()
        GBM = GradientBoostingClassifier(n_estimators=100,
                                         verbose=1,
                                         min_samples_split=10,
                                         random_state=1)
        log = linear_model.LogisticRegression(random_state=1)
        classifier = log
        classifier.fit(train_features, train_targets)
        if classifier == RF or classifier == GBM:
            print "Feature importance", classifier.feature_importances_

        print("Making predictions")
        predictions = classifier.predict_proba(test_features)[:, 1]
        predictions = list(predictions)

        author_predictions = defaultdict(list)
        paper_predictions = {}

        for (a_id, p_id), pred in zip(author_paper_ids, predictions):
            author_predictions[str(a_id)].append((pred, str(p_id)))

        for author_id in sorted(author_predictions):
            paper_ids_sorted = sorted(author_predictions[author_id],
                                      reverse=True)
            paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

        predicted = paper_predictions.items()
        predicted.sort()

        print[x[0] for x in predicted[:5]]
        print[x[0] for x in test_ground_truth[:5]]
        mp.append(
            metrics.mapk([row[1] for row in test_ground_truth],
                         [row[1] for row in predicted], 10000))
        print mp[k]

    print numpy.mean(mp)
    print numpy.std(mp)