def train_models(columns, rf=True, svm=False, logit=False):
    # Prepare the data
    dataCols = dict(
        [(colName,
          read_data_col('%s.train.dat' % fileName, valCol=valCol))
         for colName, fileName, valCol in COLUMNS
         if colName in columns or colName == 'TF']
        )
    data, colnames = create_matrix(dataCols)
    cimt = colnames.index('TF')
    y = data[:, cimt]
    x = data[:, [c for c in range(len(colnames)) if c != cimt]]

    print >> sys.stderr, '\n>> Training model with cols:', colnames
    print >> sys.stderr, x

    # Fit the models
    models = {}
    if rf:
        rfModel = ensemble.RandomForestClassifier(
            n_estimators=100,
            ## criterion='entropy',
            verbose=1,
            n_jobs=-1,
            ## oob_score=True,
            ## min_samples_leaf=1,
            )
        rfModel.fit(x, y)
        models['rf'] = rfModel

    if svm:
        svmModel = svm.SVC(kernel='rbf')
        svmModel.fit(x, y)
        models['svm'] = svmModel

    if logit:
        logitModel = linear_model.LogisticRegression()
        logitModel.fit(x, y)
        models['logit'] = logitModel

    return {'models' : models, 'colnames' : [c for c in colnames if c!='TF']}
Exemple #2
0
def cross_validation(logit=False):
    # Split authors in train/validation
    confirmed, deleted = get_train()
    aids = confirmed.keys()
    shuffle(aids)
    cutPoint = int(len(aids) * F)
    aidsTrain = aids[:cutPoint]
    aidsValid = aids[cutPoint:]

    # Prepare the training data
    dataCols = dict(
        [(colName,
          read_data_col('%s.train.dat' % fileName, valCol=valCol))
         for colName, fileName, valCol in COLUMNS]
        )
    data, colnames = create_matrix(dataCols, exclude_aids=aidsValid)
    cimt = colnames.index('TF')
    ytrain = data[:, cimt]
    xtrain = data[:, [c for c in range(len(colnames)) if c != cimt]]
    # validation
    data, colnames2, pairs = create_matrix(dataCols,
                                           exclude_aids=aidsTrain,
                                           return_pairs=True)
    if colnames2 != colnames:
        raise WTF
    yvalid = data[:, cimt]
    xvalid = data[:, [c for c in range(len(colnames)) if c != cimt]]

    print >> sys.stderr, '==> %s train; %s valid <==' % (str(xtrain.shape),
                                                         str(xvalid.shape))

    # Train the model
    models = train_models(xtrain, ytrain, logit=logit)

    # Calculate performance of each algorithm
    performance = {}
    for algorithm in models:
        print >> sys.stderr, '\n\n--> %s <--\n' % algorithm
        print >> sys.stderr, models[algorithm]

        # make the predictions
        predBin = models[algorithm].predict(xvalid)
        pred = models[algorithm].predict_proba(xvalid)
        print >> sys.stderr, ''
        print >> sys.stderr, yvalid
        print >> sys.stderr, predBin
        print >> sys.stderr, pred

        # extract author-paper scores
        decorated = {}
        for n in range(len(pairs)):
            aid, pid = pairs[n]
            score = pred[n, 0]
            try:
                decorated[aid].append((score, pid, yvalid[n]))
            except KeyError:
                decorated[aid] = [(score, pid, yvalid[n])]

        # get the MAP
        MAPs = []
        for aid in decorated:
            sortedPapers = decorated[aid]
            sortedPapers.sort()
            ntrue, ntot, MAPTerms = 0, 0, []
            for s, p, tf in sortedPapers:
                ntot += 1
                if tf == 1:
                    ntrue += 1
                    MAPTerms.append(float(ntrue)/float(ntot))
            MAPs.append(np.mean(MAPTerms))

        # output results
        performance[algorithm] = np.mean(MAPs)
        print >> sys.stderr, '\n>> %s: %f' % (algorithm, np.mean(MAPs))

    # Done
    print >> sys.stderr, '\n'
    return performance
Exemple #3
0
    ## ('nameinit', '../name.train.dat', 4),
#    ('nname', '../nname.train.dat', 3),
    ('npapers', '../npapers.train.dat', 3),
    ('nauthors', '../nauthors.train.dat', 3),
#    ('coauthors', '../coauthors_diff.train.dat', 3),
    ## ('zcoauthors', '../coauthors_diff.train.dat', 4),
    ('affiliation', '../affiliation.train.dat', 3),
    ('year', '../year.train.dat', 3),
    ('nvalidated', '../nvalidated.train.dat', 3),
    ('sumcoauthors', '../sumcoauthors.train.dat', 3),
    )

if __name__ == '__main__':
    dataCols = dict([(colName, read_data_col(fileName, valCol=valCol))
                     for colName, fileName, valCol in COLUMNS])
    data, colnames = create_matrix(dataCols)
    print colnames
    
    cimt = colnames.index('TF')
    y = data[:, cimt]
    x = data[:, [c for c in range(len(colnames)) if c != cimt]]

    N = len(y)
    ntrain = int(N * F)
    ttindices = range(N)
    shuffle(ttindices)
    yTrain = y[ttindices[:ntrain], :]
    xTrain = x[ttindices[:ntrain], :]
    yTest = y[ttindices[ntrain+1:], :]
    xTest = x[ttindices[ntrain+1:], :]