Example #1
0
def train_models(columns, rf=True, svm=False, logit=False):
    # Prepare the data
    dataCols = dict(
        [(colName,
          read_data_col('%s.train.dat' % fileName, valCol=valCol))
         for colName, fileName, valCol in COLUMNS
         if colName in columns or colName == 'TF']
        )
    data, colnames = create_matrix(dataCols)
    cimt = colnames.index('TF')
    y = data[:, cimt]
    x = data[:, [c for c in range(len(colnames)) if c != cimt]]

    print >> sys.stderr, '\n>> Training model with cols:', colnames
    print >> sys.stderr, x

    # Fit the models
    models = {}
    if rf:
        rfModel = ensemble.RandomForestClassifier(
            n_estimators=100,
            ## criterion='entropy',
            verbose=1,
            n_jobs=-1,
            ## oob_score=True,
            ## min_samples_leaf=1,
            )
        rfModel.fit(x, y)
        models['rf'] = rfModel

    if svm:
        svmModel = svm.SVC(kernel='rbf')
        svmModel.fit(x, y)
        models['svm'] = svmModel

    if logit:
        logitModel = linear_model.LogisticRegression()
        logitModel.fit(x, y)
        models['logit'] = logitModel

    return {'models' : models, 'colnames' : [c for c in colnames if c!='TF']}
Example #2
0
def cross_validation(logit=False):
    # Split authors in train/validation
    confirmed, deleted = get_train()
    aids = confirmed.keys()
    shuffle(aids)
    cutPoint = int(len(aids) * F)
    aidsTrain = aids[:cutPoint]
    aidsValid = aids[cutPoint:]

    # Prepare the training data
    dataCols = dict(
        [(colName,
          read_data_col('%s.train.dat' % fileName, valCol=valCol))
         for colName, fileName, valCol in COLUMNS]
        )
    data, colnames = create_matrix(dataCols, exclude_aids=aidsValid)
    cimt = colnames.index('TF')
    ytrain = data[:, cimt]
    xtrain = data[:, [c for c in range(len(colnames)) if c != cimt]]
    # validation
    data, colnames2, pairs = create_matrix(dataCols,
                                           exclude_aids=aidsTrain,
                                           return_pairs=True)
    if colnames2 != colnames:
        raise WTF
    yvalid = data[:, cimt]
    xvalid = data[:, [c for c in range(len(colnames)) if c != cimt]]

    print >> sys.stderr, '==> %s train; %s valid <==' % (str(xtrain.shape),
                                                         str(xvalid.shape))

    # Train the model
    models = train_models(xtrain, ytrain, logit=logit)

    # Calculate performance of each algorithm
    performance = {}
    for algorithm in models:
        print >> sys.stderr, '\n\n--> %s <--\n' % algorithm
        print >> sys.stderr, models[algorithm]

        # make the predictions
        predBin = models[algorithm].predict(xvalid)
        pred = models[algorithm].predict_proba(xvalid)
        print >> sys.stderr, ''
        print >> sys.stderr, yvalid
        print >> sys.stderr, predBin
        print >> sys.stderr, pred

        # extract author-paper scores
        decorated = {}
        for n in range(len(pairs)):
            aid, pid = pairs[n]
            score = pred[n, 0]
            try:
                decorated[aid].append((score, pid, yvalid[n]))
            except KeyError:
                decorated[aid] = [(score, pid, yvalid[n])]

        # get the MAP
        MAPs = []
        for aid in decorated:
            sortedPapers = decorated[aid]
            sortedPapers.sort()
            ntrue, ntot, MAPTerms = 0, 0, []
            for s, p, tf in sortedPapers:
                ntot += 1
                if tf == 1:
                    ntrue += 1
                    MAPTerms.append(float(ntrue)/float(ntot))
            MAPs.append(np.mean(MAPTerms))

        # output results
        performance[algorithm] = np.mean(MAPs)
        print >> sys.stderr, '\n>> %s: %f' % (algorithm, np.mean(MAPs))

    # Done
    print >> sys.stderr, '\n'
    return performance
Example #3
0
def make_submission():
    # Read validation file
    validation = get_valid()

    # Read scores for validation author-paper pairs
    validDataCols = dict(
        [(colName,
          read_data_col('%s.valid.dat' % fileName, valCol=valCol))
         for colName, fileName, valCol in COLUMNS
         if colName != 'TF']
        )

    # Train the models and prepare the validation sets
    models, xValid, pairsAP = {}, {}, {}
    for aid in validation:
        for pid in validation[aid]:
            # the available columns for this author-paper pair
            try:
                cols = [col
                        for col in validDataCols
                        if validDataCols[col].has_key(aid)
                        and validDataCols[col][aid].has_key(pid)]
            except:
                print aid, pid, aid in validDataCols[col].keys()
                raise ValueError
            cols.sort()
            # train the model if necessary (i.e. if this is the first
            # time we see exactly these columns)
            modelName = '_'.join(cols)
            if modelName not in models:
                models[modelName] = train_models(cols)
                xValid[modelName] = []
                pairsAP[modelName] = []
            # validation columns
            xValid[modelName].append(
                np.array([validDataCols[col][aid][pid]
                          for col in models[modelName]['colnames']])
                )
            pairsAP[modelName].append((aid, pid))
        
    # Make the predictions
    decorated = {}
    for modelName in models:
        print >> sys.stderr, '\n>> Making predictions for cols:', modelName
        # make all predictions for this model
        print >> sys.stderr, np.array(xValid[modelName])
        print >> sys.stderr, np.array(xValid[modelName]).shape
        pred = models[modelName]['models']['rf'].predict_proba(
            np.array(xValid[modelName])
            )
        print >> sys.stderr, pred
        # extract author-paper scores
        for n in range(len(pairsAP[modelName])):
            aid, pid = pairsAP[modelName][n]
            score = pred[n, 0]
            try:
                decorated[aid].append((score, pid))
            except KeyError:
                decorated[aid] = [(score, pid)]

    # Create the file
    outf = open('submit_%s.csv' % strftime("%Y%m%d_%H:%M:%S", localtime()), 'w')
    print >> outf, 'AuthorId,PaperIds'
    for aid in decorated:
        sortedPapers = decorated[aid]
        sortedPapers.sort()
        print >> outf, \
              str(aid) + ',' + ' '.join([str(pid) for s, pid in sortedPapers])
    outf.close()

    # Create the info file
    outf = open('submit_%s.info' % strftime("%Y%m%d_%H:%M:%S", localtime()), 'w')
    print >> outf, '\n'.join([str(c) for c in COLUMNS])
    outf.close()
Example #4
0
    ('nconference', '../nvenue.train.dat', 4),
    ('name', '../name.train.dat', 3),
    ## ('nameinit', '../name.train.dat', 4),
#    ('nname', '../nname.train.dat', 3),
    ('npapers', '../npapers.train.dat', 3),
    ('nauthors', '../nauthors.train.dat', 3),
#    ('coauthors', '../coauthors_diff.train.dat', 3),
    ## ('zcoauthors', '../coauthors_diff.train.dat', 4),
    ('affiliation', '../affiliation.train.dat', 3),
    ('year', '../year.train.dat', 3),
    ('nvalidated', '../nvalidated.train.dat', 3),
    ('sumcoauthors', '../sumcoauthors.train.dat', 3),
    )

if __name__ == '__main__':
    dataCols = dict([(colName, read_data_col(fileName, valCol=valCol))
                     for colName, fileName, valCol in COLUMNS])
    data, colnames = create_matrix(dataCols)
    print colnames
    
    cimt = colnames.index('TF')
    y = data[:, cimt]
    x = data[:, [c for c in range(len(colnames)) if c != cimt]]

    N = len(y)
    ntrain = int(N * F)
    ttindices = range(N)
    shuffle(ttindices)
    yTrain = y[ttindices[:ntrain], :]
    xTrain = x[ttindices[:ntrain], :]
    yTest = y[ttindices[ntrain+1:], :]