def train_models(columns, rf=True, svm=False, logit=False): # Prepare the data dataCols = dict( [(colName, read_data_col('%s.train.dat' % fileName, valCol=valCol)) for colName, fileName, valCol in COLUMNS if colName in columns or colName == 'TF'] ) data, colnames = create_matrix(dataCols) cimt = colnames.index('TF') y = data[:, cimt] x = data[:, [c for c in range(len(colnames)) if c != cimt]] print >> sys.stderr, '\n>> Training model with cols:', colnames print >> sys.stderr, x # Fit the models models = {} if rf: rfModel = ensemble.RandomForestClassifier( n_estimators=100, ## criterion='entropy', verbose=1, n_jobs=-1, ## oob_score=True, ## min_samples_leaf=1, ) rfModel.fit(x, y) models['rf'] = rfModel if svm: svmModel = svm.SVC(kernel='rbf') svmModel.fit(x, y) models['svm'] = svmModel if logit: logitModel = linear_model.LogisticRegression() logitModel.fit(x, y) models['logit'] = logitModel return {'models' : models, 'colnames' : [c for c in colnames if c!='TF']}
def cross_validation(logit=False): # Split authors in train/validation confirmed, deleted = get_train() aids = confirmed.keys() shuffle(aids) cutPoint = int(len(aids) * F) aidsTrain = aids[:cutPoint] aidsValid = aids[cutPoint:] # Prepare the training data dataCols = dict( [(colName, read_data_col('%s.train.dat' % fileName, valCol=valCol)) for colName, fileName, valCol in COLUMNS] ) data, colnames = create_matrix(dataCols, exclude_aids=aidsValid) cimt = colnames.index('TF') ytrain = data[:, cimt] xtrain = data[:, [c for c in range(len(colnames)) if c != cimt]] # validation data, colnames2, pairs = create_matrix(dataCols, exclude_aids=aidsTrain, return_pairs=True) if colnames2 != colnames: raise WTF yvalid = data[:, cimt] xvalid = data[:, [c for c in range(len(colnames)) if c != cimt]] print >> sys.stderr, '==> %s train; %s valid <==' % (str(xtrain.shape), str(xvalid.shape)) # Train the model models = train_models(xtrain, ytrain, logit=logit) # Calculate performance of each algorithm performance = {} for algorithm in models: print >> sys.stderr, '\n\n--> %s <--\n' % algorithm print >> sys.stderr, models[algorithm] # make the predictions predBin = models[algorithm].predict(xvalid) pred = models[algorithm].predict_proba(xvalid) print >> sys.stderr, '' print >> sys.stderr, yvalid print >> sys.stderr, predBin print >> sys.stderr, pred # extract author-paper scores decorated = {} for n in range(len(pairs)): aid, pid = pairs[n] score = pred[n, 0] try: decorated[aid].append((score, pid, yvalid[n])) except KeyError: decorated[aid] = [(score, pid, yvalid[n])] # get the MAP MAPs = [] for aid in decorated: sortedPapers = decorated[aid] sortedPapers.sort() ntrue, ntot, MAPTerms = 0, 0, [] for s, p, tf in sortedPapers: ntot += 1 if tf == 1: ntrue += 1 MAPTerms.append(float(ntrue)/float(ntot)) MAPs.append(np.mean(MAPTerms)) # output results performance[algorithm] = np.mean(MAPs) print >> sys.stderr, '\n>> %s: %f' % (algorithm, np.mean(MAPs)) # Done print >> sys.stderr, '\n' return performance
def make_submission(): # Read validation file validation = get_valid() # Read scores for validation author-paper pairs validDataCols = dict( [(colName, read_data_col('%s.valid.dat' % fileName, valCol=valCol)) for colName, fileName, valCol in COLUMNS if colName != 'TF'] ) # Train the models and prepare the validation sets models, xValid, pairsAP = {}, {}, {} for aid in validation: for pid in validation[aid]: # the available columns for this author-paper pair try: cols = [col for col in validDataCols if validDataCols[col].has_key(aid) and validDataCols[col][aid].has_key(pid)] except: print aid, pid, aid in validDataCols[col].keys() raise ValueError cols.sort() # train the model if necessary (i.e. if this is the first # time we see exactly these columns) modelName = '_'.join(cols) if modelName not in models: models[modelName] = train_models(cols) xValid[modelName] = [] pairsAP[modelName] = [] # validation columns xValid[modelName].append( np.array([validDataCols[col][aid][pid] for col in models[modelName]['colnames']]) ) pairsAP[modelName].append((aid, pid)) # Make the predictions decorated = {} for modelName in models: print >> sys.stderr, '\n>> Making predictions for cols:', modelName # make all predictions for this model print >> sys.stderr, np.array(xValid[modelName]) print >> sys.stderr, np.array(xValid[modelName]).shape pred = models[modelName]['models']['rf'].predict_proba( np.array(xValid[modelName]) ) print >> sys.stderr, pred # extract author-paper scores for n in range(len(pairsAP[modelName])): aid, pid = pairsAP[modelName][n] score = pred[n, 0] try: decorated[aid].append((score, pid)) except KeyError: decorated[aid] = [(score, pid)] # Create the file outf = open('submit_%s.csv' % strftime("%Y%m%d_%H:%M:%S", localtime()), 'w') print >> outf, 'AuthorId,PaperIds' for aid in decorated: sortedPapers = decorated[aid] sortedPapers.sort() print >> outf, \ str(aid) + ',' + ' '.join([str(pid) for s, pid in sortedPapers]) outf.close() # Create the info file outf = open('submit_%s.info' % strftime("%Y%m%d_%H:%M:%S", localtime()), 'w') print >> outf, '\n'.join([str(c) for c in COLUMNS]) outf.close()
('nconference', '../nvenue.train.dat', 4), ('name', '../name.train.dat', 3), ## ('nameinit', '../name.train.dat', 4), # ('nname', '../nname.train.dat', 3), ('npapers', '../npapers.train.dat', 3), ('nauthors', '../nauthors.train.dat', 3), # ('coauthors', '../coauthors_diff.train.dat', 3), ## ('zcoauthors', '../coauthors_diff.train.dat', 4), ('affiliation', '../affiliation.train.dat', 3), ('year', '../year.train.dat', 3), ('nvalidated', '../nvalidated.train.dat', 3), ('sumcoauthors', '../sumcoauthors.train.dat', 3), ) if __name__ == '__main__': dataCols = dict([(colName, read_data_col(fileName, valCol=valCol)) for colName, fileName, valCol in COLUMNS]) data, colnames = create_matrix(dataCols) print colnames cimt = colnames.index('TF') y = data[:, cimt] x = data[:, [c for c in range(len(colnames)) if c != cimt]] N = len(y) ntrain = int(N * F) ttindices = range(N) shuffle(ttindices) yTrain = y[ttindices[:ntrain], :] xTrain = x[ttindices[:ntrain], :] yTest = y[ttindices[ntrain+1:], :]