def Step2(): print '================================================================' print 'Step 3/6: Generating features' print '================================================================' trainLabels = [] trainFeatures = [] validFeatures = [] print 'Generating features for train data...' for trainRow in trainData: trainLabels.append(trainRow['label']) trainFeatures.append( GenerateFeatures.allFeatures(trainRow['authorId'], trainRow['paperId'])) print 'Generating features for validation data...' for validRow in validData: validFeatures.append( GenerateFeatures.allFeatures(validRow['authorId'], validRow['paperId'])) print '================================================================' print 'Step 4/6: Training models' print '================================================================' rfClassifier = RandomForestClassifier(n_estimators=100, verbose=1, n_jobs=-1, min_samples_split=10, random_state=1) rfClassifier.fit(trainFeatures, trainLabels) gbClassifier = GradientBoostingClassifier(n_estimators=100, verbose=1, learning_rate=1.0, max_depth=3, random_state=0) gbClassifier.fit(trainFeatures, trainLabels) print '================================================================' print 'Step 5/6: Applying models' print '================================================================' rfPredictions = list(rfClassifier.predict_proba(validFeatures)[:, 1]) gbPredictions = list(gbClassifier.predict_proba(validFeatures)[:, 1]) print 'Taking the weighted average of the two models' predictions = [] for prediction in zip(rfPredictions, gbPredictions): # Take the weighted average of the two models predictions.append(0.5 * prediction[0] + 0.5 * prediction[1]) print '================================================================' print 'Step 6/6: Saving results and calculate performance' print '================================================================' IO.writePredictions(predictions, validData) evaluate.calculate_map()
def test_df_nosuchpath(self): path = "/x_y_z" msg = """#TSI_DF #TSI_FILE %s ENDOFMESSAGE """ % path control_in = io.TextIOWrapper( io.BufferedReader(io.BytesIO(msg.encode("UTF-8")))) conn = MockConnector.MockConnector(control_in, None, None, None, self.LOG) IO.df(msg, conn, {}, self.LOG) out = conn.control_out.getvalue() self.assertTrue("TSI_FAILED" in out) print(out)
def Step2(): print '================================================================' print 'Step 3/6: Generating features' print '================================================================' trainLabels = [] trainFeatures = [] validFeatures = [] print 'Generating features for train data...' for trainRow in trainData: trainLabels.append(trainRow['label']) trainFeatures.append(GenerateFeatures.allFeatures(trainRow['authorId'], trainRow['paperId'])) print 'Generating features for validation data...' for validRow in validData: validFeatures.append(GenerateFeatures.allFeatures(validRow['authorId'], validRow['paperId'])) print '================================================================' print 'Step 4/6: Training models' print '================================================================' rfClassifier = RandomForestClassifier(n_estimators=100, verbose=1, n_jobs=-1, min_samples_split=10, random_state=1) rfClassifier.fit(trainFeatures, trainLabels) gbClassifier = GradientBoostingClassifier(n_estimators=100, verbose=1, learning_rate=1.0, max_depth=3, random_state=0) gbClassifier.fit(trainFeatures, trainLabels) print '================================================================' print 'Step 5/6: Applying models' print '================================================================' rfPredictions = list(rfClassifier.predict_proba(validFeatures)[:,1]) gbPredictions = list(gbClassifier.predict_proba(validFeatures)[:,1]) print 'Taking the weighted average of the two models' predictions = [] for prediction in zip(rfPredictions, gbPredictions): # Take the weighted average of the two models predictions.append(0.5 * prediction[0] + 0.5 * prediction[1]) print '================================================================' print 'Step 6/6: Saving results and calculate performance' print '================================================================' IO.writePredictions(predictions, validData) evaluate.calculate_map()
def test_ls(self): path = os.getcwd() msg = """#TSI_LS #TSI_FILE %s #TSI_LS_MODE N ENDOFMESSAGE """ % path control_in = io.TextIOWrapper( io.BufferedReader(io.BytesIO(msg.encode("UTF-8")))) conn = MockConnector.MockConnector(control_in, None, None, None, self.LOG) IO.ls(msg, conn, {}, self.LOG) out = conn.control_out.getvalue() self.assertFalse("TSI_FAILED" in out) self.assertTrue("START_LISTING" in out) self.assertTrue("END_LISTING" in out)
def Step1(): print '================================================================' print 'KDD Cup 2013 - Track 1' print 'Model by Bart Jeukendrup' print '================================================================' print 'Step 1/6: Reading CSV files to memory & pre-process data' print '================================================================' # Read all the data from the files or the memory cache IO.readAuthors() IO.readVenues() IO.readPapers() IO.readAuthorPaper() IO.readTrainData() IO.readValidData() # Preprocess string values (strip HTML, lower, character encoding) __builtin__.authors = PreProcess.authors(__builtin__.authors) __builtin__.papers = PreProcess.papers(__builtin__.papers) __builtin__.venues = PreProcess.venues(__builtin__.venues) _builtin__.paperauthor = PreProcess.paperauthors(__builtin__.paperauthor) print '================================================================' print 'Step 2/6: Initial feature calculation' print '================================================================' # Calculate adjecency and probability matrixes of HeteSim InitialCalculation.calculate()
def test_list(self): path = os.getcwd() conn = MockConnector.MockConnector(None, None, None, None, self.LOG) IO.list_directory(conn, path, False) out = conn.control_out.getvalue() print(out)
def test_stat_file(self): path = "/tmp/" info = IO.get_info(path) self.assertTrue("DRWX" in info) self.assertTrue("/tmp" in info)
from __future__ import division from docopt import docopt from os.path import isfile, abspath, dirname import lib.IO as IO import lib.bless as bless if __name__ == '__main__': main_dir = dirname(__file__) #print data_dir args = docopt(__doc__) print args vcf_f = args['--vcf'] region_arg = args['--region'] out_f = args['--out'] # Deal with regions ... region_f = None region_list = [] if isfile(region_arg): region_f = abspath(region_arg) region_list = IO.parse_list(region_f) else: region_list.append(region_arg) bless.region_list(region_list)