Beispiel #1
0
def Step2():
    print '================================================================'
    print 'Step 3/6: Generating features'
    print '================================================================'

    trainLabels = []
    trainFeatures = []
    validFeatures = []

    print 'Generating features for train data...'

    for trainRow in trainData:
        trainLabels.append(trainRow['label'])
        trainFeatures.append(
            GenerateFeatures.allFeatures(trainRow['authorId'],
                                         trainRow['paperId']))

    print 'Generating features for validation data...'
    for validRow in validData:
        validFeatures.append(
            GenerateFeatures.allFeatures(validRow['authorId'],
                                         validRow['paperId']))

    print '================================================================'
    print 'Step 4/6: Training models'
    print '================================================================'

    rfClassifier = RandomForestClassifier(n_estimators=100,
                                          verbose=1,
                                          n_jobs=-1,
                                          min_samples_split=10,
                                          random_state=1)
    rfClassifier.fit(trainFeatures, trainLabels)

    gbClassifier = GradientBoostingClassifier(n_estimators=100,
                                              verbose=1,
                                              learning_rate=1.0,
                                              max_depth=3,
                                              random_state=0)
    gbClassifier.fit(trainFeatures, trainLabels)

    print '================================================================'
    print 'Step 5/6: Applying models'
    print '================================================================'

    rfPredictions = list(rfClassifier.predict_proba(validFeatures)[:, 1])
    gbPredictions = list(gbClassifier.predict_proba(validFeatures)[:, 1])

    print 'Taking the weighted average of the two models'
    predictions = []
    for prediction in zip(rfPredictions, gbPredictions):
        # Take the weighted average of the two models
        predictions.append(0.5 * prediction[0] + 0.5 * prediction[1])

    print '================================================================'
    print 'Step 6/6: Saving results and calculate performance'
    print '================================================================'

    IO.writePredictions(predictions, validData)
    evaluate.calculate_map()
Beispiel #2
0
    def test_df_nosuchpath(self):
        path = "/x_y_z"
        msg = """#TSI_DF
#TSI_FILE %s
ENDOFMESSAGE
""" % path
        control_in = io.TextIOWrapper(
            io.BufferedReader(io.BytesIO(msg.encode("UTF-8"))))
        conn = MockConnector.MockConnector(control_in, None, None, None,
                                           self.LOG)
        IO.df(msg, conn, {}, self.LOG)
        out = conn.control_out.getvalue()
        self.assertTrue("TSI_FAILED" in out)
        print(out)
Beispiel #3
0
def Step2():
	print '================================================================'
	print 'Step 3/6: Generating features'
	print '================================================================'

	trainLabels = []
	trainFeatures = []
	validFeatures = []

	print 'Generating features for train data...'

	for trainRow in trainData:
		trainLabels.append(trainRow['label'])
		trainFeatures.append(GenerateFeatures.allFeatures(trainRow['authorId'], trainRow['paperId']))

	print 'Generating features for validation data...'
	for validRow in validData:
		validFeatures.append(GenerateFeatures.allFeatures(validRow['authorId'], validRow['paperId']))

	print '================================================================'
	print 'Step 4/6: Training models'
	print '================================================================'

	rfClassifier = RandomForestClassifier(n_estimators=100, verbose=1, n_jobs=-1, min_samples_split=10, random_state=1)
	rfClassifier.fit(trainFeatures, trainLabels)

	gbClassifier = GradientBoostingClassifier(n_estimators=100, verbose=1, learning_rate=1.0, max_depth=3, random_state=0)
	gbClassifier.fit(trainFeatures, trainLabels)

	print '================================================================'
	print 'Step 5/6: Applying models'
	print '================================================================'

	rfPredictions = list(rfClassifier.predict_proba(validFeatures)[:,1])
	gbPredictions = list(gbClassifier.predict_proba(validFeatures)[:,1])

	print 'Taking the weighted average of the two models'
	predictions = []
	for prediction in zip(rfPredictions, gbPredictions):
		# Take the weighted average of the two models
		predictions.append(0.5 * prediction[0] + 0.5 * prediction[1])

	print '================================================================'
	print 'Step 6/6: Saving results and calculate performance'
	print '================================================================'

	IO.writePredictions(predictions, validData)
	evaluate.calculate_map()
Beispiel #4
0
    def test_ls(self):
        path = os.getcwd()
        msg = """#TSI_LS
#TSI_FILE %s
#TSI_LS_MODE N
ENDOFMESSAGE
""" % path
        control_in = io.TextIOWrapper(
            io.BufferedReader(io.BytesIO(msg.encode("UTF-8"))))
        conn = MockConnector.MockConnector(control_in, None, None, None,
                                           self.LOG)
        IO.ls(msg, conn, {}, self.LOG)
        out = conn.control_out.getvalue()
        self.assertFalse("TSI_FAILED" in out)
        self.assertTrue("START_LISTING" in out)
        self.assertTrue("END_LISTING" in out)
Beispiel #5
0
def Step1():
    print '================================================================'
    print 'KDD Cup 2013 - Track 1'
    print 'Model by Bart Jeukendrup'
    print '================================================================'
    print 'Step 1/6: Reading CSV files to memory & pre-process data'
    print '================================================================'

    # Read all the data from the files or the memory cache
    IO.readAuthors()
    IO.readVenues()
    IO.readPapers()
    IO.readAuthorPaper()

    IO.readTrainData()
    IO.readValidData()

    # Preprocess string values (strip HTML, lower, character encoding)
    __builtin__.authors = PreProcess.authors(__builtin__.authors)
    __builtin__.papers = PreProcess.papers(__builtin__.papers)
    __builtin__.venues = PreProcess.venues(__builtin__.venues)
    _builtin__.paperauthor = PreProcess.paperauthors(__builtin__.paperauthor)

    print '================================================================'
    print 'Step 2/6: Initial feature calculation'
    print '================================================================'

    # Calculate adjecency and probability matrixes of HeteSim
    InitialCalculation.calculate()
Beispiel #6
0
 def test_list(self):
     path = os.getcwd()
     conn = MockConnector.MockConnector(None, None, None, None, self.LOG)
     IO.list_directory(conn, path, False)
     out = conn.control_out.getvalue()
     print(out)
Beispiel #7
0
 def test_stat_file(self):
     path = "/tmp/"
     info = IO.get_info(path)
     self.assertTrue("DRWX" in info)
     self.assertTrue("/tmp" in info)
Beispiel #8
0
from __future__ import division
from docopt import docopt
from os.path import isfile, abspath, dirname
import lib.IO as IO
import lib.bless as bless

if __name__ == '__main__':
    main_dir = dirname(__file__)
    #print data_dir
    args = docopt(__doc__)
    print args
    vcf_f = args['--vcf']
    region_arg = args['--region']
    out_f = args['--out']

    # Deal with regions ...
    region_f = None
    region_list = []
    if isfile(region_arg):
        region_f = abspath(region_arg)
        region_list = IO.parse_list(region_f)
    else: 
        region_list.append(region_arg)
    bless.region_list(region_list)

    




Beispiel #9
0
def Step1():
	print '================================================================'
	print 'KDD Cup 2013 - Track 1'
	print 'Model by Bart Jeukendrup'
	print '================================================================'
	print 'Step 1/6: Reading CSV files to memory & pre-process data'
	print '================================================================'

	# Read all the data from the files or the memory cache
	IO.readAuthors()
	IO.readVenues()
	IO.readPapers()
	IO.readAuthorPaper()

	IO.readTrainData()
	IO.readValidData()

	# Preprocess string values (strip HTML, lower, character encoding)
	__builtin__.authors = PreProcess.authors(__builtin__.authors)
	__builtin__.papers = PreProcess.papers(__builtin__.papers)
	__builtin__.venues = PreProcess.venues(__builtin__.venues)
	_builtin__.paperauthor = PreProcess.paperauthors(__builtin__.paperauthor)

	print '================================================================'
	print 'Step 2/6: Initial feature calculation'
	print '================================================================'

	# Calculate adjecency and probability matrixes of HeteSim
	InitialCalculation.calculate()