Example #1
0
 def test_save_model(self):
     X, y, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'traindata'))
     self.svc.fit(X, y)
     newX, _, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'noveldata'))
     self.assertTrue(len(self.svc.predict(newX)) == 20)
     self.svc.save_model(config.get('sklearn_SVC_test', 'modelfile'))
     os.remove(config.get('sklearn_SVC_test', 'modelfile'))
Example #2
0
 def test_predict(self):
     X, y, _ = datasets.csv2numpy(
         config.get('RandomForest_test', 'traindata'))
     self.rf.fit(X, y)
     newX, _, _ = datasets.csv2numpy(
         config.get('RandomForest_test', 'noveldata'))
     self.assertTrue(len(self.rf.predict(newX)) == 20)
Example #3
0
 def test_decision_function(self):
     X, y, _ = datasets.csv2numpy(
         config.get('RandomForest_test', 'traindata'))
     self.rf.fit(X, y)
     newX, _, _ = datasets.csv2numpy(
         config.get('RandomForest_test', 'noveldata'))
     self.assertTrue(len(self.rf.decision_function(newX)) == 20)
Example #4
0
 def __init__(self):
     '''
     Constructor. 
     '''
     self.submit_url = config.get('pdfrateproxy', 'submit_url')
     self.report_url = config.get('pdfrateproxy', 'report_url')
     self.metadata_url = config.get('pdfrateproxy', 'metadata_url')
Example #5
0
 def __init__(self):
     '''
     Constructor. 
     '''
     self.submit_url = config.get('pdfrateproxy', 'submit_url')
     self.report_url = config.get('pdfrateproxy', 'report_url')
     self.metadata_url = config.get('pdfrateproxy', 'metadata_url')
 def test_save_model(self):
     X, y, _ = datasets.csv2numpy(config.get('RandomForest_test', 'traindata'))
     self.rf.fit(X, y)
     newX, _, _ = datasets.csv2numpy(config.get('RandomForest_test', 'noveldata'))
     self.assertTrue(len(self.rf.predict(newX)) == 20)
     self.rf.save_model(config.get('RandomForest_test', 'modelfile'))
     os.remove(config.get('RandomForest_test', 'modelfile'))
 def test_load_model(self):
     X, y, _ = datasets.csv2numpy(config.get('RandomForest_test', 'traindata'))
     self.rf.fit(X, y)
     newX, _, _ = datasets.csv2numpy(config.get('RandomForest_test', 'noveldata'))
     prediction = self.rf.predict(newX)
     self.rf.save_model(config.get('RandomForest_test', 'modelfile'))
     newrf = RandomForest()
     newrf.load_model(config.get('RandomForest_test', 'modelfile'))
     self.assertTrue(numpy.array_equal(prediction, newrf.predict(newX)))
     os.remove(config.get('RandomForest_test', 'modelfile'))
Example #8
0
 def test_load_model(self):
     X, y, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'traindata'))
     self.svc.fit(X, y)
     newX, _, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'noveldata'))
     prediction = self.svc.predict(newX)
     self.svc.save_model(config.get('sklearn_SVC_test', 'modelfile'))
     newsvc = sklearn_SVC()
     newsvc.load_model(config.get('sklearn_SVC_test', 'modelfile'))
     self.assertTrue(numpy.array_equal(prediction, newsvc.predict(newX)))
     os.remove(config.get('sklearn_SVC_test', 'modelfile'))
Example #9
0
 def test_load_model(self):
     X, y, _ = datasets.csv2numpy(
         config.get('RandomForest_test', 'traindata'))
     self.rf.fit(X, y)
     newX, _, _ = datasets.csv2numpy(
         config.get('RandomForest_test', 'noveldata'))
     prediction = self.rf.predict(newX)
     self.rf.save_model(config.get('RandomForest_test', 'modelfile'))
     newrf = RandomForest()
     newrf.load_model(config.get('RandomForest_test', 'modelfile'))
     self.assertTrue(numpy.array_equal(prediction, newrf.predict(newX)))
     os.remove(config.get('RandomForest_test', 'modelfile'))
Example #10
0
 def test_fit(self):
     X, y, _ = datasets.csv2numpy(
         config.get('RandomForest_test', 'traindata'))
     self.rf.fit(X, y)
Example #11
0
 def test_decision_function(self):
     X, y, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'traindata'))
     self.svc.fit(X, y)
     newX, _, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'noveldata'))
     self.assertTrue(len(self.svc.decision_function(newX)) == 20)
 def __init__(self):
     '''
     Constructor
     '''
     self.query_dir = config.get('pdfratequeryscheduler', 'query_dir')
     self.reply_dir = config.get('pdfratequeryscheduler', 'reply_dir')
Example #13
0
 def test_decision_function(self):
     X, y, _ = datasets.csv2numpy(config.get('RandomForest_test', 'traindata'))
     self.rf.fit(X, y)
     newX, _, _ = datasets.csv2numpy(config.get('RandomForest_test', 'noveldata'))
     self.assertTrue(len(self.rf.decision_function(newX)) == 20)
def main():
    sys.stdout.write('PDFrate Query Scheduler running! [{0}]\n'.format(datetime.datetime.now()))
    parser = ArgumentParser(description = 'PDFrate Query Scheduler')
    parser.parse_args()
    
    QUERY_DIR = config.get('pdfratequeryscheduler', 'query_dir')
    REPLY_DIR = config.get('pdfratequeryscheduler', 'reply_dir')
    
    queries = []
    max_priority = 0
    sys.stdout.write('Queries found: ')
    # Process all files in the QUERY_DIR
    for f in os.listdir(QUERY_DIR):
        f = os.path.join(QUERY_DIR, f)
        if not os.path.isfile(f) or os.path.splitext(f)[1] != '.json':
            continue
        try:
            queries.append(json.load(open(f, 'r')))
        except Exception as ex:
            sys.stderr.write('Error reading query file \'{f}\': {ex}\n'.format(f=f, ex=ex))
            continue
        # Keep track of max priority
        queries[-1]['queryfile'] = f
        if queries[-1]['priority'] > max_priority:
            max_priority = queries[-1]['priority']
        #sys.stdout.write('{0}\n'.format(queries[-1]))
    
    # In case of no queries
    if not queries:
        sys.stdout.write("None\nExiting.\n")
        return
    else:
        sys.stdout.write('{}\n'.format(len(queries)))
    
    # Filter for max priority queries
    sys.stdout.write('Max priority: {0}\n'.format(max_priority))
    if max_priority != 0:
        queries = [q for q in queries if q['priority'] == max_priority]
    # The oldest one is next
    top_query = min(queries, key=itemgetter('datetime'))
    del queries
    sys.stdout.write('Next query: {0}\n'.format(top_query))
    
    # Submit query to PDFrate and save the reply
    proxy = pdfrateproxy.PdfrateProxy()
    sleep_time = randint(0, int(config.get('pdfratequeryscheduler', 'sleep_time')))
    sys.stdout.write('Sleeping for {0} seconds...\n'.format(sleep_time))
    sleep(sleep_time)
    sys.stdout.write('Getting report...\n')
    reply = proxy.get_report(utility.file_sha256_hash(top_query['filename']))
    if reply['status'] == 'noreport':
        sys.stdout.write('No report, submitting file...\n')
        reply = proxy.submit_file(top_query['filename'])
    
    if top_query['get_metadata'] == True and reply['status'] == 'success':
        # Also get metadata
        file_hash = os.path.splitext(os.path.basename(top_query['queryfile']))[0]
        sys.stdout.write('Getting metadata...\n')
        metadata_reply = proxy.get_metadata(file_hash)
        reply['metadata'] = metadata_reply['metadata']
        reply['status'] = metadata_reply['status']
    
    reply_filename = os.path.join(REPLY_DIR, os.path.basename(top_query['queryfile']))
    reply['filename'] = top_query['filename']
    sys.stdout.write('Writing reply to disk...\n')
    json.dump(reply, open(reply_filename, 'w+'))
    # Remove query file
    sys.stdout.write('Removing query file...\n')
    os.remove(top_query['queryfile'])
    sys.stdout.write('Exiting.\n')
Example #15
0
 def test_standardize_csv(self):
     datasets.standardize_csv(config.get('datasets_test', 'csv_in'),
                              config.get('datasets_test', 'csv_temp'))
     os.remove(config.get('datasets_test', 'csv_temp'))
Example #16
0
 def test_standardize_csv(self):
     datasets.standardize_csv(config.get("datasets_test", "csv_in"), config.get("datasets_test", "csv_temp"))
     os.remove(config.get("datasets_test", "csv_temp"))
Example #17
0
 def test_fit(self):
     X, y, _ = datasets.csv2numpy(config.get('RandomForest_test', 'traindata'))
     self.rf.fit(X, y)
Example #18
0
 def __init__(self):
     '''
     Constructor
     '''
     self.query_dir = config.get('pdfratequeryscheduler', 'query_dir')
     self.reply_dir = config.get('pdfratequeryscheduler', 'reply_dir')
Example #19
0
 def test_predict(self):
     X, y, _ = datasets.csv2numpy(config.get('RandomForest_test', 'traindata'))
     self.rf.fit(X, y)
     newX, _, _ = datasets.csv2numpy(config.get('RandomForest_test', 'noveldata'))
     self.assertTrue(len(self.rf.predict(newX)) == 20)
Example #20
0
def main():
    sys.stdout.write('PDFrate Query Scheduler running! [{0}]\n'.format(
        datetime.datetime.now()))
    parser = ArgumentParser(description='PDFrate Query Scheduler')
    parser.parse_args()

    QUERY_DIR = config.get('pdfratequeryscheduler', 'query_dir')
    REPLY_DIR = config.get('pdfratequeryscheduler', 'reply_dir')

    queries = []
    max_priority = 0
    sys.stdout.write('Queries found: ')
    # Process all files in the QUERY_DIR
    for f in os.listdir(QUERY_DIR):
        f = os.path.join(QUERY_DIR, f)
        if not os.path.isfile(f) or os.path.splitext(f)[1] != '.json':
            continue
        try:
            queries.append(json.load(open(f, 'r')))
        except Exception as ex:
            sys.stderr.write('Error reading query file \'{f}\': {ex}\n'.format(
                f=f, ex=ex))
            continue
        # Keep track of max priority
        queries[-1]['queryfile'] = f
        if queries[-1]['priority'] > max_priority:
            max_priority = queries[-1]['priority']
        #sys.stdout.write('{0}\n'.format(queries[-1]))

    # In case of no queries
    if not queries:
        sys.stdout.write("None\nExiting.\n")
        return
    else:
        sys.stdout.write('{}\n'.format(len(queries)))

    # Filter for max priority queries
    sys.stdout.write('Max priority: {0}\n'.format(max_priority))
    if max_priority != 0:
        queries = [q for q in queries if q['priority'] == max_priority]
    # The oldest one is next
    top_query = min(queries, key=itemgetter('datetime'))
    del queries
    sys.stdout.write('Next query: {0}\n'.format(top_query))

    # Submit query to PDFrate and save the reply
    proxy = pdfrateproxy.PdfrateProxy()
    sleep_time = randint(
        0, int(config.get('pdfratequeryscheduler', 'sleep_time')))
    sys.stdout.write('Sleeping for {0} seconds...\n'.format(sleep_time))
    sleep(sleep_time)
    sys.stdout.write('Getting report...\n')
    reply = proxy.get_report(utility.file_sha256_hash(top_query['filename']))
    if reply['status'] == 'noreport':
        sys.stdout.write('No report, submitting file...\n')
        reply = proxy.submit_file(top_query['filename'])

    if top_query['get_metadata'] == True and reply['status'] == 'success':
        # Also get metadata
        file_hash = os.path.splitext(os.path.basename(
            top_query['queryfile']))[0]
        sys.stdout.write('Getting metadata...\n')
        metadata_reply = proxy.get_metadata(file_hash)
        reply['metadata'] = metadata_reply['metadata']
        reply['status'] = metadata_reply['status']

    reply_filename = os.path.join(REPLY_DIR,
                                  os.path.basename(top_query['queryfile']))
    reply['filename'] = top_query['filename']
    sys.stdout.write('Writing reply to disk...\n')
    json.dump(reply, open(reply_filename, 'w+'))
    # Remove query file
    sys.stdout.write('Removing query file...\n')
    os.remove(top_query['queryfile'])
    sys.stdout.write('Exiting.\n')
Example #21
0
 def test_fit(self):
     X, y, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'traindata'))
     self.svc.fit(X, y)