def test_save_model(self): X, y, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'traindata')) self.svc.fit(X, y) newX, _, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'noveldata')) self.assertTrue(len(self.svc.predict(newX)) == 20) self.svc.save_model(config.get('sklearn_SVC_test', 'modelfile')) os.remove(config.get('sklearn_SVC_test', 'modelfile'))
def test_predict(self): X, y, _ = datasets.csv2numpy( config.get('RandomForest_test', 'traindata')) self.rf.fit(X, y) newX, _, _ = datasets.csv2numpy( config.get('RandomForest_test', 'noveldata')) self.assertTrue(len(self.rf.predict(newX)) == 20)
def test_decision_function(self): X, y, _ = datasets.csv2numpy( config.get('RandomForest_test', 'traindata')) self.rf.fit(X, y) newX, _, _ = datasets.csv2numpy( config.get('RandomForest_test', 'noveldata')) self.assertTrue(len(self.rf.decision_function(newX)) == 20)
def __init__(self): ''' Constructor. ''' self.submit_url = config.get('pdfrateproxy', 'submit_url') self.report_url = config.get('pdfrateproxy', 'report_url') self.metadata_url = config.get('pdfrateproxy', 'metadata_url')
def test_save_model(self): X, y, _ = datasets.csv2numpy(config.get('RandomForest_test', 'traindata')) self.rf.fit(X, y) newX, _, _ = datasets.csv2numpy(config.get('RandomForest_test', 'noveldata')) self.assertTrue(len(self.rf.predict(newX)) == 20) self.rf.save_model(config.get('RandomForest_test', 'modelfile')) os.remove(config.get('RandomForest_test', 'modelfile'))
def test_load_model(self): X, y, _ = datasets.csv2numpy(config.get('RandomForest_test', 'traindata')) self.rf.fit(X, y) newX, _, _ = datasets.csv2numpy(config.get('RandomForest_test', 'noveldata')) prediction = self.rf.predict(newX) self.rf.save_model(config.get('RandomForest_test', 'modelfile')) newrf = RandomForest() newrf.load_model(config.get('RandomForest_test', 'modelfile')) self.assertTrue(numpy.array_equal(prediction, newrf.predict(newX))) os.remove(config.get('RandomForest_test', 'modelfile'))
def test_load_model(self): X, y, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'traindata')) self.svc.fit(X, y) newX, _, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'noveldata')) prediction = self.svc.predict(newX) self.svc.save_model(config.get('sklearn_SVC_test', 'modelfile')) newsvc = sklearn_SVC() newsvc.load_model(config.get('sklearn_SVC_test', 'modelfile')) self.assertTrue(numpy.array_equal(prediction, newsvc.predict(newX))) os.remove(config.get('sklearn_SVC_test', 'modelfile'))
def test_load_model(self): X, y, _ = datasets.csv2numpy( config.get('RandomForest_test', 'traindata')) self.rf.fit(X, y) newX, _, _ = datasets.csv2numpy( config.get('RandomForest_test', 'noveldata')) prediction = self.rf.predict(newX) self.rf.save_model(config.get('RandomForest_test', 'modelfile')) newrf = RandomForest() newrf.load_model(config.get('RandomForest_test', 'modelfile')) self.assertTrue(numpy.array_equal(prediction, newrf.predict(newX))) os.remove(config.get('RandomForest_test', 'modelfile'))
def test_fit(self): X, y, _ = datasets.csv2numpy( config.get('RandomForest_test', 'traindata')) self.rf.fit(X, y)
def test_decision_function(self): X, y, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'traindata')) self.svc.fit(X, y) newX, _, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'noveldata')) self.assertTrue(len(self.svc.decision_function(newX)) == 20)
def __init__(self): ''' Constructor ''' self.query_dir = config.get('pdfratequeryscheduler', 'query_dir') self.reply_dir = config.get('pdfratequeryscheduler', 'reply_dir')
def test_decision_function(self): X, y, _ = datasets.csv2numpy(config.get('RandomForest_test', 'traindata')) self.rf.fit(X, y) newX, _, _ = datasets.csv2numpy(config.get('RandomForest_test', 'noveldata')) self.assertTrue(len(self.rf.decision_function(newX)) == 20)
def main(): sys.stdout.write('PDFrate Query Scheduler running! [{0}]\n'.format(datetime.datetime.now())) parser = ArgumentParser(description = 'PDFrate Query Scheduler') parser.parse_args() QUERY_DIR = config.get('pdfratequeryscheduler', 'query_dir') REPLY_DIR = config.get('pdfratequeryscheduler', 'reply_dir') queries = [] max_priority = 0 sys.stdout.write('Queries found: ') # Process all files in the QUERY_DIR for f in os.listdir(QUERY_DIR): f = os.path.join(QUERY_DIR, f) if not os.path.isfile(f) or os.path.splitext(f)[1] != '.json': continue try: queries.append(json.load(open(f, 'r'))) except Exception as ex: sys.stderr.write('Error reading query file \'{f}\': {ex}\n'.format(f=f, ex=ex)) continue # Keep track of max priority queries[-1]['queryfile'] = f if queries[-1]['priority'] > max_priority: max_priority = queries[-1]['priority'] #sys.stdout.write('{0}\n'.format(queries[-1])) # In case of no queries if not queries: sys.stdout.write("None\nExiting.\n") return else: sys.stdout.write('{}\n'.format(len(queries))) # Filter for max priority queries sys.stdout.write('Max priority: {0}\n'.format(max_priority)) if max_priority != 0: queries = [q for q in queries if q['priority'] == max_priority] # The oldest one is next top_query = min(queries, key=itemgetter('datetime')) del queries sys.stdout.write('Next query: {0}\n'.format(top_query)) # Submit query to PDFrate and save the reply proxy = pdfrateproxy.PdfrateProxy() sleep_time = randint(0, int(config.get('pdfratequeryscheduler', 'sleep_time'))) sys.stdout.write('Sleeping for {0} seconds...\n'.format(sleep_time)) sleep(sleep_time) sys.stdout.write('Getting report...\n') reply = proxy.get_report(utility.file_sha256_hash(top_query['filename'])) if reply['status'] == 'noreport': sys.stdout.write('No report, submitting file...\n') reply = proxy.submit_file(top_query['filename']) if top_query['get_metadata'] == True and reply['status'] == 'success': # Also get metadata file_hash = os.path.splitext(os.path.basename(top_query['queryfile']))[0] sys.stdout.write('Getting metadata...\n') metadata_reply = proxy.get_metadata(file_hash) reply['metadata'] = metadata_reply['metadata'] reply['status'] = metadata_reply['status'] reply_filename = os.path.join(REPLY_DIR, os.path.basename(top_query['queryfile'])) reply['filename'] = top_query['filename'] sys.stdout.write('Writing reply to disk...\n') json.dump(reply, open(reply_filename, 'w+')) # Remove query file sys.stdout.write('Removing query file...\n') os.remove(top_query['queryfile']) sys.stdout.write('Exiting.\n')
def test_standardize_csv(self): datasets.standardize_csv(config.get('datasets_test', 'csv_in'), config.get('datasets_test', 'csv_temp')) os.remove(config.get('datasets_test', 'csv_temp'))
def test_standardize_csv(self): datasets.standardize_csv(config.get("datasets_test", "csv_in"), config.get("datasets_test", "csv_temp")) os.remove(config.get("datasets_test", "csv_temp"))
def test_fit(self): X, y, _ = datasets.csv2numpy(config.get('RandomForest_test', 'traindata')) self.rf.fit(X, y)
def test_predict(self): X, y, _ = datasets.csv2numpy(config.get('RandomForest_test', 'traindata')) self.rf.fit(X, y) newX, _, _ = datasets.csv2numpy(config.get('RandomForest_test', 'noveldata')) self.assertTrue(len(self.rf.predict(newX)) == 20)
def main(): sys.stdout.write('PDFrate Query Scheduler running! [{0}]\n'.format( datetime.datetime.now())) parser = ArgumentParser(description='PDFrate Query Scheduler') parser.parse_args() QUERY_DIR = config.get('pdfratequeryscheduler', 'query_dir') REPLY_DIR = config.get('pdfratequeryscheduler', 'reply_dir') queries = [] max_priority = 0 sys.stdout.write('Queries found: ') # Process all files in the QUERY_DIR for f in os.listdir(QUERY_DIR): f = os.path.join(QUERY_DIR, f) if not os.path.isfile(f) or os.path.splitext(f)[1] != '.json': continue try: queries.append(json.load(open(f, 'r'))) except Exception as ex: sys.stderr.write('Error reading query file \'{f}\': {ex}\n'.format( f=f, ex=ex)) continue # Keep track of max priority queries[-1]['queryfile'] = f if queries[-1]['priority'] > max_priority: max_priority = queries[-1]['priority'] #sys.stdout.write('{0}\n'.format(queries[-1])) # In case of no queries if not queries: sys.stdout.write("None\nExiting.\n") return else: sys.stdout.write('{}\n'.format(len(queries))) # Filter for max priority queries sys.stdout.write('Max priority: {0}\n'.format(max_priority)) if max_priority != 0: queries = [q for q in queries if q['priority'] == max_priority] # The oldest one is next top_query = min(queries, key=itemgetter('datetime')) del queries sys.stdout.write('Next query: {0}\n'.format(top_query)) # Submit query to PDFrate and save the reply proxy = pdfrateproxy.PdfrateProxy() sleep_time = randint( 0, int(config.get('pdfratequeryscheduler', 'sleep_time'))) sys.stdout.write('Sleeping for {0} seconds...\n'.format(sleep_time)) sleep(sleep_time) sys.stdout.write('Getting report...\n') reply = proxy.get_report(utility.file_sha256_hash(top_query['filename'])) if reply['status'] == 'noreport': sys.stdout.write('No report, submitting file...\n') reply = proxy.submit_file(top_query['filename']) if top_query['get_metadata'] == True and reply['status'] == 'success': # Also get metadata file_hash = os.path.splitext(os.path.basename( top_query['queryfile']))[0] sys.stdout.write('Getting metadata...\n') metadata_reply = proxy.get_metadata(file_hash) reply['metadata'] = metadata_reply['metadata'] reply['status'] = metadata_reply['status'] reply_filename = os.path.join(REPLY_DIR, os.path.basename(top_query['queryfile'])) reply['filename'] = top_query['filename'] sys.stdout.write('Writing reply to disk...\n') json.dump(reply, open(reply_filename, 'w+')) # Remove query file sys.stdout.write('Removing query file...\n') os.remove(top_query['queryfile']) sys.stdout.write('Exiting.\n')
def test_fit(self): X, y, _ = datasets.csv2numpy(config.get('sklearn_SVC_test', 'traindata')) self.svc.fit(X, y)