def run_csv_splitter(data, input_reads, igrec_rcm_file, output, filename): data['second_vote_abs1'] = data['value1'] * data['size'] data['second_vote_abs2'] = data['value2'] * data['size'] data['second_vote_abs3'] = data['value3'] * data['size'] data = data[['second_vote_abs1', 'second_vote_abs2', 'second_vote_abs3', 'size']] print 'All clusters: ', len(data) data = data[data['size'] > 10] print 'Clusters with sizes more than 10: ', len(data) log.write('Clusters with sizes more than 10: ' + str(len(data)) + '\n') target_index = data.index pp = pickle.load(open('/home/ndurasov/ig_cluster_splitter/data/models/pipe_model', 'rb')) data = pp.steps[0][1].transform(data) data = pd.DataFrame(data) data.columns = ['0','1','2','3'] logreg = pd.Series(pp.steps[1][1].models['first_lvl'][0].predict(data), index=target_index) xgb = pd.Series(pp.steps[1][1].models['first_lvl'][1].predict(data), index=target_index) ans = pp.steps[2][1].predict(pd.concat([logreg, xgb], axis=1)) ans = pd.Series(ans, index = target_index) target_clusters = ans[ans == 1] print 'Find ' + str(len(target_clusters)) + ' clusters for splitting in ' + str(len(ans)) + ' clusters' log.write('Find ' + str(len(target_clusters)) + ' clusters for splitting in ' + str(len(ans)) + ' clusters\n') id_dict = func_tools.id_to_read(input_reads) igrec_rcm = func_tools.read_rcm(igrec_rcm_file) clusters = func_tools.construct_clusters(igrec_rcm, id_dict) clusters = func_tools.clusters_splitting(clusters, list(target_clusters.index)) print 'New .rcm file consists of ' + str(len(clusters)) log.write('New .rcm file consists of ' + str(len(clusters)) + '\n\n') func_tools.to_rcm(output + filename + '.rcm', clusters, id_dict) func_tools.to_fa(output + filename + '.fa', clusters)
def setUpClass(self): self.rcm = func_tools.read_rcm(test_base + test_rcm_file) self.input_reads = func_tools.id_to_read(test_base + test_input_reads_file) self.clusters = func_tools.construct_clusters(self.rcm, self.input_reads)
def test_construct_clusters(self): rcm = func_tools.read_rcm(test_base + test_rcm_file) input_reads = func_tools.id_to_read(test_base + test_input_reads_file) self.assertEqual(func_tools.construct_clusters(rcm, input_reads), test_clusters)
def test_read_rcm(self): self.assertEqual(func_tools.read_rcm(test_base + test_rcm_file), test_rcm)
ans = pp.steps[2][1].predict(pd.concat([logreg, xgb], axis=1)) #ans = xgb ans = pd.Series(ans, index=target_index) target_clusters = ans[ans == 1] print 'Find ' + str( len(target_clusters)) + ' clusters for splitting in ' + str( len(ans)) + ' clusters' log_file.write('Find ' + str(len(target_clusters)) + ' clusters for splitting in ' + str(len(ans)) + ' clusters' + '\n') id_dict = func_tools.id_to_read(args.s) igrec_rcm = func_tools.read_rcm(args.r) clusters = func_tools.construct_clusters(igrec_rcm, id_dict) print "Clusters in IgReC output: ", len(clusters), '\n' log_file.write('Clusters in IgReC output: ' + str(len(clusters)) + '\n') clusters = func_tools.clusters_splitting(clusters, list(target_clusters.index), threshold=5) print "Clusters in splitted IgReC output: ", len(clusters), '\n' log_file.write('Clusters in splitted IgReC output: ' + str(len(clusters)) + '\n') print 'New ', args.f, '.rcm file consists of ' + str(len(clusters)) log_file.write('New ' + args.f + '.rcm file consists of ' + str(len(clusters)))