コード例 #1
0
def run_csv_splitter(data, input_reads, igrec_rcm_file, output, filename):
      data['second_vote_abs1'] = data['value1'] * data['size']
      data['second_vote_abs2'] = data['value2'] * data['size']
      data['second_vote_abs3'] = data['value3'] * data['size']

      data = data[['second_vote_abs1', 'second_vote_abs2', 'second_vote_abs3',  'size']]
      print 'All clusters: ', len(data)
      data = data[data['size'] > 10]
      print 'Clusters with sizes more than 10: ', len(data)
      log.write('Clusters with sizes more than 10: ' + str(len(data)) + '\n')
      target_index = data.index

      pp = pickle.load(open('/home/ndurasov/ig_cluster_splitter/data/models/pipe_model', 'rb'))

      data = pp.steps[0][1].transform(data)
      data = pd.DataFrame(data)
      data.columns = ['0','1','2','3'] 
      logreg = pd.Series(pp.steps[1][1].models['first_lvl'][0].predict(data), index=target_index)
      xgb  = pd.Series(pp.steps[1][1].models['first_lvl'][1].predict(data), index=target_index)

      ans = pp.steps[2][1].predict(pd.concat([logreg, xgb], axis=1))

      ans = pd.Series(ans, index = target_index)
      target_clusters = ans[ans == 1]

      print 'Find ' + str(len(target_clusters)) + ' clusters for splitting in ' + str(len(ans)) + ' clusters'
      log.write('Find ' + str(len(target_clusters)) +
              ' clusters for splitting in ' + str(len(ans)) + ' clusters\n')
      id_dict = func_tools.id_to_read(input_reads)

      igrec_rcm = func_tools.read_rcm(igrec_rcm_file)

      clusters = func_tools.construct_clusters(igrec_rcm, id_dict)
      clusters = func_tools.clusters_splitting(clusters, list(target_clusters.index))

      print 'New .rcm file consists of ' + str(len(clusters))
      log.write('New .rcm file consists of ' + str(len(clusters)) + '\n\n')

      func_tools.to_rcm(output + filename + '.rcm', clusters, id_dict)
      func_tools.to_fa(output + filename + '.fa', clusters)
コード例 #2
0
 def setUpClass(self):
    self.rcm = func_tools.read_rcm(test_base + test_rcm_file)
    self.input_reads = func_tools.id_to_read(test_base + test_input_reads_file)
    self.clusters = func_tools.construct_clusters(self.rcm, self.input_reads)
コード例 #3
0
 def test_construct_clusters(self):
    rcm = func_tools.read_rcm(test_base + test_rcm_file)
    input_reads = func_tools.id_to_read(test_base + test_input_reads_file)
    self.assertEqual(func_tools.construct_clusters(rcm, input_reads), test_clusters)
コード例 #4
0
 def test_read_rcm(self):
     self.assertEqual(func_tools.read_rcm(test_base + test_rcm_file), test_rcm)
コード例 #5
0
ans = pp.steps[2][1].predict(pd.concat([logreg, xgb], axis=1))
#ans = xgb

ans = pd.Series(ans, index=target_index)
target_clusters = ans[ans == 1]

print 'Find ' + str(
    len(target_clusters)) + ' clusters for splitting in ' + str(
        len(ans)) + ' clusters'
log_file.write('Find ' + str(len(target_clusters)) +
               ' clusters for splitting in ' + str(len(ans)) + ' clusters' +
               '\n')

id_dict = func_tools.id_to_read(args.s)
igrec_rcm = func_tools.read_rcm(args.r)

clusters = func_tools.construct_clusters(igrec_rcm, id_dict)
print "Clusters in IgReC output: ", len(clusters), '\n'
log_file.write('Clusters in IgReC output: ' + str(len(clusters)) + '\n')

clusters = func_tools.clusters_splitting(clusters,
                                         list(target_clusters.index),
                                         threshold=5)
print "Clusters in splitted IgReC output: ", len(clusters), '\n'
log_file.write('Clusters in splitted IgReC output: ' + str(len(clusters)) +
               '\n')

print 'New ', args.f, '.rcm file consists of ' + str(len(clusters))
log_file.write('New ' + args.f + '.rcm file consists of ' + str(len(clusters)))