Esempio n. 1
0
def deprecated_join_run( run_id, csv_bucket, mask_id, strains, alleles, description, 
        column_label, row_label, network_desc ):
    rs =  TruthGPUDiracModel.query( run_id )
    res_list = []
    for result in rs:
        temp = {}
        temp['csv'] = result.pval_file
        temp['range'] = list(result.mask)[0]
        m = re.match(r'\[(\d+),(\d+)\)', temp['range'])
        temp['start'] = int(m.group(1))
        if temp['range'] in mask_id:
            res_list.append( temp )
    res_list.sort(key=lambda x:x['start'])
    s3 = boto.connect_s3()
    b = s3.create_bucket( csv_bucket )
    pv_list = []
    for res in res_list:
        k = b.get_key( res['csv'] )
        csv = k.get_contents_as_string()
        first = True
        temp = {}
        for line in csv.split('\n'):
            if first:
                first = False
            else:
                if line.strip():
                    network, pval = line.split(',')
                    temp[network] = pval
        pv_list.append(temp)
    master = defaultdict(list)
    for col in  pv_list:
        for k,v in col.iteritems():
            master[k].append(v)
    table = [['networks'] + [r['range'] for r in res_list]]
    for k,v in master.iteritems():
        table.append([k] + v)
    my_table = ''
    for row in table:
        my_table += '\t'.join(row) + '\n'
    ts = datetime.datetime.utcnow().strftime('%Y-%m-%d-%H.%M')
    tsv_name = "%s-joined-%s.tsv" % (run_id,ts)
    with open(tsv_name, 'w') as joined:
        joined.write( my_table )
    k = Key(b)
    k.key = tsv_name
    k.set_contents_from_filename(tsv_name)
    if not DataForDisplay.exists():
        DataForDisplay.create_table( wait=True, read_capacity_units=2, write_capacity_units=1)
    dfd_item = DataForDisplay( run_id, ts )
    #    strains, allele,description, column_label, row_label ):
    dfd_item.strains = strains
    dfd_item.alleles = alleles
    dfd_item.description = description
    dfd_item.data_bucket = csv_bucket
    dfd_item.column_label = column_label
    dfd_item.row_label = row_label
    dfd_item.data_file = tsv_name
    dfd_item.network = network_desc
    dfd_item.save()
Esempio n. 2
0
 def _get_truth( self):
     truth = {}
     for item in TruthGPUDiracModel.query(self.run_id):
         try:
             if item.strain_id in self.mask_ids:
                 truth[item.strain_id] = self._load_np( item.accuracy_file )
         except:
             pass
     return truth
Esempio n. 3
0
 def _handle_truth( self, rs ):
     with TruthGPUDiracModel.batch_write() as tgdModel:
         base_key = '%s/truth-accuracy/%s'
         if self._truth is None:
             self._truth = {}#hacky, make setter
         for mid, m in zip(self.mask_ids, self.masks):
             masked_rs = resultset.Masked( rs, m )
             acc = self._truth[mid] = masked_rs.accuracy
             r_key = self._save_result_to_s3( base_key, acc )
             rf =  base64.b64encode( json.dumps( rs.get_result_files() ) )
             ts = datetime.datetime.utcnow().strftime('%Y.%m.%d-%H:%M:%S')
             tgdModel.save(TruthGPUDiracModel( masked_rs.run_id, mid,
                     accuracy_file = r_key,
                     result_files = rf,
                     bucket = self.results_bucket_name,
                     timestamp = ts
                     ))
             self.logger.debug("Writing %s" % (r_key) )
     return self.truth
Esempio n. 4
0
def deprecated_run_once(comm, mask_id, sqs_data_to_agg,  sqs_truth_to_agg, sqs_recycling_to_agg, s3_from_gpu, s3_results, run_truth_table, s3_csvs ):
    by_network = True
  
    rec = None
    if comm.rank == 0:
        sqs = boto.connect_sqs()
        d2a = sqs.create_queue( sqs_data_to_agg )
        d2a_bak =  sqs.create_queue( sqs_recycling_to_agg )
        print "Num data %i in %s" %  (d2a.count(), sqs_data_to_agg)
        print "Num data %i in %s" %  (d2a_bak.count(), sqs_recycling_to_agg)
        if d2a.count() > d2a_bak.count():
            rec = False 
        else:
            assert d2a_bak.count() > 0, "both queues empty"
            rec = True
    rec = comm.bcast(rec)
    if rec:
        sqs_data_to_agg, sqs_recycling_to_agg = sqs_recycling_to_agg, sqs_data_to_agg
    if comm.rank == 0:
        print "I want the truth!!!"
        a = Truthiness( sqs_truth_to_agg, sqs_truth_to_agg, s3_from_gpu, 
                s3_results, run_truth_table, by_network, mask_id)
        rs =a.get_result_set()
        if rs:
            while not a.handle_result_set(rs):
                print "not the truth", ctr
                rs =a.get_result_set()
                if rs is None:
                    break 
    comm.Barrier()
    #print "Aggregating", mask_id, sqs_data_to_agg,  sqs_truth_to_agg, sqs_recycling_to_agg, s3_from_gpu, s3_results, run_truth_table, s3_csvs
    a = Aggregator( sqs_data_to_agg, sqs_recycling_to_agg, s3_from_gpu, 
            s3_results, run_truth_table, by_network, mask_id)
    rs =a.get_result_set()
    
    if comm.rank == 0:
        rid = rs.get_run_id()
        st = rs.spec_string
    ctr = 0
    while rs:
        ctr += 1
        a.handle_result_set(rs) 
        rs =a.get_result_set()
    comm.Barrier()
    acc_pre = "acc-k-11-%i-%i" %(ctr, comm.rank)
    a.save_acc( '/scratch/sgeadmin', acc_pre)
    strains = a.acc_acc.keys()
    strains.sort()
    strains = comm.bcast(strains)
    zero = None
    for mat in a.acc_acc.itervalues():
        zero = np.zeros_like(mat, dtype = np.int)
    for k in strains:
        if k in a.acc_acc:
            curr = a.acc_acc[k]
        else:
            curr = zero
        total = np.zeros_like(curr)
        comm.Reduce([curr, MPI.INT],[total, MPI.INT])
        if comm.rank == 0:
            a.acc_acc[k] = total
        total_count = 0
        print "acc", a.acc_count[k]
        total_count = comm.reduce(a.acc_count[k])
        if comm.rank == 0:
            print "total obs. %i" % total_count
            divisor = float(total_count)
            pv_table = a.acc_acc[k]/divisor
            file_loc = '/scratch/sgeadmin/pvals-%s-%s-%s.csv' % (
                a.run_config['run_settings']['k'], a.run_id, mask_id) 
            a.generate_csv( pv_table, column_names = a.get_mask_labels(), 
                index=a.networks,  filename=file_loc)
            a.write_csv(s3_csvs, file_loc)
            try:
                res = TruthGPUDiracModel.query(rid, strain_id__eq=st)
                for r in res:
                    r.pval_file = os.path.split(file_loc)[1]
                    r.mask = a.get_mask_labels()
                    r.save()
            except Exception as e:
                print "Unable to store in dynamo"
                print "%r" % e
    if comm.rank==0:
        a.save_acc( '/scratch/sgeadmin', 'acc-k-11-combined-total' )
    comm.Barrier()