def deprecated_join_run( run_id, csv_bucket, mask_id, strains, alleles, description, column_label, row_label, network_desc ): rs = TruthGPUDiracModel.query( run_id ) res_list = [] for result in rs: temp = {} temp['csv'] = result.pval_file temp['range'] = list(result.mask)[0] m = re.match(r'\[(\d+),(\d+)\)', temp['range']) temp['start'] = int(m.group(1)) if temp['range'] in mask_id: res_list.append( temp ) res_list.sort(key=lambda x:x['start']) s3 = boto.connect_s3() b = s3.create_bucket( csv_bucket ) pv_list = [] for res in res_list: k = b.get_key( res['csv'] ) csv = k.get_contents_as_string() first = True temp = {} for line in csv.split('\n'): if first: first = False else: if line.strip(): network, pval = line.split(',') temp[network] = pval pv_list.append(temp) master = defaultdict(list) for col in pv_list: for k,v in col.iteritems(): master[k].append(v) table = [['networks'] + [r['range'] for r in res_list]] for k,v in master.iteritems(): table.append([k] + v) my_table = '' for row in table: my_table += '\t'.join(row) + '\n' ts = datetime.datetime.utcnow().strftime('%Y-%m-%d-%H.%M') tsv_name = "%s-joined-%s.tsv" % (run_id,ts) with open(tsv_name, 'w') as joined: joined.write( my_table ) k = Key(b) k.key = tsv_name k.set_contents_from_filename(tsv_name) if not DataForDisplay.exists(): DataForDisplay.create_table( wait=True, read_capacity_units=2, write_capacity_units=1) dfd_item = DataForDisplay( run_id, ts ) # strains, allele,description, column_label, row_label ): dfd_item.strains = strains dfd_item.alleles = alleles dfd_item.description = description dfd_item.data_bucket = csv_bucket dfd_item.column_label = column_label dfd_item.row_label = row_label dfd_item.data_file = tsv_name dfd_item.network = network_desc dfd_item.save()
def _get_truth( self): truth = {} for item in TruthGPUDiracModel.query(self.run_id): try: if item.strain_id in self.mask_ids: truth[item.strain_id] = self._load_np( item.accuracy_file ) except: pass return truth
def deprecated_run_once(comm, mask_id, sqs_data_to_agg, sqs_truth_to_agg, sqs_recycling_to_agg, s3_from_gpu, s3_results, run_truth_table, s3_csvs ): by_network = True rec = None if comm.rank == 0: sqs = boto.connect_sqs() d2a = sqs.create_queue( sqs_data_to_agg ) d2a_bak = sqs.create_queue( sqs_recycling_to_agg ) print "Num data %i in %s" % (d2a.count(), sqs_data_to_agg) print "Num data %i in %s" % (d2a_bak.count(), sqs_recycling_to_agg) if d2a.count() > d2a_bak.count(): rec = False else: assert d2a_bak.count() > 0, "both queues empty" rec = True rec = comm.bcast(rec) if rec: sqs_data_to_agg, sqs_recycling_to_agg = sqs_recycling_to_agg, sqs_data_to_agg if comm.rank == 0: print "I want the truth!!!" a = Truthiness( sqs_truth_to_agg, sqs_truth_to_agg, s3_from_gpu, s3_results, run_truth_table, by_network, mask_id) rs =a.get_result_set() if rs: while not a.handle_result_set(rs): print "not the truth", ctr rs =a.get_result_set() if rs is None: break comm.Barrier() #print "Aggregating", mask_id, sqs_data_to_agg, sqs_truth_to_agg, sqs_recycling_to_agg, s3_from_gpu, s3_results, run_truth_table, s3_csvs a = Aggregator( sqs_data_to_agg, sqs_recycling_to_agg, s3_from_gpu, s3_results, run_truth_table, by_network, mask_id) rs =a.get_result_set() if comm.rank == 0: rid = rs.get_run_id() st = rs.spec_string ctr = 0 while rs: ctr += 1 a.handle_result_set(rs) rs =a.get_result_set() comm.Barrier() acc_pre = "acc-k-11-%i-%i" %(ctr, comm.rank) a.save_acc( '/scratch/sgeadmin', acc_pre) strains = a.acc_acc.keys() strains.sort() strains = comm.bcast(strains) zero = None for mat in a.acc_acc.itervalues(): zero = np.zeros_like(mat, dtype = np.int) for k in strains: if k in a.acc_acc: curr = a.acc_acc[k] else: curr = zero total = np.zeros_like(curr) comm.Reduce([curr, MPI.INT],[total, MPI.INT]) if comm.rank == 0: a.acc_acc[k] = total total_count = 0 print "acc", a.acc_count[k] total_count = comm.reduce(a.acc_count[k]) if comm.rank == 0: print "total obs. %i" % total_count divisor = float(total_count) pv_table = a.acc_acc[k]/divisor file_loc = '/scratch/sgeadmin/pvals-%s-%s-%s.csv' % ( a.run_config['run_settings']['k'], a.run_id, mask_id) a.generate_csv( pv_table, column_names = a.get_mask_labels(), index=a.networks, filename=file_loc) a.write_csv(s3_csvs, file_loc) try: res = TruthGPUDiracModel.query(rid, strain_id__eq=st) for r in res: r.pval_file = os.path.split(file_loc)[1] r.mask = a.get_mask_labels() r.save() except Exception as e: print "Unable to store in dynamo" print "%r" % e if comm.rank==0: a.save_acc( '/scratch/sgeadmin', 'acc-k-11-combined-total' ) comm.Barrier()