def gather_and_stitch(seq, tfile): total_data = [] in_file = str(seq) + "_tophits.gzip" if os.path.isfile(in_file): print "Extracting data from non_refined file:", in_file tasks = io.readGzipPickle(in_file) for entry in tasks: total_data.append(entry) else: print "Something is wrong in the datafile: ", in_file refined_data = combine_data() if refined_data: for entry in refined_data: total_data.append(entry) else: pass from ranking.NoeStageRank import rank_assembly_with_clustering ranked_data = rank_assembly_with_clustering(total_data, args.numhits) io.dumpGzipPickle(str(tfile), ranked_data) # delete files try: print "Deleting old rtx_* files!" rm_files = "rm rtx_*.gzip" os.system(rm_files) except NameError: print "No rtx_* files exist!" return True
def combine_data(): hits = [] file_list = glob.glob("rtx_*.gzip") if file_list: pass else: return False for f in file_list: print "Extracting data from file..", f t_hits = io.readGzipPickle(f) for t_hit in t_hits: hits.append(t_hit) return hits
def makeTopPickle2Old(previous_smotif_index, num_hits, stage): """ Concatenate data from all of the threads, organize, remove redundancies, rank and extract top hits as defined :param previous_smotif_index: :param num_hits: :param stage: :return: """ hits = [] # regex = str(previous_smotif_index) + "_*_*.pickle" regex = str(previous_smotif_index) + "_*_*.gzip" file_list = glob.glob(regex) for f in file_list: t_hits = io.readGzipPickle(f) for t_hit in t_hits: hits.append(t_hit) """ identifiers: smotif, smotif_def, seq_filter, contacts_filter, PCS_filter, qcp_rmsd, Evofilter RDC_filter, NOE_filter """ new_dict = collections.defaultdict(list) for hit in hits: # thread_data contains data from each search and filter thread. if hit[4][0] == 'NOE_filter': no_of_noes = hit[4][2] new_dict[no_of_noes].append(hit) keys = new_dict.keys() keys.sort() keys.reverse() # Rank based on NOE energy non_redundant = collections.defaultdict(list) reduced_dump_log = [] seqs = [] count_hits = 0 for i in range(len(keys)): entries = new_dict[keys[i]] if len( entries ) == 1: # There is only one entry in this no_of_noes bin just check of existing sequences and move on smotif_seq = entries[0][3][1] if smotif_seq not in seqs: seqs.append(smotif_seq) reduced_dump_log.append(entries[0]) print "final sele", entries[0][0][1][0][0], keys[i] count_hits += 1 else: t_log = collections.defaultdict(list) for hit in entries: # filter on noe_energy if hit[4][0] == 'NOE_filter': noe_energy = hit[4][3] noe_energy = round(noe_energy, 2) t_log[noe_energy].append(hit) noe_energy_bins = t_log.keys() noe_energy_bins.sort() for j in range(len(noe_energy_bins)): # filter on RDC score t2_log = collections.defaultdict(list) hits = t_log[noe_energy_bins[j]] for hit in hits: if hit[5][0] == 'RDC_filter': rdc_tensors = hit[5][1] rdc_score = 0 for tensor in rdc_tensors: rdc_score = rdc_score + tensor[0] t2_log[rdc_score].append(hit) rdc_score_bins = t2_log.keys() rdc_score_bins.sort() for k in range(len(rdc_score_bins)): hits = t2_log[rdc_score_bins[k]] for hit in hits: smotif_seq = hit[3][1] if smotif_seq not in seqs: seqs.append(smotif_seq) reduced_dump_log.append(hit) print "final sele", hit[0][1][0][0], keys[ i], noe_energy_bins[j], rdc_score_bins[k] count_hits += 1 if count_hits >= num_hits: break if count_hits >= num_hits: break if count_hits >= num_hits: pass else: print "could only extract ", len(reduced_dump_log), count_hits # io.dumpPickle(str(previous_smotif_index) + "_tophits.pickle", dump_pickle) io.dumpGzipPickle( str(previous_smotif_index) + "_tophits.gzip", reduced_dump_log) print "actual number in top hits ", len(reduced_dump_log) return range(len(reduced_dump_log))
def makeTopPickle2(previous_smotif_index, num_hits, stage): """ Concatenate data from all of the threads, organize, remove redundancies, rank and extract top hits as defined :param previous_smotif_index: :param num_hits: :param stage: :return: """ hits = [] # regex = str(previous_smotif_index) + "_*_*.pickle" regex = str(previous_smotif_index) + "_*_*.gzip" file_list = glob.glob(regex) for f in file_list: t_hits = io.readGzipPickle(f) for t_hit in t_hits: hits.append(t_hit) """ identifiers: smotif, smotif_def, seq_filter, contacts_filter, PCS_filter, qcp_rmsd, Evofilter RDC_filter, NOE_filter """ new_dict = collections.defaultdict(list) for hit in hits: # thread_data contains data from each search and filter thread. noe_energy = hit[5][3] noe_energy = round(noe_energy, 4) new_dict[noe_energy].append(hit) keys = new_dict.keys() keys.sort() # Rank based on NOE energy reduced_dump_log = [] seqs = [] count_hits = 0 for i in range(len(keys)): entries = new_dict[keys[i]] if count_hits >= num_hits: break if len(entries) == 1: smotif_seq = entries[0][4][1] if smotif_seq not in seqs: seqs.append(smotif_seq) reduced_dump_log.append(entries[0]) print "final sele", entries[0][0][1][0][0], keys[i] count_hits += 1 if count_hits >= num_hits: break else: t2_log = collections.defaultdict(list) if hit[6][0] == 'RDC_filter': for hit in entries: # if hit[5][0] == 'RDC_filter': rdc_tensors = hit[6][1] rdc_score = 0 for tensor in rdc_tensors: rdc_score = rdc_score + tensor[0] t2_log[rdc_score].append(hit) rdc_score_bins = t2_log.keys() rdc_score_bins.sort() elif hit[6][0] == 'PCS_filter': print "Working on PCS filter instead of RDC" for hit in entries: # if hit[5][0] == 'RDC_filter': rdc_tensors = hit[6][1] rdc_score = 0 for tensor in rdc_tensors: rdc_score = rdc_score + tensor[1] t2_log[rdc_score].append(hit) rdc_score_bins = t2_log.keys() rdc_score_bins.sort() else: print "Something is wrong with your PCS logic" for k in range(len(rdc_score_bins)): hits = t2_log[rdc_score_bins[k]] for hit in hits: smotif_seq = hit[4][1] if smotif_seq not in seqs: seqs.append(smotif_seq) reduced_dump_log.append(hit) count_hits += 1 print "final sele", hit[0][1][0][0], keys[ i], rdc_score_bins[k] if count_hits >= num_hits: break if count_hits >= num_hits: break if count_hits >= num_hits: break else: pass if count_hits >= num_hits: pass else: print "could only extract ", len(reduced_dump_log), count_hits io.dumpGzipPickle( str(previous_smotif_index) + "_tophits.gzip", reduced_dump_log) print "actual number in top hits ", len(reduced_dump_log) return range(len(reduced_dump_log))
def makeTopPickle(previous_smotif_index, num_hits, stage): """ Concatenate data from all of the threads, organize, remove redundancies, rank and extract top hits as defined :param previous_smotif_index: :param num_hits: :param stage: :return: """ hits = [] # regex = str(previous_smotif_index) + "_*_*.pickle" regex = str(previous_smotif_index) + "_*_*.gzip" file_list = glob.glob(regex) for f in file_list: t_hits = io.readGzipPickle(f) for t_hit in t_hits: hits.append(t_hit) """ identifiers: smotif, smotif_def, seq_filter, contacts_filter, PCS_filter, qcp_rmsd, Evofilter RDC_filter, NOE_filter """ new_dict = collections.defaultdict(list) rdc_constant = 0.0 for hit in hits: # thread_data contains data from each search and filter thread. # initialize total score array total_score = {} for data_filter in hit: if data_filter[0] == 'PCS_filter': pcs_data = data_filter pcsscore = getNchiSum(pcs_data, stage) total_score['pcs_score'] = pcsscore if data_filter[0] == 'Ref_RMSD': total_score['rmsd_score'] = data_filter[1] if data_filter[0] == 'RDC_filter': rdc_data = data_filter #Nchi = rdcSumChi(rdc_data, stage) log_likelihood = data_filter[2] rdc_tensors = data_filter[1] for tensor in rdc_tensors: rdc_constant = rdc_constant + tensor[0] rdc_constant = rdc_constant * 1e-10 total_score['rdc_score'] = log_likelihood if data_filter[0] == 'NOE_filter': noe_probability = data_filter[1] log_likelihood = -1 * (math.log(noe_probability)) total_score['noe_score'] = log_likelihood # calculate the total score and append the hit if total_score: keys = total_score.keys() keys = ['noe_score', 'rdc_score'] #keys = ['rmsd_score'] tscore = 0 for key in keys: tscore = tscore + total_score[key] tscore = tscore + rdc_constant if tscore < 999.999: new_dict[tscore].append(hit) # ************************************************ # Exclude the redundant entries and rank top hits # ************************************************ keys = new_dict.keys() keys.sort() # Exclude the redundant data. # non_redundant = {} non_redundant = collections.defaultdict(list) seqs = [] smotif_seq = '' count_hits = 0 for i in range(0, len(keys)): entries = new_dict[keys[i]] for entry in entries: for ent in entry: if ent[0] == 'seq_filter': seq_filter = ent smotif_seq = seq_filter[1] if smotif_seq not in seqs: seqs.append(smotif_seq) non_redundant[keys[i]].append(entry) count_hits += 1 if count_hits >= num_hits: break # Rank top hits and dump the data keys = non_redundant.keys() keys.sort() dump_pickle = [] print "Dumping data to disk" count_top_hits = 0 while (True): for key in keys: entries = non_redundant[key] for entry in entries: dump_pickle.append(entry) # print "final selected Smotif: ", entry[0][1][0][0], "with score: ", key print "final sele", entry[0][1][0][0], key count_top_hits += 1 if count_top_hits >= num_hits: break if count_top_hits >= num_hits: break else: print "could only extract ", count_top_hits break # io.dumpPickle(str(previous_smotif_index) + "_tophits.pickle", dump_pickle) io.dumpGzipPickle( str(previous_smotif_index) + "_tophits.gzip", dump_pickle) print "actual number in top hits ", len(dump_pickle) return range(count_top_hits)
import sys, os, copy sys.path.append('../../main/') __author__ = 'kalabharath' import utility.io_util as io seq = int(sys.argv[1]) top_result = [] t_file = str(seq) + "_refined_tophits.gzip" if os.path.isfile(t_file): top_result = io.readGzipPickle(t_file) else: t_file = str(seq) + "_tophits.gzip" if os.path.isfile(t_file): top_result = io.readGzipPickle(t_file) else: print "Somethis is terrribly wrong !" exit() for p in range(0, len(top_result)): print 'model_', p, top_struct = top_result[p] top_struct = copy.copy(top_struct) for entry in top_struct: if entry[0] == 'cathcodes':
help='Top number of hits to be selected') args = parser.parse_args() # ********************* Define cmd line argument parser ********************* # Rank '0' specifies the master process if rank == 0: # ********************* Extract top hits ********************* in_file = str(args.infile) + "_tophits.gzip" print "infile ", in_file try: tasks = io.readGzipPickle(in_file) print "len of tasks", len(tasks) except: traceback.print_exc() print "There are no entries in the tophits file, nothing to refine" killall(size) exit() # ********************* Generate and distribute job index array ********************* stime = time.time() try: if len(tasks): pass except: