def gather_and_stitch(seq, tfile): total_data = [] in_file = str(seq) + "_tophits.gzip" if os.path.isfile(in_file): print "Extracting data from non_refined file:", in_file tasks = io.readGzipPickle(in_file) for entry in tasks: total_data.append(entry) else: print "Something is wrong in the datafile: ", in_file refined_data = combine_data() if refined_data: for entry in refined_data: total_data.append(entry) else: pass from ranking.NoeStageRank import rank_assembly_with_clustering ranked_data = rank_assembly_with_clustering(total_data, args.numhits) io.dumpGzipPickle(str(tfile), ranked_data) # delete files try: print "Deleting old rtx_* files!" rm_files = "rm rtx_*.gzip" os.system(rm_files) except NameError: print "No rtx_* files exist!" return True
def altSmotifSearch(job): # send_job = [tasks[t_job[0]], alt_sse_profile[t_job[1]], args.stage, task_index, lowest_noe_energy] all_log = [] task = (job[0])[:] refine_pair = task[8][1] index_array = job[3] print "task_index", index_array for pair in refine_pair: tdump_log = perform_alt_search(job, pair) if tdump_log: for t in tdump_log: all_log.append(t) # Dump data to the disk if all_log: io.dumpGzipPickle("rtx_" + str(index_array) + ".gzip", all_log) return False else: return False
def makeTopPickle2Old(previous_smotif_index, num_hits, stage): """ Concatenate data from all of the threads, organize, remove redundancies, rank and extract top hits as defined :param previous_smotif_index: :param num_hits: :param stage: :return: """ hits = [] # regex = str(previous_smotif_index) + "_*_*.pickle" regex = str(previous_smotif_index) + "_*_*.gzip" file_list = glob.glob(regex) for f in file_list: t_hits = io.readGzipPickle(f) for t_hit in t_hits: hits.append(t_hit) """ identifiers: smotif, smotif_def, seq_filter, contacts_filter, PCS_filter, qcp_rmsd, Evofilter RDC_filter, NOE_filter """ new_dict = collections.defaultdict(list) for hit in hits: # thread_data contains data from each search and filter thread. if hit[4][0] == 'NOE_filter': no_of_noes = hit[4][2] new_dict[no_of_noes].append(hit) keys = new_dict.keys() keys.sort() keys.reverse() # Rank based on NOE energy non_redundant = collections.defaultdict(list) reduced_dump_log = [] seqs = [] count_hits = 0 for i in range(len(keys)): entries = new_dict[keys[i]] if len( entries ) == 1: # There is only one entry in this no_of_noes bin just check of existing sequences and move on smotif_seq = entries[0][3][1] if smotif_seq not in seqs: seqs.append(smotif_seq) reduced_dump_log.append(entries[0]) print "final sele", entries[0][0][1][0][0], keys[i] count_hits += 1 else: t_log = collections.defaultdict(list) for hit in entries: # filter on noe_energy if hit[4][0] == 'NOE_filter': noe_energy = hit[4][3] noe_energy = round(noe_energy, 2) t_log[noe_energy].append(hit) noe_energy_bins = t_log.keys() noe_energy_bins.sort() for j in range(len(noe_energy_bins)): # filter on RDC score t2_log = collections.defaultdict(list) hits = t_log[noe_energy_bins[j]] for hit in hits: if hit[5][0] == 'RDC_filter': rdc_tensors = hit[5][1] rdc_score = 0 for tensor in rdc_tensors: rdc_score = rdc_score + tensor[0] t2_log[rdc_score].append(hit) rdc_score_bins = t2_log.keys() rdc_score_bins.sort() for k in range(len(rdc_score_bins)): hits = t2_log[rdc_score_bins[k]] for hit in hits: smotif_seq = hit[3][1] if smotif_seq not in seqs: seqs.append(smotif_seq) reduced_dump_log.append(hit) print "final sele", hit[0][1][0][0], keys[ i], noe_energy_bins[j], rdc_score_bins[k] count_hits += 1 if count_hits >= num_hits: break if count_hits >= num_hits: break if count_hits >= num_hits: pass else: print "could only extract ", len(reduced_dump_log), count_hits # io.dumpPickle(str(previous_smotif_index) + "_tophits.pickle", dump_pickle) io.dumpGzipPickle( str(previous_smotif_index) + "_tophits.gzip", reduced_dump_log) print "actual number in top hits ", len(reduced_dump_log) return range(len(reduced_dump_log))
def makeTopPickle2(previous_smotif_index, num_hits, stage): """ Concatenate data from all of the threads, organize, remove redundancies, rank and extract top hits as defined :param previous_smotif_index: :param num_hits: :param stage: :return: """ hits = [] # regex = str(previous_smotif_index) + "_*_*.pickle" regex = str(previous_smotif_index) + "_*_*.gzip" file_list = glob.glob(regex) for f in file_list: t_hits = io.readGzipPickle(f) for t_hit in t_hits: hits.append(t_hit) """ identifiers: smotif, smotif_def, seq_filter, contacts_filter, PCS_filter, qcp_rmsd, Evofilter RDC_filter, NOE_filter """ new_dict = collections.defaultdict(list) for hit in hits: # thread_data contains data from each search and filter thread. noe_energy = hit[5][3] noe_energy = round(noe_energy, 4) new_dict[noe_energy].append(hit) keys = new_dict.keys() keys.sort() # Rank based on NOE energy reduced_dump_log = [] seqs = [] count_hits = 0 for i in range(len(keys)): entries = new_dict[keys[i]] if count_hits >= num_hits: break if len(entries) == 1: smotif_seq = entries[0][4][1] if smotif_seq not in seqs: seqs.append(smotif_seq) reduced_dump_log.append(entries[0]) print "final sele", entries[0][0][1][0][0], keys[i] count_hits += 1 if count_hits >= num_hits: break else: t2_log = collections.defaultdict(list) if hit[6][0] == 'RDC_filter': for hit in entries: # if hit[5][0] == 'RDC_filter': rdc_tensors = hit[6][1] rdc_score = 0 for tensor in rdc_tensors: rdc_score = rdc_score + tensor[0] t2_log[rdc_score].append(hit) rdc_score_bins = t2_log.keys() rdc_score_bins.sort() elif hit[6][0] == 'PCS_filter': print "Working on PCS filter instead of RDC" for hit in entries: # if hit[5][0] == 'RDC_filter': rdc_tensors = hit[6][1] rdc_score = 0 for tensor in rdc_tensors: rdc_score = rdc_score + tensor[1] t2_log[rdc_score].append(hit) rdc_score_bins = t2_log.keys() rdc_score_bins.sort() else: print "Something is wrong with your PCS logic" for k in range(len(rdc_score_bins)): hits = t2_log[rdc_score_bins[k]] for hit in hits: smotif_seq = hit[4][1] if smotif_seq not in seqs: seqs.append(smotif_seq) reduced_dump_log.append(hit) count_hits += 1 print "final sele", hit[0][1][0][0], keys[ i], rdc_score_bins[k] if count_hits >= num_hits: break if count_hits >= num_hits: break if count_hits >= num_hits: break else: pass if count_hits >= num_hits: pass else: print "could only extract ", len(reduced_dump_log), count_hits io.dumpGzipPickle( str(previous_smotif_index) + "_tophits.gzip", reduced_dump_log) print "actual number in top hits ", len(reduced_dump_log) return range(len(reduced_dump_log))
def makeTopPickle(previous_smotif_index, num_hits, stage): """ Concatenate data from all of the threads, organize, remove redundancies, rank and extract top hits as defined :param previous_smotif_index: :param num_hits: :param stage: :return: """ hits = [] # regex = str(previous_smotif_index) + "_*_*.pickle" regex = str(previous_smotif_index) + "_*_*.gzip" file_list = glob.glob(regex) for f in file_list: t_hits = io.readGzipPickle(f) for t_hit in t_hits: hits.append(t_hit) """ identifiers: smotif, smotif_def, seq_filter, contacts_filter, PCS_filter, qcp_rmsd, Evofilter RDC_filter, NOE_filter """ new_dict = collections.defaultdict(list) rdc_constant = 0.0 for hit in hits: # thread_data contains data from each search and filter thread. # initialize total score array total_score = {} for data_filter in hit: if data_filter[0] == 'PCS_filter': pcs_data = data_filter pcsscore = getNchiSum(pcs_data, stage) total_score['pcs_score'] = pcsscore if data_filter[0] == 'Ref_RMSD': total_score['rmsd_score'] = data_filter[1] if data_filter[0] == 'RDC_filter': rdc_data = data_filter #Nchi = rdcSumChi(rdc_data, stage) log_likelihood = data_filter[2] rdc_tensors = data_filter[1] for tensor in rdc_tensors: rdc_constant = rdc_constant + tensor[0] rdc_constant = rdc_constant * 1e-10 total_score['rdc_score'] = log_likelihood if data_filter[0] == 'NOE_filter': noe_probability = data_filter[1] log_likelihood = -1 * (math.log(noe_probability)) total_score['noe_score'] = log_likelihood # calculate the total score and append the hit if total_score: keys = total_score.keys() keys = ['noe_score', 'rdc_score'] #keys = ['rmsd_score'] tscore = 0 for key in keys: tscore = tscore + total_score[key] tscore = tscore + rdc_constant if tscore < 999.999: new_dict[tscore].append(hit) # ************************************************ # Exclude the redundant entries and rank top hits # ************************************************ keys = new_dict.keys() keys.sort() # Exclude the redundant data. # non_redundant = {} non_redundant = collections.defaultdict(list) seqs = [] smotif_seq = '' count_hits = 0 for i in range(0, len(keys)): entries = new_dict[keys[i]] for entry in entries: for ent in entry: if ent[0] == 'seq_filter': seq_filter = ent smotif_seq = seq_filter[1] if smotif_seq not in seqs: seqs.append(smotif_seq) non_redundant[keys[i]].append(entry) count_hits += 1 if count_hits >= num_hits: break # Rank top hits and dump the data keys = non_redundant.keys() keys.sort() dump_pickle = [] print "Dumping data to disk" count_top_hits = 0 while (True): for key in keys: entries = non_redundant[key] for entry in entries: dump_pickle.append(entry) # print "final selected Smotif: ", entry[0][1][0][0], "with score: ", key print "final sele", entry[0][1][0][0], key count_top_hits += 1 if count_top_hits >= num_hits: break if count_top_hits >= num_hits: break else: print "could only extract ", count_top_hits break # io.dumpPickle(str(previous_smotif_index) + "_tophits.pickle", dump_pickle) io.dumpGzipPickle( str(previous_smotif_index) + "_tophits.gzip", dump_pickle) print "actual number in top hits ", len(dump_pickle) return range(count_top_hits)
tasks), "Smotifs, Elapsed", round((elapsed) / (60), 2), "mins" elif tag == tags.EXIT: closed_workers += 1 # consolidate top_hits and dump files here if args.stage == 1: tasks, sse_index = srank.getRunSeq( args.numhits, args.stage) # TODO change this for new alt_smotifs exit() print "Total number of hits found are : ", len(total_data) # ranked_data = rank_assembly(total_data, args.numhits) ranked_data = rank_assembly_with_clustering_pcs2(total_data, args.numhits) print len(ranked_data) if args.stage == 1: sse_index = 0 io.dumpGzipPickle(str(sse_index) + "_tophits.gzip", ranked_data) # Rename temprary files util.rename_pickle(sse_index) print "All Done, Master exiting" exit() # On the worker processes else: while True: # initiate infinite loop comm.send(None, dest=0, tag=tags.READY) # Signal the master process that you are READY task = comm.recv(source=0, tag=MPI.ANY_SOURCE, status=status) tag = status.Get_tag() if tag == tags.START:
def S1SmotifSearch(task): """ Main () :param task: :return: """ index_array = task[0] stage = task[1] s1_def, s2_def, sse_route = mutil.getSSdef(index_array) smotif_def = sm.getSmotif(s1_def, s2_def) exp_data = io.readPickle("exp_data.pickle") exp_data_types = exp_data.keys( ) # ['ss_seq', 'pcs_data', 'aa_seq', 'contacts'] smotif_data = sm.readSmotifDatabase(smotif_def, exp_data['database_cutoff']) if not smotif_data: # If the smotif library doesn't exist, terminate further execution. return False dump_log = [] # ************************************************************************************************ # Main # The 'for' loop below iterates over all of the Smotifs and applies various filters # This is the place to add new filters as you desire. For starters, look at Sequence filter. # ************************************************************************************************ for i in range(0, len(smotif_data)): # ************************************************ # Excluding the natives # ************************************************ natives = exp_data['natives'] tpdbid = smotif_data[i][0][0] pdbid = tpdbid[0:4] if 'natives' in exp_data_types: if pdbid in natives: continue # Stop further execution, but, iterate. else: pass if 'homologs' in exp_data_types: # Smotif assembly only from the specified pdb files homologs = exp_data['homologs'] if pdbid not in homologs: # Stop further execution, but, iterate. continue else: pass # ************************************************ # Applying different filters to Smotifs # Prepare temp log array to save data at the end # ************************************************ tlog, pcs_tensor_fits, rdc_tensor_fits, = [], [], [] ref_rmsd, noe_probability = 0.0, 0.0 tlog.append(['smotif', smotif_data[i], sse_route]) tlog.append(['smotif_def', [s1_def, s2_def]]) tlog.append(['qcp_rmsd']) tlog.append(['cathcodes', [smotif_data[i][0]], [sse_route]]) # ************************************************ # Sequence filter # Aligns the smotif seq to target seq and calculates # sequence identity and the alignment score # ************************************************ smotif_seq, seq_identity = Sfilter.getS1SeqIdentity( s1_def, s2_def, smotif_data[i], exp_data) tlog.append(['seq_filter', smotif_seq, seq_identity]) # ************************************************ # Unambiguous NOE score filter # uses experimental ambiguous noe data to filter Smotifs # scoring based on f-measure? # ************************************************ if 'ilva_noes' in exp_data_types: noe_probability, no_of_noes, noe_energy, noe_data, cluster_protons, cluster_sidechains = noepdf.s1ILVApdf( s1_def, s2_def, smotif_data[i], exp_data, stage) if noe_probability >= exp_data['expected_noe_prob'][stage - 1]: tlog.append([ 'NOE_filter', noe_probability, no_of_noes, noe_energy, noe_data, cluster_protons, cluster_sidechains ]) else: continue # ************************************************ # Residual dipolar coupling filter # uses experimental RDC data to filter Smotifs # scoring based on normalised chisqr. # ************************************************ if 'rdc_data' in exp_data_types: rdc_tensor_fits, log_likelihood, rdc_energy = Rfilter.RDCAxRhFit( s1_def, s2_def, smotif_data[i], exp_data) if rdc_tensor_fits: tlog.append([ 'RDC_filter', rdc_tensor_fits, log_likelihood, rdc_energy ]) else: continue # ************************************************ # Pseudocontact Shift filter # uses experimental PCS data to filter Smotifs # scoring based on normalised chisqr # ************************************************ if 'pcs_data' in exp_data_types: pcs_tensor_fits = Pfilter.PCSAxRhFit(s1_def, s2_def, smotif_data[i], exp_data) tlog.append(['PCS_filter', pcs_tensor_fits]) # ************************************************ # Calc RMSD of the reference structure. # Used to identify the lowest possible RMSD # structure for the target, from the Smotif library. # ************************************************ if 'reference_ca' in exp_data_types: ref_rmsd = ref.calcRefRMSD(exp_data['reference_ca'], s1_def, s2_def, smotif_data[i], rmsd_cutoff=100.0) tlog.append(['Ref_RMSD', ref_rmsd, seq_identity]) # Dump the data to the disk if pcs_tensor_fits or noe_probability: dump_log.append(tlog) # Save all of the hits in pickled arrays if dump_log: if 'rank_top_hits' in exp_data_types: rank_top_hits = exp_data['rank_top_hits'] num_hits = rank_top_hits[stage - 1] dump_log = rank.rank_assembly(dump_log, num_hits) print "Reducing the amount of data to:", rank_top_hits[ stage - 1], len(dump_log) print "num of hits", len(dump_log) io.dumpGzipPickle( '0_' + str(index_array[0]) + "_" + str(index_array[1]) + ".gzip", dump_log) return dump_log else: return False