def generate_dict_of_hmmscan_result(self): '''If a dictionary of pfam scan results is not available, create it. To create, call hparser and get results (1). The dictionary has the unprot ids as keys and a list of pfam names as values. Write the string form of this dictionary (2). ''' dict_stored_at = os.path.join('intermediates', 'hmmscan_dict') try: hmmscan_dict = open(dict_stored_at).read() hmmscan_dict = eval(hmmscan_dict) print 'Previous hmm dictionary found.' except: hmmscan_dict = {} #1. hmmscan_results = hparser.parse(self.hmmscan_result, 0.001, 1, 1) for query in hmmscan_results.hit_result_of_name_of_query: query_id = query.split(':')[-1] hmmscan_dict[query_id] = [] for pfam_name in hmmscan_results.hit_result_of_name_of_query[ query]: hmmscan_dict[query_id].append(pfam_name) #2. f = open(dict_stored_at, 'w') f.write(str(hmmscan_dict)) f.close() print 'New hmm dictionary created.' return hmmscan_dict
def insertPDBPredictionsIntoDB(hmm, tree_node, basename): hmmsearch_filename = basename + "_vs_PDB.hmmsearch.out" hmmsearch_results = parse_results_of_hmmsearch_or_hmmscan.parse( hmmsearch_filename, 0.001, 1, 1) # There should only be one query here - the family HMM for query in hmmsearch_results.hit_result_of_name_of_query: for pdb_chain_id in hmmsearch_results.hit_result_of_name_of_query[ query]: pdb_id, chain_id = pdb_chain_id.split('_') pdb_chain_objects = PDB_Chain.objects.filter( pdb__id__exact=pdb_id, chain_id__exact=chain_id) if pdb_chain_objects: pdb_chain = pdb_chain_objects[0] else: print "Unrecognized PDB chain %s in hmmsearch results." \ % pdb_chain_id, print "The PDB_Chain table may be out of date." continue hit_result \ = hmmsearch_results.hit_result_of_name_of_query[query][pdb_chain_id] for match_number in hit_result.matches: match_result = hit_result.matches[match_number] aligned_seguid = CheckSum.seguid(match_result.aligned_hit) aligned_sequence_objects = AlignedSequence.objects.filter( seguid__exact=aligned_seguid) if aligned_sequence_objects: aligned_sequence = aligned_sequence_objects[0] else: # Because multiple versions of this fun run simultaneously, it is # possible this was just created moments ago. So, get_or_create # just in case. aligned_sequence, is_created \ = AlignedSequence.objects.get_or_create( chars = match_result.aligned_hit, seguid = aligned_seguid) sequence_hmm = SequenceHMM.objects.create( hmm=hmm, sequence=pdb_chain.full_sequence, aligned_sequence=aligned_sequence, bit_score=match_result.bit_score, e_value=match_result.i_evalue, sequence_type='query', hmm_start=match_result.hmm_from, hmm_end=match_result.hmm_to, sequence_start=match_result.seq_from, sequence_end=match_result.seq_to, match_type=match_result.match_type, n_aligned_chars=match_result.num_aligned_chars) TreeNodePDB.objects.create(sequence_hmm=sequence_hmm, tree_node=tree_node, pdb_chain=pdb_chain)
def main(): dir = '/clusterfs/ohana/bpg/InitialMicrobialTargetSet/Pfams/' os.chdir(dir) f = open('initial_target_microbial_genomes.fa') target_seqs = SeqIO.to_dict(SeqIO.parse(f, "fasta")) f.close() hmmscan_filename = "initial_target_microbial_genomes_vs_Pfam-A.hmmscan.out" hmmscan_results = parse_results_of_hmmsearch_or_hmmscan.parse( hmmscan_filename, 0.001, 1, 1) for query in target_seqs.keys(): query_residues = {} seq = target_seqs[query].seq.tostring() qlen = len(seq) for i in xrange(qlen): query_residues[i] = False for pfam_name in hmmscan_results.hit_result_of_name_of_query[query]: hit_result = hmmscan_results.hit_result_of_name_of_query[query][ pfam_name] for match_number in hit_result.matches: match_result = hit_result.matches[match_number] i = match_result.seq_from - 1 for j in xrange(len(match_result.aligned_hit)): if match_result.aligned_hit[j].isupper(): query_residues[i] = True if match_result.aligned_hit[j].isalpha(): i += 1 range_end = -1 range_start = -1 for i in xrange(qlen): if query_residues[i]: if range_end >= 0: if range_end - range_start >= 30: sys.stdout.write( '>%s/%s-%s\n' % (query, range_start + 1, range_end + 1)) sys.stdout.write('%s\n' % (seq[range_start:range_end + 1])) range_end = -1 range_start = -1 else: if range_start < 0: range_start = i range_end = i if range_end >= 0: if range_end - range_start >= 30: sys.stdout.write('>%s/%s-%s\n' % (query, range_start + 1, range_end + 1)) sys.stdout.write('%s\n' % (seq[range_start:range_end + 1]))
def insertPFAMPredictionsIntoDB(consensus_sequence, basename): # This file is one of the results of running hmmscan during buildFamily. hmmscan_filename = basename + "_vs_Pfam-A.hmmscan.out" hmmscan_results = parse_results_of_hmmsearch_or_hmmscan.parse( hmmscan_filename, 0.001, 1, 1) # There should only be one query here - the family consensus sequence for query in hmmscan_results.hit_result_of_name_of_query: for pfam_name in hmmscan_results.hit_result_of_name_of_query[query]: hit_result = hmmscan_results.hit_result_of_name_of_query[query][ pfam_name] pfam_objects = Pfam.objects.filter( name__exact=pfam_name, overall_pfam_version__exact=current_pfam_version) if pfam_objects: hmm_objects = HMM.objects.filter(pfam__exact=pfam_objects[0]) if hmm_objects: hmm_object = hmm_objects[0] else: print "No HMM entry for Pfam entry %s. Reload Pfam" % pfam_name continue for match_number in hit_result.matches: match_result = hit_result.matches[match_number] SequenceHMM.objects.create( hmm=hmm_object, sequence=consensus_sequence, bit_score=match_result.bit_score, e_value=match_result.i_evalue, sequence_type='consensus sequence (different family)', hmm_start=match_result.hmm_from, hmm_end=match_result.hmm_to, sequence_start=match_result.seq_from, sequence_end=match_result.seq_to, match_type=match_result.match_type, n_aligned_chars=match_result.num_aligned_chars) else: print "Unrecognized Pfam name %s for Pfam version %0.1f." \ % (pfam_name, current_pfam_version) print "Check that this is the current Pfam version", print "and whether the pfam table is up-to-date."
def main(): usage = "%prog [options] results_of_hmmscan_of_single_seq_vs_pfam" opt_parser = OptionParser(usage=usage) (options, args) = opt_parser.parse_args() if len(args) != 1: opt_parser.error('Incorrect number of arguments') hmmscan_results = parse_results_of_hmmsearch_or_hmmscan.parse( args[0], 0.001, 1, 1) uppercase_translation = string.maketrans(string.lowercase, string.uppercase) dotdash = '.-' print "Query,Pfam,Match#,SeqFrom,SeqTo,UnalignedHit" for query in hmmscan_results.hit_result_of_name_of_query: for pfam_name in hmmscan_results.hit_result_of_name_of_query[query]: hit_result = hmmscan_results.hit_result_of_name_of_query[query][ pfam_name] for match_number in hit_result.matches: match_result = hit_result.matches[match_number] unaligned_hit = match_result.aligned_hit.translate( uppercase_translation, dotdash) print "%s,%s,%s,%s,%s,%s" % ( query, pfam_name, match_number, match_result.seq_from, match_result.seq_to, unaligned_hit)
def run_and_parse_hmmscan(family_type_str, family_hmm_file): hmmscan_results_filename = '_'.join( [base_name, family_type_str, "hmmscan.out"]) hmmscan_results_table_filename = '_'.join( [base_name, family_type_str, "hmmscan.domtblout"]) hmmscan_output = PHOG_BLAST_DIR + hmmscan_results_filename hmmscan_tbloutput = PHOG_BLAST_DIR + hmmscan_results_table_filename starttime = time.time() logging.info('Running %s hmmscan' % family_type_str) sys.stdout.flush() p = subprocess.Popen([ '/clusterfs/ohana/software/bin/hmmscan', '-o', hmmscan_output, '-E', str(e_value_cutoff), '--incE', str(e_value_cutoff), '--domtblout', hmmscan_tbloutput, family_hmm_file, hmmscan_input ]) output, error = p.communicate(input=None) endtime = time.time() logging.info('Done. %s' % getTimeStr(starttime, endtime)) starttime = time.time() logging.info('Parsing results') sys.stdout.flush() # For GHG families, require that the hit align globally to the query if family_type_str == "ghg": min_sequence_length = int(.7 * len(record.seq.tostring())) else: min_sequence_length = 10 hmmscan_results_by_query = parse_results_of_hmmsearch_or_hmmscan.parse( PHOG_BLAST_DIR + hmmscan_results_filename, e_value_cutoff, min_sequence_length, 10) endtime = time.time() logging.info('Done. %s' % getTimeStr(starttime, endtime)) starttime = time.time() logging.info('Parsing domtbl') sys.stdout.flush() ordered_families = [] included_families = set() f = open(PHOG_BLAST_DIR + hmmscan_results_table_filename) for line in f.readlines(): if len(line) > 0 and line[0] != '#': family = line.split()[0] if family not in included_families: ordered_families = ordered_families + [family] included_families.add(family) f.close() endtime = time.time() logging.info('Done. %s' % getTimeStr(starttime, endtime)) for query in hmmscan_results_by_query.hit_result_of_name_of_query: hmmscan_results = hmmscan_results_by_query.hit_result_of_name_of_query[ query] break highest_scoring_match_number = {} best_bit_score = {} best_match = {} starttime = time.time() logging.info('Finding best matches') sys.stdout.flush() for family_name in ordered_families: if family_name not in hmmscan_results: continue family = Family.objects.get(id__exact=int(family_name[3:])) if family.is_global_homology(): family_type = 1 else: family_type = 0 highest_scoring_match_number[family.id] = -1 best_bit_score[family.id] = -10000.0 for match_number in hmmscan_results[family_name].matches: match_result = hmmscan_results[family_name].matches[ match_number] if match_result.bit_score > best_bit_score[family.id]: best_bit_score[family.id] = match_result.bit_score highest_scoring_match_number[family.id] = match_number if highest_scoring_match_number[family.id] >= 0: best_match[family.id] \ = hmmscan_results[family_name].matches[ highest_scoring_match_number[family.id]] root = family.canonical_root_node() hmm = HMM.objects.get(tree_node=root, hmm_type='HMMER3') endtime = time.time() logging.info('Done. %s' % getTimeStr(starttime, endtime)) output_file = os.path.join( PHOG_BLAST_DIR, base_name + '_%s_output.pkl' % family_type_str) f = open(output_file, 'w') cPickle.dump(best_match, f) f.close() return (ordered_families, best_match)