def generate_dict_of_hmmscan_result(self):
        '''If a dictionary of pfam scan results is not available, create it.

        To create, call hparser and get results (1). The dictionary has the unprot ids as
        keys and a list of pfam names as values. Write the string form of this dictionary (2).
        '''
        dict_stored_at = os.path.join('intermediates', 'hmmscan_dict')
        try:
            hmmscan_dict = open(dict_stored_at).read()
            hmmscan_dict = eval(hmmscan_dict)
            print 'Previous hmm dictionary found.'
        except:
            hmmscan_dict = {}
            #1.
            hmmscan_results = hparser.parse(self.hmmscan_result, 0.001, 1, 1)
            for query in hmmscan_results.hit_result_of_name_of_query:
                query_id = query.split(':')[-1]
                hmmscan_dict[query_id] = []
                for pfam_name in hmmscan_results.hit_result_of_name_of_query[
                        query]:
                    hmmscan_dict[query_id].append(pfam_name)
            #2.
            f = open(dict_stored_at, 'w')
            f.write(str(hmmscan_dict))
            f.close()
            print 'New hmm dictionary created.'
        return hmmscan_dict
def insertPDBPredictionsIntoDB(hmm, tree_node, basename):
    hmmsearch_filename = basename + "_vs_PDB.hmmsearch.out"
    hmmsearch_results = parse_results_of_hmmsearch_or_hmmscan.parse(
        hmmsearch_filename, 0.001, 1, 1)
    # There should only be one query here - the family HMM
    for query in hmmsearch_results.hit_result_of_name_of_query:
        for pdb_chain_id in hmmsearch_results.hit_result_of_name_of_query[
                query]:
            pdb_id, chain_id = pdb_chain_id.split('_')
            pdb_chain_objects = PDB_Chain.objects.filter(
                pdb__id__exact=pdb_id, chain_id__exact=chain_id)
            if pdb_chain_objects:
                pdb_chain = pdb_chain_objects[0]
            else:
                print "Unrecognized PDB chain %s in hmmsearch results." \
                  % pdb_chain_id,
                print "The PDB_Chain table may be out of date."
                continue
            hit_result \
                = hmmsearch_results.hit_result_of_name_of_query[query][pdb_chain_id]
            for match_number in hit_result.matches:
                match_result = hit_result.matches[match_number]
                aligned_seguid = CheckSum.seguid(match_result.aligned_hit)
                aligned_sequence_objects = AlignedSequence.objects.filter(
                    seguid__exact=aligned_seguid)
                if aligned_sequence_objects:
                    aligned_sequence = aligned_sequence_objects[0]
                else:
                    # Because multiple versions of this fun run simultaneously, it is
                    # possible this was just created moments ago. So, get_or_create
                    # just in case.
                    aligned_sequence, is_created  \
                        = AlignedSequence.objects.get_or_create(
                                                      chars = match_result.aligned_hit,
                                                      seguid = aligned_seguid)
                sequence_hmm = SequenceHMM.objects.create(
                    hmm=hmm,
                    sequence=pdb_chain.full_sequence,
                    aligned_sequence=aligned_sequence,
                    bit_score=match_result.bit_score,
                    e_value=match_result.i_evalue,
                    sequence_type='query',
                    hmm_start=match_result.hmm_from,
                    hmm_end=match_result.hmm_to,
                    sequence_start=match_result.seq_from,
                    sequence_end=match_result.seq_to,
                    match_type=match_result.match_type,
                    n_aligned_chars=match_result.num_aligned_chars)
                TreeNodePDB.objects.create(sequence_hmm=sequence_hmm,
                                           tree_node=tree_node,
                                           pdb_chain=pdb_chain)
def main():
    dir = '/clusterfs/ohana/bpg/InitialMicrobialTargetSet/Pfams/'
    os.chdir(dir)
    f = open('initial_target_microbial_genomes.fa')
    target_seqs = SeqIO.to_dict(SeqIO.parse(f, "fasta"))
    f.close()
    hmmscan_filename = "initial_target_microbial_genomes_vs_Pfam-A.hmmscan.out"
    hmmscan_results = parse_results_of_hmmsearch_or_hmmscan.parse(
        hmmscan_filename, 0.001, 1, 1)
    for query in target_seqs.keys():
        query_residues = {}
        seq = target_seqs[query].seq.tostring()
        qlen = len(seq)
        for i in xrange(qlen):
            query_residues[i] = False
        for pfam_name in hmmscan_results.hit_result_of_name_of_query[query]:
            hit_result = hmmscan_results.hit_result_of_name_of_query[query][
                pfam_name]
            for match_number in hit_result.matches:
                match_result = hit_result.matches[match_number]
                i = match_result.seq_from - 1
                for j in xrange(len(match_result.aligned_hit)):
                    if match_result.aligned_hit[j].isupper():
                        query_residues[i] = True
                    if match_result.aligned_hit[j].isalpha():
                        i += 1
        range_end = -1
        range_start = -1
        for i in xrange(qlen):
            if query_residues[i]:
                if range_end >= 0:
                    if range_end - range_start >= 30:
                        sys.stdout.write(
                            '>%s/%s-%s\n' %
                            (query, range_start + 1, range_end + 1))
                        sys.stdout.write('%s\n' %
                                         (seq[range_start:range_end + 1]))
                    range_end = -1
                    range_start = -1
            else:
                if range_start < 0:
                    range_start = i
                range_end = i
        if range_end >= 0:
            if range_end - range_start >= 30:
                sys.stdout.write('>%s/%s-%s\n' %
                                 (query, range_start + 1, range_end + 1))
                sys.stdout.write('%s\n' % (seq[range_start:range_end + 1]))
def insertPFAMPredictionsIntoDB(consensus_sequence, basename):
    # This file is one of the results of running hmmscan during buildFamily.
    hmmscan_filename = basename + "_vs_Pfam-A.hmmscan.out"
    hmmscan_results = parse_results_of_hmmsearch_or_hmmscan.parse(
        hmmscan_filename, 0.001, 1, 1)
    # There should only be one query here - the family consensus sequence
    for query in hmmscan_results.hit_result_of_name_of_query:
        for pfam_name in hmmscan_results.hit_result_of_name_of_query[query]:
            hit_result = hmmscan_results.hit_result_of_name_of_query[query][
                pfam_name]
            pfam_objects = Pfam.objects.filter(
                name__exact=pfam_name,
                overall_pfam_version__exact=current_pfam_version)
            if pfam_objects:
                hmm_objects = HMM.objects.filter(pfam__exact=pfam_objects[0])
                if hmm_objects:
                    hmm_object = hmm_objects[0]
                else:
                    print "No HMM entry for Pfam entry %s.  Reload Pfam" % pfam_name
                    continue
                for match_number in hit_result.matches:
                    match_result = hit_result.matches[match_number]
                    SequenceHMM.objects.create(
                        hmm=hmm_object,
                        sequence=consensus_sequence,
                        bit_score=match_result.bit_score,
                        e_value=match_result.i_evalue,
                        sequence_type='consensus sequence (different family)',
                        hmm_start=match_result.hmm_from,
                        hmm_end=match_result.hmm_to,
                        sequence_start=match_result.seq_from,
                        sequence_end=match_result.seq_to,
                        match_type=match_result.match_type,
                        n_aligned_chars=match_result.num_aligned_chars)
            else:
                print "Unrecognized Pfam name %s for Pfam version %0.1f." \
                      % (pfam_name, current_pfam_version)
                print "Check that this is the current Pfam version",
                print "and whether the pfam table is up-to-date."
Ejemplo n.º 5
0
def main():
    usage = "%prog [options] results_of_hmmscan_of_single_seq_vs_pfam"
    opt_parser = OptionParser(usage=usage)
    (options, args) = opt_parser.parse_args()
    if len(args) != 1:
        opt_parser.error('Incorrect number of arguments')

    hmmscan_results = parse_results_of_hmmsearch_or_hmmscan.parse(
        args[0], 0.001, 1, 1)
    uppercase_translation = string.maketrans(string.lowercase,
                                             string.uppercase)
    dotdash = '.-'
    print "Query,Pfam,Match#,SeqFrom,SeqTo,UnalignedHit"
    for query in hmmscan_results.hit_result_of_name_of_query:
        for pfam_name in hmmscan_results.hit_result_of_name_of_query[query]:
            hit_result = hmmscan_results.hit_result_of_name_of_query[query][
                pfam_name]
            for match_number in hit_result.matches:
                match_result = hit_result.matches[match_number]
                unaligned_hit = match_result.aligned_hit.translate(
                    uppercase_translation, dotdash)
                print "%s,%s,%s,%s,%s,%s" % (
                    query, pfam_name, match_number, match_result.seq_from,
                    match_result.seq_to, unaligned_hit)
 def run_and_parse_hmmscan(family_type_str, family_hmm_file):
     hmmscan_results_filename = '_'.join(
         [base_name, family_type_str, "hmmscan.out"])
     hmmscan_results_table_filename = '_'.join(
         [base_name, family_type_str, "hmmscan.domtblout"])
     hmmscan_output = PHOG_BLAST_DIR + hmmscan_results_filename
     hmmscan_tbloutput = PHOG_BLAST_DIR + hmmscan_results_table_filename
     starttime = time.time()
     logging.info('Running %s hmmscan' % family_type_str)
     sys.stdout.flush()
     p = subprocess.Popen([
         '/clusterfs/ohana/software/bin/hmmscan', '-o', hmmscan_output,
         '-E',
         str(e_value_cutoff), '--incE',
         str(e_value_cutoff), '--domtblout', hmmscan_tbloutput,
         family_hmm_file, hmmscan_input
     ])
     output, error = p.communicate(input=None)
     endtime = time.time()
     logging.info('Done. %s' % getTimeStr(starttime, endtime))
     starttime = time.time()
     logging.info('Parsing results')
     sys.stdout.flush()
     # For GHG families, require that the hit align globally to the query
     if family_type_str == "ghg":
         min_sequence_length = int(.7 * len(record.seq.tostring()))
     else:
         min_sequence_length = 10
     hmmscan_results_by_query = parse_results_of_hmmsearch_or_hmmscan.parse(
         PHOG_BLAST_DIR + hmmscan_results_filename, e_value_cutoff,
         min_sequence_length, 10)
     endtime = time.time()
     logging.info('Done. %s' % getTimeStr(starttime, endtime))
     starttime = time.time()
     logging.info('Parsing domtbl')
     sys.stdout.flush()
     ordered_families = []
     included_families = set()
     f = open(PHOG_BLAST_DIR + hmmscan_results_table_filename)
     for line in f.readlines():
         if len(line) > 0 and line[0] != '#':
             family = line.split()[0]
             if family not in included_families:
                 ordered_families = ordered_families + [family]
                 included_families.add(family)
     f.close()
     endtime = time.time()
     logging.info('Done. %s' % getTimeStr(starttime, endtime))
     for query in hmmscan_results_by_query.hit_result_of_name_of_query:
         hmmscan_results = hmmscan_results_by_query.hit_result_of_name_of_query[
             query]
         break
     highest_scoring_match_number = {}
     best_bit_score = {}
     best_match = {}
     starttime = time.time()
     logging.info('Finding best matches')
     sys.stdout.flush()
     for family_name in ordered_families:
         if family_name not in hmmscan_results:
             continue
         family = Family.objects.get(id__exact=int(family_name[3:]))
         if family.is_global_homology():
             family_type = 1
         else:
             family_type = 0
         highest_scoring_match_number[family.id] = -1
         best_bit_score[family.id] = -10000.0
         for match_number in hmmscan_results[family_name].matches:
             match_result = hmmscan_results[family_name].matches[
                 match_number]
             if match_result.bit_score > best_bit_score[family.id]:
                 best_bit_score[family.id] = match_result.bit_score
                 highest_scoring_match_number[family.id] = match_number
         if highest_scoring_match_number[family.id] >= 0:
             best_match[family.id] \
               = hmmscan_results[family_name].matches[
                                             highest_scoring_match_number[family.id]]
         root = family.canonical_root_node()
         hmm = HMM.objects.get(tree_node=root, hmm_type='HMMER3')
     endtime = time.time()
     logging.info('Done. %s' % getTimeStr(starttime, endtime))
     output_file = os.path.join(
         PHOG_BLAST_DIR, base_name + '_%s_output.pkl' % family_type_str)
     f = open(output_file, 'w')
     cPickle.dump(best_match, f)
     f.close()
     return (ordered_families, best_match)