def annotate(params, proteins): """ Runs THMHMM and parses the output files. Takes a standard 'inmembrane' params dictionary and a global proteins dictionary which it populates with results. In the current implementation, this function extracts and feeds sequences to tmhmm one by one via a temporary file. These keys are added to the proteins dictionary: - 'tmhmm_helices', a list of tuples describing the first and last residue number of each transmembrane segment; - 'tmhmm_scores', a list of confidence scores (floats) for each predicted tm segment; - 'tmhmm_inner_loops', a list of tuples describing the first and last residue number of each predicted internal loop segment; - 'tmhmm_outer_loops', a list of tuples describing the first and last residue number of each predicted outer loop segment; """ tmhmm_out = 'tmhmm.out' run('%(tmhmm_bin)s %(fasta)s' % params, tmhmm_out) return parse_tmhmm(open('tmhmm.out').read(), proteins)
def annotate(params, proteins): for seqid in proteins: proteins[seqid]['is_signalp'] = False proteins[seqid]['signalp_cleave_position'] = None signalp4_out = 'signalp.out' cmd = '%(signalp4_bin)s -t %(signalp4_organism)s %(fasta)s' % \ params run(cmd, signalp4_out) with open(signalp4_out) as signalp4_text: proteins = parse_signalp(signalp4_text, proteins) return proteins
def annotate(params, proteins): """ Returns a reference to the proteins data structure. Uses HMMER to identify sequence motifs in proteins. This function annotates the proteins with: - 'hmmsearch': a list of motifs that are found in the protein. The motifs correspond to the basename of the .hmm files found in the directory indicated by the 'hmm_profiles_dir' field of 'params'. """ log_stderr("# Searching for HMMER profiles in " + params['hmm_profiles_dir']) file_tag = os.path.join(params['hmm_profiles_dir'], '*.hmm') for hmm_profile in glob.glob(file_tag): params['hmm_profile'] = hmm_profile hmm_profile = os.path.basename(params['hmm_profile']) hmm_name = hmm_profile.replace('.hmm', '') hmmsearch3_out = 'hmm.%s.out' % hmm_name cmd = '%(hmmsearch3_bin)s -Z 2000 -E 10 %(hmm_profile)s %(fasta)s' % params run(cmd, hmmsearch3_out) # init proteins data structure with blank hmmsearch field first for seqid in proteins: if 'hmmsearch' not in proteins[seqid]: proteins[seqid]['hmmsearch'] = [] # parse the hmmsearch output file seqid = None for l in open(hmmsearch3_out): words = l.split() if l.startswith(">>"): seqid = parse_fasta_header(l[3:])[0] continue if seqid is None: continue if 'conditional E-value' in l: evalue = float(words[-1]) score = float(words[-5]) if evalue <= params['hmm_evalue_max'] and \ score >= params['hmm_score_min']: proteins[seqid]['hmmsearch'].append(hmm_name) return proteins
def annotate(params, proteins): """ Returns a reference to the proteins data structure. Uses HMMER to identify sequence motifs in proteins. This function annotates the proteins with: - 'hmmsearch': a list of motifs that are found in the protein. The motifs correspond to the basename of the .hmm files found in the directory indicated by the 'hmm_profiles_dir' field of 'params'. """ log_stderr( "# Searching for HMMER profiles in " + params['hmm_profiles_dir']) file_tag = os.path.join(params['hmm_profiles_dir'], '*.hmm') for hmm_profile in glob.glob(file_tag): params['hmm_profile'] = hmm_profile hmm_profile = os.path.basename(params['hmm_profile']) hmm_name = hmm_profile.replace('.hmm', '') hmmsearch3_out = 'hmm.%s.out' % hmm_name cmd = '%(hmmsearch3_bin)s -Z 2000 -E 10 %(hmm_profile)s %(fasta)s' % params run(cmd, hmmsearch3_out) # init proteins data structure with blank hmmsearch field first for seqid in proteins: if 'hmmsearch' not in proteins[seqid]: proteins[seqid]['hmmsearch'] = [] # parse the hmmsearch output file seqid = None for l in open(hmmsearch3_out): words = l.split() if l.startswith(">>"): seqid = parse_fasta_header(l[3:])[0] continue if seqid is None: continue if 'conditional E-value' in l: evalue = float(words[-1]) score = float(words[-5]) if evalue <= params['hmm_evalue_max'] and \ score >= params['hmm_score_min']: proteins[seqid]['hmmsearch'].append(hmm_name) return proteins
def annotate(params, proteins): """ Uses LipoP to identify lipo-attachment signals in the protein. The 'proteins' dictionary is annotated by adding two fields: - 'is_lipop' is a boolean indicating whether a signal is found or not - 'lipop_cleave_position' gives the position where the signal is cleaved and the protein is attached to a lipid Returns a reference to the proteins data structure. """ lipop1_out = 'lipop.out' run('%(lipop1_bin)s %(fasta)s' % params, lipop1_out) proteins = parse_lipop(open(lipop1_out).read(), proteins) return proteins
def annotate(params, proteins): """ Runs MEMSAT3 and parses the output files. Takes a standard 'inmembrane' params dictionary and a global proteins dictionary which it populates with results. In the current implementation, this function extracts and feeds sequences to MEMSAT3 one by one via a temporary file. These keys are added to the proteins dictionary: - 'memsat3_helices', a list of tuples describing the first and last residue number of each transmembrane segment; - 'memsat3_scores', a list of confidence scores (floats) for each predicted tm segment; - 'memsat3_inner_loops', a list of tuples describing the first and last residue number of each predicted internal loop segment; - 'memsat3_outer_loops', a list of tuples describing the first and last residue number of each predicted outer loop segment; """ for seqid in proteins: protein = proteins[seqid] # initialize the protein data structure protein.update({ 'memsat3_scores': [], 'memsat3_helices': [], 'memsat3_inner_loops': [], 'memsat3_outer_loops': [] }) # write seq to single fasta file single_fasta = seqid_to_filename(seqid) + '.fasta' if not os.path.isfile(single_fasta): write_proteins_fasta(single_fasta, proteins, [seqid]) memsat_out = single_fasta.replace('fasta', 'memsat') run('%s %s' % (params['memsat3_bin'], single_fasta), memsat_out) globmem_out = single_fasta.replace('fasta', 'globmem') if has_transmembrane_in_globmem(globmem_out): parse_memsat(protein, memsat_out)
def annotate(params, proteins): """ Runs MEMSAT3 and parses the output files. Takes a standard 'inmembrane' params dictionary and a global proteins dictionary which it populates with results. In the current implementation, this function extracts and feeds sequences to MEMSAT3 one by one via a temporary file. These keys are added to the proteins dictionary: - 'memsat3_helices', a list of tuples describing the first and last residue number of each transmembrane segment; - 'memsat3_scores', a list of confidence scores (floats) for each predicted tm segment; - 'memsat3_inner_loops', a list of tuples describing the first and last residue number of each predicted internal loop segment; - 'memsat3_outer_loops', a list of tuples describing the first and last residue number of each predicted outer loop segment; """ for seqid in proteins: protein = proteins[seqid] # initialize the protein data structure protein.update({ 'memsat3_scores':[], 'memsat3_helices':[], 'memsat3_inner_loops':[], 'memsat3_outer_loops':[] }) # write seq to single fasta file single_fasta = seqid_to_filename(seqid) + '.fasta' if not os.path.isfile(single_fasta): write_proteins_fasta(single_fasta, proteins, [seqid]) memsat_out = single_fasta.replace('fasta', 'memsat') run('%s %s' % (params['memsat3_bin'], single_fasta), memsat_out) globmem_out = single_fasta.replace('fasta', 'globmem') if has_transmembrane_in_globmem(globmem_out): parse_memsat(protein, memsat_out)
def annotate(params, proteins): for seqid in proteins: proteins[seqid]['is_signalp'] = False proteins[seqid]['signalp_cleave_position'] = None signalp4_out = 'signalp.out' cmd = '%(signalp4_bin)s -t %(signalp4_organism)s %(fasta)s' % \ params run(cmd, signalp4_out) for line in open(signalp4_out): if line.startswith("#"): continue words = line.split() seqid = parse_fasta_header(words[0])[0] proteins[seqid]['signalp_cleave_position'] = int(words[4]) if (words[9] == "Y"): proteins[seqid]['is_signalp'] = True return proteins