def parse_signalp(signalp4_lines, proteins, id_mapping=None): if id_mapping is None: id_mapping = [] past_preamble = False for line in signalp4_lines: if line.startswith("#"): past_preamble = True continue if not past_preamble and line.strip() == '': # skip empty lines continue if past_preamble: if line.strip() == '': # in the case of web output of concatenated signalp output # files, an empty line after preamble means we have finished all # 'result' lines for that section past_preamble = False continue words = line.split() seqid = parse_fasta_header(words[0])[0] if id_mapping: seqid = id_mapping[seqid] proteins[seqid]['signalp_cleave_position'] = int(words[4]) proteins[seqid]['is_signalp'] = (words[9] == 'Y') return proteins
def parse_tmbetadisc_output(output, proteins): """ Parses the TMBETADISC-RBF output (file-like object or a list of strings) an uses it to annotate and return an associated 'proteins' data structure. """ soup = BeautifulSoup(output) # parse the table. we pop of single data cells one at a time fields = soup.findAll("td") fields.reverse() f = fields.pop() # discard first <td>1</td> field try: while len(fields) > 0: f = fields.pop().text seqid, result = parse_fasta_header(f) if "Non-Outer Membrane Protein" in result: proteins[seqid]["is_tmbetadisc_rbf"] = False elif "is Outer Membrane Protein" in result: proteins[seqid]["is_tmbetadisc_rbf"] = True fields.pop() except IndexError: # we get here when we run out of table fields to pop pass return proteins
def parse_lipop(text, proteins, id_mapping=[]): """ Parses the text output of the LipoP program and returns a 'proteins' datastructure with annotations. The parser can also that the HTML returned by the LipoP web interface. If a dictionary of {safe_seqid : seqid} mappings is given, the parser will expect the input text to contain safe_seqids. """ # initialize fields in each protein for seqid in proteins: proteins[seqid]['is_lipop'] = False proteins[seqid]['lipop_cleave_position'] = None for l in text.split('\n'): words = l.split() if 'SpII score' in l: if id_mapping: lipop_seqid = parse_fasta_header(words[1])[0] seqid = id_mapping[lipop_seqid] else: seqid = parse_fasta_header(words[1])[0] if 'cleavage' in l: pair = words[5].split("=")[1] i = int(pair.split('-')[0]) else: i = None proteins[seqid]['is_lipop'] = 'Sp' in words[2] proteins[seqid]['lipop_cleave_position'] = i # check for an E.coli style inner membrane retention signal # Asp+2 to cleavage site. There are other apparent retention # signals in E. coli and other gram- bacteria in addition to # the Asp+2 which we don't detect here (yet). # (Yamaguchi et al, 1988; Tokuda and Matsuyama, 2005 [review]) if dict_get(proteins[seqid], 'lipop_cleave_position'): plus_two = proteins[seqid]['lipop_cleave_position']+1 if proteins[seqid]['seq'][plus_two] == 'D': proteins[seqid]['lipop_im_retention_signal'] = True return proteins
def parse_tmhmm(text, proteins, id_mapping=[]): seqid = None for i_line, l in enumerate(text.split('\n')): if i_line == 0: continue words = l.split() if not words: continue if l.startswith("#"): seqid = parse_fasta_header(words[1])[0] else: seqid = parse_fasta_header(words[0])[0] if seqid is None: continue # re-map from a safe_seqid to the original seqid if id_mapping: seqid = id_mapping[seqid] # initialize fields in proteins[seqid] if 'tmhmm_helices' not in proteins[seqid]: proteins[seqid].update({ 'tmhmm_helices': [], 'tmhmm_inner_loops': [], 'tmhmm_outer_loops': [] }) if 'inside' in l: proteins[seqid]['tmhmm_inner_loops'].append( (int(words[-2]), int(words[-1]))) if 'outside' in l: proteins[seqid]['tmhmm_outer_loops'].append( (int(words[-2]), int(words[-1]))) if 'TMhelix' in l: proteins[seqid]['tmhmm_helices'].append( (int(words[-2]), int(words[-1]))) return proteins
def parse_tmhmm(text, proteins, id_mapping=[]): seqid = None for i_line, l in enumerate(text.split('\n')): if i_line == 0: continue words = l.split() if not words: continue if l.startswith("#"): seqid = parse_fasta_header(words[1])[0] else: seqid = parse_fasta_header(words[0])[0] if seqid is None: continue # re-map from a safe_seqid to the original seqid if id_mapping: seqid = id_mapping[seqid] # initialize fields in proteins[seqid] if 'tmhmm_helices' not in proteins[seqid]: proteins[seqid].update({ 'tmhmm_helices':[], 'tmhmm_inner_loops':[], 'tmhmm_outer_loops':[] }) if 'inside' in l: proteins[seqid]['tmhmm_inner_loops'].append( (int(words[-2]), int(words[-1]))) if 'outside' in l: proteins[seqid]['tmhmm_outer_loops'].append( (int(words[-2]), int(words[-1]))) if 'TMhelix' in l: proteins[seqid]['tmhmm_helices'].append( (int(words[-2]), int(words[-1]))) return proteins
def annotate(params, proteins): """ Returns a reference to the proteins data structure. Uses HMMER to identify sequence motifs in proteins. This function annotates the proteins with: - 'hmmsearch': a list of motifs that are found in the protein. The motifs correspond to the basename of the .hmm files found in the directory indicated by the 'hmm_profiles_dir' field of 'params'. """ log_stderr("# Searching for HMMER profiles in " + params['hmm_profiles_dir']) file_tag = os.path.join(params['hmm_profiles_dir'], '*.hmm') for hmm_profile in glob.glob(file_tag): params['hmm_profile'] = hmm_profile hmm_profile = os.path.basename(params['hmm_profile']) hmm_name = hmm_profile.replace('.hmm', '') hmmsearch3_out = 'hmm.%s.out' % hmm_name cmd = '%(hmmsearch3_bin)s -Z 2000 -E 10 %(hmm_profile)s %(fasta)s' % params run(cmd, hmmsearch3_out) # init proteins data structure with blank hmmsearch field first for seqid in proteins: if 'hmmsearch' not in proteins[seqid]: proteins[seqid]['hmmsearch'] = [] # parse the hmmsearch output file seqid = None for l in open(hmmsearch3_out): words = l.split() if l.startswith(">>"): seqid = parse_fasta_header(l[3:])[0] continue if seqid is None: continue if 'conditional E-value' in l: evalue = float(words[-1]) score = float(words[-5]) if evalue <= params['hmm_evalue_max'] and \ score >= params['hmm_score_min']: proteins[seqid]['hmmsearch'].append(hmm_name) return proteins
def annotate(params, proteins): """ Returns a reference to the proteins data structure. Uses HMMER to identify sequence motifs in proteins. This function annotates the proteins with: - 'hmmsearch': a list of motifs that are found in the protein. The motifs correspond to the basename of the .hmm files found in the directory indicated by the 'hmm_profiles_dir' field of 'params'. """ log_stderr( "# Searching for HMMER profiles in " + params['hmm_profiles_dir']) file_tag = os.path.join(params['hmm_profiles_dir'], '*.hmm') for hmm_profile in glob.glob(file_tag): params['hmm_profile'] = hmm_profile hmm_profile = os.path.basename(params['hmm_profile']) hmm_name = hmm_profile.replace('.hmm', '') hmmsearch3_out = 'hmm.%s.out' % hmm_name cmd = '%(hmmsearch3_bin)s -Z 2000 -E 10 %(hmm_profile)s %(fasta)s' % params run(cmd, hmmsearch3_out) # init proteins data structure with blank hmmsearch field first for seqid in proteins: if 'hmmsearch' not in proteins[seqid]: proteins[seqid]['hmmsearch'] = [] # parse the hmmsearch output file seqid = None for l in open(hmmsearch3_out): words = l.split() if l.startswith(">>"): seqid = parse_fasta_header(l[3:])[0] continue if seqid is None: continue if 'conditional E-value' in l: evalue = float(words[-1]) score = float(words[-5]) if evalue <= params['hmm_evalue_max'] and \ score >= params['hmm_score_min']: proteins[seqid]['hmmsearch'].append(hmm_name) return proteins
def parse_tatfind_output(output, proteins): """ Parses the TatFind HTML output (file-like object or a list of strings) an uses it to annotate and return an associated 'proteins' data structure. """ for l in output: if "Results for" in l: seqid = l.split("Results for ")[1].split(":")[:-1][0] # parse id string to bring it to our format seqid, unused = parse_fasta_header(seqid) # "TRUE" or "FALSE" tat_pred = l.split("Results for ")[1].split(":")[-1:][0].strip() if tat_pred == "TRUE": proteins[seqid]["is_tatfind"] = True else: proteins[seqid]["is_tatfind"] = False return proteins
def parse_tmbhunt(proteins, out): """ Takes the filename of a TMB-HUNT output file (text format) & parses the outer membrane beta-barrel predictions into the proteins dictionary. """ # parse TMB-HUNT text output tmbhunt_classes = {} for l in open(out, 'r'): # inmembrane.log_stderr("# TMB-HUNT raw: " + l[:-1]) if l[0] == ">": # TMB-HUNT munges FASTA ids by making them all uppercase, # so we find the equivalent any-case id in our proteins list # and use that. ugly but necessary. seqid, desc = parse_fasta_header(l) for i in proteins.keys(): if seqid.upper() == i.upper(): seqid = i desc = proteins[i]['name'] probability = None classication = None tmbhunt_classes[seqid] = {} if l.find("Probability of a NON-BETA BARREL protein with this score:" ) != -1: # we convert from probability of NON-BARREL to probability of BARREL probability = 1 - float(l.split(":")[1].strip()) if l[0:11] == "Conclusion:": classication = l.split(":")[1].strip() if classication == "BBMP": tmbhunt_classes[seqid]['tmbhunt'] = True tmbhunt_classes[seqid]['tmbhunt_prob'] = probability proteins[seqid]['tmbhunt'] = True proteins[seqid]['tmbhunt_prob'] = probability elif classication == "Non BBMP": tmbhunt_classes[seqid]['tmbhunt'] = False tmbhunt_classes[seqid]['tmbhunt_prob'] = probability proteins[seqid]['tmbhunt'] = False proteins[seqid]['tmbhunt_prob'] = probability # inmembrane.log_stderr(str(tmbhunt_classes)) return tmbhunt_classes
def parse_tmbhunt(proteins, out): """ Takes the filename of a TMB-HUNT output file (text format) & parses the outer membrane beta-barrel predictions into the proteins dictionary. """ # parse TMB-HUNT text output tmbhunt_classes = {} for l in open(out, 'r'): # inmembrane.log_stderr("# TMB-HUNT raw: " + l[:-1]) if l[0] == ">": # TMB-HUNT munges FASTA ids by making them all uppercase, # so we find the equivalent any-case id in our proteins list # and use that. ugly but necessary. seqid, desc = parse_fasta_header(l) for i in proteins.keys(): if seqid.upper() == i.upper(): seqid = i desc = proteins[i]['name'] probability = None classication = None tmbhunt_classes[seqid] = {} if l.find( "Probability of a NON-BETA BARREL protein with this score:") != -1: # we convert from probability of NON-BARREL to probability of BARREL probability = 1 - float(l.split(":")[1].strip()) if l[0:11] == "Conclusion:": classication = l.split(":")[1].strip() if classication == "BBMP": tmbhunt_classes[seqid]['tmbhunt'] = True tmbhunt_classes[seqid]['tmbhunt_prob'] = probability proteins[seqid]['tmbhunt'] = True proteins[seqid]['tmbhunt_prob'] = probability elif classication == "Non BBMP": tmbhunt_classes[seqid]['tmbhunt'] = False tmbhunt_classes[seqid]['tmbhunt_prob'] = probability proteins[seqid]['tmbhunt'] = False proteins[seqid]['tmbhunt_prob'] = probability # inmembrane.log_stderr(str(tmbhunt_classes)) return tmbhunt_classes
def annotate(params, proteins): for seqid in proteins: proteins[seqid]['is_signalp'] = False proteins[seqid]['signalp_cleave_position'] = None signalp4_out = 'signalp.out' cmd = '%(signalp4_bin)s -t %(signalp4_organism)s %(fasta)s' % \ params run(cmd, signalp4_out) for line in open(signalp4_out): if line.startswith("#"): continue words = line.split() seqid = parse_fasta_header(words[0])[0] proteins[seqid]['signalp_cleave_position'] = int(words[4]) if (words[9] == "Y"): proteins[seqid]['is_signalp'] = True return proteins
def annotate(params, proteins, \ url="http://services.cbu.uib.no/tools/bomp/", force=False): """ Uses the BOMP web service (http://services.cbu.uib.no/tools/bomp/) to predict if proteins are outer membrane beta-barrels. """ # set the user-agent so web services can block us if they want ... :/ python_version = sys.version.split()[0] agent("Python-urllib/%s (twill; inmembrane/%s)" % (python_version, inmembrane.__version__)) bomp_out = 'bomp.out' log_stderr("# BOMP(web) %s > %s" % (params['fasta'], bomp_out)) if not force and os.path.isfile(bomp_out): log_stderr("# -> skipped: %s already exists" % bomp_out) bomp_categories = {} fh = open(bomp_out, 'r') for l in fh: words = l.split() bomp_category = int(words[-1:][0]) seqid = parse_fasta_header(l)[0] proteins[seqid]['bomp'] = bomp_category bomp_categories[seqid] = bomp_category fh.close() return bomp_categories # dump extraneous output into this blackhole so we don't see it if not __DEBUG__: twill.set_output(StringIO.StringIO()) go(url) if __DEBUG__: showforms() formfile("1", "queryfile", params["fasta"]) submit() if __DEBUG__: show() # extract the job id from the page links = showlinks() job_id = None for l in links: if l.url.find("viewOutput") != -1: # grab job id from "viewOutput?id=16745338" job_id = int(l.url.split("=")[1]) if __DEBUG__: log_stderr("BOMP job id: %d" % job_id) if not job_id: # something went wrong log_stderr("# BOMP error: Can't find job id") return # parse the HTML table and extract categories go("viewOutput?id=%i" % (job_id)) polltime = 10 log_stderr("# Waiting for BOMP to finish .") while True: try: find("Not finished") log_stderr(".") except: # Finished ! Pull down the result page. log_stderr(". done!\n") go("viewOutput?id=%i" % (job_id)) if __DEBUG__: log_stderr(show()) break # Not finished. We keep polling for a time until # we give up time.sleep(polltime) polltime = polltime * 2 if polltime >= 7200: # 2 hours log_stderr("# BOMP error: Taking too long.") return go("viewOutput?id=%i" % (job_id)) if __DEBUG__: log_stderr(show()) bomp_html = show() if __DEBUG__: log_stderr(bomp_html) # Results are in the only <table> on this page, formatted like: # <tr><th>gi|107836852|gb|ABF84721.1<th>5</tr> soup = BeautifulSoup(bomp_html) bomp_categories = {} # dictionary of {name, category} pairs for tr in soup.findAll('tr')[1:]: n, c = tr.findAll('th') name = parse_fasta_header(n.text.strip())[0] category = int(c.text) bomp_categories[name] = category # write BOMP results to a tab delimited file fh = open(bomp_out, 'w') for k, v in bomp_categories.iteritems(): fh.write("%s\t%i\n" % (k, v)) fh.close() if __DEBUG__: log_stderr(str(bomp_categories)) # label proteins with bomp classification (int) or False for name in proteins: if "bomp" not in proteins[name]: if name in bomp_categories: category = int(bomp_categories[name]) proteins[name]['bomp'] = category else: proteins[name]['bomp'] = False if __DEBUG__: log_stderr(str(proteins)) return bomp_categories """