def get_annotations(params): annotations = [] params['signalp4_organism'] = 'gram-' if not params['signalp4_bin'] or params['signalp4_bin'] == 'signalp_web': annotations += ['signalp_web'] else: annotations += ['signalp4'] if not params['lipop1_bin'] or params['lipop1_bin'] == 'lipop_scrape_web': annotations += ['lipop_scrape_web'] elif params['lipop1_bin'] == 'lipop_web': annotations += ['lipop_web'] elif params['lipop1_bin'] == 'lipop1': annotations += ['lipop1'] annotations += ['tatfind_web'] if 'bomp' in dict_get(params, 'barrel_programs'): annotations.append('bomp_web') # DEPRECATED: TMB-HUNT server is permanently offline #if 'tmbhunt' in dict_get(params, 'barrel_programs'): # annotations.append('tmbhunt_web') if 'tmbetadisc-rbf' in dict_get(params, 'barrel_programs'): annotations.append('tmbetadisc_rbf_web') # TMBETA-NET knows to only run on predicted barrels # with the category 'OM(barrel)' if 'tmbeta' in dict_get(params, 'barrel_programs'): annotations.append('tmbeta_net_web') if dict_get(params, 'helix_programs'): if 'tmhmm' in params['helix_programs']: if not params['tmhmm_bin'] or params['tmhmm_bin'] == 'tmhmm_scrape_web': annotations.append('tmhmm_scrape_web') elif params['tmhmm_bin'] == 'tmhmm_web': annotations.append('tmhmm_web') else: annotations.append('tmhmm') if 'memsat3' in params['helix_programs']: annotations.append('memsat3') # run some hmm profiles to detect features (eg Tat signal) annotations += ['hmmsearch3'] params['hmm_profiles_dir'] = os.path.join( os.path.dirname(__file__), 'gram_neg_profiles') return annotations
def get_annotations(params): """ Creates a list of annotation functions required by this gram_pos protocol. The main program will run the annotation functions of this list, mapping the correct functions to the strings. As well, the function does some bookeeping on params to make sure the 'hmm_profiles_dir' is pointing in the right place. """ annotations = [] params['signalp4_organism'] = 'gram+' if not params['signalp4_bin'] or params['signalp4_bin'] == 'signalp_web': annotations += ['signalp_web'] else: annotations += ['signalp4'] if not params['lipop1_bin'] or params['lipop1_bin'] == 'lipop_scrape_web': annotations += ['lipop_scrape_web'] elif params['lipop1_bin'] == 'lipop_web': annotations += ['lipop_web'] else: annotations += ['lipop1'] annotations += ['hmmsearch3'] if dict_get(params, 'helix_programs'): if 'tmhmm' in params['helix_programs']: if not params['tmhmm_bin'] or params['tmhmm_bin'] == 'tmhmm_scrape_web': annotations.append('tmhmm_scrape_web') elif params['tmhmm_bin'] == 'tmhmm_web': annotations.append('tmhmm_web') else: annotations.append('tmhmm') if 'memsat3' in params['helix_programs']: annotations.append('memsat3') params['hmm_profiles_dir'] = os.path.join( os.path.dirname(__file__), 'gram_pos_profiles') return annotations
def parse_lipop(text, proteins, id_mapping=None): """ Parses the text output of the LipoP program and returns a 'proteins' datastructure with annotations. The parser can also that the HTML returned by the LipoP web interface. If a dictionary of {safe_seqid : seqid} mappings is given, the parser will expect the input text to contain safe_seqids. """ if id_mapping is None: id_mapping = [] # initialize fields in each protein for seqid in proteins: proteins[seqid]['is_lipop'] = False proteins[seqid]['lipop_cleave_position'] = None for l in text.split('\n'): words = l.split() if 'SpII score' in l: seqid = parse_fasta_header(words[1])[0] if id_mapping: seqid = id_mapping[seqid] if 'cleavage' in l: pair = words[5].split("=")[1] i = int(pair.split('-')[0]) else: i = None proteins[seqid]['is_lipop'] = 'Sp' in words[2] proteins[seqid]['lipop_cleave_position'] = i # check for an E.coli style inner membrane retention signal # Asp+2 to cleavage site. There are other apparent retention # signals in E. coli and other gram- bacteria in addition to # the Asp+2 which we don't detect here (yet). # (Yamaguchi et al, 1988; Tokuda and Matsuyama, 2005 [review]) if dict_get(proteins[seqid], 'lipop_cleave_position'): plus_two = proteins[seqid]['lipop_cleave_position'] + 1 if proteins[seqid]['seq'][plus_two] == 'D': proteins[seqid]['lipop_im_retention_signal'] = True return proteins
def has_tm_helix(protein): for program in params['helix_programs']: if dict_get(protein, '%s_helices' % program): return True return False
def post_process_protein(params, protein): """ This is the main analysis of the protein, where theprotein dictionary should contain all the necessary information from the annotations. Thus post_process_protein contain can determine the final analysis. """ def sequence_length(protein): return protein['sequence_length'] def has_tm_helix(protein): for program in params['helix_programs']: if dict_get(protein, '%s_helices' % program): return True return False def has_surface_exposed_loop(protein): for program in params['helix_programs']: if eval_surface_exposed_loop( protein['sequence_length'], len(protein['%s_helices' % (program)]), protein['%s_outer_loops' % (program)], params['terminal_exposed_loop_min'], params['internal_exposed_loop_min']): return True return False def exposed_loop_extent(protein): extents = [] for program in params['helix_programs']: if program+'_helices' in protein: extents.append(max_exposed_loop( protein['sequence_length'], len(protein['%s_helices' % (program)]), protein['%s_outer_loops' % (program)], params['terminal_exposed_loop_min'], params['internal_exposed_loop_min'])) if extents: return max(extents) else: return 0 terminal_exposed_loop_min = \ params['terminal_exposed_loop_min'] is_hmm_profile_match = dict_get(protein, 'hmmsearch') is_lipop = dict_get(protein, 'is_lipop') if is_lipop: i_lipop_cut = protein['lipop_cleave_position'] is_signalp = dict_get(protein, 'is_signalp') if is_signalp: i_signalp_cut = protein['signalp_cleave_position'] details = [] if is_hmm_profile_match: details += ["hmm(%s)" % "|".join(protein['hmmsearch'])] if is_lipop: details += ["lipop"] if is_signalp: details += ["signalp"] for program in params['helix_programs']: if has_tm_helix(protein): n = len(protein['%s_helices' % program]) details += [program + "(%d)" % n] if is_lipop: chop_nterminal_peptide(protein, i_lipop_cut) elif is_signalp: chop_nterminal_peptide(protein, i_signalp_cut) if is_hmm_profile_match: category = "PSE-Cellwall" elif has_tm_helix(protein): if has_surface_exposed_loop(protein): category = "PSE-Membrane" else: category = "MEMBRANE(non-PSE)" else: if is_lipop: # whole protein considered outer terminal loop if sequence_length(protein) < terminal_exposed_loop_min: category = "LIPOPROTEIN(non-PSE)" else: category = "PSE-Lipoprotein" elif is_signalp: category = "SECRETED" else: category = "CYTOPLASM(non-PSE)" if details == []: details = ["."] protein['details'] = details protein['category'] = category if 'CYTOPLASM' not in category and 'SECRETED' not in category: protein['loop_extent'] = exposed_loop_extent(protein) else: protein['loop_extent'] = "." return details, category
def post_process_protein(params, protein): def has_tm_helix(protein): for program in params['helix_programs']: if dict_get(protein, '%s_helices' % program): return True return False # these functions detect if and TM-containing IM proteins # have large loops / terminal regions in the periplasm or cytoplasm # that may be accessible / inaccessible in spheroplast shaving # experiments. def has_long_loops(protein, loop_str='_outer_loops', \ loop_length=params['internal_exposed_loop_min']): for annot in protein: if loop_str in annot: for loop in protein[annot]: l_len = loop[1]-loop[0] if l_len >= loop_length: return True return False def long_in_periplasm(protein, \ loop_length=params['internal_exposed_loop_min']): return has_long_loops(protein, '_outer_loops', loop_length) def long_in_cytoplasm(protein, \ loop_length=params['internal_exposed_loop_min']): return has_long_loops(protein, '_inner_loops', loop_length) details = [] category = "UNKNOWN" is_hmm_profile_match = dict_get(protein, 'hmmsearch') is_signalp = dict_get(protein, 'is_signalp') is_tatfind = dict_get(protein, 'is_tatfind') is_lipop = dict_get(protein, 'is_lipop') # in terms of most sublocalization logic, a Tat signal is similar to a # Sec (signalp) signal. We use has_signal_pept to denote that either # is present. has_signal_pept = False if is_signalp or is_tatfind or \ (('hmmsearch' in protein) and "Tat_PS51318" in protein['hmmsearch']): has_signal_pept = True # annotate the barrels - high scoring bomp hits don't require a # signal peptide, low scoring ones do has_barrel = False bomp_score = dict_get(protein, 'bomp') if (bomp_score >= params['bomp_clearly_cutoff']) or \ (has_signal_pept and bomp_score >= params['bomp_maybe_cutoff']): details += ['bomp(%i)' % (bomp_score)] has_barrel = True tmbhunt_prob = dict_get(protein, 'tmbhunt_prob') if (tmbhunt_prob >= params['tmbhunt_clearly_cutoff']) or \ (has_signal_pept and tmbhunt_prob >= params['tmbhunt_maybe_cutoff']): details += ['tmbhunt(%.2f)' % (tmbhunt_prob)] has_barrel = True if has_signal_pept and dict_get(protein, 'is_tmbetadisc_rbf'): details += ['tmbetadisc-rbf'] has_barrel = True if has_barrel: category = 'OM(barrel)' # we only regard the barrel prediction as a true positive # if a signal peptide is also present # is_barrel = False # if has_signal_pept and has_barrel: # TODO and num_tms <= 1: # category = 'OM(barrel)' # is_barrel = True # set number of predicted OM barrel strands in details if has_barrel and \ dict_get(protein, 'tmbeta_strands'): num_strands = len(protein['tmbeta_strands']) details += ['tmbeta_strands(%i)' % (num_strands)] if has_signal_pept and not is_lipop and \ (dict_get(protein, 'signalp_cleave_position')): # we use the SignalP signal peptidase cleavage site for Tat signals chop_nterminal_peptide(protein, protein['signalp_cleave_position']) if is_tatfind: details += ["tatfind"] if is_signalp: details += ["signalp"] if is_lipop: details += ["lipop"] chop_nterminal_peptide(protein, protein['lipop_cleave_position']) if is_hmm_profile_match: details += ["hmm(%s)" % "|".join(protein['hmmsearch'])] if has_tm_helix(protein) and not has_barrel: for program in params['helix_programs']: n = len(protein['%s_helices' % program]) details += [program + "(%d)" % n] category = "IM" if long_in_periplasm(protein): category += "+peri" if long_in_cytoplasm(protein): category += "+cyto" elif not has_barrel: if is_lipop: if dict_get(protein, 'lipop_im_retention_signal'): category = "LIPOPROTEIN(IM)" else: category = "LIPOPROTEIN(OM)" pass elif (has_signal_pept): category = "PERIPLASMIC/SECRETED" else: category = "CYTOPLASM" if details == []: details = ["."] protein['details'] = details protein['category'] = category return details, category
def annotate(params, proteins, \ url="http://rbf.bioinfo.tw/"+ "~sachen/OMPpredict/"+ "TMBETADISC-RBF-Content.html", force=False): """ Interfaces with the TatFind web service at (http://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php) to predict if protein sequence is likely to be an outer membrane beta-barrel. Note that the default URL we use it different to the regular form used by web browsers, since we need to bypass some AJAX fun. """ # TODO: automatically split large sets into multiple jobs # since TMBETADISC seems to not like more than take # ~5000 seqs at a time if len(proteins) >= 5000: log_stderr("# ERROR: TMBETADISC-RBF(web): tends to fail with > ~5000 sequences.") return # set the user-agent so web services can block us if they want ... :/ python_version = sys.version.split()[0] agent("Python-urllib/%s (twill; inmembrane)" % python_version) outfn = 'tmbetadisc-rbf.out' log_stderr("# TMBETADISC-RBF(web) %s > %s" % (params['fasta'], outfn)) if not force and os.path.isfile(outfn): log_stderr("# -> skipped: %s already exists" % outfn) fh = open(outfn, 'r') proteins = parse_tmbetadisc_output(fh.read(), proteins) fh.close() return proteins # dump extraneous output into this blackhole so we don't see it if not __DEBUG__: twill.set_output(StringIO.StringIO()) go(url) if __DEBUG__: showforms() formfile("1", "userfile", params["fasta"]) fv("1", "format", "file") # set the user defined method method_map = {"aa":"Amino Acid Composition", "dp":"Depipetide Composition", "aadp":"Amino Acid & Depipetide Composition", "pssm":"PSSM"} if dict_get(params, 'tmbetadisc_rbf_method'): try: method = method_map[params['tmbetadisc_rbf_method']] except KeyError: log_stderr("# ERROR: Invalid setting from tmbetadisc_rbf_method. \ Must be set to aa, dp, aadp or pssm.") sys.exit() #fv("1", "select", "Amino Acid Composition") #fv("1", "select", "Depipetide Composition") #fv("1", "select", "Amino Acid & Depipetide Composition") #fv("1", "select", "PSSM") fv("1", "select", method) submit() waiting_page = show() if __DEBUG__: log_stderr(waiting_page) for l in waiting_page.split('\n'): if l.find("TMBETADISC-RBF-action.php?UniqueName=") != -1: result_url = l.split("'")[1] time.sleep(5) go(result_url) output = show() if __DEBUG__: log_stderr(output) # write raw output to a file fh = open(outfn, 'w') fh.write(output) fh.close() proteins = parse_tmbetadisc_output(output, proteins) return proteins
def annotate(params, proteins, \ url="http://psfs.cbrc.jp/tmbeta-net/", \ category='OM(barrel)', force=False): """ Uses the TMBETA-NET web service (http://psfs.cbrc.jp/tmbeta-net/) to predict strands of outer membrane beta-barrels. By default, category='BARREL' means prediction will only be run on proteins in the set with this category property. To process all proteins, change category to None. These keys are added to the proteins dictionary: 'tmbeta_strands' - a list of lists with paired start and end residues of each predicted strand. (eg [[3,9],[14,21], ..etc ]) """ # set the user-agent so web services can block us if they want ... :/ python_version = sys.version.split()[0] agent("Python-urllib/%s (twill; inmembrane)" % python_version) outfile = 'tmbeta_net.out' log_stderr("# TMBETA-NET(web) %s > %s" % (params['fasta'], outfile)) tmbeta_strands = {} if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) fh = open(outfile, 'r') tmbeta_strands = json.loads(fh.read()) fh.close() for seqid in tmbeta_strands: proteins[seqid]['tmbeta_strands'] = tmbeta_strands[seqid] return tmbeta_strands # dump extraneous output into this blackhole so we don't see it if not __DEBUG__: twill.set_output(StringIO.StringIO()) for seqid in proteins: # only run on sequences which match the category filter if force or \ (category == None) or \ (dict_get(proteins[seqid], 'category') == category): pass else: continue go(url) if __DEBUG__: showforms() fv("1", "sequence", proteins[seqid]['seq']) submit() log_stderr("# TMBETA-NET: Predicting strands for %s - %s\n" \ % (seqid, proteins[seqid]['name'])) out = show() time.sleep(1) if ("Some query is already running. Please try again." in out): log_stderr("# TMBETA-NET(web) error: %s" % (out)) return {} # parse the web page returned, extract strand boundaries proteins[seqid]['tmbeta_strands'] = [] for l in out.split('\n'): if __DEBUG__: log_stderr("## " + l) if "<BR>Segment " in l: i, j = l.split(":")[1].split("to") i = int(i.strip()[1:]) j = int(j.strip()[1:]) proteins[seqid]['tmbeta_strands'].append([i, j]) if __DEBUG__: log_stderr("# TMBETA-NET(web) segments: %s, %s" % (i, j)) tmbeta_strands[seqid] = proteins[seqid]['tmbeta_strands'] # we store the parsed strand boundaries in JSON format fh = open(outfile, 'w') fh.write(json.dumps(tmbeta_strands, separators=(',', ':\n'))) fh.close() return tmbeta_strands
def annotate(params, proteins, \ url="http://rbf.bioinfo.tw/" + "~sachen/OMPpredict/" + "TMBETADISC-RBF-Content.html", force=False): """ Interfaces with the TMBETADISC-RBF web service at (http://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php) to predict if protein sequence is likely to be an outer membrane beta-barrel. Note that the default URL we use it different to the regular form used by web browsers, since we need to bypass some AJAX fun. """ # TODO: automatically split large sets into multiple jobs # since TMBETADISC seems to not like more than take # ~5000 seqs at a time if len(proteins) >= 5000: log_stderr( "# ERROR: TMBETADISC-RBF(web): tends to fail with > ~5000 sequences.") return # set the user-agent so web services can block us if they want ... :/ python_version = sys.version.split()[0] # TODO: Set User-Agent header for requests # agent("Python-urllib/%s (requests; inmembrane)" % python_version) outfn = 'tmbetadisc-rbf.out' log_stderr("# TMBETADISC-RBF(web) %s > %s" % (params['fasta'], outfn)) if not force and os.path.isfile(outfn): log_stderr("# -> skipped: %s already exists" % outfn) fh = open(outfn, 'r') proteins = parse_tmbetadisc_output(fh.read(), proteins) fh.close() return proteins # set the user defined method method_map = {"aa": "Amino Acid Composition", "dp": "Depipetide Composition", "aadp": "Amino Acid & Depipetide Composition", "pssm": "PSSM"} if dict_get(params, 'tmbetadisc_rbf_method'): try: method = method_map[params['tmbetadisc_rbf_method']] except KeyError: log_stderr("# ERROR: Invalid setting from tmbetadisc_rbf_method. \ Must be set to aa, dp, aadp or pssm.") sys.exit() # files = {'userfile': open(params["fasta"], 'rb')} with open(params["fasta"], 'r') as ff: data = {'format': 'fasta', 'select': method, 'seq': ff.read()} response = requests.post( 'https://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php', data=data) # , files=files) waiting_page = response.content if __DEBUG__: log_stderr(waiting_page) for l in waiting_page.split('\n'): if 'TMBETADISC-RBF-action.php?UniqueName=' in l: result_url = l.split("'")[1] time.sleep(5) output = requests.get(result_url).content if __DEBUG__: log_stderr(output) # write raw output to a file fh = open(outfn, 'w') # fh.write(waiting_page) # fh.write("<!-- ----------------------------------------------------------------------------------- -->") fh.write(output) fh.close() proteins = parse_tmbetadisc_output(output, proteins) return proteins
def annotate(params, proteins, \ url="http://psfs.cbrc.jp/tmbeta-net/", \ category='OM(barrel)', force=False): """ Uses the TMBETA-NET web service (http://psfs.cbrc.jp/tmbeta-net/) to predict strands of outer membrane beta-barrels. By default, category='BARREL' means prediction will only be run on proteins in the set with this category property. To process all proteins, change category to None. These keys are added to the proteins dictionary: 'tmbeta_strands' - a list of lists with paired start and end residues of each predicted strand. (eg [[3,9],[14,21], ..etc ]) """ # set the user-agent so web services can block us if they want ... :/ python_version = sys.version.split()[0] agent("Python-urllib/%s (twill; inmembrane)" % python_version) outfile = 'tmbeta_net.out' log_stderr("# TMBETA-NET(web) %s > %s" % (params['fasta'], outfile)) tmbeta_strands = {} if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) fh = open(outfile, 'r') tmbeta_strands = json.loads(fh.read()) fh.close() for seqid in tmbeta_strands: proteins[seqid]['tmbeta_strands'] = tmbeta_strands[seqid] return tmbeta_strands # dump extraneous output into this blackhole so we don't see it if not __DEBUG__: twill.set_output(StringIO.StringIO()) for seqid in proteins: # only run on sequences which match the category filter if force or \ (category == None) or \ (dict_get(proteins[seqid], 'category') == category): pass else: continue go(url) if __DEBUG__: showforms() fv("1","sequence",proteins[seqid]['seq']) submit() log_stderr("# TMBETA-NET: Predicting strands for %s - %s\n" \ % (seqid, proteins[seqid]['name'])) out = show() time.sleep(1) if ("Some query is already running. Please try again." in out): log_stderr("# TMBETA-NET(web) error: %s" % (out)) return {} # parse the web page returned, extract strand boundaries proteins[seqid]['tmbeta_strands'] = [] for l in out.split('\n'): if __DEBUG__: log_stderr("## " + l) if "<BR>Segment " in l: i,j = l.split(":")[1].split("to") i = int(i.strip()[1:]) j = int(j.strip()[1:]) proteins[seqid]['tmbeta_strands'].append([i,j]) if __DEBUG__: log_stderr("# TMBETA-NET(web) segments: %s, %s" % (i, j)) tmbeta_strands[seqid] = proteins[seqid]['tmbeta_strands'] # we store the parsed strand boundaries in JSON format fh = open(outfile, 'w') fh.write(json.dumps(tmbeta_strands, separators=(',',':\n'))) fh.close() return tmbeta_strands
def post_process_protein(params, protein): def has_tm_helix(protein): for program in params['helix_programs']: if dict_get(protein, '%s_helices' % program): return True return False # these functions detect if and TM-containing IM proteins # have large loops / terminal regions in the periplasm or cytoplasm # that may be accessible / inaccessible in spheroplast shaving # experiments. def has_long_loops(protein, loop_str='_outer_loops', \ loop_length=params['internal_exposed_loop_min']): for annot in protein: if loop_str in annot: for loop in protein[annot]: l_len = loop[1]-loop[0] if l_len >= loop_length: return True return False def long_in_periplasm(protein, \ loop_length=params['internal_exposed_loop_min']): return has_long_loops(protein, '_outer_loops', loop_length) def long_in_cytoplasm(protein, \ loop_length=params['internal_exposed_loop_min']): return has_long_loops(protein, '_inner_loops', loop_length) details = [] category = "UNKNOWN" is_hmm_profile_match = dict_get(protein, 'hmmsearch') is_signalp = dict_get(protein, 'is_signalp') is_tatfind = dict_get(protein, 'is_tatfind') is_lipop = dict_get(protein, 'is_lipop') # in terms of most sublocalization logic, a Tat signal is similar to a # Sec (signalp) signal. We use has_signal_pept to denote that either # is present. has_signal_pept = False if is_signalp or is_tatfind or \ (('hmmsearch' in protein) and "Tat_PS51318" in protein['hmmsearch']): has_signal_pept = True # annotate the barrels - high scoring bomp hits don't require a # signal peptide, low scoring ones do has_barrel = False bomp_score = dict_get(protein, 'bomp') if (bomp_score >= params['bomp_clearly_cutoff']) or \ (has_signal_pept and bomp_score >= params['bomp_maybe_cutoff']): details += ['bomp(%i)' % (bomp_score)] has_barrel = True # DEPRECATED: TMB-HUNT server is permanently offline #tmbhunt_prob = dict_get(protein, 'tmbhunt_prob') #if (tmbhunt_prob >= params['tmbhunt_clearly_cutoff']) or \ # (has_signal_pept and tmbhunt_prob >= params['tmbhunt_maybe_cutoff']): # details += ['tmbhunt(%.2f)' % (tmbhunt_prob)] # has_barrel = True if has_signal_pept and dict_get(protein, 'is_tmbetadisc_rbf'): details += ['tmbetadisc-rbf'] has_barrel = True if has_barrel: category = 'OM(barrel)' # we only regard the barrel prediction as a true positive # if a signal peptide is also present # is_barrel = False # if has_signal_pept and has_barrel: # TODO and num_tms <= 1: # category = 'OM(barrel)' # is_barrel = True # set number of predicted OM barrel strands in details if has_barrel and \ dict_get(protein, 'tmbeta_strands'): num_strands = len(protein['tmbeta_strands']) details += ['tmbeta_strands(%i)' % (num_strands)] if has_signal_pept and not is_lipop and \ (dict_get(protein, 'signalp_cleave_position')): # we use the SignalP signal peptidase cleavage site for Tat signals chop_nterminal_peptide(protein, protein['signalp_cleave_position']) if is_tatfind: details += ["tatfind"] if is_signalp: details += ["signalp"] if is_lipop: details += ["lipop"] chop_nterminal_peptide(protein, protein['lipop_cleave_position']) if is_hmm_profile_match: details += ["hmm(%s)" % "|".join(protein['hmmsearch'])] if has_tm_helix(protein) and not has_barrel: for program in params['helix_programs']: n = len(protein['%s_helices' % program]) details += [program + "(%d)" % n] category = "IM" if long_in_periplasm(protein): category += "+peri" if long_in_cytoplasm(protein): category += "+cyto" elif not has_barrel: if is_lipop: if dict_get(protein, 'lipop_im_retention_signal'): category = "LIPOPROTEIN(IM)" else: category = "LIPOPROTEIN(OM)" pass elif (has_signal_pept): category = "PERIPLASMIC/SECRETED" else: category = "CYTOPLASM" if details == []: details = ["."] protein['details'] = details protein['category'] = category return details, category
def annotate(params, proteins, \ url="http://rbf.bioinfo.tw/" + "~sachen/OMPpredict/" + "TMBETADISC-RBF-Content.html", force=False): """ Interfaces with the TMBETADISC-RBF web service at (http://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php) to predict if protein sequence is likely to be an outer membrane beta-barrel. Note that the default URL we use it different to the regular form used by web browsers, since we need to bypass some AJAX fun. """ # TODO: automatically split large sets into multiple jobs # since TMBETADISC seems to not like more than take # ~5000 seqs at a time if len(proteins) >= 5000: log_stderr( "# ERROR: TMBETADISC-RBF(web): tends to fail with > ~5000 sequences." ) return # set the user-agent so web services can block us if they want ... :/ python_version = sys.version.split()[0] # TODO: Set User-Agent header for requests # agent("Python-urllib/%s (requests; inmembrane)" % python_version) outfn = 'tmbetadisc-rbf.out' log_stderr("# TMBETADISC-RBF(web) %s > %s" % (params['fasta'], outfn)) if not force and os.path.isfile(outfn): log_stderr("# -> skipped: %s already exists" % outfn) fh = open(outfn, 'r') proteins = parse_tmbetadisc_output(fh.read(), proteins) fh.close() return proteins # set the user defined method method_map = { "aa": "Amino Acid Composition", "dp": "Depipetide Composition", "aadp": "Amino Acid & Depipetide Composition", "pssm": "PSSM" } if dict_get(params, 'tmbetadisc_rbf_method'): try: method = method_map[params['tmbetadisc_rbf_method']] except KeyError: log_stderr("# ERROR: Invalid setting from tmbetadisc_rbf_method. \ Must be set to aa, dp, aadp or pssm.") sys.exit() # files = {'userfile': open(params["fasta"], 'rb')} with open(params["fasta"], 'r') as ff: data = {'format': 'fasta', 'select': method, 'seq': ff.read()} response = requests.post( 'https://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php', data=data) # , files=files) waiting_page = response.content if __DEBUG__: log_stderr(waiting_page) for l in waiting_page.split('\n'): if 'TMBETADISC-RBF-action.php?UniqueName=' in l: result_url = l.split("'")[1] time.sleep(5) output = requests.get(result_url).content if __DEBUG__: log_stderr(output) # write raw output to a file fh = open(outfn, 'w') # fh.write(waiting_page) # fh.write("<!-- ----------------------------------------------------------------------------------- -->") fh.write(output) fh.close() proteins = parse_tmbetadisc_output(output, proteins) return proteins