def annotate(params, proteins): """ Returns a reference to the proteins data structure. Uses HMMER to identify sequence motifs in proteins. This function annotates the proteins with: - 'hmmsearch': a list of motifs that are found in the protein. The motifs correspond to the basename of the .hmm files found in the directory indicated by the 'hmm_profiles_dir' field of 'params'. """ log_stderr( "# Searching for HMMER profiles in " + params['hmm_profiles_dir']) file_tag = os.path.join(params['hmm_profiles_dir'], '*.hmm') for hmm_profile in glob.glob(file_tag): params['hmm_profile'] = hmm_profile hmm_profile = os.path.basename(params['hmm_profile']) hmm_name = hmm_profile.replace('.hmm', '') hmmsearch3_out = 'hmm.%s.out' % hmm_name cmd = '%(hmmsearch3_bin)s -Z 2000 -E 10 %(hmm_profile)s %(fasta)s' % params run(cmd, hmmsearch3_out) # init proteins data structure with blank hmmsearch field first for seqid in proteins: if 'hmmsearch' not in proteins[seqid]: proteins[seqid]['hmmsearch'] = [] # parse the hmmsearch output file seqid = None for l in open(hmmsearch3_out): words = l.split() if l.startswith(">>"): seqid = parse_fasta_header(l[3:])[0] continue if seqid is None: continue if 'conditional E-value' in l: evalue = float(words[-1]) score = float(words[-5]) if evalue <= params['hmm_evalue_max'] and \ score >= params['hmm_score_min']: proteins[seqid]['hmmsearch'].append(hmm_name) return proteins
def annotate(params, proteins): """ Returns a reference to the proteins data structure. Uses HMMER to identify sequence motifs in proteins. This function annotates the proteins with: - 'hmmsearch': a list of motifs that are found in the protein. The motifs correspond to the basename of the .hmm files found in the directory indicated by the 'hmm_profiles_dir' field of 'params'. """ log_stderr("# Searching for HMMER profiles in " + params['hmm_profiles_dir']) file_tag = os.path.join(params['hmm_profiles_dir'], '*.hmm') for hmm_profile in glob.glob(file_tag): params['hmm_profile'] = hmm_profile hmm_profile = os.path.basename(params['hmm_profile']) hmm_name = hmm_profile.replace('.hmm', '') hmmsearch3_out = 'hmm.%s.out' % hmm_name cmd = '%(hmmsearch3_bin)s -Z 2000 -E 10 %(hmm_profile)s %(fasta)s' % params run(cmd, hmmsearch3_out) # init proteins data structure with blank hmmsearch field first for seqid in proteins: if 'hmmsearch' not in proteins[seqid]: proteins[seqid]['hmmsearch'] = [] # parse the hmmsearch output file seqid = None for l in open(hmmsearch3_out): words = l.split() if l.startswith(">>"): seqid = parse_fasta_header(l[3:])[0] continue if seqid is None: continue if 'conditional E-value' in l: evalue = float(words[-1]) score = float(words[-5]) if evalue <= params['hmm_evalue_max'] and \ score >= params['hmm_score_min']: proteins[seqid]['hmmsearch'].append(hmm_name) return proteins
def annotate(params, proteins, \ url="http://signalfind.org/tatfind.html", force=False): """ Interfaces with the TatFind web service at (http://signalfind.org/tatfind.html) to predict if protein sequences contain Twin-Arginine Translocation (Tat) signal peptides. """ # set the user-agent so web services can block us if they want ... :/ python_version = sys.version.split()[0] agent("Python-urllib/%s (twill; inmembrane)" % python_version) outfn = 'tatfind.out' log_stderr("# TatFind(web) %s > %s" % (params['fasta'], outfn)) if not force and os.path.isfile(outfn): log_stderr("# -> skipped: %s already exists" % outfn) fh = open(outfn, 'r') proteins = parse_tatfind_output(fh, proteins) fh.close() return proteins # dump extraneous output into this blackhole so we don't see it if not __DEBUG__: twill.set_output(StringIO.StringIO()) go(url) if __DEBUG__: showforms() formfile("1", "seqFile", params["fasta"]) submit() if __DEBUG__: show() tatfind_output = show() if __DEBUG__: log_stderr(tatfind_output) # write raw TatFind output to a file fh = open(outfn, 'w') fh.write(tatfind_output) fh.close() proteins = parse_tatfind_output(tatfind_output.split("\n"), proteins) return proteins
def annotate(params, proteins, \ url="http://services.cbu.uib.no/tools/bomp/", force=False): """ Uses the BOMP web service (http://services.cbu.uib.no/tools/bomp/) to predict if proteins are outer membrane beta-barrels. """ # set the user-agent so web services can block us if they want ... :/ python_version = sys.version.split()[0] agent("Python-urllib/%s (twill; inmembrane/%s)" % (python_version, inmembrane.__version__)) bomp_out = 'bomp.out' log_stderr("# BOMP(web) %s > %s" % (params['fasta'], bomp_out)) if not force and os.path.isfile(bomp_out): log_stderr("# -> skipped: %s already exists" % bomp_out) bomp_categories = {} fh = open(bomp_out, 'r') for l in fh: words = l.split() bomp_category = int(words[-1:][0]) seqid = parse_fasta_header(l)[0] proteins[seqid]['bomp'] = bomp_category bomp_categories[seqid] = bomp_category fh.close() return bomp_categories # dump extraneous output into this blackhole so we don't see it if not __DEBUG__: twill.set_output(StringIO.StringIO()) go(url) if __DEBUG__: showforms() formfile("1", "queryfile", params["fasta"]) submit() if __DEBUG__: show() # extract the job id from the page links = showlinks() job_id = None for l in links: if l.url.find("viewOutput") != -1: # grab job id from "viewOutput?id=16745338" job_id = int(l.url.split("=")[1]) if __DEBUG__: log_stderr("BOMP job id: %d" % job_id) if not job_id: # something went wrong log_stderr("# BOMP error: Can't find job id") return # parse the HTML table and extract categories go("viewOutput?id=%i" % (job_id)) polltime = 10 log_stderr("# Waiting for BOMP to finish .") while True: try: find("Not finished") log_stderr(".") except: # Finished ! Pull down the result page. log_stderr(". done!\n") go("viewOutput?id=%i" % (job_id)) if __DEBUG__: log_stderr(show()) break # Not finished. We keep polling for a time until # we give up time.sleep(polltime) polltime = polltime * 2 if polltime >= 7200: # 2 hours log_stderr("# BOMP error: Taking too long.") return go("viewOutput?id=%i" % (job_id)) if __DEBUG__: log_stderr(show()) bomp_html = show() if __DEBUG__: log_stderr(bomp_html) # Results are in the only <table> on this page, formatted like: # <tr><th>gi|107836852|gb|ABF84721.1<th>5</tr> soup = BeautifulSoup(bomp_html) bomp_categories = {} # dictionary of {name, category} pairs for tr in soup.findAll('tr')[1:]: n, c = tr.findAll('th') name = parse_fasta_header(n.text.strip())[0] category = int(c.text) bomp_categories[name] = category # write BOMP results to a tab delimited file fh = open(bomp_out, 'w') for k, v in bomp_categories.iteritems(): fh.write("%s\t%i\n" % (k, v)) fh.close() if __DEBUG__: log_stderr(str(bomp_categories)) # label proteins with bomp classification (int) or False for name in proteins: if "bomp" not in proteins[name]: if name in bomp_categories: category = int(bomp_categories[name]) proteins[name]['bomp'] = category else: proteins[name]['bomp'] = False if __DEBUG__: log_stderr(str(proteins)) return bomp_categories """
def annotate(params, proteins, \ url="http://rbf.bioinfo.tw/" + "~sachen/OMPpredict/" + "TMBETADISC-RBF-Content.html", force=False): """ Interfaces with the TMBETADISC-RBF web service at (http://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php) to predict if protein sequence is likely to be an outer membrane beta-barrel. Note that the default URL we use it different to the regular form used by web browsers, since we need to bypass some AJAX fun. """ # TODO: automatically split large sets into multiple jobs # since TMBETADISC seems to not like more than take # ~5000 seqs at a time if len(proteins) >= 5000: log_stderr( "# ERROR: TMBETADISC-RBF(web): tends to fail with > ~5000 sequences." ) return # set the user-agent so web services can block us if they want ... :/ python_version = sys.version.split()[0] # TODO: Set User-Agent header for requests # agent("Python-urllib/%s (requests; inmembrane)" % python_version) outfn = 'tmbetadisc-rbf.out' log_stderr("# TMBETADISC-RBF(web) %s > %s" % (params['fasta'], outfn)) if not force and os.path.isfile(outfn): log_stderr("# -> skipped: %s already exists" % outfn) fh = open(outfn, 'r') proteins = parse_tmbetadisc_output(fh.read(), proteins) fh.close() return proteins # set the user defined method method_map = { "aa": "Amino Acid Composition", "dp": "Depipetide Composition", "aadp": "Amino Acid & Depipetide Composition", "pssm": "PSSM" } if dict_get(params, 'tmbetadisc_rbf_method'): try: method = method_map[params['tmbetadisc_rbf_method']] except KeyError: log_stderr("# ERROR: Invalid setting from tmbetadisc_rbf_method. \ Must be set to aa, dp, aadp or pssm.") sys.exit() # files = {'userfile': open(params["fasta"], 'rb')} with open(params["fasta"], 'r') as ff: data = {'format': 'fasta', 'select': method, 'seq': ff.read()} response = requests.post( 'https://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php', data=data) # , files=files) waiting_page = response.content if __DEBUG__: log_stderr(waiting_page) for l in waiting_page.split('\n'): if 'TMBETADISC-RBF-action.php?UniqueName=' in l: result_url = l.split("'")[1] time.sleep(5) output = requests.get(result_url).content if __DEBUG__: log_stderr(output) # write raw output to a file fh = open(outfn, 'w') # fh.write(waiting_page) # fh.write("<!-- ----------------------------------------------------------------------------------- -->") fh.write(output) fh.close() proteins = parse_tmbetadisc_output(output, proteins) return proteins
def annotate(params, proteins, \ url="http://rbf.bioinfo.tw/"+ "~sachen/OMPpredict/"+ "TMBETADISC-RBF-Content.html", force=False): """ Interfaces with the TatFind web service at (http://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php) to predict if protein sequence is likely to be an outer membrane beta-barrel. Note that the default URL we use it different to the regular form used by web browsers, since we need to bypass some AJAX fun. """ # TODO: automatically split large sets into multiple jobs # since TMBETADISC seems to not like more than take # ~5000 seqs at a time if len(proteins) >= 5000: log_stderr("# ERROR: TMBETADISC-RBF(web): tends to fail with > ~5000 sequences.") return # set the user-agent so web services can block us if they want ... :/ python_version = sys.version.split()[0] agent("Python-urllib/%s (twill; inmembrane)" % python_version) outfn = 'tmbetadisc-rbf.out' log_stderr("# TMBETADISC-RBF(web) %s > %s" % (params['fasta'], outfn)) if not force and os.path.isfile(outfn): log_stderr("# -> skipped: %s already exists" % outfn) fh = open(outfn, 'r') proteins = parse_tmbetadisc_output(fh.read(), proteins) fh.close() return proteins # dump extraneous output into this blackhole so we don't see it if not __DEBUG__: twill.set_output(StringIO.StringIO()) go(url) if __DEBUG__: showforms() formfile("1", "userfile", params["fasta"]) fv("1", "format", "file") # set the user defined method method_map = {"aa":"Amino Acid Composition", "dp":"Depipetide Composition", "aadp":"Amino Acid & Depipetide Composition", "pssm":"PSSM"} if dict_get(params, 'tmbetadisc_rbf_method'): try: method = method_map[params['tmbetadisc_rbf_method']] except KeyError: log_stderr("# ERROR: Invalid setting from tmbetadisc_rbf_method. \ Must be set to aa, dp, aadp or pssm.") sys.exit() #fv("1", "select", "Amino Acid Composition") #fv("1", "select", "Depipetide Composition") #fv("1", "select", "Amino Acid & Depipetide Composition") #fv("1", "select", "PSSM") fv("1", "select", method) submit() waiting_page = show() if __DEBUG__: log_stderr(waiting_page) for l in waiting_page.split('\n'): if l.find("TMBETADISC-RBF-action.php?UniqueName=") != -1: result_url = l.split("'")[1] time.sleep(5) go(result_url) output = show() if __DEBUG__: log_stderr(output) # write raw output to a file fh = open(outfn, 'w') fh.write(output) fh.close() proteins = parse_tmbetadisc_output(output, proteins) return proteins
def annotate(params, proteins, \ url="http://psfs.cbrc.jp/tmbeta-net/", \ category='OM(barrel)', force=False): """ Uses the TMBETA-NET web service (http://psfs.cbrc.jp/tmbeta-net/) to predict strands of outer membrane beta-barrels. By default, category='BARREL' means prediction will only be run on proteins in the set with this category property. To process all proteins, change category to None. These keys are added to the proteins dictionary: 'tmbeta_strands' - a list of lists with paired start and end residues of each predicted strand. (eg [[3,9],[14,21], ..etc ]) """ # set the user-agent so web services can block us if they want ... :/ python_version = sys.version.split()[0] agent("Python-urllib/%s (twill; inmembrane)" % python_version) outfile = 'tmbeta_net.out' log_stderr("# TMBETA-NET(web) %s > %s" % (params['fasta'], outfile)) tmbeta_strands = {} if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) fh = open(outfile, 'r') tmbeta_strands = json.loads(fh.read()) fh.close() for seqid in tmbeta_strands: proteins[seqid]['tmbeta_strands'] = tmbeta_strands[seqid] return tmbeta_strands # dump extraneous output into this blackhole so we don't see it if not __DEBUG__: twill.set_output(StringIO.StringIO()) for seqid in proteins: # only run on sequences which match the category filter if force or \ (category == None) or \ (dict_get(proteins[seqid], 'category') == category): pass else: continue go(url) if __DEBUG__: showforms() fv("1", "sequence", proteins[seqid]['seq']) submit() log_stderr("# TMBETA-NET: Predicting strands for %s - %s\n" \ % (seqid, proteins[seqid]['name'])) out = show() time.sleep(1) if ("Some query is already running. Please try again." in out): log_stderr("# TMBETA-NET(web) error: %s" % (out)) return {} # parse the web page returned, extract strand boundaries proteins[seqid]['tmbeta_strands'] = [] for l in out.split('\n'): if __DEBUG__: log_stderr("## " + l) if "<BR>Segment " in l: i, j = l.split(":")[1].split("to") i = int(i.strip()[1:]) j = int(j.strip()[1:]) proteins[seqid]['tmbeta_strands'].append([i, j]) if __DEBUG__: log_stderr("# TMBETA-NET(web) segments: %s, %s" % (i, j)) tmbeta_strands[seqid] = proteins[seqid]['tmbeta_strands'] # we store the parsed strand boundaries in JSON format fh = open(outfile, 'w') fh.write(json.dumps(tmbeta_strands, separators=(',', ':\n'))) fh.close() return tmbeta_strands
def annotate(params, proteins, \ url = 'http://www.cbs.dtu.dk/ws/SignalP4/SignalP4_4_0_ws0.wsdl', \ #url = 'http://www.cbs.dtu.dk/ws/SignalP/SignalP_3_1_ws0.wsdl', \ batchsize = 500, \ force=False): if __DEBUG__: logging.basicConfig(level=logging.INFO) # soap messages (in&out) and http headers logging.getLogger('suds.client').setLevel(logging.DEBUG) # grab the cached results if present outfile = "signalp_web.out" if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) fh = open(outfile, 'r') annots = json.loads(fh.read()) fh.close() for seqid in annots: proteins[seqid]['is_signalp'] = annots[seqid]['is_signalp'] proteins[seqid]['signalp_cleave_position'] = \ annots[seqid]['signalp_cleave_position'] citation['name'] = annots[seqid]['program_name'] return proteins log_stderr("# SignalP(web), %s > %s" % (params['fasta'], outfile)) log_stderr("# SignalP(web): submitting in batches of %i sequences" % batchsize) seqids = proteins.keys() signalp_dict = {} while seqids: seqid_batch = seqids[0:batchsize] del seqids[0:batchsize] client = Client(url, cache=None) request=client.factory.create('runService.parameters') sys.stderr.write("# ") for seqid in seqid_batch: seq = client.factory.create('runService.parameters.sequencedata.sequence') seq.id = seqid seq.seq = proteins[seqid]['seq'] # organism can be 'euk', 'gram+', 'siganlgram-' request.organism = params['signalp4_organism'] # default for SignalP 4.0 #request.method = 'best' # default for SignalP 3.1 #request.method = 'nn+hmm' request.sequencedata.sequence.append(seq) sys.stderr.write(".") response = client.service.runService(request) sys.stderr.write("\n") #pollQueue job = client.factory.create('pollQueue.job') job.jobid = response.jobid response = client.service.pollQueue(job) retries = 0 sys.stderr.write("# Waiting for SignalP(web) results ") while response.status != "FINISHED" and retries < 100: response = client.service.pollQueue(job) time.sleep(10 + (retries*2)) retries += 1 sys.stderr.write(".") # if something goes wrong, note it and skip SignalP # by returning if response.status == "REJECTED" or \ response.status == "UNKNOWN JOBID" or \ response.status == "QUEUE DOWN" or \ response.status == "FAILED": log_stderr("SignalP(web) failed: '%s'" % (response.status)) return proteins sys.stderr.write(" done !\n") #fetchResults done_job = client.factory.create('fetchResult.job') done_job.jobid = response.jobid result = client.service.fetchResult(done_job) #log_stderr(str(result)) # end of signal-nn citation["name"] = result[0].method + " " + result[0].version # TODO: the better way to do this would be to save the entire SOAP # response returned by client.last_received() and then parse # that upon plugin invocation (above) using suds.sax # This way we save everything in the analysis, not just # the details we are interested in right now for res in result.ann: seqid = res.sequence.id proteins[seqid]['signalp_cleave_position'] = 0 proteins[seqid]['is_signalp'] = False if len(res.annrecords) > 0: # range.end - this is the last residue of the signal peptide if # there is a cleavage site cleavage_site = int(res.annrecords.annrecord[0].range.end) if cleavage_site == 1: cleavage_site = 0 proteins[seqid]['signalp_cleave_position'] = cleavage_site # from 'comment', "Y" or "N noTm" or "N TM" where "Y" means signal peptide signal_yn = res.annrecords[0][0].comment[0] if signal_yn == "Y": proteins[seqid]['is_signalp'] = True else: proteins[seqid]['is_signalp'] = False # for caching in the outfile if seqid not in signalp_dict: signalp_dict[seqid] = {} signalp_dict[seqid]['is_signalp'] = proteins[seqid]['is_signalp'] signalp_dict[seqid]['signalp_cleave_position'] = \ proteins[seqid]['signalp_cleave_position'] signalp_dict[seqid]['program_name'] = citation['name'] # we store the minimal stuff in JSON format fh = open(outfile, 'w') fh.write(json.dumps(signalp_dict, separators=(',',':\n'))) fh.close() return proteins
def annotate(params, proteins, \ url = 'http://www.cbs.dtu.dk/ws/SignalP4/SignalP4_4_0_ws0.wsdl', \ #url = 'http://www.cbs.dtu.dk/ws/SignalP/SignalP_3_1_ws0.wsdl', \ batchsize = 500, \ result_poll_retries = 100, \ force=False): if __DEBUG__: logging.basicConfig(level=logging.INFO) # soap messages (in&out) and http headers logging.getLogger('suds.client').setLevel(logging.DEBUG) # grab the cached results if present outfile = "signalp_web.out" if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) fh = open(outfile, 'r') annots = json.loads(fh.read()) fh.close() for seqid in annots: proteins[seqid]['is_signalp'] = annots[seqid]['is_signalp'] proteins[seqid]['signalp_cleave_position'] = \ annots[seqid]['signalp_cleave_position'] citation['name'] = annots[seqid]['program_name'] return proteins log_stderr("# SignalP(web), %s > %s" % (params['fasta'], outfile)) log_stderr("# SignalP(web): submitting in batches of %i sequences" % batchsize) seqids = proteins.keys() signalp_dict = {} while seqids: seqid_batch = seqids[0:batchsize] del seqids[0:batchsize] client = Client(url, cache=None) request = client.factory.create('runService.parameters') sys.stderr.write("# ") for seqid in seqid_batch: seq = client.factory.create( 'runService.parameters.sequencedata.sequence') seq.id = seqid seq.seq = proteins[seqid]['seq'] # organism can be 'euk', 'gram+', 'siganlgram-' request.organism = params['signalp4_organism'] # default for SignalP 4.0 #request.method = 'best' # default for SignalP 3.1 #request.method = 'nn+hmm' request.sequencedata.sequence.append(seq) sys.stderr.write(".") response = client.service.runService(request) sys.stderr.write("\n") #pollQueue job = client.factory.create('pollQueue.job') job.jobid = response.jobid response = client.service.pollQueue(job) retries = 0 sys.stderr.write("# Waiting for SignalP(web) results ") while response.status != "FINISHED" and retries < result_poll_retries: response = client.service.pollQueue(job) time.sleep(10 + (retries * 2)) retries += 1 sys.stderr.write(".") # if something goes wrong, note it and skip SignalP # by returning if response.status == "REJECTED" or \ response.status == "UNKNOWN JOBID" or \ response.status == "QUEUE DOWN" or \ response.status == "FAILED": log_stderr("\nSignalP(web) failed: '%s'" % (response.status)) return proteins if retries >= result_poll_retries: log_stderr( "\nSignalP(web) failed: result_poll_retries limit exceeded (%i)" % (result_poll_retries)) return proteins sys.stderr.write(" done !\n") #fetchResults done_job = client.factory.create('fetchResult.job') done_job.jobid = response.jobid result = client.service.fetchResult(done_job) #log_stderr(str(result)) # end of signal-nn citation["name"] = result[0].method + " " + result[0].version # TODO: the better way to do this would be to save the entire SOAP # response returned by client.last_received() and then parse # that upon plugin invocation (above) using suds.sax # This way we save everything in the analysis, not just # the details we are interested in right now for res in result.ann: seqid = res.sequence.id proteins[seqid]['signalp_cleave_position'] = 0 proteins[seqid]['is_signalp'] = False if len(res.annrecords) > 0: # range.end - this is the last residue of the signal peptide if # there is a cleavage site cleavage_site = int(res.annrecords.annrecord[0].range.end) if cleavage_site == 1: cleavage_site = 0 proteins[seqid]['signalp_cleave_position'] = cleavage_site # from 'comment', "Y" or "N noTm" or "N TM" where "Y" means signal peptide signal_yn = res.annrecords[0][0].comment[0] if signal_yn == "Y": proteins[seqid]['is_signalp'] = True else: proteins[seqid]['is_signalp'] = False # for caching in the outfile if seqid not in signalp_dict: signalp_dict[seqid] = {} signalp_dict[seqid]['is_signalp'] = proteins[seqid]['is_signalp'] signalp_dict[seqid]['signalp_cleave_position'] = \ proteins[seqid]['signalp_cleave_position'] signalp_dict[seqid]['program_name'] = citation['name'] # we store the minimal stuff in JSON format fh = open(outfile, 'w') fh.write(json.dumps(signalp_dict, separators=(',', ':\n'))) fh.close() return proteins
def annotate(params, proteins, \ # url = 'http://www.cbs.dtu.dk/ws/LipoP/LipoP_1_0_ws0.wsdl', \ # we host our own fixed version of the WSDL for the moment url="http://raw.github.com/boscoh/inmembrane/master/inmembrane/plugins/extra/LipoP_1_0_ws0.wsdl", \ # url = "http://www.cbs.dtu.dk/ws/LipoP/LipoP_1_0_ws0.wsdl", batchsize=2000, \ force=False): if __DEBUG__: logging.basicConfig(level=logging.INFO) # soap messages (in&out) and http headers logging.getLogger('suds.client').setLevel(logging.DEBUG) # grab the cached results if present outfile = "lipop_web.out" if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) fh = open(outfile, 'r') annots = json.loads(fh.read()) fh.close() for seqid in annots: proteins[seqid]['is_lipop'] = annots[seqid]['is_lipop'] proteins[seqid]['lipop_cleave_position'] = \ annots[seqid]['lipop_cleave_position'] citation['name'] = annots[seqid]['program_name'] return proteins log_stderr("# LipoP(web), %s > %s" % (params['fasta'], outfile)) log_stderr( "# LipoP(web): submitting in batches of %i sequences" % batchsize) """ # ensure schemas are correctly imported (workaround for broken schemas ..) from suds.xsd.doctor import ImportDoctor from suds.xsd.doctor import Import imp = Import("http://www.cbs.dtu.dk/ws/ws-common", location="http://www.cbs.dtu.dk/ws/common/ws_common_1_0b.xsd") imp.filter.add("http://www.cbs.dtu.dk/ws/WSLipoP_1_0_ws0") doctor = ImportDoctor(imp) client = Client(url, doctor=doctor, cache=None) #client = Client(url, plugins=[doctor], cache=None) """ seqids = proteins.keys() lipop_dict = {} while seqids: seqid_batch = seqids[0:batchsize] del seqids[0:batchsize] client = Client(url, cache=None) request = client.factory.create('runService.parameters') # this is a horrible horrible workaround to account for the fact that # the lipop SOAP service returns null results if there is are certain # non-alphanumeric characters in the sequence id provided. horrible. lipop_seq_id_mapping = {} seqcount = 0 sys.stderr.write("# ") for seqid in seqid_batch: seq = client.factory.create( 'runService.parameters.sequencedata.sequence') # workaround: removes any non-alphanumeric character (except '_') and adds # a unique number to the start to ensure every id is unique after mangling newseqid = `seqcount` + re.sub(r'[^\w]', "", seqid) seqcount += 1 lipop_seq_id_mapping[newseqid] = seqid seq.id = newseqid # seq.id = seqid seq.seq = proteins[seqid]['seq'] request.sequencedata.sequence.append(seq) sys.stderr.write(".") try: response = client.service.runService(request) except urllib2.URLError as e: log_stderr("ERROR LipoP(web) failed: '%s'" % `e.reason`) return proteins sys.stderr.write("\n") # pollQueue job = client.factory.create('pollQueue.job') job.jobid = response.jobid response = client.service.pollQueue(job) retries = 0 sys.stderr.write("# Waiting for LipoP(web) results ") while response.status != "FINISHED" and retries < 12: response = client.service.pollQueue(job) time.sleep(10 + (retries ** 2)) retries += 1 sys.stderr.write(".") # if something goes wrong, note it and skip LipoP # by returning if response.status == "REJECTED" or \ response.status == "UNKNOWN JOBID" or \ response.status == "QUEUE DOWN" or \ response.status == "FAILED": log_stderr("LipoP(web) failed: '%s'" % (response.status)) return proteins sys.stderr.write(" done !\n") # fetchResults done_job = client.factory.create('fetchResult.job') done_job.jobid = response.jobid result = client.service.fetchResult(done_job) if __DEBUG__: log_stderr(str(result)) citation["name"] = result[0].method + " " + result[0].version # TODO: the better way to do this would be to save the entire SOAP # response returned by client.last_received() and then parse # that upon plugin invocation (above) using suds.sax # This way we save everything in the analysis, not just # the details we are interested in right now for res in result.ann: # seqid = res.sequence.id seqid = lipop_seq_id_mapping[res.sequence.id] # init as if no lipop hit, may be reset below proteins[seqid]['is_lipop'] = False proteins[seqid]['lipop_cleave_position'] = 0 proteins[seqid]['lipop_im_retention_signal'] = False if len(res.annrecords) > 0: # range.end - this is the first residue (Cys) of the mature protein if # there is a SpII cleavage site for annrec in res.annrecords.annrecord: if annrec.feature == "CleavII": proteins[seqid]['lipop_cleave_position'] = int( annrec.range.begin) proteins[seqid]['is_lipop'] = True # check for an E.coli style inner membrane retention signal # Asp+2 to cleavage site. There are other apparent retention # signals in E. coli and other gram- bacteria in addition to # the Asp+2 which we don't detect here (yet). # (Yamaguchi et al, 1988; Tokuda and Matsuyama, 2005 [review]) plus_two = proteins[seqid]['lipop_cleave_position'] + 1 if proteins[seqid]['seq'][plus_two] == 'D': proteins[seqid]['lipop_im_retention_signal'] = True # for caching in the outfile if seqid not in lipop_dict: lipop_dict[seqid] = {} lipop_dict[seqid]['is_lipop'] = proteins[seqid]['is_lipop'] lipop_dict[seqid]['lipop_cleave_position'] = \ proteins[seqid]['lipop_cleave_position'] lipop_dict[seqid]['lipop_im_retention_signal'] = \ proteins[seqid]['lipop_im_retention_signal'] lipop_dict[seqid]['program_name'] = citation['name'] # we store the minimal stuff in JSON format fh = open(outfile, 'w') fh.write(json.dumps(lipop_dict, separators=(',', ':\n'))) fh.close() return proteins
def annotate(params, proteins, \ force=False): """ DEPRECATED: The TMB-HUNT server appears to be permanently offline. Uses the TMB-HUNT web service (http://bmbpcu36.leeds.ac.uk/~andy/betaBarrel/AACompPred/aaTMB_Hunt.cgi) to predict if proteins are outer membrane beta-barrels. NOTE: In my limited testing, TMB-HUNT tends to perform very poorly in terms of false positives and false negetives. I'd suggest using only BOMP. """ # TODO: automatically split large sets into multiple jobs # TMB-HUNT will only take 10000 seqs at a time if len(proteins) >= 10000: log_stderr( "# ERROR: TMB-HUNT(web): can't take more than 10,000 sequences.") return # set the user-agent so web services can block us if they want ... :/ python_version = sys.version.split()[0] agent("Python-urllib/%s (twill; inmembrane)" % python_version) out = 'tmbhunt.out' log_stderr("# TMB-HUNT(web) %s > %s" % (params['fasta'], out)) if not force and os.path.isfile(out): log_stderr("# -> skipped: %s already exists" % out) return parse_tmbhunt(proteins, out) # dump extraneous output into this blackhole so we don't see it if not __DEBUG__: twill.set_output(StringIO.StringIO()) go("http://bmbpcu36.leeds.ac.uk/~andy/betaBarrel/AACompPred/aaTMB_Hunt.cgi" ) if __DEBUG__: showforms() # read up the FASTA format seqs fh = open(params['fasta'], 'r') fasta_seqs = fh.read() fh.close() # fill out the form fv("1", "sequences", fasta_seqs) submit() if __DEBUG__: showlinks() # small jobs will lead us straight to the results, big jobs # go via a 'waiting' page which we skip past if we get it job_id = None try: # we see this with big jobs result_table_url = follow( "http://www.bioinformatics.leeds.ac.uk/~andy/betaBarrel/AACompPred/tmp/tmp_output.*.html" ) job_id = result_table_url.split('tmp_output')[-1:][0].split('.')[0] except: # small jobs take us straight to the html results table pass # parse the job_id from the url, since due to a bug in # TMB-HUNT the link on the results page from large jobs is wrong if not job_id: job_id = \ follow("Full results").split('/')[-1:][0].split('.')[0] log_stderr( "# TMB-HUNT(web) job_id is: %s <http://www.bioinformatics.leeds.ac.uk/~andy/betaBarrel/AACompPred/tmp/tmp_output%s.html>" % (job_id, job_id)) # polling until TMB-HUNT finishes # TMB-HUNT advises that 4000 sequences take ~10 mins # we poll a little faster than that polltime = (len(proteins) * 0.1) + 2 while True: log_stderr("# TMB-HUNT(web): waiting another %i sec ..." % (polltime)) time.sleep(polltime) try: go("http://bmbpcu36.leeds.ac.uk/~andy/betaBarrel/AACompPred/tmp/%s.txt" % (job_id)) break except: polltime = polltime * 2 if polltime >= 7200: # 2 hours log_stderr("# TMB-HUNT error: Taking too long.") return txt_out = show() # write raw TMB-HUNT results fh = open(out, 'w') fh.write(txt_out) fh.close() return parse_tmbhunt(proteins, out)
def annotate(params, proteins, \ batchsize=500, \ force=False): """ This plugin inferfaces with the TMHMM web interface (for humans) and scrapes the results. This is a silly way to do it, since there is a SOAP service ... however when the SOAP service goes down, as it does from time to time, this plugin can be used as a stopgap. """ baseurl = "http://www.cbs.dtu.dk" url = baseurl + "/cgi-bin/nph-webface" # grab the cached results if present outfile = "tmhmm_scrape_web.out" if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) proteins, id_mapping = generate_safe_seqids(proteins) fh = open(outfile, 'r') resultpage = fh.read() fh.close() #soup = BeautifulSoup(resultpage) proteins = parse_tmhmm(resultpage, proteins, id_mapping=id_mapping) return proteins proteins, id_mapping = generate_safe_seqids(proteins) seqids = proteins.keys() allresultpages = "" while seqids: seqid_batch = seqids[0:batchsize] del seqids[0:batchsize] # get batch of sequences in fasta format with munged ids # (workaround for potential tmhmm sequence id munging) safe_fasta = proteins_to_fasta(proteins, seqids=seqid_batch, use_safe_seqid=True) # we use an OrderedDict rather than a normal dictionary to work around # some quirks in the CBS CGI (the server expects parameters in a certain # order in the HTTP headers). payload = OrderedDict([ ('configfile', "/usr/opt/www/pub/CBS/services/TMHMM-2.0/TMHMM2.cf"), ("SEQ", ""), ("outform", "-noplot") ]) #files = {'seqfile': open(params['fasta'], 'rb')} files = {'seqfile': StringIO(safe_fasta)} log_stderr("# TMHMM(scrape_web), %s > %s" % (params['fasta'], outfile)) headers = { "User-Agent": "python-requests/%s (inmembrane/%s)" % (requests.__version__, inmembrane.__version__) } r_post = requests.post(url, data=payload, files=files, headers=headers) # HACK: the initial POST throws us a 302 redirect and we grab the redirect url from the text # (... not sure why requests allow_redirect=True option doesn't handle this transparently) pollingurl = r_post.url + r_post.text.split("Location: ")[1] r = requests.get(pollingurl) if __DEBUG__: log_stderr(r.text) # Example: # # <HTML> # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD> # If Javascript is disabled, follow <a href="/cgi-bin/nph-webface?jobid=TMHMM2,50B5432A10A9CD51&opt=wait">This link</a> # # <script LANGUAGE="JavaScript"><!-- # location.replace("/cgi-bin/nph-webface?jobid=TMHMM2,50B5432A10A9CD51&opt=wait") # //--></script> # </HTML> # extract the result URL (or die if job is rejected ...) if "Job rejected" in r.text: sys.stderr.write(r.text) sys.exit() # sometimes we get a polling page, other times the result page is sent immediately. if ("<title>Job status of" in r.text): r = r.text.replace("<noscript>", "").replace("</noscript", "") soup = BeautifulSoup(r) resultlink = soup.findAll('a')[0]['href'] if __DEBUG__: log_stderr(resultlink) # try grabbing the result, then keep polling until they are ready sys.stderr.write("# Waiting for TMHMM(scrape_web) results") time.sleep(len(proteins) / 500) resultpage = requests.get(resultlink).text retries = 0 while ("<title>Job status of" in resultpage) and retries < 10: sys.stderr.write(".") time.sleep(len(proteins) / 100 + retries**2) resultpage = requests.get(resultlink).text retries += 1 else: resultpage = r.text sys.stderr.write(" .. done !\n") if __DEBUG__: log_stderr(resultpage) allresultpages += clean_result_page(resultpage) # we store the cleaned up result pages concatenated together fh = open(outfile, 'a+') fh.write(allresultpages) fh.close() proteins = parse_tmhmm(allresultpages, proteins, id_mapping=id_mapping) return proteins
def annotate(params, proteins, \ #url = 'http://www.cbs.dtu.dk/ws/TMHMM/TMHMM_2_0_ws0.wsdl', \ #url = 'http://www.cbs.dtu.dk/ws/TMHMM/TMHMM_2_0_ws1.wsdl', \ url = "http://raw.github.com/boscoh/inmembrane/master/inmembrane/plugins/extra/TMHMM_2_0_ws0.wsdl", \ batchsize = 2000, \ force=False): mapping = {'TMhelix':'tmhmm_helices',\ 'outside':'tmhmm_outer_loops',\ 'inside':'tmhmm_inner_loops',\ } if __DEBUG__: logging.basicConfig(level=logging.INFO) # soap messages (in&out) and http headers logging.getLogger('suds.client').setLevel(logging.DEBUG) # grab the cached results if present outfile = "tmhmm_web.out" if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) fh = open(outfile, 'r') annots = json.loads(fh.read()) fh.close() for seqid in annots: for k in mapping.values(): proteins[seqid][k] = annots[seqid][k] citation['name'] = annots[seqid]['program_name'] return proteins log_stderr("# TMHMM(web), %s > %s" % (params['fasta'], outfile)) log_stderr("# TMHMM(web): submitting in batches of %i sequences" % batchsize) seqids = proteins.keys() tmhmm_dict = {} while seqids: seqid_batch = seqids[0:batchsize] del seqids[0:batchsize] client = Client(url, cache=None) request=client.factory.create('runService.parameters') # this is a horrible horrible workaround to account for the fact that # the lipop SOAP service returns null results if there is are certain # non-alphanumeric characters in the sequence id provided. horrible. tmhmm_seq_id_mapping = {} seqcount = 0 sys.stderr.write("# ") for seqid in seqid_batch: seq = client.factory.create('runService.parameters.sequencedata.sequence') # workaround: removes any non-alphanumeric character (except '_') and adds # a unique number to the start to ensure every id is unique after mangling newseqid = `seqcount`+re.sub(r'[^\w]', "", seqid) seqcount += 1 tmhmm_seq_id_mapping[newseqid] = seqid #seq.id = seqid seq.id = newseqid seq.seq = proteins[seqid]['seq'] request.sequencedata.sequence.append(seq) sys.stderr.write(".") response = client.service.runService(request) sys.stderr.write("\n") #pollQueue job = client.factory.create('pollQueue.job') job.jobid = response.jobid response = client.service.pollQueue(job) retries = 0 sys.stderr.write("# Waiting for TMHMM(web) results ") while response.status != "FINISHED" and retries < 100: response = client.service.pollQueue(job) time.sleep(10 + (retries*2)) retries += 1 sys.stderr.write(".") # if something goes wrong, note it and skip TMHMM # by returning if response.status == "REJECTED" or \ response.status == "UNKNOWN JOBID" or \ response.status == "QUEUE DOWN" or \ response.status == "FAILED": log_stderr("TMHMM(web) failed: '%s'" % (response.status)) return proteins sys.stderr.write(" done !\n") #fetchResults done_job = client.factory.create('fetchResult.job') done_job.jobid = response.jobid result = client.service.fetchResult(done_job) if __DEBUG__: log_stderr(str(result)) citation["name"] = result[0].method + " " + result[0].version for res in result.ann: #seqid = res.sequence.id seqid = tmhmm_seq_id_mapping[res.sequence.id] if 'tmhmm_helices' not in proteins[seqid]: proteins[seqid].update({ 'tmhmm_helices':[], 'tmhmm_inner_loops':[], 'tmhmm_outer_loops':[] }) if len(res.annrecords) > 0: for segment in res.annrecords.annrecord: if segment.comment in mapping: tmhmmkey = mapping[segment.comment] proteins[seqid][tmhmmkey].append(\ (segment.range.begin, segment.range.end)) # for caching in the outfile if seqid not in tmhmm_dict: tmhmm_dict[seqid] = {} # extract a copy of results from proteins dictionary # ready to we written to cache file for k in mapping.values(): tmhmm_dict[seqid][k] = proteins[seqid][k] tmhmm_dict[seqid]['program_name'] = citation['name'] if __DEBUG__: print_proteins(proteins) # we store the minimal stuff in JSON format fh = open(outfile, 'w') fh.write(json.dumps(tmhmm_dict, separators=(',',':\n'))) fh.close() return proteins
def annotate(params, proteins, \ force=False): """ Uses the TMB-HUNT web service (http://bmbpcu36.leeds.ac.uk/~andy/betaBarrel/AACompPred/aaTMB_Hunt.cgi) to predict if proteins are outer membrane beta-barrels. NOTE: In my limited testing, TMB-HUNT tends to perform very poorly in terms of false positives and false negetives. I'd suggest using only BOMP. """ # TODO: automatically split large sets into multiple jobs # TMB-HUNT will only take 10000 seqs at a time if len(proteins) >= 10000: log_stderr("# ERROR: TMB-HUNT(web): can't take more than 10,000 sequences.") return # set the user-agent so web services can block us if they want ... :/ python_version = sys.version.split()[0] agent("Python-urllib/%s (twill; inmembrane)" % python_version) out = 'tmbhunt.out' log_stderr("# TMB-HUNT(web) %s > %s" % (params['fasta'], out)) if not force and os.path.isfile(out): log_stderr("# -> skipped: %s already exists" % out) return parse_tmbhunt(proteins, out) # dump extraneous output into this blackhole so we don't see it if not __DEBUG__: twill.set_output(StringIO.StringIO()) go("http://bmbpcu36.leeds.ac.uk/~andy/betaBarrel/AACompPred/aaTMB_Hunt.cgi") if __DEBUG__: showforms() # read up the FASTA format seqs fh = open(params['fasta'], 'r') fasta_seqs = fh.read() fh.close() # fill out the form fv("1", "sequences", fasta_seqs) submit() if __DEBUG__: showlinks() # small jobs will lead us straight to the results, big jobs # go via a 'waiting' page which we skip past if we get it job_id = None try: # we see this with big jobs result_table_url = follow("http://www.bioinformatics.leeds.ac.uk/~andy/betaBarrel/AACompPred/tmp/tmp_output.*.html") job_id = result_table_url.split('tmp_output')[-1:][0].split('.')[0] except: # small jobs take us straight to the html results table pass # parse the job_id from the url, since due to a bug in # TMB-HUNT the link on the results page from large jobs is wrong if not job_id: job_id = follow("Full results").split('/')[-1:][0].split('.')[0] log_stderr("# TMB-HUNT(web) job_id is: %s <http://www.bioinformatics.leeds.ac.uk/~andy/betaBarrel/AACompPred/tmp/tmp_output%s.html>" % (job_id, job_id)) # polling until TMB-HUNT finishes # TMB-HUNT advises that 4000 sequences take ~10 mins # we poll a little faster than that polltime = (len(proteins)*0.1)+2 while True: log_stderr("# TMB-HUNT(web): waiting another %i sec ..." % (polltime)) time.sleep(polltime) try: go("http://bmbpcu36.leeds.ac.uk/~andy/betaBarrel/AACompPred/tmp/%s.txt" % (job_id)) break except: polltime = polltime * 2 if polltime >= 7200: # 2 hours log_stderr("# TMB-HUNT error: Taking too long.") return txt_out = show() # write raw TMB-HUNT results fh = open(out, 'w') fh.write(txt_out) fh.close() return parse_tmbhunt(proteins, out)
def annotate(params, proteins, \ url="http://psfs.cbrc.jp/tmbeta-net/", \ category='OM(barrel)', force=False): """ Uses the TMBETA-NET web service (http://psfs.cbrc.jp/tmbeta-net/) to predict strands of outer membrane beta-barrels. By default, category='BARREL' means prediction will only be run on proteins in the set with this category property. To process all proteins, change category to None. These keys are added to the proteins dictionary: 'tmbeta_strands' - a list of lists with paired start and end residues of each predicted strand. (eg [[3,9],[14,21], ..etc ]) """ # set the user-agent so web services can block us if they want ... :/ python_version = sys.version.split()[0] agent("Python-urllib/%s (twill; inmembrane)" % python_version) outfile = 'tmbeta_net.out' log_stderr("# TMBETA-NET(web) %s > %s" % (params['fasta'], outfile)) tmbeta_strands = {} if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) fh = open(outfile, 'r') tmbeta_strands = json.loads(fh.read()) fh.close() for seqid in tmbeta_strands: proteins[seqid]['tmbeta_strands'] = tmbeta_strands[seqid] return tmbeta_strands # dump extraneous output into this blackhole so we don't see it if not __DEBUG__: twill.set_output(StringIO.StringIO()) for seqid in proteins: # only run on sequences which match the category filter if force or \ (category == None) or \ (dict_get(proteins[seqid], 'category') == category): pass else: continue go(url) if __DEBUG__: showforms() fv("1","sequence",proteins[seqid]['seq']) submit() log_stderr("# TMBETA-NET: Predicting strands for %s - %s\n" \ % (seqid, proteins[seqid]['name'])) out = show() time.sleep(1) if ("Some query is already running. Please try again." in out): log_stderr("# TMBETA-NET(web) error: %s" % (out)) return {} # parse the web page returned, extract strand boundaries proteins[seqid]['tmbeta_strands'] = [] for l in out.split('\n'): if __DEBUG__: log_stderr("## " + l) if "<BR>Segment " in l: i,j = l.split(":")[1].split("to") i = int(i.strip()[1:]) j = int(j.strip()[1:]) proteins[seqid]['tmbeta_strands'].append([i,j]) if __DEBUG__: log_stderr("# TMBETA-NET(web) segments: %s, %s" % (i, j)) tmbeta_strands[seqid] = proteins[seqid]['tmbeta_strands'] # we store the parsed strand boundaries in JSON format fh = open(outfile, 'w') fh.write(json.dumps(tmbeta_strands, separators=(',',':\n'))) fh.close() return tmbeta_strands
def annotate(params, proteins, \ # url = 'http://www.cbs.dtu.dk/ws/TMHMM/TMHMM_2_0_ws0.wsdl', \ # url = 'http://www.cbs.dtu.dk/ws/TMHMM/TMHMM_2_0_ws1.wsdl', \ url="http://raw.github.com/boscoh/inmembrane/master/inmembrane/plugins/extra/TMHMM_2_0_ws0.wsdl", \ batchsize=2000, \ force=False): mapping = {'TMhelix': 'tmhmm_helices', \ 'outside': 'tmhmm_outer_loops', \ 'inside': 'tmhmm_inner_loops', \ } if __DEBUG__: logging.basicConfig(level=logging.INFO) # soap messages (in&out) and http headers logging.getLogger('suds.client').setLevel(logging.DEBUG) # grab the cached results if present outfile = "tmhmm_web.out" if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) fh = open(outfile, 'r') annots = json.loads(fh.read()) fh.close() for seqid in annots: for k in mapping.values(): proteins[seqid][k] = annots[seqid][k] citation['name'] = annots[seqid]['program_name'] return proteins log_stderr("# TMHMM(web), %s > %s" % (params['fasta'], outfile)) log_stderr("# TMHMM(web): submitting in batches of %i sequences" % batchsize) seqids = proteins.keys() tmhmm_dict = {} while seqids: seqid_batch = seqids[0:batchsize] del seqids[0:batchsize] client = Client(url, cache=None) request = client.factory.create('runService.parameters') # this is a horrible horrible workaround to account for the fact that # the lipop SOAP service returns null results if there is are certain # non-alphanumeric characters in the sequence id provided. horrible. tmhmm_seq_id_mapping = {} seqcount = 0 sys.stderr.write("# ") for seqid in seqid_batch: seq = client.factory.create( 'runService.parameters.sequencedata.sequence') # workaround: removes any non-alphanumeric character (except '_') and adds # a unique number to the start to ensure every id is unique after mangling newseqid = ` seqcount ` + re.sub(r'[^\w]', "", seqid) seqcount += 1 tmhmm_seq_id_mapping[newseqid] = seqid # seq.id = seqid seq.id = newseqid seq.seq = proteins[seqid]['seq'] request.sequencedata.sequence.append(seq) sys.stderr.write(".") response = client.service.runService(request) sys.stderr.write("\n") # pollQueue job = client.factory.create('pollQueue.job') job.jobid = response.jobid response = client.service.pollQueue(job) retries = 0 sys.stderr.write("# Waiting for TMHMM(web) results ") while response.status != "FINISHED" and retries < 100: response = client.service.pollQueue(job) time.sleep(10 + (retries * 2)) retries += 1 sys.stderr.write(".") # if something goes wrong, note it and skip TMHMM # by returning if response.status == "REJECTED" or \ response.status == "UNKNOWN JOBID" or \ response.status == "QUEUE DOWN" or \ response.status == "FAILED": log_stderr("TMHMM(web) failed: '%s'" % (response.status)) return proteins sys.stderr.write(" done !\n") # fetchResults done_job = client.factory.create('fetchResult.job') done_job.jobid = response.jobid result = client.service.fetchResult(done_job) if __DEBUG__: log_stderr(str(result)) citation["name"] = result[0].method + " " + result[0].version for res in result.ann: # seqid = res.sequence.id seqid = tmhmm_seq_id_mapping[res.sequence.id] if 'tmhmm_helices' not in proteins[seqid]: proteins[seqid].update({ 'tmhmm_helices': [], 'tmhmm_inner_loops': [], 'tmhmm_outer_loops': [] }) if len(res.annrecords) > 0: for segment in res.annrecords.annrecord: if segment.comment in mapping: tmhmmkey = mapping[segment.comment] proteins[seqid][tmhmmkey].append( \ (segment.range.begin, segment.range.end)) # for caching in the outfile if seqid not in tmhmm_dict: tmhmm_dict[seqid] = {} # extract a copy of results from proteins dictionary # ready to we written to cache file for k in mapping.values(): tmhmm_dict[seqid][k] = proteins[seqid][k] tmhmm_dict[seqid]['program_name'] = citation['name'] if __DEBUG__: print_proteins(proteins) # we store the minimal stuff in JSON format fh = open(outfile, 'w') fh.write(json.dumps(tmhmm_dict, separators=(',', ':\n'))) fh.close() return proteins
def annotate(params, proteins, \ #url = 'http://www.cbs.dtu.dk/ws/LipoP/LipoP_1_0_ws0.wsdl', \ # we host our own fixed version of the WSDL for the moment url = "http://raw.github.com/boscoh/inmembrane/master/inmembrane/plugins/extra/LipoP_1_0_ws0.wsdl", \ #url = "http://www.cbs.dtu.dk/ws/LipoP/LipoP_1_0_ws0.wsdl", batchsize = 2000, \ force=False): if __DEBUG__: logging.basicConfig(level=logging.INFO) # soap messages (in&out) and http headers logging.getLogger('suds.client').setLevel(logging.DEBUG) # grab the cached results if present outfile = "lipop_web.out" if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) fh = open(outfile, 'r') annots = json.loads(fh.read()) fh.close() for seqid in annots: proteins[seqid]['is_lipop'] = annots[seqid]['is_lipop'] proteins[seqid]['lipop_cleave_position'] = \ annots[seqid]['lipop_cleave_position'] citation['name'] = annots[seqid]['program_name'] return proteins log_stderr("# LipoP(web), %s > %s" % (params['fasta'], outfile)) log_stderr("# LipoP(web): submitting in batches of %i sequences" % batchsize) """ # ensure schemas are correctly imported (workaround for broken schemas ..) from suds.xsd.doctor import ImportDoctor from suds.xsd.doctor import Import imp = Import("http://www.cbs.dtu.dk/ws/ws-common", location="http://www.cbs.dtu.dk/ws/common/ws_common_1_0b.xsd") imp.filter.add("http://www.cbs.dtu.dk/ws/WSLipoP_1_0_ws0") doctor = ImportDoctor(imp) client = Client(url, doctor=doctor, cache=None) #client = Client(url, plugins=[doctor], cache=None) """ seqids = proteins.keys() lipop_dict = {} while seqids: seqid_batch = seqids[0:batchsize] del seqids[0:batchsize] client = Client(url, cache=None) request = client.factory.create('runService.parameters') # this is a horrible horrible workaround to account for the fact that # the lipop SOAP service returns null results if there is are certain # non-alphanumeric characters in the sequence id provided. horrible. lipop_seq_id_mapping = {} seqcount = 0 sys.stderr.write("# ") for seqid in seqid_batch: seq = client.factory.create( 'runService.parameters.sequencedata.sequence') # workaround: removes any non-alphanumeric character (except '_') and adds # a unique number to the start to ensure every id is unique after mangling newseqid = ` seqcount ` + re.sub(r'[^\w]', "", seqid) seqcount += 1 lipop_seq_id_mapping[newseqid] = seqid seq.id = newseqid #seq.id = seqid seq.seq = proteins[seqid]['seq'] request.sequencedata.sequence.append(seq) sys.stderr.write(".") try: response = client.service.runService(request) except urllib2.URLError as e: log_stderr("ERROR LipoP(web) failed: '%s'" % ` e.reason `) return proteins sys.stderr.write("\n") #pollQueue job = client.factory.create('pollQueue.job') job.jobid = response.jobid response = client.service.pollQueue(job) retries = 0 sys.stderr.write("# Waiting for LipoP(web) results ") while response.status != "FINISHED" and retries < 12: response = client.service.pollQueue(job) time.sleep(10 + (retries**2)) retries += 1 sys.stderr.write(".") # if something goes wrong, note it and skip LipoP # by returning if response.status == "REJECTED" or \ response.status == "UNKNOWN JOBID" or \ response.status == "QUEUE DOWN" or \ response.status == "FAILED": log_stderr("LipoP(web) failed: '%s'" % (response.status)) return proteins sys.stderr.write(" done !\n") #fetchResults done_job = client.factory.create('fetchResult.job') done_job.jobid = response.jobid result = client.service.fetchResult(done_job) if __DEBUG__: log_stderr(str(result)) citation["name"] = result[0].method + " " + result[0].version # TODO: the better way to do this would be to save the entire SOAP # response returned by client.last_received() and then parse # that upon plugin invocation (above) using suds.sax # This way we save everything in the analysis, not just # the details we are interested in right now for res in result.ann: #seqid = res.sequence.id seqid = lipop_seq_id_mapping[res.sequence.id] # init as if no lipop hit, may be reset below proteins[seqid]['is_lipop'] = False proteins[seqid]['lipop_cleave_position'] = 0 proteins[seqid]['lipop_im_retention_signal'] = False if len(res.annrecords) > 0: # range.end - this is the first residue (Cys) of the mature protein if # there is a SpII cleavage site for annrec in res.annrecords.annrecord: if annrec.feature == "CleavII": proteins[seqid]['lipop_cleave_position'] = int( annrec.range.begin) proteins[seqid]['is_lipop'] = True # check for an E.coli style inner membrane retention signal # Asp+2 to cleavage site. There are other apparent retention # signals in E. coli and other gram- bacteria in addition to # the Asp+2 which we don't detect here (yet). # (Yamaguchi et al, 1988; Tokuda and Matsuyama, 2005 [review]) plus_two = proteins[seqid]['lipop_cleave_position'] + 1 if proteins[seqid]['seq'][plus_two] == 'D': proteins[seqid]['lipop_im_retention_signal'] = True # for caching in the outfile if seqid not in lipop_dict: lipop_dict[seqid] = {} lipop_dict[seqid]['is_lipop'] = proteins[seqid]['is_lipop'] lipop_dict[seqid]['lipop_cleave_position'] = \ proteins[seqid]['lipop_cleave_position'] lipop_dict[seqid]['lipop_im_retention_signal'] = \ proteins[seqid]['lipop_im_retention_signal'] lipop_dict[seqid]['program_name'] = citation['name'] # we store the minimal stuff in JSON format fh = open(outfile, 'w') fh.write(json.dumps(lipop_dict, separators=(',', ':\n'))) fh.close() return proteins
def annotate(params, proteins, batchsize=2000, force=False): """ This plugin interfaces with the SignalP web interface (for humans) and scrapes the results. There once was a SOAP service but it was discontinued, so now we use this. """ baseurl = "http://www.cbs.dtu.dk" url = baseurl + "/cgi-bin/webface2.fcgi" # grab the cached results if present outfile = "signalp_scrape_web.out" if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) proteins, id_mapping = generate_safe_seqids(proteins) fh = open(outfile, 'r') resultpage = fh.read() fh.close() # soup = BeautifulSoup(resultpage) proteins = parse_signalp(resultpage.splitlines(), proteins, id_mapping=id_mapping) return proteins proteins, id_mapping = generate_safe_seqids(proteins) seqids = proteins.keys() allresultpages = "" while seqids: seqid_batch = seqids[0:batchsize] del seqids[0:batchsize] safe_fasta = proteins_to_fasta(proteins, seqids=seqid_batch, use_safe_seqid=True) # we use an OrderedDict rather than a normal dictionary to work around # some quirks in the CBS CGI (the server expects parameters in a certain # order in the HTTP headers). payload = OrderedDict([ ('configfile', "/usr/opt/www/pub/CBS/services/SignalP-4.1/SignalP.cf"), ("SEQPASTE", ""), ("orgtype", params['signalp4_organism']), # gram+, gram-, euk ("Dcut-type", "default"), ("method", "best"), # best, notm ("minlen", ""), ("trunc", ""), ("format", "short")]) # summary, short, long, all # files = {'seqfile': open(params['fasta'], 'rb')} files = {'SEQSUB': StringIO(safe_fasta)} log_stderr( "# SignalP(scrape_web), %s > %s" % (params['fasta'], outfile)) headers = {"User-Agent": "python-requests/%s (inmembrane/%s)" % (requests.__version__, inmembrane.__version__)} r_post = requests.post(url, data=payload, files=files, headers=headers) if __DEBUG__: log_stderr(r_post.text) # Example: # # <HTML> # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD> # If Javascript is disabled, follow <a href="/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait">This link</a> # # <script LANGUAGE="JavaScript"><!-- # location.replace("/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait") # //--></script> # </HTML> # extract the result URL (or die if job is rejected ...) if "Job rejected" in r_post.text: log_stderr(r_post.text) sys.exit() r_post_clean = r_post.text.replace("<noscript>", "").replace( "</noscript", "") soup = BeautifulSoup(r_post_clean) pollingurl = soup.findAll('a')[0]['href'] sys.stderr.write("# Fetching from: " + pollingurl + "\n"); # try grabbing the result, then keep polling until they are ready sys.stderr.write("# Waiting for SignalP(scrape_web) results ") waittime = 1.0 time.sleep(waittime) # (len(proteins)/500) resultpage = requests.get(pollingurl).text retries = 0 while (("<title>Job status of" in resultpage) and (retries < 15)): sys.stderr.write(".") time.sleep(waittime) # (len(proteins)/500) resultpage = requests.get(pollingurl).text waittime += 1; retries += 1 waittime = min(waittime, 20) sys.stderr.write(" .. done !\n") if __DEBUG__: log_stderr(resultpage) # Example: # # <pre> # # lcl_AE004092.1_cdsid_AAK33146.1 CYT score=-0.200913 # # Cut-off=-3 # lcl_AE004092.1_cdsid_AAK33146.1 LipoP1.0:Best CYT 1 1 -0.200913 # <P> # <hr> # # lcl_AE004092.1_cdsid_AAK33147.1 CYT score=-0.200913 # # Cut-off=-3 # lcl_AE004092.1_cdsid_AAK33147.1 LipoP1.0:Best CYT 1 1 -0.200913 # <P> # <hr> allresultpages += html2text( resultpage) # += clean_result_page(resultpage) # we store the cleaned up result pages concatenated together fh = open(outfile, 'a+') fh.write(allresultpages) fh.close() proteins = parse_signalp(allresultpages.splitlines(), proteins, id_mapping=id_mapping) return proteins
def annotate(params, proteins, batchsize=2000, force=False): """ This plugin interfaces with the SignalP web interface (for humans) and scrapes the results. There once was a SOAP service but it was discontinued, so now we use this. """ baseurl = "http://www.cbs.dtu.dk" url = baseurl + "/cgi-bin/webface2.fcgi" # grab the cached results if present outfile = "signalp_scrape_web.out" if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) proteins, id_mapping = generate_safe_seqids(proteins) fh = open(outfile, 'r') resultpage = fh.read() fh.close() # soup = BeautifulSoup(resultpage) proteins = parse_signalp(resultpage.splitlines(), proteins, id_mapping=id_mapping) return proteins proteins, id_mapping = generate_safe_seqids(proteins) seqids = proteins.keys() allresultpages = "" while seqids: seqid_batch = seqids[0:batchsize] del seqids[0:batchsize] safe_fasta = proteins_to_fasta(proteins, seqids=seqid_batch, use_safe_seqid=True) # we use an OrderedDict rather than a normal dictionary to work around # some quirks in the CBS CGI (the server expects parameters in a certain # order in the HTTP headers). payload = OrderedDict([ ('configfile', "/usr/opt/www/pub/CBS/services/SignalP-4.1/SignalP.cf"), ("SEQPASTE", ""), ("orgtype", params['signalp4_organism']), # gram+, gram-, euk ("Dcut-type", "default"), ("method", "best"), # best, notm ("minlen", ""), ("trunc", ""), ("format", "short") ]) # summary, short, long, all # files = {'seqfile': open(params['fasta'], 'rb')} files = {'SEQSUB': StringIO(safe_fasta)} log_stderr("# SignalP(scrape_web), %s > %s" % (params['fasta'], outfile)) headers = { "User-Agent": "python-requests/%s (inmembrane/%s)" % (requests.__version__, inmembrane.__version__) } r_post = requests.post(url, data=payload, files=files, headers=headers) if __DEBUG__: log_stderr(r_post.text) # Example: # # <HTML> # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD> # If Javascript is disabled, follow <a href="/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait">This link</a> # # <script LANGUAGE="JavaScript"><!-- # location.replace("/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait") # //--></script> # </HTML> # extract the result URL (or die if job is rejected ...) if "Job rejected" in r_post.text: log_stderr(r_post.text) sys.exit() r_post_clean = r_post.text.replace("<noscript>", "").replace("</noscript", "") soup = BeautifulSoup(r_post_clean) pollingurl = soup.findAll('a')[0]['href'] sys.stderr.write("# Fetching from: " + pollingurl + "\n") # try grabbing the result, then keep polling until they are ready sys.stderr.write("# Waiting for SignalP(scrape_web) results ") waittime = 1.0 time.sleep(waittime) # (len(proteins)/500) resultpage = requests.get(pollingurl).text retries = 0 while (("<title>Job status of" in resultpage) and (retries < 15)): sys.stderr.write(".") time.sleep(waittime) # (len(proteins)/500) resultpage = requests.get(pollingurl).text waittime += 1 retries += 1 waittime = min(waittime, 20) sys.stderr.write(" .. done !\n") if __DEBUG__: log_stderr(resultpage) # Example: # # <pre> # # lcl_AE004092.1_cdsid_AAK33146.1 CYT score=-0.200913 # # Cut-off=-3 # lcl_AE004092.1_cdsid_AAK33146.1 LipoP1.0:Best CYT 1 1 -0.200913 # <P> # <hr> # # lcl_AE004092.1_cdsid_AAK33147.1 CYT score=-0.200913 # # Cut-off=-3 # lcl_AE004092.1_cdsid_AAK33147.1 LipoP1.0:Best CYT 1 1 -0.200913 # <P> # <hr> allresultpages += html2text( resultpage) # += clean_result_page(resultpage) # we store the cleaned up result pages concatenated together fh = open(outfile, 'a+') fh.write(allresultpages) fh.close() proteins = parse_signalp(allresultpages.splitlines(), proteins, id_mapping=id_mapping) return proteins
def annotate(params, proteins, \ url="http://rbf.bioinfo.tw/" + "~sachen/OMPpredict/" + "TMBETADISC-RBF-Content.html", force=False): """ Interfaces with the TMBETADISC-RBF web service at (http://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php) to predict if protein sequence is likely to be an outer membrane beta-barrel. Note that the default URL we use it different to the regular form used by web browsers, since we need to bypass some AJAX fun. """ # TODO: automatically split large sets into multiple jobs # since TMBETADISC seems to not like more than take # ~5000 seqs at a time if len(proteins) >= 5000: log_stderr( "# ERROR: TMBETADISC-RBF(web): tends to fail with > ~5000 sequences.") return # set the user-agent so web services can block us if they want ... :/ python_version = sys.version.split()[0] # TODO: Set User-Agent header for requests # agent("Python-urllib/%s (requests; inmembrane)" % python_version) outfn = 'tmbetadisc-rbf.out' log_stderr("# TMBETADISC-RBF(web) %s > %s" % (params['fasta'], outfn)) if not force and os.path.isfile(outfn): log_stderr("# -> skipped: %s already exists" % outfn) fh = open(outfn, 'r') proteins = parse_tmbetadisc_output(fh.read(), proteins) fh.close() return proteins # set the user defined method method_map = {"aa": "Amino Acid Composition", "dp": "Depipetide Composition", "aadp": "Amino Acid & Depipetide Composition", "pssm": "PSSM"} if dict_get(params, 'tmbetadisc_rbf_method'): try: method = method_map[params['tmbetadisc_rbf_method']] except KeyError: log_stderr("# ERROR: Invalid setting from tmbetadisc_rbf_method. \ Must be set to aa, dp, aadp or pssm.") sys.exit() # files = {'userfile': open(params["fasta"], 'rb')} with open(params["fasta"], 'r') as ff: data = {'format': 'fasta', 'select': method, 'seq': ff.read()} response = requests.post( 'https://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php', data=data) # , files=files) waiting_page = response.content if __DEBUG__: log_stderr(waiting_page) for l in waiting_page.split('\n'): if 'TMBETADISC-RBF-action.php?UniqueName=' in l: result_url = l.split("'")[1] time.sleep(5) output = requests.get(result_url).content if __DEBUG__: log_stderr(output) # write raw output to a file fh = open(outfn, 'w') # fh.write(waiting_page) # fh.write("<!-- ----------------------------------------------------------------------------------- -->") fh.write(output) fh.close() proteins = parse_tmbetadisc_output(output, proteins) return proteins
def annotate(params, proteins, \ batchsize=500, \ force=False): """ This plugin inferfaces with the TMHMM web interface (for humans) and scrapes the results. This is a silly way to do it, since there is a SOAP service ... however when the SOAP service goes down, as it does from time to time, this plugin can be used as a stopgap. """ baseurl = "http://www.cbs.dtu.dk" url = baseurl + "/cgi-bin/nph-webface" # grab the cached results if present outfile = "tmhmm_scrape_web.out" if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) proteins, id_mapping = generate_safe_seqids(proteins) fh = open(outfile, 'r') resultpage = fh.read() fh.close() #soup = BeautifulSoup(resultpage) proteins = parse_tmhmm(resultpage, proteins, id_mapping=id_mapping) return proteins proteins, id_mapping = generate_safe_seqids(proteins) seqids = proteins.keys() allresultpages = "" while seqids: seqid_batch = seqids[0:batchsize] del seqids[0:batchsize] # get batch of sequences in fasta format with munged ids # (workaround for potential tmhmm sequence id munging) safe_fasta = proteins_to_fasta(proteins, seqids=seqid_batch, use_safe_seqid=True) # we use an OrderedDict rather than a normal dictionary to work around # some quirks in the CBS CGI (the server expects parameters in a certain # order in the HTTP headers). payload = OrderedDict([('configfile', "/usr/opt/www/pub/CBS/services/TMHMM-2.0/TMHMM2.cf"), ("SEQ",""), ("outform","-noplot")]) #files = {'seqfile': open(params['fasta'], 'rb')} files = {'seqfile': StringIO(safe_fasta)} log_stderr("# TMHMM(scrape_web), %s > %s" % (params['fasta'], outfile)) headers = {"User-Agent": "python-requests/%s (inmembrane/%s)" % (requests.__version__, inmembrane.__version__) } r = requests.post(url, data=payload, files=files, headers=headers) if __DEBUG__: log_stderr(r.text) # Example: # # <HTML> # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD> # If Javascript is disabled, follow <a href="/cgi-bin/nph-webface?jobid=TMHMM2,50B5432A10A9CD51&opt=wait">This link</a> # # <script LANGUAGE="JavaScript"><!-- # location.replace("/cgi-bin/nph-webface?jobid=TMHMM2,50B5432A10A9CD51&opt=wait") # //--></script> # </HTML> # extract the result URL (or die if job is rejected ...) if "Job rejected" in r.text: sys.stderr.write(r.text) sys.exit() soup = BeautifulSoup(r.text) resultlink = soup.findAll('a')[0]['href'] if __DEBUG__: log_stderr(resultlink) # brief pause, then grab the results at the result url sys.stderr.write("# Waiting for TMHMM(scrape_web) results") time.sleep(len(proteins)/500) resultpage = requests.post(resultlink).text retries = 0 while ("Webservices : Job queue" in resultpage) and retries < 10: sys.stderr.write(".") time.sleep(len(proteins)/100 + retries**2) resultpage = requests.post(resultlink).text retries += 1 sys.stderr.write(" .. done !\n") if __DEBUG__: log_stderr(resultpage) allresultpages += clean_result_page(resultpage) # we store the cleaned up result pages concatenated together fh = open(outfile, 'a+') fh.write(allresultpages) fh.close() proteins = parse_tmhmm(allresultpages, proteins, id_mapping=id_mapping) return proteins
def annotate(params, proteins, \ batchsize=2000, \ force=False): """ This plugin inferfaces with the LipoP web interface (for humans) and scrapes the results. This is a silly way to do it, since there is a SOAP service ... however when the SOAP service goes down, as it does from time to time, this plugin can be used as a stopgap. """ baseurl = "http://www.cbs.dtu.dk" url = baseurl + "/cgi-bin/webface2.fcgi" # grab the cached results if present outfile = "lipop_scrape_web.out" if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) proteins, id_mapping = generate_safe_seqids(proteins) fh = open(outfile, 'r') resultpage = fh.read() fh.close() #soup = BeautifulSoup(resultpage) proteins = parse_lipop(resultpage, proteins, id_mapping=id_mapping) return proteins proteins, id_mapping = generate_safe_seqids(proteins) seqids = proteins.keys() allresultpages = "" while seqids: seqid_batch = seqids[0:batchsize] del seqids[0:batchsize] # get batch of sequences in fasta format with munged ids # (workaround for lipop sequence id munging) safe_fasta = proteins_to_fasta(proteins, seqids=seqid_batch, use_safe_seqid=True) # we use an OrderedDict rather than a normal dictionary to work around # some quirks in the CBS CGI (the server expects parameters in a certain # order in the HTTP headers). payload = OrderedDict([('configfile', "/usr/opt/www/pub/CBS/services/LipoP-1.0/LipoP.cf"), ("SEQ",""), ("outform","-noplot")]) #files = {'seqfile': open(params['fasta'], 'rb')} files = {'seqfile': StringIO(safe_fasta)} log_stderr("# LipoP(scrape_web), %s > %s" % (params['fasta'], outfile)) headers = {"User-Agent": "python-requests/%s (inmembrane/%s)" % (requests.__version__, inmembrane.__version__) } r = requests.post(url, data=payload, files=files, headers=headers) if __DEBUG__: log_stderr(r.text) # Example: # # <HTML> # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD> # If Javascript is disabled, follow <a href="/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait">This link</a> # # <script LANGUAGE="JavaScript"><!-- # location.replace("/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait") # //--></script> # </HTML> # extract the result URL (or die if job is rejected ...) if "Job rejected" in r.text: sys.stderr.write(r.text) sys.exit() r = r.text.replace("<noscript>","").replace("</noscript","") soup = BeautifulSoup(r) resultlink = soup.findAll('a')[0]['href'] sys.stderr.write("# Fetching from: " + resultlink + "\n"); # try grabbing the result, then keep polling until they are ready sys.stderr.write("# Waiting for LipoP(scrape_web) results ") waittime = 1.0 time.sleep(waittime) #(len(proteins)/500) resultpage = requests.get(resultlink).text retries = 0 while (("<title>Job status of" in resultpage) and (retries < 15)): sys.stderr.write(".") time.sleep(waittime) #(len(proteins)/500) resultpage = requests.get(resultlink).text waittime += 1; retries += 1 waittime = min(waittime, 20) sys.stderr.write(" .. done !\n") if __DEBUG__: log_stderr(resultpage) # Example: # # <pre> # # lcl_AE004092.1_cdsid_AAK33146.1 CYT score=-0.200913 # # Cut-off=-3 # lcl_AE004092.1_cdsid_AAK33146.1 LipoP1.0:Best CYT 1 1 -0.200913 # <P> # <hr> # # lcl_AE004092.1_cdsid_AAK33147.1 CYT score=-0.200913 # # Cut-off=-3 # lcl_AE004092.1_cdsid_AAK33147.1 LipoP1.0:Best CYT 1 1 -0.200913 # <P> # <hr> allresultpages += clean_result_page(resultpage) # we store the cleaned up result pages concatenated together fh = open(outfile, 'a+') fh.write(allresultpages) fh.close() proteins = parse_lipop(allresultpages, proteins, id_mapping=id_mapping) return proteins