def annotate(params, proteins, \ batchsize=500, \ force=False): """ This plugin inferfaces with the TMHMM web interface (for humans) and scrapes the results. This is a silly way to do it, since there is a SOAP service ... however when the SOAP service goes down, as it does from time to time, this plugin can be used as a stopgap. """ baseurl = "http://www.cbs.dtu.dk" url = baseurl + "/cgi-bin/nph-webface" # grab the cached results if present outfile = "tmhmm_scrape_web.out" if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) proteins, id_mapping = generate_safe_seqids(proteins) fh = open(outfile, 'r') resultpage = fh.read() fh.close() #soup = BeautifulSoup(resultpage) proteins = parse_tmhmm(resultpage, proteins, id_mapping=id_mapping) return proteins proteins, id_mapping = generate_safe_seqids(proteins) seqids = proteins.keys() allresultpages = "" while seqids: seqid_batch = seqids[0:batchsize] del seqids[0:batchsize] # get batch of sequences in fasta format with munged ids # (workaround for potential tmhmm sequence id munging) safe_fasta = proteins_to_fasta(proteins, seqids=seqid_batch, use_safe_seqid=True) # we use an OrderedDict rather than a normal dictionary to work around # some quirks in the CBS CGI (the server expects parameters in a certain # order in the HTTP headers). payload = OrderedDict([('configfile', "/usr/opt/www/pub/CBS/services/TMHMM-2.0/TMHMM2.cf"), ("SEQ",""), ("outform","-noplot")]) #files = {'seqfile': open(params['fasta'], 'rb')} files = {'seqfile': StringIO(safe_fasta)} log_stderr("# TMHMM(scrape_web), %s > %s" % (params['fasta'], outfile)) headers = {"User-Agent": "python-requests/%s (inmembrane/%s)" % (requests.__version__, inmembrane.__version__) } r = requests.post(url, data=payload, files=files, headers=headers) if __DEBUG__: log_stderr(r.text) # Example: # # <HTML> # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD> # If Javascript is disabled, follow <a href="/cgi-bin/nph-webface?jobid=TMHMM2,50B5432A10A9CD51&opt=wait">This link</a> # # <script LANGUAGE="JavaScript"><!-- # location.replace("/cgi-bin/nph-webface?jobid=TMHMM2,50B5432A10A9CD51&opt=wait") # //--></script> # </HTML> # extract the result URL (or die if job is rejected ...) if "Job rejected" in r.text: sys.stderr.write(r.text) sys.exit() soup = BeautifulSoup(r.text) resultlink = soup.findAll('a')[0]['href'] if __DEBUG__: log_stderr(resultlink) # brief pause, then grab the results at the result url sys.stderr.write("# Waiting for TMHMM(scrape_web) results") time.sleep(len(proteins)/500) resultpage = requests.post(resultlink).text retries = 0 while ("Webservices : Job queue" in resultpage) and retries < 10: sys.stderr.write(".") time.sleep(len(proteins)/100 + retries**2) resultpage = requests.post(resultlink).text retries += 1 sys.stderr.write(" .. done !\n") if __DEBUG__: log_stderr(resultpage) allresultpages += clean_result_page(resultpage) # we store the cleaned up result pages concatenated together fh = open(outfile, 'a+') fh.write(allresultpages) fh.close() proteins = parse_tmhmm(allresultpages, proteins, id_mapping=id_mapping) return proteins
def annotate(params, proteins, batchsize=2000, force=False): """ This plugin interfaces with the SignalP web interface (for humans) and scrapes the results. There once was a SOAP service but it was discontinued, so now we use this. """ baseurl = "http://www.cbs.dtu.dk" url = baseurl + "/cgi-bin/webface2.fcgi" # grab the cached results if present outfile = "signalp_scrape_web.out" if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) proteins, id_mapping = generate_safe_seqids(proteins) fh = open(outfile, 'r') resultpage = fh.read() fh.close() # soup = BeautifulSoup(resultpage) proteins = parse_signalp(resultpage.splitlines(), proteins, id_mapping=id_mapping) return proteins proteins, id_mapping = generate_safe_seqids(proteins) seqids = proteins.keys() allresultpages = "" while seqids: seqid_batch = seqids[0:batchsize] del seqids[0:batchsize] safe_fasta = proteins_to_fasta(proteins, seqids=seqid_batch, use_safe_seqid=True) # we use an OrderedDict rather than a normal dictionary to work around # some quirks in the CBS CGI (the server expects parameters in a certain # order in the HTTP headers). payload = OrderedDict([ ('configfile', "/usr/opt/www/pub/CBS/services/SignalP-4.1/SignalP.cf"), ("SEQPASTE", ""), ("orgtype", params['signalp4_organism']), # gram+, gram-, euk ("Dcut-type", "default"), ("method", "best"), # best, notm ("minlen", ""), ("trunc", ""), ("format", "short") ]) # summary, short, long, all # files = {'seqfile': open(params['fasta'], 'rb')} files = {'SEQSUB': StringIO(safe_fasta)} log_stderr("# SignalP(scrape_web), %s > %s" % (params['fasta'], outfile)) headers = { "User-Agent": "python-requests/%s (inmembrane/%s)" % (requests.__version__, inmembrane.__version__) } r_post = requests.post(url, data=payload, files=files, headers=headers) if __DEBUG__: log_stderr(r_post.text) # Example: # # <HTML> # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD> # If Javascript is disabled, follow <a href="/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait">This link</a> # # <script LANGUAGE="JavaScript"><!-- # location.replace("/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait") # //--></script> # </HTML> # extract the result URL (or die if job is rejected ...) if "Job rejected" in r_post.text: log_stderr(r_post.text) sys.exit() r_post_clean = r_post.text.replace("<noscript>", "").replace("</noscript", "") soup = BeautifulSoup(r_post_clean) pollingurl = soup.findAll('a')[0]['href'] sys.stderr.write("# Fetching from: " + pollingurl + "\n") # try grabbing the result, then keep polling until they are ready sys.stderr.write("# Waiting for SignalP(scrape_web) results ") waittime = 1.0 time.sleep(waittime) # (len(proteins)/500) resultpage = requests.get(pollingurl).text retries = 0 while (("<title>Job status of" in resultpage) and (retries < 15)): sys.stderr.write(".") time.sleep(waittime) # (len(proteins)/500) resultpage = requests.get(pollingurl).text waittime += 1 retries += 1 waittime = min(waittime, 20) sys.stderr.write(" .. done !\n") if __DEBUG__: log_stderr(resultpage) # Example: # # <pre> # # lcl_AE004092.1_cdsid_AAK33146.1 CYT score=-0.200913 # # Cut-off=-3 # lcl_AE004092.1_cdsid_AAK33146.1 LipoP1.0:Best CYT 1 1 -0.200913 # <P> # <hr> # # lcl_AE004092.1_cdsid_AAK33147.1 CYT score=-0.200913 # # Cut-off=-3 # lcl_AE004092.1_cdsid_AAK33147.1 LipoP1.0:Best CYT 1 1 -0.200913 # <P> # <hr> allresultpages += html2text( resultpage) # += clean_result_page(resultpage) # we store the cleaned up result pages concatenated together fh = open(outfile, 'a+') fh.write(allresultpages) fh.close() proteins = parse_signalp(allresultpages.splitlines(), proteins, id_mapping=id_mapping) return proteins
def annotate(params, proteins, \ batchsize=500, \ force=False): """ This plugin inferfaces with the TMHMM web interface (for humans) and scrapes the results. This is a silly way to do it, since there is a SOAP service ... however when the SOAP service goes down, as it does from time to time, this plugin can be used as a stopgap. """ baseurl = "http://www.cbs.dtu.dk" url = baseurl + "/cgi-bin/nph-webface" # grab the cached results if present outfile = "tmhmm_scrape_web.out" if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) proteins, id_mapping = generate_safe_seqids(proteins) fh = open(outfile, 'r') resultpage = fh.read() fh.close() #soup = BeautifulSoup(resultpage) proteins = parse_tmhmm(resultpage, proteins, id_mapping=id_mapping) return proteins proteins, id_mapping = generate_safe_seqids(proteins) seqids = proteins.keys() allresultpages = "" while seqids: seqid_batch = seqids[0:batchsize] del seqids[0:batchsize] # get batch of sequences in fasta format with munged ids # (workaround for potential tmhmm sequence id munging) safe_fasta = proteins_to_fasta(proteins, seqids=seqid_batch, use_safe_seqid=True) # we use an OrderedDict rather than a normal dictionary to work around # some quirks in the CBS CGI (the server expects parameters in a certain # order in the HTTP headers). payload = OrderedDict([ ('configfile', "/usr/opt/www/pub/CBS/services/TMHMM-2.0/TMHMM2.cf"), ("SEQ", ""), ("outform", "-noplot") ]) #files = {'seqfile': open(params['fasta'], 'rb')} files = {'seqfile': StringIO(safe_fasta)} log_stderr("# TMHMM(scrape_web), %s > %s" % (params['fasta'], outfile)) headers = { "User-Agent": "python-requests/%s (inmembrane/%s)" % (requests.__version__, inmembrane.__version__) } r_post = requests.post(url, data=payload, files=files, headers=headers) # HACK: the initial POST throws us a 302 redirect and we grab the redirect url from the text # (... not sure why requests allow_redirect=True option doesn't handle this transparently) pollingurl = r_post.url + r_post.text.split("Location: ")[1] r = requests.get(pollingurl) if __DEBUG__: log_stderr(r.text) # Example: # # <HTML> # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD> # If Javascript is disabled, follow <a href="/cgi-bin/nph-webface?jobid=TMHMM2,50B5432A10A9CD51&opt=wait">This link</a> # # <script LANGUAGE="JavaScript"><!-- # location.replace("/cgi-bin/nph-webface?jobid=TMHMM2,50B5432A10A9CD51&opt=wait") # //--></script> # </HTML> # extract the result URL (or die if job is rejected ...) if "Job rejected" in r.text: sys.stderr.write(r.text) sys.exit() # sometimes we get a polling page, other times the result page is sent immediately. if ("<title>Job status of" in r.text): r = r.text.replace("<noscript>", "").replace("</noscript", "") soup = BeautifulSoup(r) resultlink = soup.findAll('a')[0]['href'] if __DEBUG__: log_stderr(resultlink) # try grabbing the result, then keep polling until they are ready sys.stderr.write("# Waiting for TMHMM(scrape_web) results") time.sleep(len(proteins) / 500) resultpage = requests.get(resultlink).text retries = 0 while ("<title>Job status of" in resultpage) and retries < 10: sys.stderr.write(".") time.sleep(len(proteins) / 100 + retries**2) resultpage = requests.get(resultlink).text retries += 1 else: resultpage = r.text sys.stderr.write(" .. done !\n") if __DEBUG__: log_stderr(resultpage) allresultpages += clean_result_page(resultpage) # we store the cleaned up result pages concatenated together fh = open(outfile, 'a+') fh.write(allresultpages) fh.close() proteins = parse_tmhmm(allresultpages, proteins, id_mapping=id_mapping) return proteins
def annotate(params, proteins, batchsize=2000, force=False): """ This plugin interfaces with the SignalP web interface (for humans) and scrapes the results. There once was a SOAP service but it was discontinued, so now we use this. """ baseurl = "http://www.cbs.dtu.dk" url = baseurl + "/cgi-bin/webface2.fcgi" # grab the cached results if present outfile = "signalp_scrape_web.out" if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) proteins, id_mapping = generate_safe_seqids(proteins) fh = open(outfile, 'r') resultpage = fh.read() fh.close() # soup = BeautifulSoup(resultpage) proteins = parse_signalp(resultpage.splitlines(), proteins, id_mapping=id_mapping) return proteins proteins, id_mapping = generate_safe_seqids(proteins) seqids = proteins.keys() allresultpages = "" while seqids: seqid_batch = seqids[0:batchsize] del seqids[0:batchsize] safe_fasta = proteins_to_fasta(proteins, seqids=seqid_batch, use_safe_seqid=True) # we use an OrderedDict rather than a normal dictionary to work around # some quirks in the CBS CGI (the server expects parameters in a certain # order in the HTTP headers). payload = OrderedDict([ ('configfile', "/usr/opt/www/pub/CBS/services/SignalP-4.1/SignalP.cf"), ("SEQPASTE", ""), ("orgtype", params['signalp4_organism']), # gram+, gram-, euk ("Dcut-type", "default"), ("method", "best"), # best, notm ("minlen", ""), ("trunc", ""), ("format", "short")]) # summary, short, long, all # files = {'seqfile': open(params['fasta'], 'rb')} files = {'SEQSUB': StringIO(safe_fasta)} log_stderr( "# SignalP(scrape_web), %s > %s" % (params['fasta'], outfile)) headers = {"User-Agent": "python-requests/%s (inmembrane/%s)" % (requests.__version__, inmembrane.__version__)} r_post = requests.post(url, data=payload, files=files, headers=headers) if __DEBUG__: log_stderr(r_post.text) # Example: # # <HTML> # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD> # If Javascript is disabled, follow <a href="/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait">This link</a> # # <script LANGUAGE="JavaScript"><!-- # location.replace("/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait") # //--></script> # </HTML> # extract the result URL (or die if job is rejected ...) if "Job rejected" in r_post.text: log_stderr(r_post.text) sys.exit() r_post_clean = r_post.text.replace("<noscript>", "").replace( "</noscript", "") soup = BeautifulSoup(r_post_clean) pollingurl = soup.findAll('a')[0]['href'] sys.stderr.write("# Fetching from: " + pollingurl + "\n"); # try grabbing the result, then keep polling until they are ready sys.stderr.write("# Waiting for SignalP(scrape_web) results ") waittime = 1.0 time.sleep(waittime) # (len(proteins)/500) resultpage = requests.get(pollingurl).text retries = 0 while (("<title>Job status of" in resultpage) and (retries < 15)): sys.stderr.write(".") time.sleep(waittime) # (len(proteins)/500) resultpage = requests.get(pollingurl).text waittime += 1; retries += 1 waittime = min(waittime, 20) sys.stderr.write(" .. done !\n") if __DEBUG__: log_stderr(resultpage) # Example: # # <pre> # # lcl_AE004092.1_cdsid_AAK33146.1 CYT score=-0.200913 # # Cut-off=-3 # lcl_AE004092.1_cdsid_AAK33146.1 LipoP1.0:Best CYT 1 1 -0.200913 # <P> # <hr> # # lcl_AE004092.1_cdsid_AAK33147.1 CYT score=-0.200913 # # Cut-off=-3 # lcl_AE004092.1_cdsid_AAK33147.1 LipoP1.0:Best CYT 1 1 -0.200913 # <P> # <hr> allresultpages += html2text( resultpage) # += clean_result_page(resultpage) # we store the cleaned up result pages concatenated together fh = open(outfile, 'a+') fh.write(allresultpages) fh.close() proteins = parse_signalp(allresultpages.splitlines(), proteins, id_mapping=id_mapping) return proteins
def annotate(params, proteins, \ batchsize=2000, \ force=False): """ This plugin inferfaces with the LipoP web interface (for humans) and scrapes the results. This is a silly way to do it, since there is a SOAP service ... however when the SOAP service goes down, as it does from time to time, this plugin can be used as a stopgap. """ baseurl = "http://www.cbs.dtu.dk" url = baseurl + "/cgi-bin/webface2.fcgi" # grab the cached results if present outfile = "lipop_scrape_web.out" if not force and os.path.isfile(outfile): log_stderr("# -> skipped: %s already exists" % outfile) proteins, id_mapping = generate_safe_seqids(proteins) fh = open(outfile, 'r') resultpage = fh.read() fh.close() #soup = BeautifulSoup(resultpage) proteins = parse_lipop(resultpage, proteins, id_mapping=id_mapping) return proteins proteins, id_mapping = generate_safe_seqids(proteins) seqids = proteins.keys() allresultpages = "" while seqids: seqid_batch = seqids[0:batchsize] del seqids[0:batchsize] # get batch of sequences in fasta format with munged ids # (workaround for lipop sequence id munging) safe_fasta = proteins_to_fasta(proteins, seqids=seqid_batch, use_safe_seqid=True) # we use an OrderedDict rather than a normal dictionary to work around # some quirks in the CBS CGI (the server expects parameters in a certain # order in the HTTP headers). payload = OrderedDict([('configfile', "/usr/opt/www/pub/CBS/services/LipoP-1.0/LipoP.cf"), ("SEQ",""), ("outform","-noplot")]) #files = {'seqfile': open(params['fasta'], 'rb')} files = {'seqfile': StringIO(safe_fasta)} log_stderr("# LipoP(scrape_web), %s > %s" % (params['fasta'], outfile)) headers = {"User-Agent": "python-requests/%s (inmembrane/%s)" % (requests.__version__, inmembrane.__version__) } r = requests.post(url, data=payload, files=files, headers=headers) if __DEBUG__: log_stderr(r.text) # Example: # # <HTML> # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD> # If Javascript is disabled, follow <a href="/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait">This link</a> # # <script LANGUAGE="JavaScript"><!-- # location.replace("/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait") # //--></script> # </HTML> # extract the result URL (or die if job is rejected ...) if "Job rejected" in r.text: sys.stderr.write(r.text) sys.exit() r = r.text.replace("<noscript>","").replace("</noscript","") soup = BeautifulSoup(r) resultlink = soup.findAll('a')[0]['href'] sys.stderr.write("# Fetching from: " + resultlink + "\n"); # try grabbing the result, then keep polling until they are ready sys.stderr.write("# Waiting for LipoP(scrape_web) results ") waittime = 1.0 time.sleep(waittime) #(len(proteins)/500) resultpage = requests.get(resultlink).text retries = 0 while (("<title>Job status of" in resultpage) and (retries < 15)): sys.stderr.write(".") time.sleep(waittime) #(len(proteins)/500) resultpage = requests.get(resultlink).text waittime += 1; retries += 1 waittime = min(waittime, 20) sys.stderr.write(" .. done !\n") if __DEBUG__: log_stderr(resultpage) # Example: # # <pre> # # lcl_AE004092.1_cdsid_AAK33146.1 CYT score=-0.200913 # # Cut-off=-3 # lcl_AE004092.1_cdsid_AAK33146.1 LipoP1.0:Best CYT 1 1 -0.200913 # <P> # <hr> # # lcl_AE004092.1_cdsid_AAK33147.1 CYT score=-0.200913 # # Cut-off=-3 # lcl_AE004092.1_cdsid_AAK33147.1 LipoP1.0:Best CYT 1 1 -0.200913 # <P> # <hr> allresultpages += clean_result_page(resultpage) # we store the cleaned up result pages concatenated together fh = open(outfile, 'a+') fh.write(allresultpages) fh.close() proteins = parse_lipop(allresultpages, proteins, id_mapping=id_mapping) return proteins