def annotate(params, proteins, \
             batchsize=500, \
             force=False):
  """
  This plugin inferfaces with the TMHMM web interface (for humans) and
  scrapes the results. This is a silly way to do it, since there is
  a SOAP service ... however when the SOAP service goes down, as it does
  from time to time, this plugin can be used as a stopgap.
  """

  baseurl = "http://www.cbs.dtu.dk"
  url = baseurl + "/cgi-bin/nph-webface"

  # grab the cached results if present
  outfile = "tmhmm_scrape_web.out"
  if not force and os.path.isfile(outfile):
    log_stderr("# -> skipped: %s already exists" % outfile)
    proteins, id_mapping = generate_safe_seqids(proteins)
    fh = open(outfile, 'r')
    resultpage = fh.read()
    fh.close()
    #soup = BeautifulSoup(resultpage)
    proteins = parse_tmhmm(resultpage, proteins, id_mapping=id_mapping)
    return proteins

  proteins, id_mapping = generate_safe_seqids(proteins)

  seqids = proteins.keys()
  allresultpages = ""
  while seqids:
    seqid_batch = seqids[0:batchsize]
    del seqids[0:batchsize]

    # get batch of sequences in fasta format with munged ids 
    # (workaround for potential tmhmm sequence id munging)
    safe_fasta = proteins_to_fasta(proteins, seqids=seqid_batch, 
                                             use_safe_seqid=True)

    # we use an OrderedDict rather than a normal dictionary to work around 
    # some quirks in the CBS CGI (the server expects parameters in a certain 
    # order in the HTTP headers).
    payload = OrderedDict([('configfile',
                          "/usr/opt/www/pub/CBS/services/TMHMM-2.0/TMHMM2.cf"),
                          ("SEQ",""),
                          ("outform","-noplot")])

    #files = {'seqfile': open(params['fasta'], 'rb')}
    files = {'seqfile': StringIO(safe_fasta)}

    log_stderr("# TMHMM(scrape_web), %s > %s" % (params['fasta'], outfile))

    headers = {"User-Agent": 
               "python-requests/%s (inmembrane/%s)" % 
               (requests.__version__, inmembrane.__version__) }
    r = requests.post(url, data=payload, files=files, headers=headers)
    if __DEBUG__:
      log_stderr(r.text)
      # Example:
      #
      # <HTML>
      # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD>
      # If Javascript is disabled, follow <a href="/cgi-bin/nph-webface?jobid=TMHMM2,50B5432A10A9CD51&opt=wait">This link</a>
      #
      # <script LANGUAGE="JavaScript"><!--
      # location.replace("/cgi-bin/nph-webface?jobid=TMHMM2,50B5432A10A9CD51&opt=wait")
      # //--></script>
      # </HTML>

    # extract the result URL (or die if job is rejected ...)
    if "Job rejected" in r.text:
      sys.stderr.write(r.text)
      sys.exit()
    soup = BeautifulSoup(r.text)
      
    resultlink = soup.findAll('a')[0]['href']
    if __DEBUG__:
      log_stderr(resultlink)

    # brief pause, then grab the results at the result url
    sys.stderr.write("# Waiting for TMHMM(scrape_web) results")
    time.sleep(len(proteins)/500)
    resultpage = requests.post(resultlink).text
    retries = 0
    while ("Webservices : Job queue" in resultpage) and retries < 10:
      sys.stderr.write(".")
      time.sleep(len(proteins)/100 + retries**2)
      resultpage = requests.post(resultlink).text
      retries += 1

    sys.stderr.write(" .. done !\n")

    if __DEBUG__:
      log_stderr(resultpage)

    allresultpages += clean_result_page(resultpage)
  
  # we store the cleaned up result pages concatenated together
  fh = open(outfile, 'a+')
  fh.write(allresultpages)
  fh.close()

  proteins = parse_tmhmm(allresultpages, proteins, id_mapping=id_mapping)
  return proteins
def annotate(params, proteins, batchsize=2000, force=False):
    """
    This plugin interfaces with the SignalP web interface (for humans) and
    scrapes the results. There once was a SOAP service but it was discontinued,
    so now we use this.
    """

    baseurl = "http://www.cbs.dtu.dk"
    url = baseurl + "/cgi-bin/webface2.fcgi"

    # grab the cached results if present
    outfile = "signalp_scrape_web.out"
    if not force and os.path.isfile(outfile):
        log_stderr("# -> skipped: %s already exists" % outfile)
        proteins, id_mapping = generate_safe_seqids(proteins)
        fh = open(outfile, 'r')
        resultpage = fh.read()
        fh.close()
        # soup = BeautifulSoup(resultpage)
        proteins = parse_signalp(resultpage.splitlines(),
                                 proteins,
                                 id_mapping=id_mapping)
        return proteins

    proteins, id_mapping = generate_safe_seqids(proteins)

    seqids = proteins.keys()
    allresultpages = ""
    while seqids:
        seqid_batch = seqids[0:batchsize]
        del seqids[0:batchsize]

        safe_fasta = proteins_to_fasta(proteins,
                                       seqids=seqid_batch,
                                       use_safe_seqid=True)

        # we use an OrderedDict rather than a normal dictionary to work around
        # some quirks in the CBS CGI (the server expects parameters in a certain
        # order in the HTTP headers).
        payload = OrderedDict([
            ('configfile',
             "/usr/opt/www/pub/CBS/services/SignalP-4.1/SignalP.cf"),
            ("SEQPASTE", ""),
            ("orgtype", params['signalp4_organism']),  # gram+, gram-, euk
            ("Dcut-type", "default"),
            ("method", "best"),  # best, notm
            ("minlen", ""),
            ("trunc", ""),
            ("format", "short")
        ])  # summary, short, long, all

        # files = {'seqfile': open(params['fasta'], 'rb')}
        files = {'SEQSUB': StringIO(safe_fasta)}

        log_stderr("# SignalP(scrape_web), %s > %s" %
                   (params['fasta'], outfile))

        headers = {
            "User-Agent":
            "python-requests/%s (inmembrane/%s)" %
            (requests.__version__, inmembrane.__version__)
        }
        r_post = requests.post(url, data=payload, files=files, headers=headers)

        if __DEBUG__:
            log_stderr(r_post.text)
            # Example:
            #
            # <HTML>
            # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD>
            # If Javascript is disabled, follow <a href="/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait">This link</a>
            #
            # <script LANGUAGE="JavaScript"><!--
            # location.replace("/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait")
            # //--></script>
            # </HTML>

        # extract the result URL (or die if job is rejected ...)
        if "Job rejected" in r_post.text:
            log_stderr(r_post.text)
            sys.exit()

        r_post_clean = r_post.text.replace("<noscript>",
                                           "").replace("</noscript", "")
        soup = BeautifulSoup(r_post_clean)
        pollingurl = soup.findAll('a')[0]['href']
        sys.stderr.write("# Fetching from: " + pollingurl + "\n")
        # try grabbing the result, then keep polling until they are ready
        sys.stderr.write("# Waiting for SignalP(scrape_web) results ")
        waittime = 1.0
        time.sleep(waittime)  # (len(proteins)/500)
        resultpage = requests.get(pollingurl).text
        retries = 0
        while (("<title>Job status of" in resultpage) and (retries < 15)):
            sys.stderr.write(".")
            time.sleep(waittime)  # (len(proteins)/500)
            resultpage = requests.get(pollingurl).text
            waittime += 1
            retries += 1
            waittime = min(waittime, 20)

        sys.stderr.write(" .. done !\n")

        if __DEBUG__:
            log_stderr(resultpage)
            # Example:
            #
            #   <pre>
            # # lcl_AE004092.1_cdsid_AAK33146.1 CYT score=-0.200913
            # # Cut-off=-3
            # lcl_AE004092.1_cdsid_AAK33146.1	LipoP1.0:Best	CYT	1	1	-0.200913
            # <P>
            # <hr>
            # # lcl_AE004092.1_cdsid_AAK33147.1 CYT score=-0.200913
            # # Cut-off=-3
            # lcl_AE004092.1_cdsid_AAK33147.1	LipoP1.0:Best	CYT	1	1	-0.200913
            # <P>
            # <hr>

        allresultpages += html2text(
            resultpage)  # += clean_result_page(resultpage)

    # we store the cleaned up result pages concatenated together
    fh = open(outfile, 'a+')
    fh.write(allresultpages)
    fh.close()

    proteins = parse_signalp(allresultpages.splitlines(),
                             proteins,
                             id_mapping=id_mapping)
    return proteins
def annotate(params, proteins, \
             batchsize=500, \
             force=False):
    """
  This plugin inferfaces with the TMHMM web interface (for humans) and
  scrapes the results. This is a silly way to do it, since there is
  a SOAP service ... however when the SOAP service goes down, as it does
  from time to time, this plugin can be used as a stopgap.
  """

    baseurl = "http://www.cbs.dtu.dk"
    url = baseurl + "/cgi-bin/nph-webface"

    # grab the cached results if present
    outfile = "tmhmm_scrape_web.out"
    if not force and os.path.isfile(outfile):
        log_stderr("# -> skipped: %s already exists" % outfile)
        proteins, id_mapping = generate_safe_seqids(proteins)
        fh = open(outfile, 'r')
        resultpage = fh.read()
        fh.close()
        #soup = BeautifulSoup(resultpage)
        proteins = parse_tmhmm(resultpage, proteins, id_mapping=id_mapping)
        return proteins

    proteins, id_mapping = generate_safe_seqids(proteins)

    seqids = proteins.keys()
    allresultpages = ""
    while seqids:
        seqid_batch = seqids[0:batchsize]
        del seqids[0:batchsize]

        # get batch of sequences in fasta format with munged ids
        # (workaround for potential tmhmm sequence id munging)
        safe_fasta = proteins_to_fasta(proteins,
                                       seqids=seqid_batch,
                                       use_safe_seqid=True)

        # we use an OrderedDict rather than a normal dictionary to work around
        # some quirks in the CBS CGI (the server expects parameters in a certain
        # order in the HTTP headers).
        payload = OrderedDict([
            ('configfile',
             "/usr/opt/www/pub/CBS/services/TMHMM-2.0/TMHMM2.cf"), ("SEQ", ""),
            ("outform", "-noplot")
        ])

        #files = {'seqfile': open(params['fasta'], 'rb')}
        files = {'seqfile': StringIO(safe_fasta)}

        log_stderr("# TMHMM(scrape_web), %s > %s" % (params['fasta'], outfile))

        headers = {
            "User-Agent":
            "python-requests/%s (inmembrane/%s)" %
            (requests.__version__, inmembrane.__version__)
        }
        r_post = requests.post(url, data=payload, files=files, headers=headers)

        # HACK: the initial POST throws us a 302 redirect and we grab the redirect url from the text
        #       (... not sure why requests allow_redirect=True option doesn't handle this transparently)
        pollingurl = r_post.url + r_post.text.split("Location: ")[1]
        r = requests.get(pollingurl)

        if __DEBUG__:
            log_stderr(r.text)
            # Example:
            #
            # <HTML>
            # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD>
            # If Javascript is disabled, follow <a href="/cgi-bin/nph-webface?jobid=TMHMM2,50B5432A10A9CD51&opt=wait">This link</a>
            #
            # <script LANGUAGE="JavaScript"><!--
            # location.replace("/cgi-bin/nph-webface?jobid=TMHMM2,50B5432A10A9CD51&opt=wait")
            # //--></script>
            # </HTML>

        # extract the result URL (or die if job is rejected ...)
        if "Job rejected" in r.text:
            sys.stderr.write(r.text)
            sys.exit()

        # sometimes we get a polling page, other times the result page is sent immediately.
        if ("<title>Job status of" in r.text):
            r = r.text.replace("<noscript>", "").replace("</noscript", "")
            soup = BeautifulSoup(r)
            resultlink = soup.findAll('a')[0]['href']
            if __DEBUG__:
                log_stderr(resultlink)

            # try grabbing the result, then keep polling until they are ready
            sys.stderr.write("# Waiting for TMHMM(scrape_web) results")
            time.sleep(len(proteins) / 500)
            resultpage = requests.get(resultlink).text
            retries = 0
            while ("<title>Job status of" in resultpage) and retries < 10:
                sys.stderr.write(".")
                time.sleep(len(proteins) / 100 + retries**2)
                resultpage = requests.get(resultlink).text
                retries += 1
        else:
            resultpage = r.text

        sys.stderr.write(" .. done !\n")

        if __DEBUG__:
            log_stderr(resultpage)

        allresultpages += clean_result_page(resultpage)

    # we store the cleaned up result pages concatenated together
    fh = open(outfile, 'a+')
    fh.write(allresultpages)
    fh.close()

    proteins = parse_tmhmm(allresultpages, proteins, id_mapping=id_mapping)
    return proteins
def annotate(params, proteins, batchsize=2000, force=False):
    """
    This plugin interfaces with the SignalP web interface (for humans) and
    scrapes the results. There once was a SOAP service but it was discontinued,
    so now we use this.
    """

    baseurl = "http://www.cbs.dtu.dk"
    url = baseurl + "/cgi-bin/webface2.fcgi"

    # grab the cached results if present
    outfile = "signalp_scrape_web.out"
    if not force and os.path.isfile(outfile):
        log_stderr("# -> skipped: %s already exists" % outfile)
        proteins, id_mapping = generate_safe_seqids(proteins)
        fh = open(outfile, 'r')
        resultpage = fh.read()
        fh.close()
        # soup = BeautifulSoup(resultpage)
        proteins = parse_signalp(resultpage.splitlines(),
                                 proteins, id_mapping=id_mapping)
        return proteins

    proteins, id_mapping = generate_safe_seqids(proteins)

    seqids = proteins.keys()
    allresultpages = ""
    while seqids:
        seqid_batch = seqids[0:batchsize]
        del seqids[0:batchsize]

        safe_fasta = proteins_to_fasta(proteins,
                                       seqids=seqid_batch,
                                       use_safe_seqid=True)

        # we use an OrderedDict rather than a normal dictionary to work around
        # some quirks in the CBS CGI (the server expects parameters in a certain
        # order in the HTTP headers).
        payload = OrderedDict([
            ('configfile',
             "/usr/opt/www/pub/CBS/services/SignalP-4.1/SignalP.cf"),
            ("SEQPASTE", ""),
            ("orgtype", params['signalp4_organism']),  # gram+, gram-, euk
            ("Dcut-type", "default"),
            ("method", "best"),  # best, notm
            ("minlen", ""),
            ("trunc", ""),
            ("format", "short")])  # summary, short, long, all

        # files = {'seqfile': open(params['fasta'], 'rb')}
        files = {'SEQSUB': StringIO(safe_fasta)}

        log_stderr(
            "# SignalP(scrape_web), %s > %s" % (params['fasta'], outfile))

        headers = {"User-Agent":
                       "python-requests/%s (inmembrane/%s)" %
                       (requests.__version__, inmembrane.__version__)}
        r_post = requests.post(url, data=payload, files=files, headers=headers)

        if __DEBUG__:
            log_stderr(r_post.text)
            # Example:
            #
            # <HTML>
            # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD>
            # If Javascript is disabled, follow <a href="/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait">This link</a>
            #
            # <script LANGUAGE="JavaScript"><!--
            # location.replace("/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait")
            # //--></script>
            # </HTML>

        # extract the result URL (or die if job is rejected ...)
        if "Job rejected" in r_post.text:
            log_stderr(r_post.text)
            sys.exit()

        r_post_clean = r_post.text.replace("<noscript>", "").replace(
            "</noscript", "")
        soup = BeautifulSoup(r_post_clean)
        pollingurl = soup.findAll('a')[0]['href']
        sys.stderr.write("# Fetching from: " + pollingurl + "\n");
        # try grabbing the result, then keep polling until they are ready
        sys.stderr.write("# Waiting for SignalP(scrape_web) results ")
        waittime = 1.0
        time.sleep(waittime)  # (len(proteins)/500)
        resultpage = requests.get(pollingurl).text
        retries = 0
        while (("<title>Job status of" in resultpage) and (retries < 15)):
            sys.stderr.write(".")
            time.sleep(waittime)  # (len(proteins)/500)
            resultpage = requests.get(pollingurl).text
            waittime += 1;
            retries += 1
            waittime = min(waittime, 20)

        sys.stderr.write(" .. done !\n")

        if __DEBUG__:
            log_stderr(resultpage)
            # Example:
            #
            #   <pre>
            # # lcl_AE004092.1_cdsid_AAK33146.1 CYT score=-0.200913
            # # Cut-off=-3
            # lcl_AE004092.1_cdsid_AAK33146.1	LipoP1.0:Best	CYT	1	1	-0.200913
            # <P>
            # <hr>
            # # lcl_AE004092.1_cdsid_AAK33147.1 CYT score=-0.200913
            # # Cut-off=-3
            # lcl_AE004092.1_cdsid_AAK33147.1	LipoP1.0:Best	CYT	1	1	-0.200913
            # <P>
            # <hr>

        allresultpages += html2text(
            resultpage)  # += clean_result_page(resultpage)

    # we store the cleaned up result pages concatenated together
    fh = open(outfile, 'a+')
    fh.write(allresultpages)
    fh.close()

    proteins = parse_signalp(allresultpages.splitlines(), proteins,
                             id_mapping=id_mapping)
    return proteins
Exemple #5
0
def annotate(params, proteins, \
             batchsize=2000, \
             force=False):
  """
  This plugin inferfaces with the LipoP web interface (for humans) and
  scrapes the results. This is a silly way to do it, since there is
  a SOAP service ... however when the SOAP service goes down, as it does
  from time to time, this plugin can be used as a stopgap.
  """

  baseurl = "http://www.cbs.dtu.dk"
  url = baseurl + "/cgi-bin/webface2.fcgi"

  # grab the cached results if present
  outfile = "lipop_scrape_web.out"
  if not force and os.path.isfile(outfile):
    log_stderr("# -> skipped: %s already exists" % outfile)
    proteins, id_mapping = generate_safe_seqids(proteins)
    fh = open(outfile, 'r')
    resultpage = fh.read()
    fh.close()
    #soup = BeautifulSoup(resultpage)
    proteins = parse_lipop(resultpage, proteins, id_mapping=id_mapping)
    return proteins

  proteins, id_mapping = generate_safe_seqids(proteins)

  seqids = proteins.keys()
  allresultpages = ""
  while seqids:
    seqid_batch = seqids[0:batchsize]
    del seqids[0:batchsize]

    # get batch of sequences in fasta format with munged ids 
    # (workaround for lipop sequence id munging)
    safe_fasta = proteins_to_fasta(proteins, seqids=seqid_batch,
                                             use_safe_seqid=True)

    # we use an OrderedDict rather than a normal dictionary to work around 
    # some quirks in the CBS CGI (the server expects parameters in a certain 
    # order in the HTTP headers).
    payload = OrderedDict([('configfile',
                          "/usr/opt/www/pub/CBS/services/LipoP-1.0/LipoP.cf"),
                          ("SEQ",""),
                          ("outform","-noplot")])

    #files = {'seqfile': open(params['fasta'], 'rb')}
    files = {'seqfile': StringIO(safe_fasta)}

    log_stderr("# LipoP(scrape_web), %s > %s" % (params['fasta'], outfile))

    headers = {"User-Agent": 
               "python-requests/%s (inmembrane/%s)" %
               (requests.__version__, inmembrane.__version__) }
    r = requests.post(url, data=payload, files=files, headers=headers)
    if __DEBUG__:
      log_stderr(r.text)
      # Example:
      #
      # <HTML>
      # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD>
      # If Javascript is disabled, follow <a href="/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait">This link</a>
      #
      # <script LANGUAGE="JavaScript"><!--
      # location.replace("/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait")
      # //--></script>
      # </HTML>

    # extract the result URL (or die if job is rejected ...)
    if "Job rejected" in r.text:
      sys.stderr.write(r.text)
      sys.exit()

    r = r.text.replace("<noscript>","").replace("</noscript","")
    soup = BeautifulSoup(r)
    resultlink = soup.findAll('a')[0]['href']
    sys.stderr.write("# Fetching from: " + resultlink + "\n");
    # try grabbing the result, then keep polling until they are ready
    sys.stderr.write("# Waiting for LipoP(scrape_web) results ")
    waittime = 1.0
    time.sleep(waittime) #(len(proteins)/500)
    resultpage = requests.get(resultlink).text
    retries = 0
    while (("<title>Job status of" in resultpage) and (retries < 15)):
        sys.stderr.write(".")
        time.sleep(waittime) #(len(proteins)/500)
        resultpage = requests.get(resultlink).text
        waittime += 1;
        retries += 1
        waittime = min(waittime, 20)

    sys.stderr.write(" .. done !\n")

    if __DEBUG__:
      log_stderr(resultpage)
      # Example:
      #
      #   <pre>
      # # lcl_AE004092.1_cdsid_AAK33146.1 CYT score=-0.200913
      # # Cut-off=-3
      # lcl_AE004092.1_cdsid_AAK33146.1	LipoP1.0:Best	CYT	1	1	-0.200913
      # <P>
      # <hr>
      # # lcl_AE004092.1_cdsid_AAK33147.1 CYT score=-0.200913
      # # Cut-off=-3
      # lcl_AE004092.1_cdsid_AAK33147.1	LipoP1.0:Best	CYT	1	1	-0.200913
      # <P>
      # <hr>

    allresultpages += clean_result_page(resultpage)

  # we store the cleaned up result pages concatenated together
  fh = open(outfile, 'a+')
  fh.write(allresultpages)
  fh.close()

  proteins = parse_lipop(allresultpages, proteins, id_mapping=id_mapping)
  return proteins