Ejemplo n.º 1
0
def annotate(params, proteins):
    """
    Returns a reference to the proteins data structure.

    Uses HMMER to identify sequence motifs in proteins. This function
    annotates the proteins with:
      - 'hmmsearch': a list of motifs that are found in the protein. The
         motifs correspond to the basename of the .hmm files found in the directory
         indicated by the 'hmm_profiles_dir' field of 'params'.
    """

    log_stderr(
        "# Searching for HMMER profiles in " + params['hmm_profiles_dir'])

    file_tag = os.path.join(params['hmm_profiles_dir'], '*.hmm')
    for hmm_profile in glob.glob(file_tag):
        params['hmm_profile'] = hmm_profile

        hmm_profile = os.path.basename(params['hmm_profile'])
        hmm_name = hmm_profile.replace('.hmm', '')
        hmmsearch3_out = 'hmm.%s.out' % hmm_name

        cmd = '%(hmmsearch3_bin)s -Z 2000 -E 10 %(hmm_profile)s %(fasta)s' % params
        run(cmd, hmmsearch3_out)

        # init proteins data structure with blank hmmsearch field first
        for seqid in proteins:
            if 'hmmsearch' not in proteins[seqid]:
                proteins[seqid]['hmmsearch'] = []

        # parse the hmmsearch output file
        seqid = None
        for l in open(hmmsearch3_out):
            words = l.split()

            if l.startswith(">>"):
                seqid = parse_fasta_header(l[3:])[0]
                continue

            if seqid is None:
                continue

            if 'conditional E-value' in l:
                evalue = float(words[-1])
                score = float(words[-5])
                if evalue <= params['hmm_evalue_max'] and \
                        score >= params['hmm_score_min']:
                    proteins[seqid]['hmmsearch'].append(hmm_name)

    return proteins
Ejemplo n.º 2
0
def annotate(params, proteins):
    """
    Returns a reference to the proteins data structure.

    Uses HMMER to identify sequence motifs in proteins. This function
    annotates the proteins with:
      - 'hmmsearch': a list of motifs that are found in the protein. The
         motifs correspond to the basename of the .hmm files found in the directory
         indicated by the 'hmm_profiles_dir' field of 'params'.
    """

    log_stderr("# Searching for HMMER profiles in " +
               params['hmm_profiles_dir'])

    file_tag = os.path.join(params['hmm_profiles_dir'], '*.hmm')
    for hmm_profile in glob.glob(file_tag):
        params['hmm_profile'] = hmm_profile

        hmm_profile = os.path.basename(params['hmm_profile'])
        hmm_name = hmm_profile.replace('.hmm', '')
        hmmsearch3_out = 'hmm.%s.out' % hmm_name

        cmd = '%(hmmsearch3_bin)s -Z 2000 -E 10 %(hmm_profile)s %(fasta)s' % params
        run(cmd, hmmsearch3_out)

        # init proteins data structure with blank hmmsearch field first
        for seqid in proteins:
            if 'hmmsearch' not in proteins[seqid]:
                proteins[seqid]['hmmsearch'] = []

        # parse the hmmsearch output file
        seqid = None
        for l in open(hmmsearch3_out):
            words = l.split()

            if l.startswith(">>"):
                seqid = parse_fasta_header(l[3:])[0]
                continue

            if seqid is None:
                continue

            if 'conditional E-value' in l:
                evalue = float(words[-1])
                score = float(words[-5])
                if evalue <= params['hmm_evalue_max'] and \
                        score >= params['hmm_score_min']:
                    proteins[seqid]['hmmsearch'].append(hmm_name)

    return proteins
Ejemplo n.º 3
0
def annotate(params, proteins, \
             url="http://signalfind.org/tatfind.html", force=False):
    """
    Interfaces with the TatFind web service at (http://signalfind.org/tatfind.html)
    to predict if protein sequences contain Twin-Arginine Translocation (Tat)
    signal peptides.
    """
    # set the user-agent so web services can block us if they want ... :/
    python_version = sys.version.split()[0]
    agent("Python-urllib/%s (twill; inmembrane)" % python_version)

    outfn = 'tatfind.out'
    log_stderr("# TatFind(web) %s > %s" % (params['fasta'], outfn))

    if not force and os.path.isfile(outfn):
        log_stderr("# -> skipped: %s already exists" % outfn)
        fh = open(outfn, 'r')
        proteins = parse_tatfind_output(fh, proteins)
        fh.close()
        return proteins

    # dump extraneous output into this blackhole so we don't see it
    if not __DEBUG__: twill.set_output(StringIO.StringIO())

    go(url)
    if __DEBUG__: showforms()
    formfile("1", "seqFile", params["fasta"])
    submit()
    if __DEBUG__: show()

    tatfind_output = show()
    if __DEBUG__: log_stderr(tatfind_output)

    # write raw TatFind output to a file
    fh = open(outfn, 'w')
    fh.write(tatfind_output)
    fh.close()

    proteins = parse_tatfind_output(tatfind_output.split("\n"), proteins)

    return proteins
Ejemplo n.º 4
0
def annotate(params, proteins, \
             url="http://signalfind.org/tatfind.html", force=False):
    """
    Interfaces with the TatFind web service at (http://signalfind.org/tatfind.html)
    to predict if protein sequences contain Twin-Arginine Translocation (Tat)
    signal peptides.
    """
    # set the user-agent so web services can block us if they want ... :/
    python_version = sys.version.split()[0]
    agent("Python-urllib/%s (twill; inmembrane)" % python_version)

    outfn = 'tatfind.out'
    log_stderr("# TatFind(web) %s > %s" % (params['fasta'], outfn))

    if not force and os.path.isfile(outfn):
        log_stderr("# -> skipped: %s already exists" % outfn)
        fh = open(outfn, 'r')
        proteins = parse_tatfind_output(fh, proteins)
        fh.close()
        return proteins

    # dump extraneous output into this blackhole so we don't see it
    if not __DEBUG__: twill.set_output(StringIO.StringIO())

    go(url)
    if __DEBUG__: showforms()
    formfile("1", "seqFile", params["fasta"])
    submit()
    if __DEBUG__: show()

    tatfind_output = show()
    if __DEBUG__: log_stderr(tatfind_output)

    # write raw TatFind output to a file
    fh = open(outfn, 'w')
    fh.write(tatfind_output)
    fh.close()

    proteins = parse_tatfind_output(tatfind_output.split("\n"), proteins)

    return proteins
Ejemplo n.º 5
0
def annotate(params, proteins, \
             url="http://services.cbu.uib.no/tools/bomp/", force=False):
    """
    Uses the BOMP web service (http://services.cbu.uib.no/tools/bomp/) to
    predict if proteins are outer membrane beta-barrels.
    """
    # set the user-agent so web services can block us if they want ... :/
    python_version = sys.version.split()[0]
    agent("Python-urllib/%s (twill; inmembrane/%s)" %
          (python_version, inmembrane.__version__))

    bomp_out = 'bomp.out'
    log_stderr("# BOMP(web) %s > %s" % (params['fasta'], bomp_out))

    if not force and os.path.isfile(bomp_out):
        log_stderr("# -> skipped: %s already exists" % bomp_out)
        bomp_categories = {}
        fh = open(bomp_out, 'r')
        for l in fh:
            words = l.split()
            bomp_category = int(words[-1:][0])
            seqid = parse_fasta_header(l)[0]
            proteins[seqid]['bomp'] = bomp_category
            bomp_categories[seqid] = bomp_category
        fh.close()
        return bomp_categories

    # dump extraneous output into this blackhole so we don't see it
    if not __DEBUG__: twill.set_output(StringIO.StringIO())

    go(url)
    if __DEBUG__: showforms()
    formfile("1", "queryfile", params["fasta"])
    submit()
    if __DEBUG__: show()

    # extract the job id from the page
    links = showlinks()
    job_id = None
    for l in links:
        if l.url.find("viewOutput") != -1:
            # grab job id from "viewOutput?id=16745338"
            job_id = int(l.url.split("=")[1])

    if __DEBUG__: log_stderr("BOMP job id: %d" % job_id)

    if not job_id:
        # something went wrong
        log_stderr("# BOMP error: Can't find job id")
        return

    # parse the HTML table and extract categories
    go("viewOutput?id=%i" % (job_id))

    polltime = 10
    log_stderr("# Waiting for BOMP to finish .")
    while True:
        try:
            find("Not finished")
            log_stderr(".")
        except:
            # Finished ! Pull down the result page.
            log_stderr(". done!\n")
            go("viewOutput?id=%i" % (job_id))
            if __DEBUG__: log_stderr(show())
            break

        # Not finished. We keep polling for a time until
        # we give up
        time.sleep(polltime)
        polltime = polltime * 2
        if polltime >= 7200:  # 2 hours
            log_stderr("# BOMP error: Taking too long.")
            return
        go("viewOutput?id=%i" % (job_id))
        if __DEBUG__: log_stderr(show())

    bomp_html = show()
    if __DEBUG__: log_stderr(bomp_html)

    # Results are in the only <table> on this page, formatted like:
    # <tr><th>gi|107836852|gb|ABF84721.1<th>5</tr>
    soup = BeautifulSoup(bomp_html)
    bomp_categories = {}  # dictionary of {name, category} pairs
    for tr in soup.findAll('tr')[1:]:
        n, c = tr.findAll('th')
        name = parse_fasta_header(n.text.strip())[0]
        category = int(c.text)
        bomp_categories[name] = category

    # write BOMP results to a tab delimited file
    fh = open(bomp_out, 'w')
    for k, v in bomp_categories.iteritems():
        fh.write("%s\t%i\n" % (k, v))
    fh.close()

    if __DEBUG__: log_stderr(str(bomp_categories))

    # label proteins with bomp classification (int) or False
    for name in proteins:
        if "bomp" not in proteins[name]:
            if name in bomp_categories:
                category = int(bomp_categories[name])
                proteins[name]['bomp'] = category
            else:
                proteins[name]['bomp'] = False

    if __DEBUG__: log_stderr(str(proteins))

    return bomp_categories
    """
Ejemplo n.º 6
0
def annotate(params, proteins, \
             url="http://rbf.bioinfo.tw/" +
                 "~sachen/OMPpredict/" +
                 "TMBETADISC-RBF-Content.html", force=False):
    """
    Interfaces with the TMBETADISC-RBF web service at
    (http://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php)
    to predict if protein sequence is likely to be an outer membrane beta-barrel.

    Note that the default URL we use it different to the regular form used
    by web browsers, since we need to bypass some AJAX fun.
    """
    # TODO: automatically split large sets into multiple jobs
    #       since TMBETADISC seems to not like more than take
    #       ~5000 seqs at a time
    if len(proteins) >= 5000:
        log_stderr(
            "# ERROR: TMBETADISC-RBF(web): tends to fail with > ~5000 sequences."
        )
        return

    # set the user-agent so web services can block us if they want ... :/
    python_version = sys.version.split()[0]
    # TODO: Set User-Agent header for requests
    # agent("Python-urllib/%s (requests; inmembrane)" % python_version)

    outfn = 'tmbetadisc-rbf.out'
    log_stderr("# TMBETADISC-RBF(web) %s > %s" % (params['fasta'], outfn))

    if not force and os.path.isfile(outfn):
        log_stderr("# -> skipped: %s already exists" % outfn)
        fh = open(outfn, 'r')
        proteins = parse_tmbetadisc_output(fh.read(), proteins)
        fh.close()
        return proteins

    # set the user defined method
    method_map = {
        "aa": "Amino Acid Composition",
        "dp": "Depipetide Composition",
        "aadp": "Amino Acid & Depipetide Composition",
        "pssm": "PSSM"
    }
    if dict_get(params, 'tmbetadisc_rbf_method'):
        try:
            method = method_map[params['tmbetadisc_rbf_method']]
        except KeyError:
            log_stderr("# ERROR: Invalid setting from tmbetadisc_rbf_method. \
                    Must be set to aa, dp, aadp or pssm.")
            sys.exit()

    # files = {'userfile': open(params["fasta"], 'rb')}
    with open(params["fasta"], 'r') as ff:
        data = {'format': 'fasta', 'select': method, 'seq': ff.read()}
    response = requests.post(
        'https://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php',
        data=data)  # , files=files)

    waiting_page = response.content
    if __DEBUG__: log_stderr(waiting_page)

    for l in waiting_page.split('\n'):
        if 'TMBETADISC-RBF-action.php?UniqueName=' in l:
            result_url = l.split("'")[1]

    time.sleep(5)

    output = requests.get(result_url).content

    if __DEBUG__: log_stderr(output)

    # write raw output to a file
    fh = open(outfn, 'w')
    # fh.write(waiting_page)
    # fh.write("<!-- ----------------------------------------------------------------------------------- -->")
    fh.write(output)
    fh.close()

    proteins = parse_tmbetadisc_output(output, proteins)

    return proteins
def annotate(params, proteins, \
             url="http://rbf.bioinfo.tw/"+
                 "~sachen/OMPpredict/"+
                 "TMBETADISC-RBF-Content.html", force=False):
  """
  Interfaces with the TatFind web service at 
  (http://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php) 
  to predict if protein sequence is likely to be an outer membrane beta-barrel.
  
  Note that the default URL we use it different to the regular form used
  by web browsers, since we need to bypass some AJAX fun.
  """
  # TODO: automatically split large sets into multiple jobs
  #       since TMBETADISC seems to not like more than take 
  #       ~5000 seqs at a time
  if len(proteins) >= 5000:
    log_stderr("# ERROR: TMBETADISC-RBF(web): tends to fail with > ~5000 sequences.")
    return
  
  # set the user-agent so web services can block us if they want ... :/
  python_version = sys.version.split()[0]
  agent("Python-urllib/%s (twill; inmembrane)" % python_version)
  
  outfn = 'tmbetadisc-rbf.out'
  log_stderr("# TMBETADISC-RBF(web) %s > %s" % (params['fasta'], outfn))
  
  if not force and os.path.isfile(outfn):
    log_stderr("# -> skipped: %s already exists" % outfn)
    fh = open(outfn, 'r')
    proteins = parse_tmbetadisc_output(fh.read(), proteins)
    fh.close()
    return proteins
  
  # dump extraneous output into this blackhole so we don't see it
  if not __DEBUG__: twill.set_output(StringIO.StringIO())
  
  go(url)
  if __DEBUG__: showforms()
  formfile("1", "userfile", params["fasta"])
  fv("1", "format", "file")

  # set the user defined method
  method_map = {"aa":"Amino Acid Composition",
                "dp":"Depipetide Composition",
                "aadp":"Amino Acid & Depipetide Composition",
                "pssm":"PSSM"}
  if dict_get(params, 'tmbetadisc_rbf_method'):
    try:
      method = method_map[params['tmbetadisc_rbf_method']]
    except KeyError:
      log_stderr("# ERROR: Invalid setting from tmbetadisc_rbf_method. \
                    Must be set to aa, dp, aadp or pssm.")
      sys.exit()

  #fv("1", "select", "Amino Acid Composition")
  #fv("1", "select", "Depipetide Composition")
  #fv("1", "select", "Amino Acid & Depipetide Composition")
  #fv("1", "select", "PSSM")
  fv("1", "select", method)
  
  submit()
  
  waiting_page = show()
  if __DEBUG__: log_stderr(waiting_page)

  for l in waiting_page.split('\n'):
    if l.find("TMBETADISC-RBF-action.php?UniqueName=") != -1:
      result_url = l.split("'")[1]

  time.sleep(5)
  
  go(result_url)
  
  output = show()
  if __DEBUG__: log_stderr(output)
  
  # write raw output to a file
  fh = open(outfn, 'w')
  fh.write(output)
  fh.close()
  
  proteins = parse_tmbetadisc_output(output, proteins) 
  
  return proteins
Ejemplo n.º 8
0
def annotate(params, proteins, \
                   url="http://psfs.cbrc.jp/tmbeta-net/", \
                   category='OM(barrel)',
                   force=False):
    """
  Uses the TMBETA-NET web service (http://psfs.cbrc.jp/tmbeta-net/) to
  predict strands of outer membrane beta-barrels.
  
  By default, category='BARREL' means prediction will only be run
  on proteins in the set with this category property. To process all
  proteins, change category to None.

  These keys are added to the proteins dictionary: 
    'tmbeta_strands' - a list of lists with paired start and end 
                       residues of each predicted strand. 
                       (eg [[3,9],[14,21], ..etc ])
  """

    # set the user-agent so web services can block us if they want ... :/
    python_version = sys.version.split()[0]
    agent("Python-urllib/%s (twill; inmembrane)" % python_version)

    outfile = 'tmbeta_net.out'
    log_stderr("# TMBETA-NET(web) %s > %s" % (params['fasta'], outfile))

    tmbeta_strands = {}
    if not force and os.path.isfile(outfile):
        log_stderr("# -> skipped: %s already exists" % outfile)
        fh = open(outfile, 'r')
        tmbeta_strands = json.loads(fh.read())
        fh.close()
        for seqid in tmbeta_strands:
            proteins[seqid]['tmbeta_strands'] = tmbeta_strands[seqid]

        return tmbeta_strands

    # dump extraneous output into this blackhole so we don't see it
    if not __DEBUG__: twill.set_output(StringIO.StringIO())

    for seqid in proteins:

        # only run on sequences which match the category filter
        if force or \
           (category == None) or \
           (dict_get(proteins[seqid], 'category') == category):
            pass
        else:
            continue

        go(url)
        if __DEBUG__: showforms()
        fv("1", "sequence", proteins[seqid]['seq'])
        submit()
        log_stderr("# TMBETA-NET: Predicting strands for %s - %s\n" \
                          % (seqid, proteins[seqid]['name']))
        out = show()
        time.sleep(1)

        if ("Some query is already running. Please try again." in out):
            log_stderr("# TMBETA-NET(web) error: %s" % (out))
            return {}

        # parse the web page returned, extract strand boundaries
        proteins[seqid]['tmbeta_strands'] = []
        for l in out.split('\n'):
            if __DEBUG__: log_stderr("## " + l)

            if "<BR>Segment " in l:
                i, j = l.split(":")[1].split("to")
                i = int(i.strip()[1:])
                j = int(j.strip()[1:])
                proteins[seqid]['tmbeta_strands'].append([i, j])

                if __DEBUG__:
                    log_stderr("# TMBETA-NET(web) segments: %s, %s" % (i, j))

        tmbeta_strands[seqid] = proteins[seqid]['tmbeta_strands']

    # we store the parsed strand boundaries in JSON format
    fh = open(outfile, 'w')
    fh.write(json.dumps(tmbeta_strands, separators=(',', ':\n')))
    fh.close()

    return tmbeta_strands
def annotate(params, proteins, \
             url = 'http://www.cbs.dtu.dk/ws/SignalP4/SignalP4_4_0_ws0.wsdl', \
             #url = 'http://www.cbs.dtu.dk/ws/SignalP/SignalP_3_1_ws0.wsdl', \
             batchsize = 500, \
             force=False):
  if __DEBUG__:
    logging.basicConfig(level=logging.INFO)
    # soap messages (in&out) and http headers
    logging.getLogger('suds.client').setLevel(logging.DEBUG)             
      
  # grab the cached results if present
  outfile = "signalp_web.out"
  if not force and os.path.isfile(outfile):
    log_stderr("# -> skipped: %s already exists" % outfile)
    fh = open(outfile, 'r')
    annots = json.loads(fh.read())
    fh.close()
    for seqid in annots:
      proteins[seqid]['is_signalp'] = annots[seqid]['is_signalp']
      proteins[seqid]['signalp_cleave_position'] = \
        annots[seqid]['signalp_cleave_position']
    citation['name'] = annots[seqid]['program_name']
    return proteins
  
  log_stderr("# SignalP(web), %s > %s" % (params['fasta'], outfile))
  log_stderr("# SignalP(web): submitting in batches of %i sequences" % batchsize)
  
  seqids = proteins.keys()
  signalp_dict = {}
  while seqids:
    seqid_batch = seqids[0:batchsize]
    del seqids[0:batchsize]
    client = Client(url, cache=None)
    request=client.factory.create('runService.parameters')
    
    sys.stderr.write("# ")
    for seqid in seqid_batch:
      seq = client.factory.create('runService.parameters.sequencedata.sequence')
      seq.id = seqid
      seq.seq = proteins[seqid]['seq']
    
      # organism can be 'euk', 'gram+', 'siganlgram-'
      request.organism = params['signalp4_organism']
      # default for SignalP 4.0
      #request.method = 'best'
      # default for SignalP 3.1
      #request.method = 'nn+hmm'
      request.sequencedata.sequence.append(seq)
      sys.stderr.write(".")
      
    response = client.service.runService(request)

    sys.stderr.write("\n")
    
    #pollQueue
    job = client.factory.create('pollQueue.job')
    job.jobid = response.jobid
    response = client.service.pollQueue(job)
    retries = 0
    sys.stderr.write("# Waiting for SignalP(web) results ")
    while response.status != "FINISHED" and retries < 100:
      response = client.service.pollQueue(job)
      time.sleep(10 + (retries*2))
      retries += 1
      sys.stderr.write(".")
      
      # if something goes wrong, note it and skip SignalP
      # by returning
      if response.status == "REJECTED" or \
         response.status == "UNKNOWN JOBID" or \
         response.status == "QUEUE DOWN" or \
         response.status == "FAILED":
        log_stderr("SignalP(web) failed: '%s'" % (response.status))
        return proteins
        
    sys.stderr.write(" done !\n")
      
    #fetchResults
    done_job = client.factory.create('fetchResult.job')
    done_job.jobid = response.jobid
    result = client.service.fetchResult(done_job)
    #log_stderr(str(result))

    # end of signal-nn
    
    citation["name"] = result[0].method + " " + result[0].version
      
    # TODO: the better way to do this would be to save the entire SOAP
    #       response returned by client.last_received() and then parse
    #       that upon plugin invocation (above) using suds.sax
    #       This way we save everything in the analysis, not just
    #       the details we are interested in right now
    for res in result.ann:
      seqid = res.sequence.id
      proteins[seqid]['signalp_cleave_position'] = 0
      proteins[seqid]['is_signalp'] = False
      if len(res.annrecords) > 0:
        # range.end - this is the last residue of the signal peptide if
        #  there is a cleavage site
        cleavage_site = int(res.annrecords.annrecord[0].range.end)
        if cleavage_site == 1: cleavage_site = 0
        proteins[seqid]['signalp_cleave_position'] = cleavage_site
        # from 'comment', "Y" or "N noTm" or "N TM" where "Y" means signal peptide
        signal_yn = res.annrecords[0][0].comment[0]
        if signal_yn == "Y":
          proteins[seqid]['is_signalp'] = True
        else:
          proteins[seqid]['is_signalp'] = False
            
      # for caching in the outfile
      if seqid not in signalp_dict:
        signalp_dict[seqid] = {}
      signalp_dict[seqid]['is_signalp'] = proteins[seqid]['is_signalp']
      signalp_dict[seqid]['signalp_cleave_position'] = \
        proteins[seqid]['signalp_cleave_position']
      signalp_dict[seqid]['program_name'] = citation['name']
    
  # we store the minimal stuff in JSON format
  fh = open(outfile, 'w')
  fh.write(json.dumps(signalp_dict, separators=(',',':\n')))
  fh.close()    
  
  return proteins
Ejemplo n.º 10
0
def annotate(params, proteins, \
             url = 'http://www.cbs.dtu.dk/ws/SignalP4/SignalP4_4_0_ws0.wsdl', \
             #url = 'http://www.cbs.dtu.dk/ws/SignalP/SignalP_3_1_ws0.wsdl', \
             batchsize = 500, \
             result_poll_retries = 100, \
             force=False):
    if __DEBUG__:
        logging.basicConfig(level=logging.INFO)
        # soap messages (in&out) and http headers
        logging.getLogger('suds.client').setLevel(logging.DEBUG)

    # grab the cached results if present
    outfile = "signalp_web.out"
    if not force and os.path.isfile(outfile):
        log_stderr("# -> skipped: %s already exists" % outfile)
        fh = open(outfile, 'r')
        annots = json.loads(fh.read())
        fh.close()
        for seqid in annots:
            proteins[seqid]['is_signalp'] = annots[seqid]['is_signalp']
            proteins[seqid]['signalp_cleave_position'] = \
              annots[seqid]['signalp_cleave_position']
        citation['name'] = annots[seqid]['program_name']
        return proteins

    log_stderr("# SignalP(web), %s > %s" % (params['fasta'], outfile))
    log_stderr("# SignalP(web): submitting in batches of %i sequences" %
               batchsize)

    seqids = proteins.keys()
    signalp_dict = {}
    while seqids:
        seqid_batch = seqids[0:batchsize]
        del seqids[0:batchsize]
        client = Client(url, cache=None)
        request = client.factory.create('runService.parameters')

        sys.stderr.write("# ")
        for seqid in seqid_batch:
            seq = client.factory.create(
                'runService.parameters.sequencedata.sequence')
            seq.id = seqid
            seq.seq = proteins[seqid]['seq']

            # organism can be 'euk', 'gram+', 'siganlgram-'
            request.organism = params['signalp4_organism']
            # default for SignalP 4.0
            #request.method = 'best'
            # default for SignalP 3.1
            #request.method = 'nn+hmm'
            request.sequencedata.sequence.append(seq)
            sys.stderr.write(".")

        response = client.service.runService(request)

        sys.stderr.write("\n")

        #pollQueue
        job = client.factory.create('pollQueue.job')
        job.jobid = response.jobid
        response = client.service.pollQueue(job)
        retries = 0
        sys.stderr.write("# Waiting for SignalP(web) results ")
        while response.status != "FINISHED" and retries < result_poll_retries:
            response = client.service.pollQueue(job)
            time.sleep(10 + (retries * 2))
            retries += 1
            sys.stderr.write(".")

            # if something goes wrong, note it and skip SignalP
            # by returning
            if response.status == "REJECTED" or \
               response.status == "UNKNOWN JOBID" or \
               response.status == "QUEUE DOWN" or \
               response.status == "FAILED":
                log_stderr("\nSignalP(web) failed: '%s'" % (response.status))
                return proteins

        if retries >= result_poll_retries:
            log_stderr(
                "\nSignalP(web) failed: result_poll_retries limit exceeded (%i)"
                % (result_poll_retries))
            return proteins

        sys.stderr.write(" done !\n")

        #fetchResults
        done_job = client.factory.create('fetchResult.job')
        done_job.jobid = response.jobid
        result = client.service.fetchResult(done_job)
        #log_stderr(str(result))

        # end of signal-nn

        citation["name"] = result[0].method + " " + result[0].version

        # TODO: the better way to do this would be to save the entire SOAP
        #       response returned by client.last_received() and then parse
        #       that upon plugin invocation (above) using suds.sax
        #       This way we save everything in the analysis, not just
        #       the details we are interested in right now
        for res in result.ann:
            seqid = res.sequence.id
            proteins[seqid]['signalp_cleave_position'] = 0
            proteins[seqid]['is_signalp'] = False
            if len(res.annrecords) > 0:
                # range.end - this is the last residue of the signal peptide if
                #  there is a cleavage site
                cleavage_site = int(res.annrecords.annrecord[0].range.end)
                if cleavage_site == 1: cleavage_site = 0
                proteins[seqid]['signalp_cleave_position'] = cleavage_site
                # from 'comment', "Y" or "N noTm" or "N TM" where "Y" means signal peptide
                signal_yn = res.annrecords[0][0].comment[0]
                if signal_yn == "Y":
                    proteins[seqid]['is_signalp'] = True
                else:
                    proteins[seqid]['is_signalp'] = False

            # for caching in the outfile
            if seqid not in signalp_dict:
                signalp_dict[seqid] = {}
            signalp_dict[seqid]['is_signalp'] = proteins[seqid]['is_signalp']
            signalp_dict[seqid]['signalp_cleave_position'] = \
              proteins[seqid]['signalp_cleave_position']
            signalp_dict[seqid]['program_name'] = citation['name']

    # we store the minimal stuff in JSON format
    fh = open(outfile, 'w')
    fh.write(json.dumps(signalp_dict, separators=(',', ':\n')))
    fh.close()

    return proteins
Ejemplo n.º 11
0
def annotate(params, proteins, \
             # url = 'http://www.cbs.dtu.dk/ws/LipoP/LipoP_1_0_ws0.wsdl', \
             # we host our own fixed version of the WSDL for the moment
             url="http://raw.github.com/boscoh/inmembrane/master/inmembrane/plugins/extra/LipoP_1_0_ws0.wsdl", \
             # url = "http://www.cbs.dtu.dk/ws/LipoP/LipoP_1_0_ws0.wsdl",
             batchsize=2000, \
             force=False):
    if __DEBUG__:
        logging.basicConfig(level=logging.INFO)
        # soap messages (in&out) and http headers
        logging.getLogger('suds.client').setLevel(logging.DEBUG)

        # grab the cached results if present
    outfile = "lipop_web.out"
    if not force and os.path.isfile(outfile):
        log_stderr("# -> skipped: %s already exists" % outfile)
        fh = open(outfile, 'r')
        annots = json.loads(fh.read())
        fh.close()
        for seqid in annots:
            proteins[seqid]['is_lipop'] = annots[seqid]['is_lipop']
            proteins[seqid]['lipop_cleave_position'] = \
                annots[seqid]['lipop_cleave_position']

        citation['name'] = annots[seqid]['program_name']
        return proteins

    log_stderr("# LipoP(web), %s > %s" % (params['fasta'], outfile))
    log_stderr(
        "# LipoP(web): submitting in batches of %i sequences" % batchsize)

    """
    # ensure schemas are correctly imported (workaround for broken schemas ..)
    from suds.xsd.doctor import ImportDoctor
    from suds.xsd.doctor import Import
    imp = Import("http://www.cbs.dtu.dk/ws/ws-common", location="http://www.cbs.dtu.dk/ws/common/ws_common_1_0b.xsd")
    imp.filter.add("http://www.cbs.dtu.dk/ws/WSLipoP_1_0_ws0")
    doctor = ImportDoctor(imp)
    client = Client(url, doctor=doctor, cache=None)
    #client = Client(url, plugins=[doctor], cache=None)
    """

    seqids = proteins.keys()
    lipop_dict = {}
    while seqids:
        seqid_batch = seqids[0:batchsize]
        del seqids[0:batchsize]

        client = Client(url, cache=None)

        request = client.factory.create('runService.parameters')

        # this is a horrible horrible workaround to account for the fact that
        # the lipop SOAP service returns null results if there is are certain
        # non-alphanumeric characters in the sequence id provided. horrible.
        lipop_seq_id_mapping = {}
        seqcount = 0

        sys.stderr.write("# ")
        for seqid in seqid_batch:
            seq = client.factory.create(
                'runService.parameters.sequencedata.sequence')

            # workaround: removes any non-alphanumeric character (except '_') and adds
            # a unique number to the start to ensure every id is unique after mangling
            newseqid = `seqcount` + re.sub(r'[^\w]', "", seqid)
            seqcount += 1
            lipop_seq_id_mapping[newseqid] = seqid
            seq.id = newseqid
            # seq.id = seqid
            seq.seq = proteins[seqid]['seq']

            request.sequencedata.sequence.append(seq)
            sys.stderr.write(".")
        try:
            response = client.service.runService(request)
        except urllib2.URLError as e:
            log_stderr("ERROR LipoP(web) failed: '%s'" % `e.reason`)
            return proteins

        sys.stderr.write("\n")

        # pollQueue
        job = client.factory.create('pollQueue.job')
        job.jobid = response.jobid
        response = client.service.pollQueue(job)
        retries = 0
        sys.stderr.write("# Waiting for LipoP(web) results ")
        while response.status != "FINISHED" and retries < 12:
            response = client.service.pollQueue(job)
            time.sleep(10 + (retries ** 2))
            retries += 1
            sys.stderr.write(".")

            # if something goes wrong, note it and skip LipoP
            # by returning
            if response.status == "REJECTED" or \
                    response.status == "UNKNOWN JOBID" or \
                    response.status == "QUEUE DOWN" or \
                    response.status == "FAILED":
                log_stderr("LipoP(web) failed: '%s'" % (response.status))
                return proteins

        sys.stderr.write(" done !\n")

        # fetchResults
        done_job = client.factory.create('fetchResult.job')
        done_job.jobid = response.jobid
        result = client.service.fetchResult(done_job)
        if __DEBUG__: log_stderr(str(result))

        citation["name"] = result[0].method + " " + result[0].version

        # TODO: the better way to do this would be to save the entire SOAP
        #       response returned by client.last_received() and then parse
        #       that upon plugin invocation (above) using suds.sax
        #       This way we save everything in the analysis, not just
        #       the details we are interested in right now
        for res in result.ann:
            # seqid = res.sequence.id
            seqid = lipop_seq_id_mapping[res.sequence.id]
            # init as if no lipop hit, may be reset below
            proteins[seqid]['is_lipop'] = False
            proteins[seqid]['lipop_cleave_position'] = 0
            proteins[seqid]['lipop_im_retention_signal'] = False
            if len(res.annrecords) > 0:
                # range.end - this is the first residue (Cys) of the mature protein if
                #  there is a SpII cleavage site
                for annrec in res.annrecords.annrecord:
                    if annrec.feature == "CleavII":
                        proteins[seqid]['lipop_cleave_position'] = int(
                            annrec.range.begin)
                        proteins[seqid]['is_lipop'] = True

                        # check for an E.coli style inner membrane retention signal
                        # Asp+2 to cleavage site. There are other apparent retention
                        # signals in E. coli and other gram- bacteria in addition to
                        # the Asp+2 which we don't detect here (yet).
                        # (Yamaguchi et al, 1988; Tokuda and Matsuyama, 2005 [review])
                        plus_two = proteins[seqid]['lipop_cleave_position'] + 1
                        if proteins[seqid]['seq'][plus_two] == 'D':
                            proteins[seqid]['lipop_im_retention_signal'] = True

            # for caching in the outfile
            if seqid not in lipop_dict:
                lipop_dict[seqid] = {}
            lipop_dict[seqid]['is_lipop'] = proteins[seqid]['is_lipop']
            lipop_dict[seqid]['lipop_cleave_position'] = \
                proteins[seqid]['lipop_cleave_position']
            lipop_dict[seqid]['lipop_im_retention_signal'] = \
                proteins[seqid]['lipop_im_retention_signal']
            lipop_dict[seqid]['program_name'] = citation['name']

    # we store the minimal stuff in JSON format
    fh = open(outfile, 'w')
    fh.write(json.dumps(lipop_dict, separators=(',', ':\n')))
    fh.close()

    return proteins
Ejemplo n.º 12
0
def annotate(params, proteins, \
             force=False):
    """
    DEPRECATED: The TMB-HUNT server appears to be permanently offline.

    Uses the TMB-HUNT web service
    (http://bmbpcu36.leeds.ac.uk/~andy/betaBarrel/AACompPred/aaTMB_Hunt.cgi) to
    predict if proteins are outer membrane beta-barrels.

    NOTE: In my limited testing, TMB-HUNT tends to perform very poorly in
          terms of false positives and false negetives. I'd suggest using only
          BOMP.
    """
    # TODO: automatically split large sets into multiple jobs
    #       TMB-HUNT will only take 10000 seqs at a time
    if len(proteins) >= 10000:
        log_stderr(
            "# ERROR: TMB-HUNT(web): can't take more than 10,000 sequences.")
        return

    # set the user-agent so web services can block us if they want ... :/
    python_version = sys.version.split()[0]
    agent("Python-urllib/%s (twill; inmembrane)" % python_version)

    out = 'tmbhunt.out'
    log_stderr("# TMB-HUNT(web) %s > %s" % (params['fasta'], out))

    if not force and os.path.isfile(out):
        log_stderr("# -> skipped: %s already exists" % out)
        return parse_tmbhunt(proteins, out)

    # dump extraneous output into this blackhole so we don't see it
    if not __DEBUG__: twill.set_output(StringIO.StringIO())

    go("http://bmbpcu36.leeds.ac.uk/~andy/betaBarrel/AACompPred/aaTMB_Hunt.cgi"
       )
    if __DEBUG__: showforms()

    # read up the FASTA format seqs
    fh = open(params['fasta'], 'r')
    fasta_seqs = fh.read()
    fh.close()

    # fill out the form
    fv("1", "sequences", fasta_seqs)

    submit()
    if __DEBUG__: showlinks()

    # small jobs will lead us straight to the results, big jobs
    # go via a 'waiting' page which we skip past if we get it
    job_id = None
    try:
        # we see this with big jobs
        result_table_url = follow(
            "http://www.bioinformatics.leeds.ac.uk/~andy/betaBarrel/AACompPred/tmp/tmp_output.*.html"
        )
        job_id = result_table_url.split('tmp_output')[-1:][0].split('.')[0]
    except:
        # small jobs take us straight to the html results table
        pass

    # parse the job_id from the url, since due to a bug in
    # TMB-HUNT the link on the results page from large jobs is wrong
    if not job_id:        job_id = \
follow("Full results").split('/')[-1:][0].split('.')[0]
    log_stderr(
        "# TMB-HUNT(web) job_id is: %s <http://www.bioinformatics.leeds.ac.uk/~andy/betaBarrel/AACompPred/tmp/tmp_output%s.html>"
        % (job_id, job_id))

    # polling until TMB-HUNT finishes
    # TMB-HUNT advises that 4000 sequences take ~10 mins
    # we poll a little faster than that
    polltime = (len(proteins) * 0.1) + 2
    while True:
        log_stderr("# TMB-HUNT(web): waiting another %i sec ..." % (polltime))
        time.sleep(polltime)
        try:
            go("http://bmbpcu36.leeds.ac.uk/~andy/betaBarrel/AACompPred/tmp/%s.txt"
               % (job_id))
            break
        except:
            polltime = polltime * 2

        if polltime >= 7200:  # 2 hours
            log_stderr("# TMB-HUNT error: Taking too long.")
            return

    txt_out = show()

    # write raw TMB-HUNT results
    fh = open(out, 'w')
    fh.write(txt_out)
    fh.close()

    return parse_tmbhunt(proteins, out)
Ejemplo n.º 13
0
def annotate(params, proteins, \
             batchsize=500, \
             force=False):
    """
  This plugin inferfaces with the TMHMM web interface (for humans) and
  scrapes the results. This is a silly way to do it, since there is
  a SOAP service ... however when the SOAP service goes down, as it does
  from time to time, this plugin can be used as a stopgap.
  """

    baseurl = "http://www.cbs.dtu.dk"
    url = baseurl + "/cgi-bin/nph-webface"

    # grab the cached results if present
    outfile = "tmhmm_scrape_web.out"
    if not force and os.path.isfile(outfile):
        log_stderr("# -> skipped: %s already exists" % outfile)
        proteins, id_mapping = generate_safe_seqids(proteins)
        fh = open(outfile, 'r')
        resultpage = fh.read()
        fh.close()
        #soup = BeautifulSoup(resultpage)
        proteins = parse_tmhmm(resultpage, proteins, id_mapping=id_mapping)
        return proteins

    proteins, id_mapping = generate_safe_seqids(proteins)

    seqids = proteins.keys()
    allresultpages = ""
    while seqids:
        seqid_batch = seqids[0:batchsize]
        del seqids[0:batchsize]

        # get batch of sequences in fasta format with munged ids
        # (workaround for potential tmhmm sequence id munging)
        safe_fasta = proteins_to_fasta(proteins,
                                       seqids=seqid_batch,
                                       use_safe_seqid=True)

        # we use an OrderedDict rather than a normal dictionary to work around
        # some quirks in the CBS CGI (the server expects parameters in a certain
        # order in the HTTP headers).
        payload = OrderedDict([
            ('configfile',
             "/usr/opt/www/pub/CBS/services/TMHMM-2.0/TMHMM2.cf"), ("SEQ", ""),
            ("outform", "-noplot")
        ])

        #files = {'seqfile': open(params['fasta'], 'rb')}
        files = {'seqfile': StringIO(safe_fasta)}

        log_stderr("# TMHMM(scrape_web), %s > %s" % (params['fasta'], outfile))

        headers = {
            "User-Agent":
            "python-requests/%s (inmembrane/%s)" %
            (requests.__version__, inmembrane.__version__)
        }
        r_post = requests.post(url, data=payload, files=files, headers=headers)

        # HACK: the initial POST throws us a 302 redirect and we grab the redirect url from the text
        #       (... not sure why requests allow_redirect=True option doesn't handle this transparently)
        pollingurl = r_post.url + r_post.text.split("Location: ")[1]
        r = requests.get(pollingurl)

        if __DEBUG__:
            log_stderr(r.text)
            # Example:
            #
            # <HTML>
            # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD>
            # If Javascript is disabled, follow <a href="/cgi-bin/nph-webface?jobid=TMHMM2,50B5432A10A9CD51&opt=wait">This link</a>
            #
            # <script LANGUAGE="JavaScript"><!--
            # location.replace("/cgi-bin/nph-webface?jobid=TMHMM2,50B5432A10A9CD51&opt=wait")
            # //--></script>
            # </HTML>

        # extract the result URL (or die if job is rejected ...)
        if "Job rejected" in r.text:
            sys.stderr.write(r.text)
            sys.exit()

        # sometimes we get a polling page, other times the result page is sent immediately.
        if ("<title>Job status of" in r.text):
            r = r.text.replace("<noscript>", "").replace("</noscript", "")
            soup = BeautifulSoup(r)
            resultlink = soup.findAll('a')[0]['href']
            if __DEBUG__:
                log_stderr(resultlink)

            # try grabbing the result, then keep polling until they are ready
            sys.stderr.write("# Waiting for TMHMM(scrape_web) results")
            time.sleep(len(proteins) / 500)
            resultpage = requests.get(resultlink).text
            retries = 0
            while ("<title>Job status of" in resultpage) and retries < 10:
                sys.stderr.write(".")
                time.sleep(len(proteins) / 100 + retries**2)
                resultpage = requests.get(resultlink).text
                retries += 1
        else:
            resultpage = r.text

        sys.stderr.write(" .. done !\n")

        if __DEBUG__:
            log_stderr(resultpage)

        allresultpages += clean_result_page(resultpage)

    # we store the cleaned up result pages concatenated together
    fh = open(outfile, 'a+')
    fh.write(allresultpages)
    fh.close()

    proteins = parse_tmhmm(allresultpages, proteins, id_mapping=id_mapping)
    return proteins
Ejemplo n.º 14
0
def annotate(params, proteins, \
             #url = 'http://www.cbs.dtu.dk/ws/TMHMM/TMHMM_2_0_ws0.wsdl', \
             #url = 'http://www.cbs.dtu.dk/ws/TMHMM/TMHMM_2_0_ws1.wsdl', \
             url = "http://raw.github.com/boscoh/inmembrane/master/inmembrane/plugins/extra/TMHMM_2_0_ws0.wsdl", \
             batchsize = 2000, \
             force=False):
  mapping = {'TMhelix':'tmhmm_helices',\
              'outside':'tmhmm_outer_loops',\
               'inside':'tmhmm_inner_loops',\
  }
  
  if __DEBUG__:
    logging.basicConfig(level=logging.INFO)
    # soap messages (in&out) and http headers
    logging.getLogger('suds.client').setLevel(logging.DEBUG)             
      
  # grab the cached results if present
  outfile = "tmhmm_web.out"
  if not force and os.path.isfile(outfile):
    log_stderr("# -> skipped: %s already exists" % outfile)
    fh = open(outfile, 'r')
    annots = json.loads(fh.read())
    fh.close()
    for seqid in annots:
      for k in mapping.values():
        proteins[seqid][k] = annots[seqid][k]

    citation['name'] = annots[seqid]['program_name']
    return proteins
  
  log_stderr("# TMHMM(web), %s > %s" % (params['fasta'], outfile))
  log_stderr("# TMHMM(web): submitting in batches of %i sequences" % batchsize)
  
  seqids = proteins.keys()
  tmhmm_dict = {}
  while seqids:
    seqid_batch = seqids[0:batchsize]
    del seqids[0:batchsize]
    client = Client(url, cache=None)
    request=client.factory.create('runService.parameters')
    
    # this is a horrible horrible workaround to account for the fact that
    # the lipop SOAP service returns null results if there is are certain
    # non-alphanumeric characters in the sequence id provided. horrible.
    tmhmm_seq_id_mapping = {}
    seqcount = 0
    
    sys.stderr.write("# ")
    for seqid in seqid_batch:
      seq = client.factory.create('runService.parameters.sequencedata.sequence')

      # workaround: removes any non-alphanumeric character (except '_') and adds
      # a unique number to the start to ensure every id is unique after mangling
      newseqid = `seqcount`+re.sub(r'[^\w]', "", seqid)
      seqcount += 1
      tmhmm_seq_id_mapping[newseqid] = seqid
      #seq.id = seqid
      seq.id = newseqid

      seq.seq = proteins[seqid]['seq']
      request.sequencedata.sequence.append(seq)
      sys.stderr.write(".")
      
    response = client.service.runService(request)

    sys.stderr.write("\n")
    
    #pollQueue
    job = client.factory.create('pollQueue.job')
    job.jobid = response.jobid
    response = client.service.pollQueue(job)
    retries = 0
    sys.stderr.write("# Waiting for TMHMM(web) results ")
    while response.status != "FINISHED" and retries < 100:
      response = client.service.pollQueue(job)
      time.sleep(10 + (retries*2))
      retries += 1
      sys.stderr.write(".")
      
      # if something goes wrong, note it and skip TMHMM
      # by returning
      if response.status == "REJECTED" or \
         response.status == "UNKNOWN JOBID" or \
         response.status == "QUEUE DOWN" or \
         response.status == "FAILED":
        log_stderr("TMHMM(web) failed: '%s'" % (response.status))
        return proteins
        
    sys.stderr.write(" done !\n")
      
    #fetchResults
    done_job = client.factory.create('fetchResult.job')
    done_job.jobid = response.jobid
    result = client.service.fetchResult(done_job)
    
    if __DEBUG__: log_stderr(str(result))

    citation["name"] = result[0].method + " " + result[0].version
      
    for res in result.ann:
      #seqid = res.sequence.id
      seqid = tmhmm_seq_id_mapping[res.sequence.id]
      if 'tmhmm_helices' not in proteins[seqid]:
        proteins[seqid].update({
          'tmhmm_helices':[],
          'tmhmm_inner_loops':[],
          'tmhmm_outer_loops':[]
        })
      if len(res.annrecords) > 0:
        for segment in res.annrecords.annrecord:
          if segment.comment in mapping:
            tmhmmkey = mapping[segment.comment]
            proteins[seqid][tmhmmkey].append(\
                           (segment.range.begin, segment.range.end))
                           
      # for caching in the outfile
      if seqid not in tmhmm_dict:
        tmhmm_dict[seqid] = {}

      # extract a copy of results from proteins dictionary
      # ready to we written to cache file
      for k in mapping.values():
        tmhmm_dict[seqid][k] = proteins[seqid][k]

      tmhmm_dict[seqid]['program_name'] = citation['name']
   
  if __DEBUG__: print_proteins(proteins)
  # we store the minimal stuff in JSON format
  fh = open(outfile, 'w')
  fh.write(json.dumps(tmhmm_dict, separators=(',',':\n')))
  fh.close()    
  
  return proteins
Ejemplo n.º 15
0
def annotate(params, proteins, \
             force=False):
  """
  Uses the TMB-HUNT web service 
  (http://bmbpcu36.leeds.ac.uk/~andy/betaBarrel/AACompPred/aaTMB_Hunt.cgi) to
  predict if proteins are outer membrane beta-barrels.
  
  NOTE: In my limited testing, TMB-HUNT tends to perform very poorly in
        terms of false positives and false negetives. I'd suggest using only
        BOMP.
  """
  # TODO: automatically split large sets into multiple jobs
  #       TMB-HUNT will only take 10000 seqs at a time
  if len(proteins) >= 10000:
    log_stderr("# ERROR: TMB-HUNT(web): can't take more than 10,000 sequences.")
    return
  
  # set the user-agent so web services can block us if they want ... :/
  python_version = sys.version.split()[0]
  agent("Python-urllib/%s (twill; inmembrane)" % python_version)
  
  out = 'tmbhunt.out'
  log_stderr("# TMB-HUNT(web) %s > %s" % (params['fasta'], out))
  
  if not force and os.path.isfile(out):
    log_stderr("# -> skipped: %s already exists" % out)
    return parse_tmbhunt(proteins, out)
  
  # dump extraneous output into this blackhole so we don't see it
  if not __DEBUG__: twill.set_output(StringIO.StringIO())
  
  go("http://bmbpcu36.leeds.ac.uk/~andy/betaBarrel/AACompPred/aaTMB_Hunt.cgi")
  if __DEBUG__: showforms()

  # read up the FASTA format seqs
  fh = open(params['fasta'], 'r')
  fasta_seqs = fh.read()
  fh.close()
  
  # fill out the form
  fv("1", "sequences", fasta_seqs)

  submit()
  if __DEBUG__: showlinks()

  # small jobs will lead us straight to the results, big jobs
  # go via a 'waiting' page which we skip past if we get it
  job_id = None
  try:
    # we see this with big jobs
    result_table_url = follow("http://www.bioinformatics.leeds.ac.uk/~andy/betaBarrel/AACompPred/tmp/tmp_output.*.html")
    job_id = result_table_url.split('tmp_output')[-1:][0].split('.')[0]
  except:
    # small jobs take us straight to the html results table
    pass

  # parse the job_id from the url, since due to a bug in
  # TMB-HUNT the link on the results page from large jobs is wrong
  if not job_id: job_id = follow("Full results").split('/')[-1:][0].split('.')[0]
  log_stderr("# TMB-HUNT(web) job_id is: %s <http://www.bioinformatics.leeds.ac.uk/~andy/betaBarrel/AACompPred/tmp/tmp_output%s.html>" % (job_id, job_id))
  
  # polling until TMB-HUNT finishes
  # TMB-HUNT advises that 4000 sequences take ~10 mins
  # we poll a little faster than that
  polltime = (len(proteins)*0.1)+2
  while True:
    log_stderr("# TMB-HUNT(web): waiting another %i sec ..." % (polltime))
    time.sleep(polltime)
    try:
      go("http://bmbpcu36.leeds.ac.uk/~andy/betaBarrel/AACompPred/tmp/%s.txt" % (job_id))
      break
    except:
      polltime = polltime * 2
      
    if polltime >= 7200: # 2 hours
      log_stderr("# TMB-HUNT error: Taking too long.")
      return
    
  txt_out = show()
  
  # write raw TMB-HUNT results
  fh = open(out, 'w')
  fh.write(txt_out)
  fh.close()
  
  return parse_tmbhunt(proteins, out)
Ejemplo n.º 16
0
def annotate(params, proteins, \
                   url="http://psfs.cbrc.jp/tmbeta-net/", \
                   category='OM(barrel)',
                   force=False):
  """
  Uses the TMBETA-NET web service (http://psfs.cbrc.jp/tmbeta-net/) to
  predict strands of outer membrane beta-barrels.
  
  By default, category='BARREL' means prediction will only be run
  on proteins in the set with this category property. To process all
  proteins, change category to None.

  These keys are added to the proteins dictionary: 
    'tmbeta_strands' - a list of lists with paired start and end 
                       residues of each predicted strand. 
                       (eg [[3,9],[14,21], ..etc ])
  """

  # set the user-agent so web services can block us if they want ... :/
  python_version = sys.version.split()[0]
  agent("Python-urllib/%s (twill; inmembrane)" % python_version)
  
  outfile = 'tmbeta_net.out'
  log_stderr("# TMBETA-NET(web) %s > %s" % (params['fasta'], outfile))
  
  tmbeta_strands = {}
  if not force and os.path.isfile(outfile):
    log_stderr("# -> skipped: %s already exists" % outfile)
    fh = open(outfile, 'r')
    tmbeta_strands = json.loads(fh.read())
    fh.close()    
    for seqid in tmbeta_strands:
      proteins[seqid]['tmbeta_strands'] = tmbeta_strands[seqid]
      
    return tmbeta_strands

  # dump extraneous output into this blackhole so we don't see it
  if not __DEBUG__: twill.set_output(StringIO.StringIO())

  for seqid in proteins:
    
    # only run on sequences which match the category filter
    if force or \
       (category == None) or \
       (dict_get(proteins[seqid], 'category') == category):
      pass
    else:
      continue
      
    go(url)
    if __DEBUG__: showforms()
    fv("1","sequence",proteins[seqid]['seq'])
    submit()
    log_stderr("# TMBETA-NET: Predicting strands for %s - %s\n" \
                      % (seqid, proteins[seqid]['name']))
    out = show()
    time.sleep(1)

    if ("Some query is already running. Please try again." in out):
      log_stderr("# TMBETA-NET(web) error: %s" % (out))
      return {}

    # parse the web page returned, extract strand boundaries
    proteins[seqid]['tmbeta_strands'] = []
    for l in out.split('\n'):
      if __DEBUG__: log_stderr("## " + l)

      if "<BR>Segment " in l:
        i,j = l.split(":")[1].split("to")
        i = int(i.strip()[1:])
        j = int(j.strip()[1:])
        proteins[seqid]['tmbeta_strands'].append([i,j])

        if __DEBUG__: log_stderr("# TMBETA-NET(web) segments: %s, %s" % (i, j))

    tmbeta_strands[seqid] = proteins[seqid]['tmbeta_strands']

  # we store the parsed strand boundaries in JSON format
  fh = open(outfile, 'w')
  fh.write(json.dumps(tmbeta_strands, separators=(',',':\n')))
  fh.close()

  return tmbeta_strands
Ejemplo n.º 17
0
def annotate(params, proteins, \
             # url = 'http://www.cbs.dtu.dk/ws/TMHMM/TMHMM_2_0_ws0.wsdl', \
             # url = 'http://www.cbs.dtu.dk/ws/TMHMM/TMHMM_2_0_ws1.wsdl', \
             url="http://raw.github.com/boscoh/inmembrane/master/inmembrane/plugins/extra/TMHMM_2_0_ws0.wsdl", \
             batchsize=2000, \
             force=False):
    mapping = {'TMhelix': 'tmhmm_helices', \
               'outside': 'tmhmm_outer_loops', \
               'inside': 'tmhmm_inner_loops', \
               }

    if __DEBUG__:
        logging.basicConfig(level=logging.INFO)
        # soap messages (in&out) and http headers
        logging.getLogger('suds.client').setLevel(logging.DEBUG)

        # grab the cached results if present
    outfile = "tmhmm_web.out"
    if not force and os.path.isfile(outfile):
        log_stderr("# -> skipped: %s already exists" % outfile)
        fh = open(outfile, 'r')
        annots = json.loads(fh.read())
        fh.close()
        for seqid in annots:
            for k in mapping.values():
                proteins[seqid][k] = annots[seqid][k]

        citation['name'] = annots[seqid]['program_name']
        return proteins

    log_stderr("# TMHMM(web), %s > %s" % (params['fasta'], outfile))
    log_stderr("# TMHMM(web): submitting in batches of %i sequences" %
               batchsize)

    seqids = proteins.keys()
    tmhmm_dict = {}
    while seqids:
        seqid_batch = seqids[0:batchsize]
        del seqids[0:batchsize]
        client = Client(url, cache=None)
        request = client.factory.create('runService.parameters')

        # this is a horrible horrible workaround to account for the fact that
        # the lipop SOAP service returns null results if there is are certain
        # non-alphanumeric characters in the sequence id provided. horrible.
        tmhmm_seq_id_mapping = {}
        seqcount = 0

        sys.stderr.write("# ")
        for seqid in seqid_batch:
            seq = client.factory.create(
                'runService.parameters.sequencedata.sequence')

            # workaround: removes any non-alphanumeric character (except '_') and adds
            # a unique number to the start to ensure every id is unique after mangling
            newseqid = ` seqcount ` + re.sub(r'[^\w]', "", seqid)
            seqcount += 1
            tmhmm_seq_id_mapping[newseqid] = seqid
            # seq.id = seqid
            seq.id = newseqid

            seq.seq = proteins[seqid]['seq']
            request.sequencedata.sequence.append(seq)
            sys.stderr.write(".")

        response = client.service.runService(request)

        sys.stderr.write("\n")

        # pollQueue
        job = client.factory.create('pollQueue.job')
        job.jobid = response.jobid
        response = client.service.pollQueue(job)
        retries = 0
        sys.stderr.write("# Waiting for TMHMM(web) results ")
        while response.status != "FINISHED" and retries < 100:
            response = client.service.pollQueue(job)
            time.sleep(10 + (retries * 2))
            retries += 1
            sys.stderr.write(".")

            # if something goes wrong, note it and skip TMHMM
            # by returning
            if response.status == "REJECTED" or \
                    response.status == "UNKNOWN JOBID" or \
                    response.status == "QUEUE DOWN" or \
                    response.status == "FAILED":
                log_stderr("TMHMM(web) failed: '%s'" % (response.status))
                return proteins

        sys.stderr.write(" done !\n")

        # fetchResults
        done_job = client.factory.create('fetchResult.job')
        done_job.jobid = response.jobid
        result = client.service.fetchResult(done_job)

        if __DEBUG__: log_stderr(str(result))

        citation["name"] = result[0].method + " " + result[0].version

        for res in result.ann:
            # seqid = res.sequence.id
            seqid = tmhmm_seq_id_mapping[res.sequence.id]
            if 'tmhmm_helices' not in proteins[seqid]:
                proteins[seqid].update({
                    'tmhmm_helices': [],
                    'tmhmm_inner_loops': [],
                    'tmhmm_outer_loops': []
                })
            if len(res.annrecords) > 0:
                for segment in res.annrecords.annrecord:
                    if segment.comment in mapping:
                        tmhmmkey = mapping[segment.comment]
                        proteins[seqid][tmhmmkey].append( \
                            (segment.range.begin, segment.range.end))

            # for caching in the outfile
            if seqid not in tmhmm_dict:
                tmhmm_dict[seqid] = {}

            # extract a copy of results from proteins dictionary
            # ready to we written to cache file
            for k in mapping.values():
                tmhmm_dict[seqid][k] = proteins[seqid][k]

            tmhmm_dict[seqid]['program_name'] = citation['name']

    if __DEBUG__: print_proteins(proteins)
    # we store the minimal stuff in JSON format
    fh = open(outfile, 'w')
    fh.write(json.dumps(tmhmm_dict, separators=(',', ':\n')))
    fh.close()

    return proteins
Ejemplo n.º 18
0
def annotate(params, proteins, \
             url="http://services.cbu.uib.no/tools/bomp/", force=False):
    """
    Uses the BOMP web service (http://services.cbu.uib.no/tools/bomp/) to
    predict if proteins are outer membrane beta-barrels.
    """
    # set the user-agent so web services can block us if they want ... :/
    python_version = sys.version.split()[0]
    agent("Python-urllib/%s (twill; inmembrane/%s)" %
          (python_version, inmembrane.__version__))

    bomp_out = 'bomp.out'
    log_stderr("# BOMP(web) %s > %s" % (params['fasta'], bomp_out))

    if not force and os.path.isfile(bomp_out):
        log_stderr("# -> skipped: %s already exists" % bomp_out)
        bomp_categories = {}
        fh = open(bomp_out, 'r')
        for l in fh:
            words = l.split()
            bomp_category = int(words[-1:][0])
            seqid = parse_fasta_header(l)[0]
            proteins[seqid]['bomp'] = bomp_category
            bomp_categories[seqid] = bomp_category
        fh.close()
        return bomp_categories

    # dump extraneous output into this blackhole so we don't see it
    if not __DEBUG__: twill.set_output(StringIO.StringIO())

    go(url)
    if __DEBUG__: showforms()
    formfile("1", "queryfile", params["fasta"])
    submit()
    if __DEBUG__: show()

    # extract the job id from the page
    links = showlinks()
    job_id = None
    for l in links:
        if l.url.find("viewOutput") != -1:
            # grab job id from "viewOutput?id=16745338"
            job_id = int(l.url.split("=")[1])

    if __DEBUG__: log_stderr("BOMP job id: %d" % job_id)

    if not job_id:
        # something went wrong
        log_stderr("# BOMP error: Can't find job id")
        return

    # parse the HTML table and extract categories
    go("viewOutput?id=%i" % (job_id))

    polltime = 10
    log_stderr("# Waiting for BOMP to finish .")
    while True:
        try:
            find("Not finished")
            log_stderr(".")
        except:
            # Finished ! Pull down the result page.
            log_stderr(". done!\n")
            go("viewOutput?id=%i" % (job_id))
            if __DEBUG__: log_stderr(show())
            break

        # Not finished. We keep polling for a time until
        # we give up
        time.sleep(polltime)
        polltime = polltime * 2
        if polltime >= 7200:  # 2 hours
            log_stderr("# BOMP error: Taking too long.")
            return
        go("viewOutput?id=%i" % (job_id))
        if __DEBUG__: log_stderr(show())

    bomp_html = show()
    if __DEBUG__: log_stderr(bomp_html)

    # Results are in the only <table> on this page, formatted like:
    # <tr><th>gi|107836852|gb|ABF84721.1<th>5</tr>
    soup = BeautifulSoup(bomp_html)
    bomp_categories = {}  # dictionary of {name, category} pairs
    for tr in soup.findAll('tr')[1:]:
        n, c = tr.findAll('th')
        name = parse_fasta_header(n.text.strip())[0]
        category = int(c.text)
        bomp_categories[name] = category

    # write BOMP results to a tab delimited file
    fh = open(bomp_out, 'w')
    for k, v in bomp_categories.iteritems():
        fh.write("%s\t%i\n" % (k, v))
    fh.close()

    if __DEBUG__: log_stderr(str(bomp_categories))

    # label proteins with bomp classification (int) or False
    for name in proteins:
        if "bomp" not in proteins[name]:
            if name in bomp_categories:
                category = int(bomp_categories[name])
                proteins[name]['bomp'] = category
            else:
                proteins[name]['bomp'] = False

    if __DEBUG__: log_stderr(str(proteins))

    return bomp_categories

    """
Ejemplo n.º 19
0
def annotate(params, proteins, \
             #url = 'http://www.cbs.dtu.dk/ws/LipoP/LipoP_1_0_ws0.wsdl', \
             # we host our own fixed version of the WSDL for the moment


             url = "http://raw.github.com/boscoh/inmembrane/master/inmembrane/plugins/extra/LipoP_1_0_ws0.wsdl", \
             #url = "http://www.cbs.dtu.dk/ws/LipoP/LipoP_1_0_ws0.wsdl",

             batchsize = 2000, \
             force=False):
    if __DEBUG__:
        logging.basicConfig(level=logging.INFO)
        # soap messages (in&out) and http headers
        logging.getLogger('suds.client').setLevel(logging.DEBUG)

    # grab the cached results if present
    outfile = "lipop_web.out"
    if not force and os.path.isfile(outfile):
        log_stderr("# -> skipped: %s already exists" % outfile)
        fh = open(outfile, 'r')
        annots = json.loads(fh.read())
        fh.close()
        for seqid in annots:
            proteins[seqid]['is_lipop'] = annots[seqid]['is_lipop']
            proteins[seqid]['lipop_cleave_position'] = \
              annots[seqid]['lipop_cleave_position']

        citation['name'] = annots[seqid]['program_name']
        return proteins

    log_stderr("# LipoP(web), %s > %s" % (params['fasta'], outfile))
    log_stderr("# LipoP(web): submitting in batches of %i sequences" %
               batchsize)
    """
  # ensure schemas are correctly imported (workaround for broken schemas ..)
  from suds.xsd.doctor import ImportDoctor
  from suds.xsd.doctor import Import
  imp = Import("http://www.cbs.dtu.dk/ws/ws-common", location="http://www.cbs.dtu.dk/ws/common/ws_common_1_0b.xsd")
  imp.filter.add("http://www.cbs.dtu.dk/ws/WSLipoP_1_0_ws0")
  doctor = ImportDoctor(imp)
  client = Client(url, doctor=doctor, cache=None)
  #client = Client(url, plugins=[doctor], cache=None)
  """

    seqids = proteins.keys()
    lipop_dict = {}
    while seqids:
        seqid_batch = seqids[0:batchsize]
        del seqids[0:batchsize]

        client = Client(url, cache=None)

        request = client.factory.create('runService.parameters')

        # this is a horrible horrible workaround to account for the fact that
        # the lipop SOAP service returns null results if there is are certain
        # non-alphanumeric characters in the sequence id provided. horrible.
        lipop_seq_id_mapping = {}
        seqcount = 0

        sys.stderr.write("# ")
        for seqid in seqid_batch:
            seq = client.factory.create(
                'runService.parameters.sequencedata.sequence')

            # workaround: removes any non-alphanumeric character (except '_') and adds
            # a unique number to the start to ensure every id is unique after mangling
            newseqid = ` seqcount ` + re.sub(r'[^\w]', "", seqid)
            seqcount += 1
            lipop_seq_id_mapping[newseqid] = seqid
            seq.id = newseqid
            #seq.id = seqid
            seq.seq = proteins[seqid]['seq']

            request.sequencedata.sequence.append(seq)
            sys.stderr.write(".")
        try:
            response = client.service.runService(request)
        except urllib2.URLError as e:
            log_stderr("ERROR LipoP(web) failed: '%s'" % ` e.reason `)
            return proteins

        sys.stderr.write("\n")

        #pollQueue
        job = client.factory.create('pollQueue.job')
        job.jobid = response.jobid
        response = client.service.pollQueue(job)
        retries = 0
        sys.stderr.write("# Waiting for LipoP(web) results ")
        while response.status != "FINISHED" and retries < 12:
            response = client.service.pollQueue(job)
            time.sleep(10 + (retries**2))
            retries += 1
            sys.stderr.write(".")

            # if something goes wrong, note it and skip LipoP
            # by returning
            if response.status == "REJECTED" or \
               response.status == "UNKNOWN JOBID" or \
               response.status == "QUEUE DOWN" or \
               response.status == "FAILED":
                log_stderr("LipoP(web) failed: '%s'" % (response.status))
                return proteins

        sys.stderr.write(" done !\n")

        #fetchResults
        done_job = client.factory.create('fetchResult.job')
        done_job.jobid = response.jobid
        result = client.service.fetchResult(done_job)
        if __DEBUG__: log_stderr(str(result))

        citation["name"] = result[0].method + " " + result[0].version

        # TODO: the better way to do this would be to save the entire SOAP
        #       response returned by client.last_received() and then parse
        #       that upon plugin invocation (above) using suds.sax
        #       This way we save everything in the analysis, not just
        #       the details we are interested in right now
        for res in result.ann:
            #seqid = res.sequence.id
            seqid = lipop_seq_id_mapping[res.sequence.id]
            # init as if no lipop hit, may be reset below
            proteins[seqid]['is_lipop'] = False
            proteins[seqid]['lipop_cleave_position'] = 0
            proteins[seqid]['lipop_im_retention_signal'] = False
            if len(res.annrecords) > 0:
                # range.end - this is the first residue (Cys) of the mature protein if
                #  there is a SpII cleavage site
                for annrec in res.annrecords.annrecord:
                    if annrec.feature == "CleavII":
                        proteins[seqid]['lipop_cleave_position'] = int(
                            annrec.range.begin)
                        proteins[seqid]['is_lipop'] = True

                        # check for an E.coli style inner membrane retention signal
                        # Asp+2 to cleavage site. There are other apparent retention
                        # signals in E. coli and other gram- bacteria in addition to
                        # the Asp+2 which we don't detect here (yet).
                        # (Yamaguchi et al, 1988; Tokuda and Matsuyama, 2005 [review])
                        plus_two = proteins[seqid]['lipop_cleave_position'] + 1
                        if proteins[seqid]['seq'][plus_two] == 'D':
                            proteins[seqid]['lipop_im_retention_signal'] = True

            # for caching in the outfile
            if seqid not in lipop_dict:
                lipop_dict[seqid] = {}
            lipop_dict[seqid]['is_lipop'] = proteins[seqid]['is_lipop']
            lipop_dict[seqid]['lipop_cleave_position'] = \
              proteins[seqid]['lipop_cleave_position']
            lipop_dict[seqid]['lipop_im_retention_signal'] = \
              proteins[seqid]['lipop_im_retention_signal']
            lipop_dict[seqid]['program_name'] = citation['name']

    # we store the minimal stuff in JSON format
    fh = open(outfile, 'w')
    fh.write(json.dumps(lipop_dict, separators=(',', ':\n')))
    fh.close()

    return proteins
Ejemplo n.º 20
0
def annotate(params, proteins, batchsize=2000, force=False):
    """
    This plugin interfaces with the SignalP web interface (for humans) and
    scrapes the results. There once was a SOAP service but it was discontinued,
    so now we use this.
    """

    baseurl = "http://www.cbs.dtu.dk"
    url = baseurl + "/cgi-bin/webface2.fcgi"

    # grab the cached results if present
    outfile = "signalp_scrape_web.out"
    if not force and os.path.isfile(outfile):
        log_stderr("# -> skipped: %s already exists" % outfile)
        proteins, id_mapping = generate_safe_seqids(proteins)
        fh = open(outfile, 'r')
        resultpage = fh.read()
        fh.close()
        # soup = BeautifulSoup(resultpage)
        proteins = parse_signalp(resultpage.splitlines(),
                                 proteins, id_mapping=id_mapping)
        return proteins

    proteins, id_mapping = generate_safe_seqids(proteins)

    seqids = proteins.keys()
    allresultpages = ""
    while seqids:
        seqid_batch = seqids[0:batchsize]
        del seqids[0:batchsize]

        safe_fasta = proteins_to_fasta(proteins,
                                       seqids=seqid_batch,
                                       use_safe_seqid=True)

        # we use an OrderedDict rather than a normal dictionary to work around
        # some quirks in the CBS CGI (the server expects parameters in a certain
        # order in the HTTP headers).
        payload = OrderedDict([
            ('configfile',
             "/usr/opt/www/pub/CBS/services/SignalP-4.1/SignalP.cf"),
            ("SEQPASTE", ""),
            ("orgtype", params['signalp4_organism']),  # gram+, gram-, euk
            ("Dcut-type", "default"),
            ("method", "best"),  # best, notm
            ("minlen", ""),
            ("trunc", ""),
            ("format", "short")])  # summary, short, long, all

        # files = {'seqfile': open(params['fasta'], 'rb')}
        files = {'SEQSUB': StringIO(safe_fasta)}

        log_stderr(
            "# SignalP(scrape_web), %s > %s" % (params['fasta'], outfile))

        headers = {"User-Agent":
                       "python-requests/%s (inmembrane/%s)" %
                       (requests.__version__, inmembrane.__version__)}
        r_post = requests.post(url, data=payload, files=files, headers=headers)

        if __DEBUG__:
            log_stderr(r_post.text)
            # Example:
            #
            # <HTML>
            # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD>
            # If Javascript is disabled, follow <a href="/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait">This link</a>
            #
            # <script LANGUAGE="JavaScript"><!--
            # location.replace("/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait")
            # //--></script>
            # </HTML>

        # extract the result URL (or die if job is rejected ...)
        if "Job rejected" in r_post.text:
            log_stderr(r_post.text)
            sys.exit()

        r_post_clean = r_post.text.replace("<noscript>", "").replace(
            "</noscript", "")
        soup = BeautifulSoup(r_post_clean)
        pollingurl = soup.findAll('a')[0]['href']
        sys.stderr.write("# Fetching from: " + pollingurl + "\n");
        # try grabbing the result, then keep polling until they are ready
        sys.stderr.write("# Waiting for SignalP(scrape_web) results ")
        waittime = 1.0
        time.sleep(waittime)  # (len(proteins)/500)
        resultpage = requests.get(pollingurl).text
        retries = 0
        while (("<title>Job status of" in resultpage) and (retries < 15)):
            sys.stderr.write(".")
            time.sleep(waittime)  # (len(proteins)/500)
            resultpage = requests.get(pollingurl).text
            waittime += 1;
            retries += 1
            waittime = min(waittime, 20)

        sys.stderr.write(" .. done !\n")

        if __DEBUG__:
            log_stderr(resultpage)
            # Example:
            #
            #   <pre>
            # # lcl_AE004092.1_cdsid_AAK33146.1 CYT score=-0.200913
            # # Cut-off=-3
            # lcl_AE004092.1_cdsid_AAK33146.1	LipoP1.0:Best	CYT	1	1	-0.200913
            # <P>
            # <hr>
            # # lcl_AE004092.1_cdsid_AAK33147.1 CYT score=-0.200913
            # # Cut-off=-3
            # lcl_AE004092.1_cdsid_AAK33147.1	LipoP1.0:Best	CYT	1	1	-0.200913
            # <P>
            # <hr>

        allresultpages += html2text(
            resultpage)  # += clean_result_page(resultpage)

    # we store the cleaned up result pages concatenated together
    fh = open(outfile, 'a+')
    fh.write(allresultpages)
    fh.close()

    proteins = parse_signalp(allresultpages.splitlines(), proteins,
                             id_mapping=id_mapping)
    return proteins
Ejemplo n.º 21
0
def annotate(params, proteins, batchsize=2000, force=False):
    """
    This plugin interfaces with the SignalP web interface (for humans) and
    scrapes the results. There once was a SOAP service but it was discontinued,
    so now we use this.
    """

    baseurl = "http://www.cbs.dtu.dk"
    url = baseurl + "/cgi-bin/webface2.fcgi"

    # grab the cached results if present
    outfile = "signalp_scrape_web.out"
    if not force and os.path.isfile(outfile):
        log_stderr("# -> skipped: %s already exists" % outfile)
        proteins, id_mapping = generate_safe_seqids(proteins)
        fh = open(outfile, 'r')
        resultpage = fh.read()
        fh.close()
        # soup = BeautifulSoup(resultpage)
        proteins = parse_signalp(resultpage.splitlines(),
                                 proteins,
                                 id_mapping=id_mapping)
        return proteins

    proteins, id_mapping = generate_safe_seqids(proteins)

    seqids = proteins.keys()
    allresultpages = ""
    while seqids:
        seqid_batch = seqids[0:batchsize]
        del seqids[0:batchsize]

        safe_fasta = proteins_to_fasta(proteins,
                                       seqids=seqid_batch,
                                       use_safe_seqid=True)

        # we use an OrderedDict rather than a normal dictionary to work around
        # some quirks in the CBS CGI (the server expects parameters in a certain
        # order in the HTTP headers).
        payload = OrderedDict([
            ('configfile',
             "/usr/opt/www/pub/CBS/services/SignalP-4.1/SignalP.cf"),
            ("SEQPASTE", ""),
            ("orgtype", params['signalp4_organism']),  # gram+, gram-, euk
            ("Dcut-type", "default"),
            ("method", "best"),  # best, notm
            ("minlen", ""),
            ("trunc", ""),
            ("format", "short")
        ])  # summary, short, long, all

        # files = {'seqfile': open(params['fasta'], 'rb')}
        files = {'SEQSUB': StringIO(safe_fasta)}

        log_stderr("# SignalP(scrape_web), %s > %s" %
                   (params['fasta'], outfile))

        headers = {
            "User-Agent":
            "python-requests/%s (inmembrane/%s)" %
            (requests.__version__, inmembrane.__version__)
        }
        r_post = requests.post(url, data=payload, files=files, headers=headers)

        if __DEBUG__:
            log_stderr(r_post.text)
            # Example:
            #
            # <HTML>
            # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD>
            # If Javascript is disabled, follow <a href="/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait">This link</a>
            #
            # <script LANGUAGE="JavaScript"><!--
            # location.replace("/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait")
            # //--></script>
            # </HTML>

        # extract the result URL (or die if job is rejected ...)
        if "Job rejected" in r_post.text:
            log_stderr(r_post.text)
            sys.exit()

        r_post_clean = r_post.text.replace("<noscript>",
                                           "").replace("</noscript", "")
        soup = BeautifulSoup(r_post_clean)
        pollingurl = soup.findAll('a')[0]['href']
        sys.stderr.write("# Fetching from: " + pollingurl + "\n")
        # try grabbing the result, then keep polling until they are ready
        sys.stderr.write("# Waiting for SignalP(scrape_web) results ")
        waittime = 1.0
        time.sleep(waittime)  # (len(proteins)/500)
        resultpage = requests.get(pollingurl).text
        retries = 0
        while (("<title>Job status of" in resultpage) and (retries < 15)):
            sys.stderr.write(".")
            time.sleep(waittime)  # (len(proteins)/500)
            resultpage = requests.get(pollingurl).text
            waittime += 1
            retries += 1
            waittime = min(waittime, 20)

        sys.stderr.write(" .. done !\n")

        if __DEBUG__:
            log_stderr(resultpage)
            # Example:
            #
            #   <pre>
            # # lcl_AE004092.1_cdsid_AAK33146.1 CYT score=-0.200913
            # # Cut-off=-3
            # lcl_AE004092.1_cdsid_AAK33146.1	LipoP1.0:Best	CYT	1	1	-0.200913
            # <P>
            # <hr>
            # # lcl_AE004092.1_cdsid_AAK33147.1 CYT score=-0.200913
            # # Cut-off=-3
            # lcl_AE004092.1_cdsid_AAK33147.1	LipoP1.0:Best	CYT	1	1	-0.200913
            # <P>
            # <hr>

        allresultpages += html2text(
            resultpage)  # += clean_result_page(resultpage)

    # we store the cleaned up result pages concatenated together
    fh = open(outfile, 'a+')
    fh.write(allresultpages)
    fh.close()

    proteins = parse_signalp(allresultpages.splitlines(),
                             proteins,
                             id_mapping=id_mapping)
    return proteins
Ejemplo n.º 22
0
def annotate(params, proteins, \
             url="http://rbf.bioinfo.tw/" +
                 "~sachen/OMPpredict/" +
                 "TMBETADISC-RBF-Content.html", force=False):
    """
    Interfaces with the TMBETADISC-RBF web service at
    (http://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php)
    to predict if protein sequence is likely to be an outer membrane beta-barrel.

    Note that the default URL we use it different to the regular form used
    by web browsers, since we need to bypass some AJAX fun.
    """
    # TODO: automatically split large sets into multiple jobs
    #       since TMBETADISC seems to not like more than take
    #       ~5000 seqs at a time
    if len(proteins) >= 5000:
        log_stderr(
            "# ERROR: TMBETADISC-RBF(web): tends to fail with > ~5000 sequences.")
        return

    # set the user-agent so web services can block us if they want ... :/
    python_version = sys.version.split()[0]
    # TODO: Set User-Agent header for requests
    # agent("Python-urllib/%s (requests; inmembrane)" % python_version)

    outfn = 'tmbetadisc-rbf.out'
    log_stderr("# TMBETADISC-RBF(web) %s > %s" % (params['fasta'], outfn))

    if not force and os.path.isfile(outfn):
        log_stderr("# -> skipped: %s already exists" % outfn)
        fh = open(outfn, 'r')
        proteins = parse_tmbetadisc_output(fh.read(), proteins)
        fh.close()
        return proteins

    # set the user defined method
    method_map = {"aa": "Amino Acid Composition",
                  "dp": "Depipetide Composition",
                  "aadp": "Amino Acid & Depipetide Composition",
                  "pssm": "PSSM"}
    if dict_get(params, 'tmbetadisc_rbf_method'):
        try:
            method = method_map[params['tmbetadisc_rbf_method']]
        except KeyError:
            log_stderr("# ERROR: Invalid setting from tmbetadisc_rbf_method. \
                    Must be set to aa, dp, aadp or pssm.")
            sys.exit()

    # files = {'userfile': open(params["fasta"], 'rb')}
    with open(params["fasta"], 'r') as ff:
        data = {'format': 'fasta', 'select': method, 'seq': ff.read()}
    response = requests.post(
        'https://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php',
        data=data)  # , files=files)

    waiting_page = response.content
    if __DEBUG__: log_stderr(waiting_page)

    for l in waiting_page.split('\n'):
        if 'TMBETADISC-RBF-action.php?UniqueName=' in l:
            result_url = l.split("'")[1]

    time.sleep(5)

    output = requests.get(result_url).content

    if __DEBUG__: log_stderr(output)

    # write raw output to a file
    fh = open(outfn, 'w')
    # fh.write(waiting_page)
    # fh.write("<!-- ----------------------------------------------------------------------------------- -->")
    fh.write(output)
    fh.close()

    proteins = parse_tmbetadisc_output(output, proteins)

    return proteins
def annotate(params, proteins, \
             batchsize=500, \
             force=False):
  """
  This plugin inferfaces with the TMHMM web interface (for humans) and
  scrapes the results. This is a silly way to do it, since there is
  a SOAP service ... however when the SOAP service goes down, as it does
  from time to time, this plugin can be used as a stopgap.
  """

  baseurl = "http://www.cbs.dtu.dk"
  url = baseurl + "/cgi-bin/nph-webface"

  # grab the cached results if present
  outfile = "tmhmm_scrape_web.out"
  if not force and os.path.isfile(outfile):
    log_stderr("# -> skipped: %s already exists" % outfile)
    proteins, id_mapping = generate_safe_seqids(proteins)
    fh = open(outfile, 'r')
    resultpage = fh.read()
    fh.close()
    #soup = BeautifulSoup(resultpage)
    proteins = parse_tmhmm(resultpage, proteins, id_mapping=id_mapping)
    return proteins

  proteins, id_mapping = generate_safe_seqids(proteins)

  seqids = proteins.keys()
  allresultpages = ""
  while seqids:
    seqid_batch = seqids[0:batchsize]
    del seqids[0:batchsize]

    # get batch of sequences in fasta format with munged ids 
    # (workaround for potential tmhmm sequence id munging)
    safe_fasta = proteins_to_fasta(proteins, seqids=seqid_batch, 
                                             use_safe_seqid=True)

    # we use an OrderedDict rather than a normal dictionary to work around 
    # some quirks in the CBS CGI (the server expects parameters in a certain 
    # order in the HTTP headers).
    payload = OrderedDict([('configfile',
                          "/usr/opt/www/pub/CBS/services/TMHMM-2.0/TMHMM2.cf"),
                          ("SEQ",""),
                          ("outform","-noplot")])

    #files = {'seqfile': open(params['fasta'], 'rb')}
    files = {'seqfile': StringIO(safe_fasta)}

    log_stderr("# TMHMM(scrape_web), %s > %s" % (params['fasta'], outfile))

    headers = {"User-Agent": 
               "python-requests/%s (inmembrane/%s)" % 
               (requests.__version__, inmembrane.__version__) }
    r = requests.post(url, data=payload, files=files, headers=headers)
    if __DEBUG__:
      log_stderr(r.text)
      # Example:
      #
      # <HTML>
      # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD>
      # If Javascript is disabled, follow <a href="/cgi-bin/nph-webface?jobid=TMHMM2,50B5432A10A9CD51&opt=wait">This link</a>
      #
      # <script LANGUAGE="JavaScript"><!--
      # location.replace("/cgi-bin/nph-webface?jobid=TMHMM2,50B5432A10A9CD51&opt=wait")
      # //--></script>
      # </HTML>

    # extract the result URL (or die if job is rejected ...)
    if "Job rejected" in r.text:
      sys.stderr.write(r.text)
      sys.exit()
    soup = BeautifulSoup(r.text)
      
    resultlink = soup.findAll('a')[0]['href']
    if __DEBUG__:
      log_stderr(resultlink)

    # brief pause, then grab the results at the result url
    sys.stderr.write("# Waiting for TMHMM(scrape_web) results")
    time.sleep(len(proteins)/500)
    resultpage = requests.post(resultlink).text
    retries = 0
    while ("Webservices : Job queue" in resultpage) and retries < 10:
      sys.stderr.write(".")
      time.sleep(len(proteins)/100 + retries**2)
      resultpage = requests.post(resultlink).text
      retries += 1

    sys.stderr.write(" .. done !\n")

    if __DEBUG__:
      log_stderr(resultpage)

    allresultpages += clean_result_page(resultpage)
  
  # we store the cleaned up result pages concatenated together
  fh = open(outfile, 'a+')
  fh.write(allresultpages)
  fh.close()

  proteins = parse_tmhmm(allresultpages, proteins, id_mapping=id_mapping)
  return proteins
Ejemplo n.º 24
0
def annotate(params, proteins, \
             batchsize=2000, \
             force=False):
  """
  This plugin inferfaces with the LipoP web interface (for humans) and
  scrapes the results. This is a silly way to do it, since there is
  a SOAP service ... however when the SOAP service goes down, as it does
  from time to time, this plugin can be used as a stopgap.
  """

  baseurl = "http://www.cbs.dtu.dk"
  url = baseurl + "/cgi-bin/webface2.fcgi"

  # grab the cached results if present
  outfile = "lipop_scrape_web.out"
  if not force and os.path.isfile(outfile):
    log_stderr("# -> skipped: %s already exists" % outfile)
    proteins, id_mapping = generate_safe_seqids(proteins)
    fh = open(outfile, 'r')
    resultpage = fh.read()
    fh.close()
    #soup = BeautifulSoup(resultpage)
    proteins = parse_lipop(resultpage, proteins, id_mapping=id_mapping)
    return proteins

  proteins, id_mapping = generate_safe_seqids(proteins)

  seqids = proteins.keys()
  allresultpages = ""
  while seqids:
    seqid_batch = seqids[0:batchsize]
    del seqids[0:batchsize]

    # get batch of sequences in fasta format with munged ids 
    # (workaround for lipop sequence id munging)
    safe_fasta = proteins_to_fasta(proteins, seqids=seqid_batch,
                                             use_safe_seqid=True)

    # we use an OrderedDict rather than a normal dictionary to work around 
    # some quirks in the CBS CGI (the server expects parameters in a certain 
    # order in the HTTP headers).
    payload = OrderedDict([('configfile',
                          "/usr/opt/www/pub/CBS/services/LipoP-1.0/LipoP.cf"),
                          ("SEQ",""),
                          ("outform","-noplot")])

    #files = {'seqfile': open(params['fasta'], 'rb')}
    files = {'seqfile': StringIO(safe_fasta)}

    log_stderr("# LipoP(scrape_web), %s > %s" % (params['fasta'], outfile))

    headers = {"User-Agent": 
               "python-requests/%s (inmembrane/%s)" %
               (requests.__version__, inmembrane.__version__) }
    r = requests.post(url, data=payload, files=files, headers=headers)
    if __DEBUG__:
      log_stderr(r.text)
      # Example:
      #
      # <HTML>
      # <HEAD><TITLE>Webface Jobsubmission</TITLE></HEAD>
      # If Javascript is disabled, follow <a href="/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait">This link</a>
      #
      # <script LANGUAGE="JavaScript"><!--
      # location.replace("/cgi-bin/webface?jobid=LipoP,50B5432A10A9CD51&opt=wait")
      # //--></script>
      # </HTML>

    # extract the result URL (or die if job is rejected ...)
    if "Job rejected" in r.text:
      sys.stderr.write(r.text)
      sys.exit()

    r = r.text.replace("<noscript>","").replace("</noscript","")
    soup = BeautifulSoup(r)
    resultlink = soup.findAll('a')[0]['href']
    sys.stderr.write("# Fetching from: " + resultlink + "\n");
    # try grabbing the result, then keep polling until they are ready
    sys.stderr.write("# Waiting for LipoP(scrape_web) results ")
    waittime = 1.0
    time.sleep(waittime) #(len(proteins)/500)
    resultpage = requests.get(resultlink).text
    retries = 0
    while (("<title>Job status of" in resultpage) and (retries < 15)):
        sys.stderr.write(".")
        time.sleep(waittime) #(len(proteins)/500)
        resultpage = requests.get(resultlink).text
        waittime += 1;
        retries += 1
        waittime = min(waittime, 20)

    sys.stderr.write(" .. done !\n")

    if __DEBUG__:
      log_stderr(resultpage)
      # Example:
      #
      #   <pre>
      # # lcl_AE004092.1_cdsid_AAK33146.1 CYT score=-0.200913
      # # Cut-off=-3
      # lcl_AE004092.1_cdsid_AAK33146.1	LipoP1.0:Best	CYT	1	1	-0.200913
      # <P>
      # <hr>
      # # lcl_AE004092.1_cdsid_AAK33147.1 CYT score=-0.200913
      # # Cut-off=-3
      # lcl_AE004092.1_cdsid_AAK33147.1	LipoP1.0:Best	CYT	1	1	-0.200913
      # <P>
      # <hr>

    allresultpages += clean_result_page(resultpage)

  # we store the cleaned up result pages concatenated together
  fh = open(outfile, 'a+')
  fh.write(allresultpages)
  fh.close()

  proteins = parse_lipop(allresultpages, proteins, id_mapping=id_mapping)
  return proteins