Ejemplo n.º 1
0
def parse_signalp(signalp4_lines, proteins, id_mapping=None):
    if id_mapping is None:
        id_mapping = []

    past_preamble = False
    for line in signalp4_lines:
        if line.startswith("#"):
            past_preamble = True
            continue
        if not past_preamble and line.strip() == '':  # skip empty lines
            continue
        if past_preamble:
            if line.strip() == '':
                # in the case of web output of concatenated signalp output
                # files, an empty line after preamble means we have finished all
                # 'result' lines for that section
                past_preamble = False
                continue
            words = line.split()
            seqid = parse_fasta_header(words[0])[0]
            if id_mapping:
                seqid = id_mapping[seqid]
            proteins[seqid]['signalp_cleave_position'] = int(words[4])
            proteins[seqid]['is_signalp'] = (words[9] == 'Y')

    return proteins
def parse_tmbetadisc_output(output, proteins):
  """
  Parses the TMBETADISC-RBF output (file-like object or a list of strings) 
  an uses it to annotate and return an associated 'proteins' data structure.
  """

  soup = BeautifulSoup(output)
  # parse the table. we pop of single data cells one at a time
  fields = soup.findAll("td")
  fields.reverse()
  f = fields.pop() # discard first <td>1</td> field
  try:
    while len(fields) > 0:
      f = fields.pop().text
      seqid, result = parse_fasta_header(f)
      if "Non-Outer Membrane Protein" in result:
        proteins[seqid]["is_tmbetadisc_rbf"] = False
      elif "is Outer Membrane Protein" in result:
        proteins[seqid]["is_tmbetadisc_rbf"] = True
      fields.pop()
  except IndexError:
    # we get here when we run out of table fields to pop
    pass
        
  return proteins
Ejemplo n.º 3
0
def parse_signalp(signalp4_lines, proteins, id_mapping=None):
    if id_mapping is None:
        id_mapping = []

    past_preamble = False
    for line in signalp4_lines:
        if line.startswith("#"):
            past_preamble = True
            continue
        if not past_preamble and line.strip() == '':  # skip empty lines
            continue
        if past_preamble:
            if line.strip() == '':
                # in the case of web output of concatenated signalp output
                # files, an empty line after preamble means we have finished all
                # 'result' lines for that section
                past_preamble = False
                continue
            words = line.split()
            seqid = parse_fasta_header(words[0])[0]
            if id_mapping:
                seqid = id_mapping[seqid]
            proteins[seqid]['signalp_cleave_position'] = int(words[4])
            proteins[seqid]['is_signalp'] = (words[9] == 'Y')

    return proteins
Ejemplo n.º 4
0
def parse_tmbetadisc_output(output, proteins):
    """
    Parses the TMBETADISC-RBF output (file-like object or a list of strings)
    an uses it to annotate and return an associated 'proteins' data structure.
    """

    soup = BeautifulSoup(output)
    # parse the table. we pop of single data cells one at a time
    fields = soup.findAll("td")
    fields.reverse()
    f = fields.pop()  # discard first <td>1</td> field
    try:
        while len(fields) > 0:
            f = fields.pop().text
            seqid, result = parse_fasta_header(f)
            if "Non-Outer Membrane Protein" in result:
                proteins[seqid]["is_tmbetadisc_rbf"] = False
            elif "is Outer Membrane Protein" in result:
                proteins[seqid]["is_tmbetadisc_rbf"] = True
            fields.pop()
    except IndexError:
        # we get here when we run out of table fields to pop
        pass

    return proteins
Ejemplo n.º 5
0
def parse_lipop(text, proteins, id_mapping=[]):
  """
  Parses the text output of the LipoP program and returns a 'proteins'
  datastructure with annotations.

  The parser can also that the HTML returned by the LipoP web interface.
  If a dictionary of {safe_seqid : seqid} mappings is given, the parser 
  will expect the input text to contain safe_seqids.
  """
  # initialize fields in each protein
  for seqid in proteins:
    proteins[seqid]['is_lipop'] = False
    proteins[seqid]['lipop_cleave_position'] = None

  for l in text.split('\n'):
    words = l.split()

    if 'SpII score' in l:
      if id_mapping:
        lipop_seqid = parse_fasta_header(words[1])[0]
        seqid = id_mapping[lipop_seqid]
      else:
        seqid = parse_fasta_header(words[1])[0]
      if 'cleavage' in l:
        pair = words[5].split("=")[1]
        i = int(pair.split('-')[0])
      else:
        i = None
      proteins[seqid]['is_lipop'] = 'Sp' in words[2]
      proteins[seqid]['lipop_cleave_position'] = i

    # check for an E.coli style inner membrane retention signal
    # Asp+2 to cleavage site. There are other apparent retention 
    # signals in E. coli and other gram- bacteria in addition to
    # the Asp+2 which we don't detect here (yet).
    # (Yamaguchi et al, 1988; Tokuda and Matsuyama, 2005 [review])
    if dict_get(proteins[seqid], 'lipop_cleave_position'):
      plus_two = proteins[seqid]['lipop_cleave_position']+1
      if proteins[seqid]['seq'][plus_two] == 'D':
        proteins[seqid]['lipop_im_retention_signal'] = True

  return proteins
Ejemplo n.º 6
0
def parse_tmhmm(text, proteins, id_mapping=[]):
    seqid = None
    for i_line, l in enumerate(text.split('\n')):
        if i_line == 0:
            continue
        words = l.split()
        if not words:
            continue

        if l.startswith("#"):
            seqid = parse_fasta_header(words[1])[0]
        else:
            seqid = parse_fasta_header(words[0])[0]
        if seqid is None:
            continue

        # re-map from a safe_seqid to the original seqid
        if id_mapping:
            seqid = id_mapping[seqid]

        # initialize fields in proteins[seqid]
        if 'tmhmm_helices' not in proteins[seqid]:
            proteins[seqid].update({
                'tmhmm_helices': [],
                'tmhmm_inner_loops': [],
                'tmhmm_outer_loops': []
            })

        if 'inside' in l:
            proteins[seqid]['tmhmm_inner_loops'].append(
                (int(words[-2]), int(words[-1])))
        if 'outside' in l:
            proteins[seqid]['tmhmm_outer_loops'].append(
                (int(words[-2]), int(words[-1])))
        if 'TMhelix' in l:
            proteins[seqid]['tmhmm_helices'].append(
                (int(words[-2]), int(words[-1])))

    return proteins
Ejemplo n.º 7
0
def parse_tmhmm(text, proteins, id_mapping=[]):
  seqid = None
  for i_line, l in enumerate(text.split('\n')):
    if i_line == 0:
      continue
    words = l.split()
    if not words:
      continue

    if l.startswith("#"):
      seqid = parse_fasta_header(words[1])[0]
    else:
      seqid = parse_fasta_header(words[0])[0]
    if seqid is None:
      continue

    # re-map from a safe_seqid to the original seqid
    if id_mapping:
      seqid = id_mapping[seqid]

    # initialize fields in proteins[seqid]
    if 'tmhmm_helices' not in proteins[seqid]:
      proteins[seqid].update({
        'tmhmm_helices':[],
        'tmhmm_inner_loops':[],
        'tmhmm_outer_loops':[]
      })

    if 'inside' in l:
      proteins[seqid]['tmhmm_inner_loops'].append(
          (int(words[-2]), int(words[-1])))
    if 'outside' in l:
      proteins[seqid]['tmhmm_outer_loops'].append(
          (int(words[-2]), int(words[-1])))
    if 'TMhelix' in l:
      proteins[seqid]['tmhmm_helices'].append(
          (int(words[-2]), int(words[-1])))

  return proteins
Ejemplo n.º 8
0
def annotate(params, proteins):
    """
    Returns a reference to the proteins data structure.

    Uses HMMER to identify sequence motifs in proteins. This function
    annotates the proteins with:
      - 'hmmsearch': a list of motifs that are found in the protein. The
         motifs correspond to the basename of the .hmm files found in the directory
         indicated by the 'hmm_profiles_dir' field of 'params'.
    """

    log_stderr("# Searching for HMMER profiles in " +
               params['hmm_profiles_dir'])

    file_tag = os.path.join(params['hmm_profiles_dir'], '*.hmm')
    for hmm_profile in glob.glob(file_tag):
        params['hmm_profile'] = hmm_profile

        hmm_profile = os.path.basename(params['hmm_profile'])
        hmm_name = hmm_profile.replace('.hmm', '')
        hmmsearch3_out = 'hmm.%s.out' % hmm_name

        cmd = '%(hmmsearch3_bin)s -Z 2000 -E 10 %(hmm_profile)s %(fasta)s' % params
        run(cmd, hmmsearch3_out)

        # init proteins data structure with blank hmmsearch field first
        for seqid in proteins:
            if 'hmmsearch' not in proteins[seqid]:
                proteins[seqid]['hmmsearch'] = []

        # parse the hmmsearch output file
        seqid = None
        for l in open(hmmsearch3_out):
            words = l.split()

            if l.startswith(">>"):
                seqid = parse_fasta_header(l[3:])[0]
                continue

            if seqid is None:
                continue

            if 'conditional E-value' in l:
                evalue = float(words[-1])
                score = float(words[-5])
                if evalue <= params['hmm_evalue_max'] and \
                        score >= params['hmm_score_min']:
                    proteins[seqid]['hmmsearch'].append(hmm_name)

    return proteins
Ejemplo n.º 9
0
def annotate(params, proteins):
    """
    Returns a reference to the proteins data structure.

    Uses HMMER to identify sequence motifs in proteins. This function
    annotates the proteins with:
      - 'hmmsearch': a list of motifs that are found in the protein. The
         motifs correspond to the basename of the .hmm files found in the directory
         indicated by the 'hmm_profiles_dir' field of 'params'.
    """

    log_stderr(
        "# Searching for HMMER profiles in " + params['hmm_profiles_dir'])

    file_tag = os.path.join(params['hmm_profiles_dir'], '*.hmm')
    for hmm_profile in glob.glob(file_tag):
        params['hmm_profile'] = hmm_profile

        hmm_profile = os.path.basename(params['hmm_profile'])
        hmm_name = hmm_profile.replace('.hmm', '')
        hmmsearch3_out = 'hmm.%s.out' % hmm_name

        cmd = '%(hmmsearch3_bin)s -Z 2000 -E 10 %(hmm_profile)s %(fasta)s' % params
        run(cmd, hmmsearch3_out)

        # init proteins data structure with blank hmmsearch field first
        for seqid in proteins:
            if 'hmmsearch' not in proteins[seqid]:
                proteins[seqid]['hmmsearch'] = []

        # parse the hmmsearch output file
        seqid = None
        for l in open(hmmsearch3_out):
            words = l.split()

            if l.startswith(">>"):
                seqid = parse_fasta_header(l[3:])[0]
                continue

            if seqid is None:
                continue

            if 'conditional E-value' in l:
                evalue = float(words[-1])
                score = float(words[-5])
                if evalue <= params['hmm_evalue_max'] and \
                        score >= params['hmm_score_min']:
                    proteins[seqid]['hmmsearch'].append(hmm_name)

    return proteins
Ejemplo n.º 10
0
def parse_tatfind_output(output, proteins):
    """
    Parses the TatFind HTML output (file-like object or a list of strings)
    an uses it to annotate and return an associated 'proteins' data structure.
    """
    for l in output:
        if "Results for" in l:
            seqid = l.split("Results for ")[1].split(":")[:-1][0]
            # parse id string to bring it to our format
            seqid, unused = parse_fasta_header(seqid)
            # "TRUE" or "FALSE"
            tat_pred = l.split("Results for ")[1].split(":")[-1:][0].strip()
            if tat_pred == "TRUE":
                proteins[seqid]["is_tatfind"] = True
            else:
                proteins[seqid]["is_tatfind"] = False

    return proteins
Ejemplo n.º 11
0
def parse_tatfind_output(output, proteins):
    """
    Parses the TatFind HTML output (file-like object or a list of strings)
    an uses it to annotate and return an associated 'proteins' data structure.
    """
    for l in output:
        if "Results for" in l:
            seqid = l.split("Results for ")[1].split(":")[:-1][0]
            # parse id string to bring it to our format
            seqid, unused = parse_fasta_header(seqid)
            # "TRUE" or "FALSE"
            tat_pred = l.split("Results for ")[1].split(":")[-1:][0].strip()
            if tat_pred == "TRUE":
                proteins[seqid]["is_tatfind"] = True
            else:
                proteins[seqid]["is_tatfind"] = False

    return proteins
Ejemplo n.º 12
0
def parse_tmbhunt(proteins, out):
    """
    Takes the filename of a TMB-HUNT output file (text format)
    & parses the outer membrane beta-barrel predictions into the proteins dictionary.
    """
    # parse TMB-HUNT text output
    tmbhunt_classes = {}
    for l in open(out, 'r'):
        # inmembrane.log_stderr("# TMB-HUNT raw: " + l[:-1])
        if l[0] == ">":
            # TMB-HUNT munges FASTA ids by making them all uppercase,
            # so we find the equivalent any-case id in our proteins list
            # and use that. ugly but necessary.
            seqid, desc = parse_fasta_header(l)
            for i in proteins.keys():
                if seqid.upper() == i.upper():
                    seqid = i
                    desc = proteins[i]['name']

            probability = None
            classication = None
            tmbhunt_classes[seqid] = {}
        if l.find("Probability of a NON-BETA BARREL protein with this score:"
                  ) != -1:
            # we convert from probability of NON-BARREL to probability of BARREL
            probability = 1 - float(l.split(":")[1].strip())
        if l[0:11] == "Conclusion:":
            classication = l.split(":")[1].strip()
            if classication == "BBMP":
                tmbhunt_classes[seqid]['tmbhunt'] = True
                tmbhunt_classes[seqid]['tmbhunt_prob'] = probability

                proteins[seqid]['tmbhunt'] = True
                proteins[seqid]['tmbhunt_prob'] = probability

            elif classication == "Non BBMP":
                tmbhunt_classes[seqid]['tmbhunt'] = False
                tmbhunt_classes[seqid]['tmbhunt_prob'] = probability

                proteins[seqid]['tmbhunt'] = False
                proteins[seqid]['tmbhunt_prob'] = probability

    # inmembrane.log_stderr(str(tmbhunt_classes))
    return tmbhunt_classes
Ejemplo n.º 13
0
def parse_tmbhunt(proteins, out):
    """
    Takes the filename of a TMB-HUNT output file (text format)
    & parses the outer membrane beta-barrel predictions into the proteins dictionary.
    """
    # parse TMB-HUNT text output
    tmbhunt_classes = {}
    for l in open(out, 'r'):
        # inmembrane.log_stderr("# TMB-HUNT raw: " + l[:-1])
        if l[0] == ">":
            # TMB-HUNT munges FASTA ids by making them all uppercase,
            # so we find the equivalent any-case id in our proteins list
            # and use that. ugly but necessary.
            seqid, desc = parse_fasta_header(l)
            for i in proteins.keys():
                if seqid.upper() == i.upper():
                    seqid = i
                    desc = proteins[i]['name']

            probability = None
            classication = None
            tmbhunt_classes[seqid] = {}
        if l.find(
                "Probability of a NON-BETA BARREL protein with this score:") != -1:
            # we convert from probability of NON-BARREL to probability of BARREL
            probability = 1 - float(l.split(":")[1].strip())
        if l[0:11] == "Conclusion:":
            classication = l.split(":")[1].strip()
            if classication == "BBMP":
                tmbhunt_classes[seqid]['tmbhunt'] = True
                tmbhunt_classes[seqid]['tmbhunt_prob'] = probability

                proteins[seqid]['tmbhunt'] = True
                proteins[seqid]['tmbhunt_prob'] = probability

            elif classication == "Non BBMP":
                tmbhunt_classes[seqid]['tmbhunt'] = False
                tmbhunt_classes[seqid]['tmbhunt_prob'] = probability

                proteins[seqid]['tmbhunt'] = False
                proteins[seqid]['tmbhunt_prob'] = probability

    # inmembrane.log_stderr(str(tmbhunt_classes))
    return tmbhunt_classes
Ejemplo n.º 14
0
def annotate(params, proteins):
  for seqid in proteins:
    proteins[seqid]['is_signalp'] = False
    proteins[seqid]['signalp_cleave_position'] = None

  signalp4_out = 'signalp.out'
  cmd = '%(signalp4_bin)s -t %(signalp4_organism)s  %(fasta)s' % \
             params
  run(cmd, signalp4_out)

  for line in open(signalp4_out):
    if line.startswith("#"):
      continue
    words = line.split()
    seqid = parse_fasta_header(words[0])[0]
    proteins[seqid]['signalp_cleave_position'] = int(words[4])
    if (words[9] == "Y"):
      proteins[seqid]['is_signalp'] = True

  return proteins
Ejemplo n.º 15
0
def annotate(params, proteins, \
             url="http://services.cbu.uib.no/tools/bomp/", force=False):
    """
    Uses the BOMP web service (http://services.cbu.uib.no/tools/bomp/) to
    predict if proteins are outer membrane beta-barrels.
    """
    # set the user-agent so web services can block us if they want ... :/
    python_version = sys.version.split()[0]
    agent("Python-urllib/%s (twill; inmembrane/%s)" %
          (python_version, inmembrane.__version__))

    bomp_out = 'bomp.out'
    log_stderr("# BOMP(web) %s > %s" % (params['fasta'], bomp_out))

    if not force and os.path.isfile(bomp_out):
        log_stderr("# -> skipped: %s already exists" % bomp_out)
        bomp_categories = {}
        fh = open(bomp_out, 'r')
        for l in fh:
            words = l.split()
            bomp_category = int(words[-1:][0])
            seqid = parse_fasta_header(l)[0]
            proteins[seqid]['bomp'] = bomp_category
            bomp_categories[seqid] = bomp_category
        fh.close()
        return bomp_categories

    # dump extraneous output into this blackhole so we don't see it
    if not __DEBUG__: twill.set_output(StringIO.StringIO())

    go(url)
    if __DEBUG__: showforms()
    formfile("1", "queryfile", params["fasta"])
    submit()
    if __DEBUG__: show()

    # extract the job id from the page
    links = showlinks()
    job_id = None
    for l in links:
        if l.url.find("viewOutput") != -1:
            # grab job id from "viewOutput?id=16745338"
            job_id = int(l.url.split("=")[1])

    if __DEBUG__: log_stderr("BOMP job id: %d" % job_id)

    if not job_id:
        # something went wrong
        log_stderr("# BOMP error: Can't find job id")
        return

    # parse the HTML table and extract categories
    go("viewOutput?id=%i" % (job_id))

    polltime = 10
    log_stderr("# Waiting for BOMP to finish .")
    while True:
        try:
            find("Not finished")
            log_stderr(".")
        except:
            # Finished ! Pull down the result page.
            log_stderr(". done!\n")
            go("viewOutput?id=%i" % (job_id))
            if __DEBUG__: log_stderr(show())
            break

        # Not finished. We keep polling for a time until
        # we give up
        time.sleep(polltime)
        polltime = polltime * 2
        if polltime >= 7200:  # 2 hours
            log_stderr("# BOMP error: Taking too long.")
            return
        go("viewOutput?id=%i" % (job_id))
        if __DEBUG__: log_stderr(show())

    bomp_html = show()
    if __DEBUG__: log_stderr(bomp_html)

    # Results are in the only <table> on this page, formatted like:
    # <tr><th>gi|107836852|gb|ABF84721.1<th>5</tr>
    soup = BeautifulSoup(bomp_html)
    bomp_categories = {}  # dictionary of {name, category} pairs
    for tr in soup.findAll('tr')[1:]:
        n, c = tr.findAll('th')
        name = parse_fasta_header(n.text.strip())[0]
        category = int(c.text)
        bomp_categories[name] = category

    # write BOMP results to a tab delimited file
    fh = open(bomp_out, 'w')
    for k, v in bomp_categories.iteritems():
        fh.write("%s\t%i\n" % (k, v))
    fh.close()

    if __DEBUG__: log_stderr(str(bomp_categories))

    # label proteins with bomp classification (int) or False
    for name in proteins:
        if "bomp" not in proteins[name]:
            if name in bomp_categories:
                category = int(bomp_categories[name])
                proteins[name]['bomp'] = category
            else:
                proteins[name]['bomp'] = False

    if __DEBUG__: log_stderr(str(proteins))

    return bomp_categories
    """
Ejemplo n.º 16
0
def annotate(params, proteins, \
             url="http://services.cbu.uib.no/tools/bomp/", force=False):
    """
    Uses the BOMP web service (http://services.cbu.uib.no/tools/bomp/) to
    predict if proteins are outer membrane beta-barrels.
    """
    # set the user-agent so web services can block us if they want ... :/
    python_version = sys.version.split()[0]
    agent("Python-urllib/%s (twill; inmembrane/%s)" %
          (python_version, inmembrane.__version__))

    bomp_out = 'bomp.out'
    log_stderr("# BOMP(web) %s > %s" % (params['fasta'], bomp_out))

    if not force and os.path.isfile(bomp_out):
        log_stderr("# -> skipped: %s already exists" % bomp_out)
        bomp_categories = {}
        fh = open(bomp_out, 'r')
        for l in fh:
            words = l.split()
            bomp_category = int(words[-1:][0])
            seqid = parse_fasta_header(l)[0]
            proteins[seqid]['bomp'] = bomp_category
            bomp_categories[seqid] = bomp_category
        fh.close()
        return bomp_categories

    # dump extraneous output into this blackhole so we don't see it
    if not __DEBUG__: twill.set_output(StringIO.StringIO())

    go(url)
    if __DEBUG__: showforms()
    formfile("1", "queryfile", params["fasta"])
    submit()
    if __DEBUG__: show()

    # extract the job id from the page
    links = showlinks()
    job_id = None
    for l in links:
        if l.url.find("viewOutput") != -1:
            # grab job id from "viewOutput?id=16745338"
            job_id = int(l.url.split("=")[1])

    if __DEBUG__: log_stderr("BOMP job id: %d" % job_id)

    if not job_id:
        # something went wrong
        log_stderr("# BOMP error: Can't find job id")
        return

    # parse the HTML table and extract categories
    go("viewOutput?id=%i" % (job_id))

    polltime = 10
    log_stderr("# Waiting for BOMP to finish .")
    while True:
        try:
            find("Not finished")
            log_stderr(".")
        except:
            # Finished ! Pull down the result page.
            log_stderr(". done!\n")
            go("viewOutput?id=%i" % (job_id))
            if __DEBUG__: log_stderr(show())
            break

        # Not finished. We keep polling for a time until
        # we give up
        time.sleep(polltime)
        polltime = polltime * 2
        if polltime >= 7200:  # 2 hours
            log_stderr("# BOMP error: Taking too long.")
            return
        go("viewOutput?id=%i" % (job_id))
        if __DEBUG__: log_stderr(show())

    bomp_html = show()
    if __DEBUG__: log_stderr(bomp_html)

    # Results are in the only <table> on this page, formatted like:
    # <tr><th>gi|107836852|gb|ABF84721.1<th>5</tr>
    soup = BeautifulSoup(bomp_html)
    bomp_categories = {}  # dictionary of {name, category} pairs
    for tr in soup.findAll('tr')[1:]:
        n, c = tr.findAll('th')
        name = parse_fasta_header(n.text.strip())[0]
        category = int(c.text)
        bomp_categories[name] = category

    # write BOMP results to a tab delimited file
    fh = open(bomp_out, 'w')
    for k, v in bomp_categories.iteritems():
        fh.write("%s\t%i\n" % (k, v))
    fh.close()

    if __DEBUG__: log_stderr(str(bomp_categories))

    # label proteins with bomp classification (int) or False
    for name in proteins:
        if "bomp" not in proteins[name]:
            if name in bomp_categories:
                category = int(bomp_categories[name])
                proteins[name]['bomp'] = category
            else:
                proteins[name]['bomp'] = False

    if __DEBUG__: log_stderr(str(proteins))

    return bomp_categories

    """