Exemple #1
0
def get_annotations(params):
  annotations = []
  params['signalp4_organism'] = 'gram-'
  
  if not params['signalp4_bin'] or params['signalp4_bin'] == 'signalp_web':
    annotations += ['signalp_web']
  else:
    annotations += ['signalp4']
  
  if not params['lipop1_bin'] or params['lipop1_bin'] == 'lipop_scrape_web':
    annotations += ['lipop_scrape_web']
  elif params['lipop1_bin'] == 'lipop_web':
    annotations += ['lipop_web']
  elif params['lipop1_bin'] == 'lipop1':
    annotations += ['lipop1']
  
  annotations += ['tatfind_web']
  
  if 'bomp' in dict_get(params, 'barrel_programs'):
    annotations.append('bomp_web')
  # DEPRECATED: TMB-HUNT server is permanently offline
  #if 'tmbhunt' in dict_get(params, 'barrel_programs'):
  #  annotations.append('tmbhunt_web')
  if 'tmbetadisc-rbf' in dict_get(params, 'barrel_programs'):
    annotations.append('tmbetadisc_rbf_web')
    
  # TMBETA-NET knows to only run on predicted barrels
  # with the category 'OM(barrel)'
  if 'tmbeta' in dict_get(params, 'barrel_programs'):
    annotations.append('tmbeta_net_web')

  if dict_get(params, 'helix_programs'):
    if 'tmhmm' in params['helix_programs']:
      if not params['tmhmm_bin'] or params['tmhmm_bin'] == 'tmhmm_scrape_web':
        annotations.append('tmhmm_scrape_web')
      elif params['tmhmm_bin'] == 'tmhmm_web':
        annotations.append('tmhmm_web')
      else:
        annotations.append('tmhmm')
    if 'memsat3' in params['helix_programs']:
      annotations.append('memsat3')

  # run some hmm profiles to detect features (eg Tat signal)
  annotations += ['hmmsearch3']
  params['hmm_profiles_dir'] = os.path.join(
      os.path.dirname(__file__), 'gram_neg_profiles')

  return annotations
Exemple #2
0
def get_annotations(params):
  """
  Creates a list of annotation functions required
  by this gram_pos protocol. The main program
  will run the annotation functions of this list,
  mapping the correct functions to the strings.

  As well, the function does some bookeeping on
  params to make sure the 'hmm_profiles_dir' is
  pointing in the right place.
  """
  annotations = []
  
  params['signalp4_organism'] = 'gram+'
  
  if not params['signalp4_bin'] or params['signalp4_bin'] == 'signalp_web':
    annotations += ['signalp_web']
  else:
    annotations += ['signalp4']
  
  if not params['lipop1_bin'] or params['lipop1_bin'] == 'lipop_scrape_web':
    annotations += ['lipop_scrape_web']
  elif params['lipop1_bin'] == 'lipop_web':
    annotations += ['lipop_web']
  else:
    annotations += ['lipop1']
    
  annotations += ['hmmsearch3']

  if dict_get(params, 'helix_programs'):
    if 'tmhmm' in params['helix_programs']:
      if not params['tmhmm_bin'] or params['tmhmm_bin'] == 'tmhmm_scrape_web':
        annotations.append('tmhmm_scrape_web')
      elif params['tmhmm_bin'] == 'tmhmm_web':
        annotations.append('tmhmm_web')
      else:
        annotations.append('tmhmm')
    if 'memsat3' in params['helix_programs']:
      annotations.append('memsat3')

  params['hmm_profiles_dir'] = os.path.join(
      os.path.dirname(__file__), 'gram_pos_profiles')

  return annotations
Exemple #3
0
def parse_lipop(text, proteins, id_mapping=None):
    """
    Parses the text output of the LipoP program and returns a 'proteins'
    datastructure with annotations.

    The parser can also that the HTML returned by the LipoP web interface.
    If a dictionary of {safe_seqid : seqid} mappings is given, the parser
    will expect the input text to contain safe_seqids.
    """

    if id_mapping is None:
        id_mapping = []

    # initialize fields in each protein
    for seqid in proteins:
        proteins[seqid]['is_lipop'] = False
        proteins[seqid]['lipop_cleave_position'] = None

    for l in text.split('\n'):
        words = l.split()

        if 'SpII score' in l:
            seqid = parse_fasta_header(words[1])[0]
            if id_mapping:
                seqid = id_mapping[seqid]
            if 'cleavage' in l:
                pair = words[5].split("=")[1]
                i = int(pair.split('-')[0])
            else:
                i = None
            proteins[seqid]['is_lipop'] = 'Sp' in words[2]
            proteins[seqid]['lipop_cleave_position'] = i

        # check for an E.coli style inner membrane retention signal
        # Asp+2 to cleavage site. There are other apparent retention
        # signals in E. coli and other gram- bacteria in addition to
        # the Asp+2 which we don't detect here (yet).
        # (Yamaguchi et al, 1988; Tokuda and Matsuyama, 2005 [review])
        if dict_get(proteins[seqid], 'lipop_cleave_position'):
            plus_two = proteins[seqid]['lipop_cleave_position'] + 1
            if proteins[seqid]['seq'][plus_two] == 'D':
                proteins[seqid]['lipop_im_retention_signal'] = True

    return proteins
Exemple #4
0
def parse_lipop(text, proteins, id_mapping=None):
    """
    Parses the text output of the LipoP program and returns a 'proteins'
    datastructure with annotations.

    The parser can also that the HTML returned by the LipoP web interface.
    If a dictionary of {safe_seqid : seqid} mappings is given, the parser
    will expect the input text to contain safe_seqids.
    """

    if id_mapping is None:
        id_mapping = []

    # initialize fields in each protein
    for seqid in proteins:
        proteins[seqid]['is_lipop'] = False
        proteins[seqid]['lipop_cleave_position'] = None

    for l in text.split('\n'):
        words = l.split()

        if 'SpII score' in l:
            seqid = parse_fasta_header(words[1])[0]
            if id_mapping:
                seqid = id_mapping[seqid]
            if 'cleavage' in l:
                pair = words[5].split("=")[1]
                i = int(pair.split('-')[0])
            else:
                i = None
            proteins[seqid]['is_lipop'] = 'Sp' in words[2]
            proteins[seqid]['lipop_cleave_position'] = i

        # check for an E.coli style inner membrane retention signal
        # Asp+2 to cleavage site. There are other apparent retention
        # signals in E. coli and other gram- bacteria in addition to
        # the Asp+2 which we don't detect here (yet).
        # (Yamaguchi et al, 1988; Tokuda and Matsuyama, 2005 [review])
        if dict_get(proteins[seqid], 'lipop_cleave_position'):
            plus_two = proteins[seqid]['lipop_cleave_position'] + 1
            if proteins[seqid]['seq'][plus_two] == 'D':
                proteins[seqid]['lipop_im_retention_signal'] = True

    return proteins
Exemple #5
0
 def has_tm_helix(protein):
   for program in params['helix_programs']:
     if dict_get(protein, '%s_helices' % program):
       return True
   return False
Exemple #6
0
def post_process_protein(params, protein):
  """
  This is the main analysis of the protein, where theprotein
  dictionary should contain all the necessary information
  from the annotations. Thus post_process_protein contain
  can determine the final analysis.
  """

  def sequence_length(protein):
    return protein['sequence_length']
    
  def has_tm_helix(protein):
    for program in params['helix_programs']:
      if dict_get(protein, '%s_helices' % program):
        return True
    return False

  def has_surface_exposed_loop(protein):
    for program in params['helix_programs']:
      if eval_surface_exposed_loop(
          protein['sequence_length'], 
          len(protein['%s_helices' % (program)]), 
          protein['%s_outer_loops' % (program)], 
          params['terminal_exposed_loop_min'], 
          params['internal_exposed_loop_min']):
        return True
    return False

  def exposed_loop_extent(protein):
    extents = []
    for program in params['helix_programs']:
      if program+'_helices' in protein:
        extents.append(max_exposed_loop(
            protein['sequence_length'], 
            len(protein['%s_helices' % (program)]), 
            protein['%s_outer_loops' % (program)], 
            params['terminal_exposed_loop_min'], 
            params['internal_exposed_loop_min']))
    if extents:
      return max(extents)
    else:
      return 0

  terminal_exposed_loop_min = \
      params['terminal_exposed_loop_min']

  is_hmm_profile_match = dict_get(protein, 'hmmsearch')
  is_lipop = dict_get(protein, 'is_lipop')
  if is_lipop:
    i_lipop_cut = protein['lipop_cleave_position']
  is_signalp = dict_get(protein, 'is_signalp')
  if is_signalp:
    i_signalp_cut = protein['signalp_cleave_position']

  details = []
  if is_hmm_profile_match:
    details += ["hmm(%s)" % "|".join(protein['hmmsearch'])]
  if is_lipop: 
    details += ["lipop"]
  if is_signalp:
    details += ["signalp"]
  for program in params['helix_programs']:
    if has_tm_helix(protein):
      n = len(protein['%s_helices' % program])
      details += [program + "(%d)" % n]

  if is_lipop:
    chop_nterminal_peptide(protein, i_lipop_cut)
  elif is_signalp:
    chop_nterminal_peptide(protein, i_signalp_cut)

  if is_hmm_profile_match:
    category =  "PSE-Cellwall"
  elif has_tm_helix(protein):
    if has_surface_exposed_loop(protein):
      category = "PSE-Membrane"
    else:
      category = "MEMBRANE(non-PSE)"
  else:
    if is_lipop:
      # whole protein considered outer terminal loop
      if sequence_length(protein) < terminal_exposed_loop_min:
        category = "LIPOPROTEIN(non-PSE)"
      else:
        category = "PSE-Lipoprotein"
    elif is_signalp:
      category = "SECRETED"
    else:
      category = "CYTOPLASM(non-PSE)"

  if details == []:
    details = ["."]

  protein['details'] = details
  protein['category'] = category
  if 'CYTOPLASM' not in category and 'SECRETED' not in category:
    protein['loop_extent'] = exposed_loop_extent(protein)
  else:
    protein['loop_extent'] = "."

  return details, category
def post_process_protein(params, protein):
    
  def has_tm_helix(protein):
    for program in params['helix_programs']:
      if dict_get(protein, '%s_helices' % program):
        return True
    return False

  # these functions detect if and TM-containing IM proteins
  # have large loops / terminal regions in the periplasm or cytoplasm
  # that may be accessible / inaccessible in spheroplast shaving 
  # experiments.
  def has_long_loops(protein, loop_str='_outer_loops', \
                     loop_length=params['internal_exposed_loop_min']):
    for annot in protein:
      if loop_str in annot:
        for loop in protein[annot]:
          l_len = loop[1]-loop[0]
          if l_len >= loop_length:
            return True
    return False
  
  def long_in_periplasm(protein, \
                        loop_length=params['internal_exposed_loop_min']):
    return has_long_loops(protein, '_outer_loops', loop_length)
  
  def long_in_cytoplasm(protein, \
                        loop_length=params['internal_exposed_loop_min']):
    return has_long_loops(protein, '_inner_loops', loop_length)

  
  details = []
  category = "UNKNOWN"
  is_hmm_profile_match = dict_get(protein, 'hmmsearch')
  is_signalp = dict_get(protein, 'is_signalp')
  is_tatfind = dict_get(protein, 'is_tatfind')
  is_lipop = dict_get(protein, 'is_lipop')
  
  # in terms of most sublocalization logic, a Tat signal is similar to a 
  # Sec (signalp) signal. We use has_signal_pept to denote that either 
  # is present.
  has_signal_pept = False
  if is_signalp or is_tatfind or \
     (('hmmsearch' in protein) and "Tat_PS51318" in protein['hmmsearch']):
    has_signal_pept = True
  
  # annotate the barrels - high scoring bomp hits don't require a 
  # signal peptide, low scoring ones do
  has_barrel = False
  bomp_score = dict_get(protein, 'bomp')
  if (bomp_score >= params['bomp_clearly_cutoff']) or \
     (has_signal_pept and bomp_score >= params['bomp_maybe_cutoff']):
    
    details += ['bomp(%i)' % (bomp_score)]
    has_barrel = True
    
  tmbhunt_prob = dict_get(protein, 'tmbhunt_prob')
  if (tmbhunt_prob >= params['tmbhunt_clearly_cutoff']) or \
     (has_signal_pept and tmbhunt_prob >= params['tmbhunt_maybe_cutoff']):
    details += ['tmbhunt(%.2f)' % (tmbhunt_prob)]
    has_barrel = True
    
  if has_signal_pept and dict_get(protein, 'is_tmbetadisc_rbf'):
    details += ['tmbetadisc-rbf']
    has_barrel = True
    
  if has_barrel:
    category = 'OM(barrel)'
    
  # we only regard the barrel prediction as a true positive
  # if a signal peptide is also present
#  is_barrel = False
#  if has_signal_pept and has_barrel: # TODO and num_tms <= 1:
#    category = 'OM(barrel)'
#    is_barrel = True
    
  # set number of predicted OM barrel strands in details
  if has_barrel and \
      dict_get(protein, 'tmbeta_strands'):
    num_strands = len(protein['tmbeta_strands'])
    details += ['tmbeta_strands(%i)' % (num_strands)]
  
  if has_signal_pept and not is_lipop and \
    (dict_get(protein, 'signalp_cleave_position')):
    # we use the SignalP signal peptidase cleavage site for Tat signals
    chop_nterminal_peptide(protein,  protein['signalp_cleave_position'])
  
  if is_tatfind:
    details += ["tatfind"]
  
  if is_signalp:
    details += ["signalp"]
  
  if is_lipop:
    details += ["lipop"]
    chop_nterminal_peptide(protein, protein['lipop_cleave_position'])
  
  if is_hmm_profile_match:
    details += ["hmm(%s)" % "|".join(protein['hmmsearch'])]

  if has_tm_helix(protein) and not has_barrel:
    for program in params['helix_programs']:
      n = len(protein['%s_helices' % program])
      details += [program + "(%d)" % n]
    
    category = "IM"
    if long_in_periplasm(protein):
      category += "+peri"
    if long_in_cytoplasm(protein):
      category += "+cyto"
  elif not has_barrel:
    if is_lipop:
      if dict_get(protein, 'lipop_im_retention_signal'):
        category = "LIPOPROTEIN(IM)"
      else:
        category = "LIPOPROTEIN(OM)"
      pass
    elif (has_signal_pept):
      category = "PERIPLASMIC/SECRETED"
    else:
      category = "CYTOPLASM"

  if details == []:
    details = ["."]

  protein['details'] = details
  protein['category'] = category

  return details, category
def annotate(params, proteins, \
             url="http://rbf.bioinfo.tw/"+
                 "~sachen/OMPpredict/"+
                 "TMBETADISC-RBF-Content.html", force=False):
  """
  Interfaces with the TatFind web service at 
  (http://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php) 
  to predict if protein sequence is likely to be an outer membrane beta-barrel.
  
  Note that the default URL we use it different to the regular form used
  by web browsers, since we need to bypass some AJAX fun.
  """
  # TODO: automatically split large sets into multiple jobs
  #       since TMBETADISC seems to not like more than take 
  #       ~5000 seqs at a time
  if len(proteins) >= 5000:
    log_stderr("# ERROR: TMBETADISC-RBF(web): tends to fail with > ~5000 sequences.")
    return
  
  # set the user-agent so web services can block us if they want ... :/
  python_version = sys.version.split()[0]
  agent("Python-urllib/%s (twill; inmembrane)" % python_version)
  
  outfn = 'tmbetadisc-rbf.out'
  log_stderr("# TMBETADISC-RBF(web) %s > %s" % (params['fasta'], outfn))
  
  if not force and os.path.isfile(outfn):
    log_stderr("# -> skipped: %s already exists" % outfn)
    fh = open(outfn, 'r')
    proteins = parse_tmbetadisc_output(fh.read(), proteins)
    fh.close()
    return proteins
  
  # dump extraneous output into this blackhole so we don't see it
  if not __DEBUG__: twill.set_output(StringIO.StringIO())
  
  go(url)
  if __DEBUG__: showforms()
  formfile("1", "userfile", params["fasta"])
  fv("1", "format", "file")

  # set the user defined method
  method_map = {"aa":"Amino Acid Composition",
                "dp":"Depipetide Composition",
                "aadp":"Amino Acid & Depipetide Composition",
                "pssm":"PSSM"}
  if dict_get(params, 'tmbetadisc_rbf_method'):
    try:
      method = method_map[params['tmbetadisc_rbf_method']]
    except KeyError:
      log_stderr("# ERROR: Invalid setting from tmbetadisc_rbf_method. \
                    Must be set to aa, dp, aadp or pssm.")
      sys.exit()

  #fv("1", "select", "Amino Acid Composition")
  #fv("1", "select", "Depipetide Composition")
  #fv("1", "select", "Amino Acid & Depipetide Composition")
  #fv("1", "select", "PSSM")
  fv("1", "select", method)
  
  submit()
  
  waiting_page = show()
  if __DEBUG__: log_stderr(waiting_page)

  for l in waiting_page.split('\n'):
    if l.find("TMBETADISC-RBF-action.php?UniqueName=") != -1:
      result_url = l.split("'")[1]

  time.sleep(5)
  
  go(result_url)
  
  output = show()
  if __DEBUG__: log_stderr(output)
  
  # write raw output to a file
  fh = open(outfn, 'w')
  fh.write(output)
  fh.close()
  
  proteins = parse_tmbetadisc_output(output, proteins) 
  
  return proteins
def annotate(params, proteins, \
                   url="http://psfs.cbrc.jp/tmbeta-net/", \
                   category='OM(barrel)',
                   force=False):
    """
  Uses the TMBETA-NET web service (http://psfs.cbrc.jp/tmbeta-net/) to
  predict strands of outer membrane beta-barrels.
  
  By default, category='BARREL' means prediction will only be run
  on proteins in the set with this category property. To process all
  proteins, change category to None.

  These keys are added to the proteins dictionary: 
    'tmbeta_strands' - a list of lists with paired start and end 
                       residues of each predicted strand. 
                       (eg [[3,9],[14,21], ..etc ])
  """

    # set the user-agent so web services can block us if they want ... :/
    python_version = sys.version.split()[0]
    agent("Python-urllib/%s (twill; inmembrane)" % python_version)

    outfile = 'tmbeta_net.out'
    log_stderr("# TMBETA-NET(web) %s > %s" % (params['fasta'], outfile))

    tmbeta_strands = {}
    if not force and os.path.isfile(outfile):
        log_stderr("# -> skipped: %s already exists" % outfile)
        fh = open(outfile, 'r')
        tmbeta_strands = json.loads(fh.read())
        fh.close()
        for seqid in tmbeta_strands:
            proteins[seqid]['tmbeta_strands'] = tmbeta_strands[seqid]

        return tmbeta_strands

    # dump extraneous output into this blackhole so we don't see it
    if not __DEBUG__: twill.set_output(StringIO.StringIO())

    for seqid in proteins:

        # only run on sequences which match the category filter
        if force or \
           (category == None) or \
           (dict_get(proteins[seqid], 'category') == category):
            pass
        else:
            continue

        go(url)
        if __DEBUG__: showforms()
        fv("1", "sequence", proteins[seqid]['seq'])
        submit()
        log_stderr("# TMBETA-NET: Predicting strands for %s - %s\n" \
                          % (seqid, proteins[seqid]['name']))
        out = show()
        time.sleep(1)

        if ("Some query is already running. Please try again." in out):
            log_stderr("# TMBETA-NET(web) error: %s" % (out))
            return {}

        # parse the web page returned, extract strand boundaries
        proteins[seqid]['tmbeta_strands'] = []
        for l in out.split('\n'):
            if __DEBUG__: log_stderr("## " + l)

            if "<BR>Segment " in l:
                i, j = l.split(":")[1].split("to")
                i = int(i.strip()[1:])
                j = int(j.strip()[1:])
                proteins[seqid]['tmbeta_strands'].append([i, j])

                if __DEBUG__:
                    log_stderr("# TMBETA-NET(web) segments: %s, %s" % (i, j))

        tmbeta_strands[seqid] = proteins[seqid]['tmbeta_strands']

    # we store the parsed strand boundaries in JSON format
    fh = open(outfile, 'w')
    fh.write(json.dumps(tmbeta_strands, separators=(',', ':\n')))
    fh.close()

    return tmbeta_strands
def annotate(params, proteins, \
             url="http://rbf.bioinfo.tw/" +
                 "~sachen/OMPpredict/" +
                 "TMBETADISC-RBF-Content.html", force=False):
    """
    Interfaces with the TMBETADISC-RBF web service at
    (http://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php)
    to predict if protein sequence is likely to be an outer membrane beta-barrel.

    Note that the default URL we use it different to the regular form used
    by web browsers, since we need to bypass some AJAX fun.
    """
    # TODO: automatically split large sets into multiple jobs
    #       since TMBETADISC seems to not like more than take
    #       ~5000 seqs at a time
    if len(proteins) >= 5000:
        log_stderr(
            "# ERROR: TMBETADISC-RBF(web): tends to fail with > ~5000 sequences.")
        return

    # set the user-agent so web services can block us if they want ... :/
    python_version = sys.version.split()[0]
    # TODO: Set User-Agent header for requests
    # agent("Python-urllib/%s (requests; inmembrane)" % python_version)

    outfn = 'tmbetadisc-rbf.out'
    log_stderr("# TMBETADISC-RBF(web) %s > %s" % (params['fasta'], outfn))

    if not force and os.path.isfile(outfn):
        log_stderr("# -> skipped: %s already exists" % outfn)
        fh = open(outfn, 'r')
        proteins = parse_tmbetadisc_output(fh.read(), proteins)
        fh.close()
        return proteins

    # set the user defined method
    method_map = {"aa": "Amino Acid Composition",
                  "dp": "Depipetide Composition",
                  "aadp": "Amino Acid & Depipetide Composition",
                  "pssm": "PSSM"}
    if dict_get(params, 'tmbetadisc_rbf_method'):
        try:
            method = method_map[params['tmbetadisc_rbf_method']]
        except KeyError:
            log_stderr("# ERROR: Invalid setting from tmbetadisc_rbf_method. \
                    Must be set to aa, dp, aadp or pssm.")
            sys.exit()

    # files = {'userfile': open(params["fasta"], 'rb')}
    with open(params["fasta"], 'r') as ff:
        data = {'format': 'fasta', 'select': method, 'seq': ff.read()}
    response = requests.post(
        'https://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php',
        data=data)  # , files=files)

    waiting_page = response.content
    if __DEBUG__: log_stderr(waiting_page)

    for l in waiting_page.split('\n'):
        if 'TMBETADISC-RBF-action.php?UniqueName=' in l:
            result_url = l.split("'")[1]

    time.sleep(5)

    output = requests.get(result_url).content

    if __DEBUG__: log_stderr(output)

    # write raw output to a file
    fh = open(outfn, 'w')
    # fh.write(waiting_page)
    # fh.write("<!-- ----------------------------------------------------------------------------------- -->")
    fh.write(output)
    fh.close()

    proteins = parse_tmbetadisc_output(output, proteins)

    return proteins
Exemple #11
0
def annotate(params, proteins, \
                   url="http://psfs.cbrc.jp/tmbeta-net/", \
                   category='OM(barrel)',
                   force=False):
  """
  Uses the TMBETA-NET web service (http://psfs.cbrc.jp/tmbeta-net/) to
  predict strands of outer membrane beta-barrels.
  
  By default, category='BARREL' means prediction will only be run
  on proteins in the set with this category property. To process all
  proteins, change category to None.

  These keys are added to the proteins dictionary: 
    'tmbeta_strands' - a list of lists with paired start and end 
                       residues of each predicted strand. 
                       (eg [[3,9],[14,21], ..etc ])
  """

  # set the user-agent so web services can block us if they want ... :/
  python_version = sys.version.split()[0]
  agent("Python-urllib/%s (twill; inmembrane)" % python_version)
  
  outfile = 'tmbeta_net.out'
  log_stderr("# TMBETA-NET(web) %s > %s" % (params['fasta'], outfile))
  
  tmbeta_strands = {}
  if not force and os.path.isfile(outfile):
    log_stderr("# -> skipped: %s already exists" % outfile)
    fh = open(outfile, 'r')
    tmbeta_strands = json.loads(fh.read())
    fh.close()    
    for seqid in tmbeta_strands:
      proteins[seqid]['tmbeta_strands'] = tmbeta_strands[seqid]
      
    return tmbeta_strands

  # dump extraneous output into this blackhole so we don't see it
  if not __DEBUG__: twill.set_output(StringIO.StringIO())

  for seqid in proteins:
    
    # only run on sequences which match the category filter
    if force or \
       (category == None) or \
       (dict_get(proteins[seqid], 'category') == category):
      pass
    else:
      continue
      
    go(url)
    if __DEBUG__: showforms()
    fv("1","sequence",proteins[seqid]['seq'])
    submit()
    log_stderr("# TMBETA-NET: Predicting strands for %s - %s\n" \
                      % (seqid, proteins[seqid]['name']))
    out = show()
    time.sleep(1)

    if ("Some query is already running. Please try again." in out):
      log_stderr("# TMBETA-NET(web) error: %s" % (out))
      return {}

    # parse the web page returned, extract strand boundaries
    proteins[seqid]['tmbeta_strands'] = []
    for l in out.split('\n'):
      if __DEBUG__: log_stderr("## " + l)

      if "<BR>Segment " in l:
        i,j = l.split(":")[1].split("to")
        i = int(i.strip()[1:])
        j = int(j.strip()[1:])
        proteins[seqid]['tmbeta_strands'].append([i,j])

        if __DEBUG__: log_stderr("# TMBETA-NET(web) segments: %s, %s" % (i, j))

    tmbeta_strands[seqid] = proteins[seqid]['tmbeta_strands']

  # we store the parsed strand boundaries in JSON format
  fh = open(outfile, 'w')
  fh.write(json.dumps(tmbeta_strands, separators=(',',':\n')))
  fh.close()

  return tmbeta_strands
Exemple #12
0
def post_process_protein(params, protein):
    
  def has_tm_helix(protein):
    for program in params['helix_programs']:
      if dict_get(protein, '%s_helices' % program):
        return True
    return False

  # these functions detect if and TM-containing IM proteins
  # have large loops / terminal regions in the periplasm or cytoplasm
  # that may be accessible / inaccessible in spheroplast shaving 
  # experiments.
  def has_long_loops(protein, loop_str='_outer_loops', \
                     loop_length=params['internal_exposed_loop_min']):
    for annot in protein:
      if loop_str in annot:
        for loop in protein[annot]:
          l_len = loop[1]-loop[0]
          if l_len >= loop_length:
            return True
    return False
  
  def long_in_periplasm(protein, \
                        loop_length=params['internal_exposed_loop_min']):
    return has_long_loops(protein, '_outer_loops', loop_length)
  
  def long_in_cytoplasm(protein, \
                        loop_length=params['internal_exposed_loop_min']):
    return has_long_loops(protein, '_inner_loops', loop_length)

  
  details = []
  category = "UNKNOWN"
  is_hmm_profile_match = dict_get(protein, 'hmmsearch')
  is_signalp = dict_get(protein, 'is_signalp')
  is_tatfind = dict_get(protein, 'is_tatfind')
  is_lipop = dict_get(protein, 'is_lipop')
  
  # in terms of most sublocalization logic, a Tat signal is similar to a 
  # Sec (signalp) signal. We use has_signal_pept to denote that either 
  # is present.
  has_signal_pept = False
  if is_signalp or is_tatfind or \
     (('hmmsearch' in protein) and "Tat_PS51318" in protein['hmmsearch']):
    has_signal_pept = True
  
  # annotate the barrels - high scoring bomp hits don't require a 
  # signal peptide, low scoring ones do
  has_barrel = False
  bomp_score = dict_get(protein, 'bomp')
  if (bomp_score >= params['bomp_clearly_cutoff']) or \
     (has_signal_pept and bomp_score >= params['bomp_maybe_cutoff']):
    
    details += ['bomp(%i)' % (bomp_score)]
    has_barrel = True

  # DEPRECATED: TMB-HUNT server is permanently offline
  #tmbhunt_prob = dict_get(protein, 'tmbhunt_prob')
  #if (tmbhunt_prob >= params['tmbhunt_clearly_cutoff']) or \
  #   (has_signal_pept and tmbhunt_prob >= params['tmbhunt_maybe_cutoff']):
  #  details += ['tmbhunt(%.2f)' % (tmbhunt_prob)]
  #  has_barrel = True
    
  if has_signal_pept and dict_get(protein, 'is_tmbetadisc_rbf'):
    details += ['tmbetadisc-rbf']
    has_barrel = True
    
  if has_barrel:
    category = 'OM(barrel)'
    
  # we only regard the barrel prediction as a true positive
  # if a signal peptide is also present
#  is_barrel = False
#  if has_signal_pept and has_barrel: # TODO and num_tms <= 1:
#    category = 'OM(barrel)'
#    is_barrel = True
    
  # set number of predicted OM barrel strands in details
  if has_barrel and \
      dict_get(protein, 'tmbeta_strands'):
    num_strands = len(protein['tmbeta_strands'])
    details += ['tmbeta_strands(%i)' % (num_strands)]
  
  if has_signal_pept and not is_lipop and \
    (dict_get(protein, 'signalp_cleave_position')):
    # we use the SignalP signal peptidase cleavage site for Tat signals
    chop_nterminal_peptide(protein,  protein['signalp_cleave_position'])
  
  if is_tatfind:
    details += ["tatfind"]
  
  if is_signalp:
    details += ["signalp"]
  
  if is_lipop:
    details += ["lipop"]
    chop_nterminal_peptide(protein, protein['lipop_cleave_position'])
  
  if is_hmm_profile_match:
    details += ["hmm(%s)" % "|".join(protein['hmmsearch'])]

  if has_tm_helix(protein) and not has_barrel:
    for program in params['helix_programs']:
      n = len(protein['%s_helices' % program])
      details += [program + "(%d)" % n]
    
    category = "IM"
    if long_in_periplasm(protein):
      category += "+peri"
    if long_in_cytoplasm(protein):
      category += "+cyto"
  elif not has_barrel:
    if is_lipop:
      if dict_get(protein, 'lipop_im_retention_signal'):
        category = "LIPOPROTEIN(IM)"
      else:
        category = "LIPOPROTEIN(OM)"
      pass
    elif (has_signal_pept):
      category = "PERIPLASMIC/SECRETED"
    else:
      category = "CYTOPLASM"

  if details == []:
    details = ["."]

  protein['details'] = details
  protein['category'] = category

  return details, category
def annotate(params, proteins, \
             url="http://rbf.bioinfo.tw/" +
                 "~sachen/OMPpredict/" +
                 "TMBETADISC-RBF-Content.html", force=False):
    """
    Interfaces with the TMBETADISC-RBF web service at
    (http://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php)
    to predict if protein sequence is likely to be an outer membrane beta-barrel.

    Note that the default URL we use it different to the regular form used
    by web browsers, since we need to bypass some AJAX fun.
    """
    # TODO: automatically split large sets into multiple jobs
    #       since TMBETADISC seems to not like more than take
    #       ~5000 seqs at a time
    if len(proteins) >= 5000:
        log_stderr(
            "# ERROR: TMBETADISC-RBF(web): tends to fail with > ~5000 sequences."
        )
        return

    # set the user-agent so web services can block us if they want ... :/
    python_version = sys.version.split()[0]
    # TODO: Set User-Agent header for requests
    # agent("Python-urllib/%s (requests; inmembrane)" % python_version)

    outfn = 'tmbetadisc-rbf.out'
    log_stderr("# TMBETADISC-RBF(web) %s > %s" % (params['fasta'], outfn))

    if not force and os.path.isfile(outfn):
        log_stderr("# -> skipped: %s already exists" % outfn)
        fh = open(outfn, 'r')
        proteins = parse_tmbetadisc_output(fh.read(), proteins)
        fh.close()
        return proteins

    # set the user defined method
    method_map = {
        "aa": "Amino Acid Composition",
        "dp": "Depipetide Composition",
        "aadp": "Amino Acid & Depipetide Composition",
        "pssm": "PSSM"
    }
    if dict_get(params, 'tmbetadisc_rbf_method'):
        try:
            method = method_map[params['tmbetadisc_rbf_method']]
        except KeyError:
            log_stderr("# ERROR: Invalid setting from tmbetadisc_rbf_method. \
                    Must be set to aa, dp, aadp or pssm.")
            sys.exit()

    # files = {'userfile': open(params["fasta"], 'rb')}
    with open(params["fasta"], 'r') as ff:
        data = {'format': 'fasta', 'select': method, 'seq': ff.read()}
    response = requests.post(
        'https://rbf.bioinfo.tw/~sachen/OMPpredict/TMBETADISC-RBF.php',
        data=data)  # , files=files)

    waiting_page = response.content
    if __DEBUG__: log_stderr(waiting_page)

    for l in waiting_page.split('\n'):
        if 'TMBETADISC-RBF-action.php?UniqueName=' in l:
            result_url = l.split("'")[1]

    time.sleep(5)

    output = requests.get(result_url).content

    if __DEBUG__: log_stderr(output)

    # write raw output to a file
    fh = open(outfn, 'w')
    # fh.write(waiting_page)
    # fh.write("<!-- ----------------------------------------------------------------------------------- -->")
    fh.write(output)
    fh.close()

    proteins = parse_tmbetadisc_output(output, proteins)

    return proteins