Example #1
0
def insert_best_seqid_column(params):
    csv = params['csv']
    uniprot_ids_header = params['uniprot_ids_header']
    delimiter = params['delimiter']
    out_csv = params['output_csv']
    uniprot_cache = params['cache_dir']

    if not csv:
        raise IOError('No file selected')

    if not os.path.isfile(csv):
        raise IOError(csv + ' not found')

    headers = get_headers(csv)

    if uniprot_ids_header not in headers:
        s = "Column header '%s' not found, available headers:\n" % uniprot_ids_header
        for header in headers:
            s += '   ' + header + '\n'
        raise IOError(s)

    logging('Reading %s\n' % csv)
    entries = read_csv(csv)

    all_seqids = []
    for entry in entries:
        tokens = entry[uniprot_ids_header].split(delimiter)
        entry['seqids'] = [s.strip() for s in tokens]
        all_seqids.extend(entry['seqids'])

    logging('Found %d potential Uniprot IDs\n' % len(all_seqids))

    uniprot_data = uniprot.batch_uniprot_metadata(
        all_seqids, uniprot_cache)

    for entry in entries:
        best_seqid = uniprot.sort_seqids_by_uniprot(
            entry['seqids'], uniprot_data)[0]
        entry['best_seqid'] = best_seqid
        entry['is_reviewed'] = False
        if best_seqid in uniprot_data:
            entry['is_reviewed'] = \
                uniprot_data[best_seqid]['is_reviewed']

    logging('Writing ')
    logging('%s\n' % os.path.abspath(out_csv), lambda: open_file(out_csv))
    headers = ['best_seqid', 'is_reviewed'] + get_headers(csv)
    rows = [headers]
    for entry in entries:
        rows.append([entry[h] for h in headers])
    write_csv(out_csv, rows)
def add_uniprot_data(protxml_groups, cache_file=None):
    """
  Processes the data from an PROTXML file, reads the
  seqids, and attempts to mapt to a UNIPROT ID
  and then, fetch the metadata of that protein from
  the uniprot.org website: organism, gene, description
  etc.
  """
    seqids = get_all_seqids(protxml_groups)
    if is_url_connected('http://uniprot.org'):
        uniprot_dict = uniprot.get_metadata_with_some_seqid_conversions(
            seqids, cache_file)
    else:
        print "Can't connect to www.uniprot.org, won't use uniprot metatdata"
        uniprot_dict = {}
    for group_id, protxml_group in protxml_groups.items():
        for protein in protxml_group['proteins']:
            protein['id'] = ''
            protein['acc'] = protein['protein_name']
            names = [protein['protein_name']] + protein['other_seqids']
            new_seqid = uniprot.sort_seqids_by_uniprot(names, uniprot_dict)[0]
            if new_seqid != protein['protein_name']:
                print "Protein group %s%s is better represented with %s than %s" % \
                    (group_id,
                     protein['group_sibling_id'],
                     uniprot.get_naked_seqid(new_seqid),
                     uniprot.get_naked_seqid(protein['protein_name']))
                protein['protein_name'] = new_seqid
                protein['other_seqids'] = names[1:]
                protein['acc'] = new_seqid
            protein['other_seqids'] = ';'.join(protein['other_seqids'])
            if new_seqid not in uniprot_dict:
                print "No uniprot metadata for protein group %s%s seqid %s" % \
                    (group_id,
                     protein['group_sibling_id'],
                     uniprot.get_naked_seqid(new_seqid))
                continue
            protein['link'] = ''
            uniprot_entry = uniprot_dict[new_seqid]
            protein['id'] = uniprot_entry['id']
            protein['acc'] = uniprot_entry['accs'][0]
            protein['link'] = \
                '=HYPERLINK("http://uniprot.org/uniprot/%s")' % \
                    uniprot_dict[new_seqid]['id']
            if 'gene' in uniprot_entry:
                protein['gene'] = uniprot_entry['gene']
            if 'organism' in uniprot_entry:
                protein['organism'] = uniprot_entry['organism']
            protein['description'] = '; '.join(uniprot_entry['descriptions'])
            if 'length' in uniprot_entry:
                protein['length'] = uniprot_entry['length']
Example #3
0
def add_uniprot_data(protxml_groups, cache_file=None):
  """
  Processes the data from an PROTXML file, reads the
  seqids, and attempts to mapt to a UNIPROT ID
  and then, fetch the metadata of that protein from
  the uniprot.org website: organism, gene, description
  etc.
  """
  seqids = get_all_seqids(protxml_groups)
  if is_url_connected('http://uniprot.org'):
    uniprot_dict = uniprot.get_metadata_with_some_seqid_conversions(seqids, cache_file)
  else:
    print "Can't connect to www.uniprot.org, won't use uniprot metatdata"
    uniprot_dict = {}
  for group_id, protxml_group in protxml_groups.items():
    for protein in protxml_group['proteins']:
      protein['id'] = ''
      protein['acc'] = protein['protein_name']
      names = [protein['protein_name']] + protein['other_seqids']
      new_seqid = uniprot.sort_seqids_by_uniprot(names, uniprot_dict)[0]
      if new_seqid != protein['protein_name']:
        print "Protein group %s%s is better represented with %s than %s" % \
            (group_id, 
             protein['group_sibling_id'],
             uniprot.get_naked_seqid(new_seqid), 
             uniprot.get_naked_seqid(protein['protein_name']))
        protein['protein_name'] = new_seqid
        protein['other_seqids'] = names[1:]
        protein['acc'] = new_seqid
      protein['other_seqids'] = ';'.join(protein['other_seqids'])
      if new_seqid not in uniprot_dict:
        print "No uniprot metadata for protein group %s%s seqid %s" % \
            (group_id, 
             protein['group_sibling_id'],
             uniprot.get_naked_seqid(new_seqid))
        continue
      protein['link'] = ''
      uniprot_entry = uniprot_dict[new_seqid]
      protein['id'] = uniprot_entry['id']
      protein['acc'] = uniprot_entry['accs'][0]
      protein['link'] = \
          '=HYPERLINK("http://uniprot.org/uniprot/%s")' % \
              uniprot_dict[new_seqid]['id']
      if 'gene' in uniprot_entry:
        protein['gene'] = uniprot_entry['gene']
      if 'organism' in uniprot_entry:
        protein['organism'] = uniprot_entry['organism']
      protein['description'] = '; '.join(uniprot_entry['descriptions'])
      if 'length' in uniprot_entry:
        protein['length'] = uniprot_entry['length']
Example #4
0
def insert_best_seqid_column(params):
    csv = params['csv']
    uniprot_ids_header = params['uniprot_ids_header']
    delimiter = params['delimiter']
    out_csv = params['output_csv']
    uniprot_cache = params['cache_dir']

    if not csv:
        raise IOError('No file selected')

    if not os.path.isfile(csv):
        raise IOError(csv + ' not found')

    headers = get_headers(csv)

    if uniprot_ids_header not in headers:
        s = "Column header '%s' not found, available headers:\n" % uniprot_ids_header
        for header in headers:
            s += '   ' + header + '\n'
        raise IOError(s)

    logging('Reading %s\n' % csv)
    entries = read_csv(csv)

    all_seqids = []
    for entry in entries:
        tokens = entry[uniprot_ids_header].split(delimiter)
        entry['seqids'] = [s.strip() for s in tokens]
        all_seqids.extend(entry['seqids'])

    logging('Found %d potential Uniprot IDs\n' % len(all_seqids))

    uniprot_data = uniprot.batch_uniprot_metadata(all_seqids, uniprot_cache)

    for entry in entries:
        best_seqid = uniprot.sort_seqids_by_uniprot(entry['seqids'],
                                                    uniprot_data)[0]
        entry['best_seqid'] = best_seqid
        entry['is_reviewed'] = False
        if best_seqid in uniprot_data:
            entry['is_reviewed'] = \
                uniprot_data[best_seqid]['is_reviewed']

    logging('Writing ')
    logging('%s\n' % os.path.abspath(out_csv), lambda: open_file(out_csv))
    headers = ['best_seqid', 'is_reviewed'] + get_headers(csv)
    rows = [headers]
    for entry in entries:
        rows.append([entry[h] for h in headers])
    write_csv(out_csv, rows)