Example #1
0
def fetch_uniprot_data(accessions):
    """
    Fetch UniProt protein descriptions, gene names, sequences, etc.

    All information is stored in UNIPROT_DATA and can be accessed with
    :func:`.get_uniprot_data`.

    Parameters
    ----------
    accessions : list of str
    """
    accessions = set(accessions).difference(UNIPROT_DATA)

    if not accessions:
        return {}

    cache_dir = tempfile.mkdtemp(suffix="uniprot")

    UNIPROT_DATA.update(
        uniprot.get_metadata_with_some_seqid_conversions(
            accessions,
            cache_dir=cache_dir,
        ))

    shutil.rmtree(cache_dir)

    return {i: UNIPROT_DATA.get(i, {}) for i in accessions}
Example #2
0
def fill_protein_sequences(proteins):
    print('Filling protein sequences:')

    for i in range(0, len(proteins)):
        # 1. send requests while server will not get correct response
        server_response = None
        while not server_response:
            server_response = uniprot.get_metadata_with_some_seqid_conversions([proteins[i].id])

        # 2. store sequence in protein
        proteins[i].sequence = server_response[proteins[i].id]['sequence']
    print('Filling protein sequences: done')
Example #3
0
def load_sequences_from_uniprot(proteins, clean_seqid=None, cache_basename=None):
  if clean_seqid:
    change_seqids_in_proteins(proteins, clean_seqid)
  seqids = []
  for seqid in proteins:
    seqids.append(seqid)
    if 'other_seqids' in proteins[seqid]['attr']:
      seqids.extend(proteins[seqid]['attr']['other_seqids'])
  uniprot_data = uniprot.get_metadata_with_some_seqid_conversions(seqids, cache_basename)
  load_fastas_into_proteins(proteins, uniprot_data)
  if cache_basename:
    uniprot.write_fasta(cache_basename+'.fasta', uniprot_data, uniprot_data.keys())
def add_uniprot_data(protxml_groups, cache_file=None):
    """
  Processes the data from an PROTXML file, reads the
  seqids, and attempts to mapt to a UNIPROT ID
  and then, fetch the metadata of that protein from
  the uniprot.org website: organism, gene, description
  etc.
  """
    seqids = get_all_seqids(protxml_groups)
    if is_url_connected('http://uniprot.org'):
        uniprot_dict = uniprot.get_metadata_with_some_seqid_conversions(
            seqids, cache_file)
    else:
        print "Can't connect to www.uniprot.org, won't use uniprot metatdata"
        uniprot_dict = {}
    for group_id, protxml_group in protxml_groups.items():
        for protein in protxml_group['proteins']:
            protein['id'] = ''
            protein['acc'] = protein['protein_name']
            names = [protein['protein_name']] + protein['other_seqids']
            new_seqid = uniprot.sort_seqids_by_uniprot(names, uniprot_dict)[0]
            if new_seqid != protein['protein_name']:
                print "Protein group %s%s is better represented with %s than %s" % \
                    (group_id,
                     protein['group_sibling_id'],
                     uniprot.get_naked_seqid(new_seqid),
                     uniprot.get_naked_seqid(protein['protein_name']))
                protein['protein_name'] = new_seqid
                protein['other_seqids'] = names[1:]
                protein['acc'] = new_seqid
            protein['other_seqids'] = ';'.join(protein['other_seqids'])
            if new_seqid not in uniprot_dict:
                print "No uniprot metadata for protein group %s%s seqid %s" % \
                    (group_id,
                     protein['group_sibling_id'],
                     uniprot.get_naked_seqid(new_seqid))
                continue
            protein['link'] = ''
            uniprot_entry = uniprot_dict[new_seqid]
            protein['id'] = uniprot_entry['id']
            protein['acc'] = uniprot_entry['accs'][0]
            protein['link'] = \
                '=HYPERLINK("http://uniprot.org/uniprot/%s")' % \
                    uniprot_dict[new_seqid]['id']
            if 'gene' in uniprot_entry:
                protein['gene'] = uniprot_entry['gene']
            if 'organism' in uniprot_entry:
                protein['organism'] = uniprot_entry['organism']
            protein['description'] = '; '.join(uniprot_entry['descriptions'])
            if 'length' in uniprot_entry:
                protein['length'] = uniprot_entry['length']
Example #5
0
def add_uniprot_data(protxml_groups, cache_file=None):
  """
  Processes the data from an PROTXML file, reads the
  seqids, and attempts to mapt to a UNIPROT ID
  and then, fetch the metadata of that protein from
  the uniprot.org website: organism, gene, description
  etc.
  """
  seqids = get_all_seqids(protxml_groups)
  if is_url_connected('http://uniprot.org'):
    uniprot_dict = uniprot.get_metadata_with_some_seqid_conversions(seqids, cache_file)
  else:
    print "Can't connect to www.uniprot.org, won't use uniprot metatdata"
    uniprot_dict = {}
  for group_id, protxml_group in protxml_groups.items():
    for protein in protxml_group['proteins']:
      protein['id'] = ''
      protein['acc'] = protein['protein_name']
      names = [protein['protein_name']] + protein['other_seqids']
      new_seqid = uniprot.sort_seqids_by_uniprot(names, uniprot_dict)[0]
      if new_seqid != protein['protein_name']:
        print "Protein group %s%s is better represented with %s than %s" % \
            (group_id, 
             protein['group_sibling_id'],
             uniprot.get_naked_seqid(new_seqid), 
             uniprot.get_naked_seqid(protein['protein_name']))
        protein['protein_name'] = new_seqid
        protein['other_seqids'] = names[1:]
        protein['acc'] = new_seqid
      protein['other_seqids'] = ';'.join(protein['other_seqids'])
      if new_seqid not in uniprot_dict:
        print "No uniprot metadata for protein group %s%s seqid %s" % \
            (group_id, 
             protein['group_sibling_id'],
             uniprot.get_naked_seqid(new_seqid))
        continue
      protein['link'] = ''
      uniprot_entry = uniprot_dict[new_seqid]
      protein['id'] = uniprot_entry['id']
      protein['acc'] = uniprot_entry['accs'][0]
      protein['link'] = \
          '=HYPERLINK("http://uniprot.org/uniprot/%s")' % \
              uniprot_dict[new_seqid]['id']
      if 'gene' in uniprot_entry:
        protein['gene'] = uniprot_entry['gene']
      if 'organism' in uniprot_entry:
        protein['organism'] = uniprot_entry['organism']
      protein['description'] = '; '.join(uniprot_entry['descriptions'])
      if 'length' in uniprot_entry:
        protein['length'] = uniprot_entry['length']
Example #6
0
pairs = uniprot.batch_uniprot_id_mapping_pairs('P_REFSEQ_AC', 'ACC', seqids)
pprint.pprint(pairs, indent=2)

# Example 2 - get UniProt metadata
uniprot_seqids = [j for i, j in pairs]
uniprot_data = uniprot.batch_uniprot_metadata(uniprot_seqids, 'cache')
pprint.pprint(uniprot_data, indent=2)

# Example 3 - parse for isoforms in metadata
text = open('cache/metadata.0.txt').read()
uniprot_data = uniprot.parse_isoforms(text)
pprint.pprint(uniprot_data)

# Example 4 - chaining commands to map seqids
seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split()
uniprot_data = uniprot.get_metadata_with_some_seqid_conversions(
    seqids, 'cache2')
pprint.pprint(uniprot_data, indent=2)

# Example 4 - chaining commands to map seqids
seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split()
uniprot_data = uniprot.get_metadata_with_some_seqid_conversions(
    seqids, 'cache2')
fasta_db = "test-isoform/Q91ZU6-8.fasta"
read_seqids, fastas = uniprot.read_fasta(fasta_db)
test_sequence = list(fastas.values())[0]['sequence']
print(test_sequence == uniprot_data['Q91ZU6-8']['sequence'])

# Example 5 - check isoforms
seqids = ["Q91ZU6-{}".format(i) for i in [1, 2, 3, 4, 5, 6, 8]]
txt = open('test-isoform/Q91ZU6.txt').read()
results = uniprot.parse_uniprot_metadata_with_seqids(seqids, txt)
Example #7
0
pprint.pprint(pairs, indent=2)

# Example 2 - get UniProt metadata
uniprot_seqids = [j for i,j in pairs]
uniprot_data = uniprot.batch_uniprot_metadata(
    uniprot_seqids, 'cache')
pprint.pprint(uniprot_data, indent=2)

# Example 3 - parse for isoforms in metadata
text = open('cache/metadata.0.txt').read()
uniprot_data = uniprot.parse_isoforms(text)
pprint.pprint(uniprot_data)

# Example 4 - chaining commands to map seqids
seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split()
uniprot_data = uniprot.get_metadata_with_some_seqid_conversions(
    seqids, 'cache2')
pprint.pprint(uniprot_data, indent=2)

# Example 4 - chaining commands to map seqids
seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split()
uniprot_data = uniprot.get_metadata_with_some_seqid_conversions(
    seqids, 'cache2')
fasta_db = "test-isoform/Q91ZU6-8.fasta"
read_seqids, fastas = uniprot.read_fasta(fasta_db)
test_sequence = list(fastas.values())[0]['sequence']
print(test_sequence == uniprot_data['Q91ZU6-8']['sequence'])

# Example 5 - check isoforms
seqids = ["Q91ZU6-{}".format(i) for i in [1, 2, 3, 4, 5, 6, 8]]
txt = open('test-isoform/Q91ZU6.txt').read()
results = uniprot.parse_uniprot_metadata_with_seqids(seqids, txt)