def fetch_uniprot_data(accessions): """ Fetch UniProt protein descriptions, gene names, sequences, etc. All information is stored in UNIPROT_DATA and can be accessed with :func:`.get_uniprot_data`. Parameters ---------- accessions : list of str """ accessions = set(accessions).difference(UNIPROT_DATA) if not accessions: return {} cache_dir = tempfile.mkdtemp(suffix="uniprot") UNIPROT_DATA.update( uniprot.get_metadata_with_some_seqid_conversions( accessions, cache_dir=cache_dir, )) shutil.rmtree(cache_dir) return {i: UNIPROT_DATA.get(i, {}) for i in accessions}
def fill_protein_sequences(proteins): print('Filling protein sequences:') for i in range(0, len(proteins)): # 1. send requests while server will not get correct response server_response = None while not server_response: server_response = uniprot.get_metadata_with_some_seqid_conversions([proteins[i].id]) # 2. store sequence in protein proteins[i].sequence = server_response[proteins[i].id]['sequence'] print('Filling protein sequences: done')
def load_sequences_from_uniprot(proteins, clean_seqid=None, cache_basename=None): if clean_seqid: change_seqids_in_proteins(proteins, clean_seqid) seqids = [] for seqid in proteins: seqids.append(seqid) if 'other_seqids' in proteins[seqid]['attr']: seqids.extend(proteins[seqid]['attr']['other_seqids']) uniprot_data = uniprot.get_metadata_with_some_seqid_conversions(seqids, cache_basename) load_fastas_into_proteins(proteins, uniprot_data) if cache_basename: uniprot.write_fasta(cache_basename+'.fasta', uniprot_data, uniprot_data.keys())
def add_uniprot_data(protxml_groups, cache_file=None): """ Processes the data from an PROTXML file, reads the seqids, and attempts to mapt to a UNIPROT ID and then, fetch the metadata of that protein from the uniprot.org website: organism, gene, description etc. """ seqids = get_all_seqids(protxml_groups) if is_url_connected('http://uniprot.org'): uniprot_dict = uniprot.get_metadata_with_some_seqid_conversions( seqids, cache_file) else: print "Can't connect to www.uniprot.org, won't use uniprot metatdata" uniprot_dict = {} for group_id, protxml_group in protxml_groups.items(): for protein in protxml_group['proteins']: protein['id'] = '' protein['acc'] = protein['protein_name'] names = [protein['protein_name']] + protein['other_seqids'] new_seqid = uniprot.sort_seqids_by_uniprot(names, uniprot_dict)[0] if new_seqid != protein['protein_name']: print "Protein group %s%s is better represented with %s than %s" % \ (group_id, protein['group_sibling_id'], uniprot.get_naked_seqid(new_seqid), uniprot.get_naked_seqid(protein['protein_name'])) protein['protein_name'] = new_seqid protein['other_seqids'] = names[1:] protein['acc'] = new_seqid protein['other_seqids'] = ';'.join(protein['other_seqids']) if new_seqid not in uniprot_dict: print "No uniprot metadata for protein group %s%s seqid %s" % \ (group_id, protein['group_sibling_id'], uniprot.get_naked_seqid(new_seqid)) continue protein['link'] = '' uniprot_entry = uniprot_dict[new_seqid] protein['id'] = uniprot_entry['id'] protein['acc'] = uniprot_entry['accs'][0] protein['link'] = \ '=HYPERLINK("http://uniprot.org/uniprot/%s")' % \ uniprot_dict[new_seqid]['id'] if 'gene' in uniprot_entry: protein['gene'] = uniprot_entry['gene'] if 'organism' in uniprot_entry: protein['organism'] = uniprot_entry['organism'] protein['description'] = '; '.join(uniprot_entry['descriptions']) if 'length' in uniprot_entry: protein['length'] = uniprot_entry['length']
def add_uniprot_data(protxml_groups, cache_file=None): """ Processes the data from an PROTXML file, reads the seqids, and attempts to mapt to a UNIPROT ID and then, fetch the metadata of that protein from the uniprot.org website: organism, gene, description etc. """ seqids = get_all_seqids(protxml_groups) if is_url_connected('http://uniprot.org'): uniprot_dict = uniprot.get_metadata_with_some_seqid_conversions(seqids, cache_file) else: print "Can't connect to www.uniprot.org, won't use uniprot metatdata" uniprot_dict = {} for group_id, protxml_group in protxml_groups.items(): for protein in protxml_group['proteins']: protein['id'] = '' protein['acc'] = protein['protein_name'] names = [protein['protein_name']] + protein['other_seqids'] new_seqid = uniprot.sort_seqids_by_uniprot(names, uniprot_dict)[0] if new_seqid != protein['protein_name']: print "Protein group %s%s is better represented with %s than %s" % \ (group_id, protein['group_sibling_id'], uniprot.get_naked_seqid(new_seqid), uniprot.get_naked_seqid(protein['protein_name'])) protein['protein_name'] = new_seqid protein['other_seqids'] = names[1:] protein['acc'] = new_seqid protein['other_seqids'] = ';'.join(protein['other_seqids']) if new_seqid not in uniprot_dict: print "No uniprot metadata for protein group %s%s seqid %s" % \ (group_id, protein['group_sibling_id'], uniprot.get_naked_seqid(new_seqid)) continue protein['link'] = '' uniprot_entry = uniprot_dict[new_seqid] protein['id'] = uniprot_entry['id'] protein['acc'] = uniprot_entry['accs'][0] protein['link'] = \ '=HYPERLINK("http://uniprot.org/uniprot/%s")' % \ uniprot_dict[new_seqid]['id'] if 'gene' in uniprot_entry: protein['gene'] = uniprot_entry['gene'] if 'organism' in uniprot_entry: protein['organism'] = uniprot_entry['organism'] protein['description'] = '; '.join(uniprot_entry['descriptions']) if 'length' in uniprot_entry: protein['length'] = uniprot_entry['length']
pairs = uniprot.batch_uniprot_id_mapping_pairs('P_REFSEQ_AC', 'ACC', seqids) pprint.pprint(pairs, indent=2) # Example 2 - get UniProt metadata uniprot_seqids = [j for i, j in pairs] uniprot_data = uniprot.batch_uniprot_metadata(uniprot_seqids, 'cache') pprint.pprint(uniprot_data, indent=2) # Example 3 - parse for isoforms in metadata text = open('cache/metadata.0.txt').read() uniprot_data = uniprot.parse_isoforms(text) pprint.pprint(uniprot_data) # Example 4 - chaining commands to map seqids seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split() uniprot_data = uniprot.get_metadata_with_some_seqid_conversions( seqids, 'cache2') pprint.pprint(uniprot_data, indent=2) # Example 4 - chaining commands to map seqids seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split() uniprot_data = uniprot.get_metadata_with_some_seqid_conversions( seqids, 'cache2') fasta_db = "test-isoform/Q91ZU6-8.fasta" read_seqids, fastas = uniprot.read_fasta(fasta_db) test_sequence = list(fastas.values())[0]['sequence'] print(test_sequence == uniprot_data['Q91ZU6-8']['sequence']) # Example 5 - check isoforms seqids = ["Q91ZU6-{}".format(i) for i in [1, 2, 3, 4, 5, 6, 8]] txt = open('test-isoform/Q91ZU6.txt').read() results = uniprot.parse_uniprot_metadata_with_seqids(seqids, txt)
pprint.pprint(pairs, indent=2) # Example 2 - get UniProt metadata uniprot_seqids = [j for i,j in pairs] uniprot_data = uniprot.batch_uniprot_metadata( uniprot_seqids, 'cache') pprint.pprint(uniprot_data, indent=2) # Example 3 - parse for isoforms in metadata text = open('cache/metadata.0.txt').read() uniprot_data = uniprot.parse_isoforms(text) pprint.pprint(uniprot_data) # Example 4 - chaining commands to map seqids seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split() uniprot_data = uniprot.get_metadata_with_some_seqid_conversions( seqids, 'cache2') pprint.pprint(uniprot_data, indent=2) # Example 4 - chaining commands to map seqids seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split() uniprot_data = uniprot.get_metadata_with_some_seqid_conversions( seqids, 'cache2') fasta_db = "test-isoform/Q91ZU6-8.fasta" read_seqids, fastas = uniprot.read_fasta(fasta_db) test_sequence = list(fastas.values())[0]['sequence'] print(test_sequence == uniprot_data['Q91ZU6-8']['sequence']) # Example 5 - check isoforms seqids = ["Q91ZU6-{}".format(i) for i in [1, 2, 3, 4, 5, 6, 8]] txt = open('test-isoform/Q91ZU6.txt').read() results = uniprot.parse_uniprot_metadata_with_seqids(seqids, txt)