def load_sequences_from_uniprot(proteins, clean_seqid=None, cache_basename=None): if clean_seqid: change_seqids_in_proteins(proteins, clean_seqid) seqids = [] for seqid in proteins: seqids.append(seqid) if 'other_seqids' in proteins[seqid]['attr']: seqids.extend(proteins[seqid]['attr']['other_seqids']) uniprot_data = uniprot.get_metadata_with_some_seqid_conversions(seqids, cache_basename) load_fastas_into_proteins(proteins, uniprot_data) if cache_basename: uniprot.write_fasta(cache_basename+'.fasta', uniprot_data, uniprot_data.keys())
filename = '/data/cycd_targets/cycd_target_uniprot.txt' targetIDs = pd.read_csv(filename) already_seen = pd.concat((targetIDs['Entry'], entries)) # Load hit list from PSSM filename = '/data/cycd_targets/hsap_proteome/hsap_hits>20.csv' targetIDs = pd.read_csv(filename, sep='\t') entries = targetIDs['Entry'] # Do a merge to see what's not already seen in the hand-curated list merged = set(entries) - set(already_seen) # Fetch and write as FASTA out_name = '/data/cycd_targets/hsap_hits>20.fasta' upData = uniprot.batch_uniprot_metadata(merged, 'cache') uniprot.write_fasta(out_name, upData, merged) split_fastas(out_name) PSIPRED_DIR = '/data/cycd_targets/cycd_target_uniprot_individuals' seqs = [] for filename in os.listdir(PSIPRED_DIR): if filename.endswith('.ss2'): print 'Working on ', filename #Load PSIPRED VFORMAT in a sane way to extract only relevant info df = pd.read_csv(os.path.join(PSIPRED_DIR, filename), header=0, delim_whitespace=True, skiprows=0,
mapping = uniprot.sequentially_convert_to_uniprot_id(seqids, "cache.json") uniprot_seqids = mapping.values() # Example 4 - get UniProt metadata uniprot_data = uniprot.batch_uniprot_metadata(uniprot_seqids, "cache2.txt") pprint.pprint(uniprot_data, indent=2) for l in open("cache2.txt"): print l.strip() uniprot.write_fasta("example.output.fasta", uniprot_data, uniprot_seqids) # Example 5 - chaining commands to make your own # special mapper def map_to_refseq(seqids): uniprot_mapping = uniprot.sequentially_convert_to_uniprot_id(seqids, "func.cache.json") uniprot_ids = uniprot_mapping.values() pairs = uniprot.batch_uniprot_id_mapping_pairs("ACC", "P_REFSEQ_AC", uniprot_ids) mapping = {} for seqid in seqids: if seqid in uniprot_mapping: uniprot_id = uniprot_mapping[seqid] for pair in pairs:
seqids, 'cache.json') uniprot_seqids = mapping.values() # Example 4 - get UniProt metadata uniprot_data = uniprot.batch_uniprot_metadata( uniprot_seqids, 'cache2.txt') pprint.pprint(uniprot_data, indent=2) for l in open('cache2.txt'): print l.strip() uniprot.write_fasta('example.output.fasta', uniprot_data, uniprot_seqids) # Example 5 - chaining commands to make your own # special mapper def map_to_refseq(seqids): uniprot_mapping = uniprot.sequentially_convert_to_uniprot_id( seqids, 'func.cache.json') uniprot_ids = uniprot_mapping.values() pairs = uniprot.batch_uniprot_id_mapping_pairs( 'ACC', 'P_REFSEQ_AC', uniprot_ids) mapping = {} for seqid in seqids: if seqid in uniprot_mapping: