Exemple #1
0
def UIDs2JSON(base_uri, uids, fout):
  """ Uses uniprot library from Bosco Ho (https://github.com/boscoh/uniprot)."""
  import uniprot ## Bosco Ho (https://github.com/boscoh/uniprot)
  uniprot_data=uniprot.batch_uniprot_metadata(uids, None)
  for uid in uniprot_data.keys():
    for key in uniprot_data[uid].keys():
      if key in ('accs', 'sequence', 'go', 'description'):  #keep simple
        del uniprot_data[uid][key]
  json_txt=json.dumps(uniprot_data, sort_keys=True, indent=2)
  fout.write(json_txt+'\n')
  return
def get_uniprot_loc(seq_id):

    # input should be a string in any case
    if not isinstance(seq_id, str):
        return None

    # if there are less than 3 spaces
    elif ' ' not in seq_id:
        # get all protein info
        uniprot_data = uniprot.batch_uniprot_metadata([seq_id])
        prot_content = uniprot_data.get(seq_id)
        prot_content = str(prot_content.get('comment'))

    else:
        prot_content = seq_id

    if prot_content == None:
        return None

    # looking for location
    loc_header = 'SUBCELLULAR LOCATION: '

    loc_idx = prot_content.find(loc_header)

    # if the location is present
    if loc_idx > -1:

        # isolating location information (not including header)
        # now left side begins with the location
        prot_content = prot_content[loc_idx + len(loc_header):]

        # making string a bit shorter by going up to next exclamation mark
        next_exclam = prot_content.find('!')
        prot_content = prot_content[:next_exclam]

        # finding where all punctuation marks occur in what's remaining
        punct_df = pd.DataFrame(list(string.punctuation), columns=['mark'])
        punct_df['pos'] = punct_df.mark.apply(lambda x: prot_content.find(x))

        # ignoring punctuations that do not occur
        punct_df = punct_df[punct_df.pos > -1]

        # finding the mark that occurs first
        punct_df = punct_df.sort('pos').reset_index(drop=True)
        first_punct = punct_df.pos[punct_df.index == 0].values[0]

        # getting location; removing leading and trailing spaces
        location = prot_content[:first_punct].strip()

        return location

    else:
        return None
Exemple #3
0
def insert_best_seqid_column(params):
    csv = params['csv']
    uniprot_ids_header = params['uniprot_ids_header']
    delimiter = params['delimiter']
    out_csv = params['output_csv']
    uniprot_cache = params['cache_dir']

    if not csv:
        raise IOError('No file selected')

    if not os.path.isfile(csv):
        raise IOError(csv + ' not found')

    headers = get_headers(csv)

    if uniprot_ids_header not in headers:
        s = "Column header '%s' not found, available headers:\n" % uniprot_ids_header
        for header in headers:
            s += '   ' + header + '\n'
        raise IOError(s)

    logging('Reading %s\n' % csv)
    entries = read_csv(csv)

    all_seqids = []
    for entry in entries:
        tokens = entry[uniprot_ids_header].split(delimiter)
        entry['seqids'] = [s.strip() for s in tokens]
        all_seqids.extend(entry['seqids'])

    logging('Found %d potential Uniprot IDs\n' % len(all_seqids))

    uniprot_data = uniprot.batch_uniprot_metadata(
        all_seqids, uniprot_cache)

    for entry in entries:
        best_seqid = uniprot.sort_seqids_by_uniprot(
            entry['seqids'], uniprot_data)[0]
        entry['best_seqid'] = best_seqid
        entry['is_reviewed'] = False
        if best_seqid in uniprot_data:
            entry['is_reviewed'] = \
                uniprot_data[best_seqid]['is_reviewed']

    logging('Writing ')
    logging('%s\n' % os.path.abspath(out_csv), lambda: open_file(out_csv))
    headers = ['best_seqid', 'is_reviewed'] + get_headers(csv)
    rows = [headers]
    for entry in entries:
        rows.append([entry[h] for h in headers])
    write_csv(out_csv, rows)
Exemple #4
0
def insert_best_seqid_column(params):
    csv = params['csv']
    uniprot_ids_header = params['uniprot_ids_header']
    delimiter = params['delimiter']
    out_csv = params['output_csv']
    uniprot_cache = params['cache_dir']

    if not csv:
        raise IOError('No file selected')

    if not os.path.isfile(csv):
        raise IOError(csv + ' not found')

    headers = get_headers(csv)

    if uniprot_ids_header not in headers:
        s = "Column header '%s' not found, available headers:\n" % uniprot_ids_header
        for header in headers:
            s += '   ' + header + '\n'
        raise IOError(s)

    logging('Reading %s\n' % csv)
    entries = read_csv(csv)

    all_seqids = []
    for entry in entries:
        tokens = entry[uniprot_ids_header].split(delimiter)
        entry['seqids'] = [s.strip() for s in tokens]
        all_seqids.extend(entry['seqids'])

    logging('Found %d potential Uniprot IDs\n' % len(all_seqids))

    uniprot_data = uniprot.batch_uniprot_metadata(all_seqids, uniprot_cache)

    for entry in entries:
        best_seqid = uniprot.sort_seqids_by_uniprot(entry['seqids'],
                                                    uniprot_data)[0]
        entry['best_seqid'] = best_seqid
        entry['is_reviewed'] = False
        if best_seqid in uniprot_data:
            entry['is_reviewed'] = \
                uniprot_data[best_seqid]['is_reviewed']

    logging('Writing ')
    logging('%s\n' % os.path.abspath(out_csv), lambda: open_file(out_csv))
    headers = ['best_seqid', 'is_reviewed'] + get_headers(csv)
    rows = [headers]
    for entry in entries:
        rows.append([entry[h] for h in headers])
    write_csv(out_csv, rows)
def get_prot_seq(seq_id):

    print seq_id

    # just in case uniprot tries to sever the connection
    try:
        uniprot_data = uniprot.batch_uniprot_metadata([seq_id])
    except:
        try:
            uniprot_data = uniprot.batch_uniprot_metadata([seq_id])
        except:
            try:
                uniprot_data = uniprot.batch_uniprot_metadata([seq_id])
            except:
                uniprot_data = uniprot.batch_uniprot_metadata([seq_id])

    prot_content = uniprot_data.get(seq_id)

    if prot_content == None:
        return None

    sequence = str(prot_content.get('sequence'))
    return sequence
def main():
    file_rawdata = "data/MS_ISG15_Raw.tsv"
    MS_dict = read_MS_rawdata(file_rawdata)
    all_uniprot_ids = sorted(MS_dict.keys())
    processed_ids = [
        f.rsplit('_')[1].rsplit('.')[0] for f in glob.glob('Fasta/*.fa')
    ]
    uniprot_ids = sorted(
        list(set(all_uniprot_ids).difference(set(processed_ids))))
    print "All IDs: %i" % len(all_uniprot_ids)
    print "Processed IDs: %i" % len(processed_ids)
    print "Remaining IDs: %i" % len(uniprot_ids)
    uniprot_data = uniprot.batch_uniprot_metadata(uniprot_ids[0:150], 'cache')
    parse_structure(uniprot_data)
    parse_ortholog(uniprot_ids)
def get_uniprot_str(seq_id, just_gene=False):

    # get all protein info
    uniprot_data = uniprot.batch_uniprot_metadata([seq_id])
    prot_content = uniprot_data.get(seq_id)

    if prot_content == None:
        return None

    if not just_gene:
        prot_content = json.dumps(prot_content.get('comment'))
    else:
        prot_content = json.dumps(prot_content.get('gene'))[1:-1]
    # stripping out all '\n' carefully
    prot_content = strip_slantn(prot_content)

    return prot_content
Exemple #8
0
def parse_uniprot_PDB(filename, MS_dict, dict_asa, outfile):
    uniprot_id = filename.rsplit('_')[1].rsplit('.')[0]
    uniprot_data = uniprot.batch_uniprot_metadata([uniprot_id], 'cache')
    uniprot_seq = uniprot_data[uniprot_id]['sequence']
    positions = list(set(MS_dict[uniprot_id]['Position']))
    PDBs = [line.rstrip() for line in open(filename, 'r').readlines()]
    for PDB in PDBs:
        file_PDB_zip = 'PDB/' + PDB + '.pdb.gz'
        file_PDB = 'PDB/' + PDB + '.pdb'
        file_dssp = 'dssp/' + PDB + '.dssp'
        fetchPDB(PDB)
        if len(glob.glob(PDB + '.pdb.gz')) == 0: continue
        os.system('mv ' + PDB + '.pdb.gz PDB/')
        os.system('gunzip -f ' + file_PDB_zip)
        os.system('/usr/local/Cellar/dssp/2.1.0/bin/mkdssp ' + file_PDB +
                  ' > ' + file_dssp)
        dict_dssp = reading_dssp(file_dssp, dict_asa)
        atoms = parsePDB(file_PDB)
        get_RSA_all_lys(atoms, dict_dssp, uniprot_id, PDB, outfile)
        chain_ids = sorted(list(set(atoms.getChids())))
        for chain_id in chain_ids:
            chain_atoms = atoms.select('chain ' + chain_id)
            top_aln = align_uniprot_PDB(uniprot_seq, atoms, chain_id)
            if top_aln == 'NA':
                print "No alignment"
                continue
            uniprot_seq_aln, pdb_seq_aln, score, begin, end = top_aln
            for position in positions:
                if position < int(begin): continue
                if position > int(end): continue
                pdb_resi, pdb_aa = identify_position(uniprot_seq_aln,
                                                     pdb_seq_aln, position,
                                                     uniprot_id)
                if pdb_resi == 'NA': continue
                pdb_resi = sorted(list(set(
                    chain_atoms.getResnums())))[pdb_resi - 1]
                residueID = chain_id + '-' + str(pdb_resi)
                if residueID not in dict_dssp.keys(): continue
                RSA = dict_dssp[residueID]['RSA']
                outfile.write("\t".join(
                    map(str, [
                        'ISG15', uniprot_id, position, PDB, chain_id, pdb_resi,
                        RSA
                    ])) + "\n")
Exemple #9
0
def protein_seq_update_celery_nofunction( full_batch = False ):
    proteins = None # Protein.objects.extra(where=["CHAR_LENGTH(sequence) = 0"])
    if full_batch:
        proteins = Protein.objects.all()
    else:
        proteins = Protein.objects.extra(where=["CHAR_LENGTH(sequence) = 0"])
    uniprot_data = uniprot.batch_uniprot_metadata( [ b.prot_id for b in proteins ]  )
    for key in uniprot_data.keys():
        defaults = {}
        try:
            defaults['sequence'] = uniprot_data[key]['sequence']
        except:
            pass
        try:
            defaults['description'] = uniprot_data[key]['description']
        except:
            pass
        prot, _ = Protein.objects.update_or_create( prot_id = key, defaults = defaults )
    return 'Protein sequences updated'
Exemple #10
0
def retrieve_uniprot_meta(uniprotIDs_file):
    with open(uniprotIDs_file, 'r') as f:
        id_list = [uni_id.strip() for uni_id in f]

    meta_dict = uniprot.batch_uniprot_metadata(id_list)
    return meta_dict
Exemple #11
0
# Clean up caches
os.system('rm cache*')

# Example 1 - reading a fasta file
seqids, fastas = uniprot.read_fasta('example.fasta')
pprint.pprint(seqids, indent=2)

# Example 2 - map identifiers for RefSeq to Uniprot
seqids = "NP_000508.1  NP_001018081.3".split()
pairs = uniprot.batch_uniprot_id_mapping_pairs('P_REFSEQ_AC', 'ACC', seqids)
pprint.pprint(pairs, indent=2)

# Example 2 - get UniProt metadata
uniprot_seqids = [j for i, j in pairs]
uniprot_data = uniprot.batch_uniprot_metadata(uniprot_seqids, 'cache')
pprint.pprint(uniprot_data, indent=2)

# Example 3 - parse for isoforms in metadata
text = open('cache/metadata.0.txt').read()
uniprot_data = uniprot.parse_isoforms(text)
pprint.pprint(uniprot_data)

# Example 4 - chaining commands to map seqids
seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split()
uniprot_data = uniprot.get_metadata_with_some_seqid_conversions(
    seqids, 'cache2')
pprint.pprint(uniprot_data, indent=2)

# Example 4 - chaining commands to map seqids
seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split()
Exemple #12
0
# Example 3 - sequential identifier mapping to UniProt
# identifiers using robust but slow method

seqids = """
EFG_MYCA1 YP_885981.1 CpC231_1796
""".split()

mapping = uniprot.sequentially_convert_to_uniprot_id(seqids, "cache.json")

uniprot_seqids = mapping.values()


# Example 4 - get UniProt metadata

uniprot_data = uniprot.batch_uniprot_metadata(uniprot_seqids, "cache2.txt")

pprint.pprint(uniprot_data, indent=2)

for l in open("cache2.txt"):
    print l.strip()

uniprot.write_fasta("example.output.fasta", uniprot_data, uniprot_seqids)


# Example 5 - chaining commands to make your own
# special mapper


def map_to_refseq(seqids):
    uniprot_mapping = uniprot.sequentially_convert_to_uniprot_id(seqids, "func.cache.json")
Exemple #13
0
# Load hand-curated CycD putative target list
filename = '/data/cycd_targets/cycd_target_uniprot.txt'
targetIDs = pd.read_csv(filename)
already_seen = pd.concat((targetIDs['Entry'], entries))

# Load hit list from PSSM
filename = '/data/cycd_targets/hsap_proteome/hsap_hits>20.csv'
targetIDs = pd.read_csv(filename, sep='\t')
entries = targetIDs['Entry']

# Do a merge to see what's not already seen in the hand-curated list
merged = set(entries) - set(already_seen)

# Fetch and write as FASTA
out_name = '/data/cycd_targets/hsap_hits>20.fasta'
upData = uniprot.batch_uniprot_metadata(merged, 'cache')
uniprot.write_fasta(out_name, upData, merged)

split_fastas(out_name)

PSIPRED_DIR = '/data/cycd_targets/cycd_target_uniprot_individuals'
seqs = []

for filename in os.listdir(PSIPRED_DIR):
    if filename.endswith('.ss2'):
        print 'Working on ', filename

        #Load PSIPRED VFORMAT in a sane way to extract only relevant info
        df = pd.read_csv(os.path.join(PSIPRED_DIR, filename),
                         header=0,
                         delim_whitespace=True,
Exemple #14
0
# Clean up caches
os.system('rm cache*')

# Example 1 - reading a fasta file
seqids, fastas = uniprot.read_fasta('example.fasta')
pprint.pprint(seqids, indent=2)

# Example 2 - map identifiers for RefSeq to Uniprot
seqids = "NP_000508.1  NP_001018081.3".split()
pairs = uniprot.batch_uniprot_id_mapping_pairs(
  'P_REFSEQ_AC', 'ACC', seqids)
pprint.pprint(pairs, indent=2)

# Example 2 - get UniProt metadata
uniprot_seqids = [j for i,j in pairs]
uniprot_data = uniprot.batch_uniprot_metadata(
    uniprot_seqids, 'cache')
pprint.pprint(uniprot_data, indent=2)

# Example 3 - parse for isoforms in metadata
text = open('cache/metadata.0.txt').read()
uniprot_data = uniprot.parse_isoforms(text)
pprint.pprint(uniprot_data)

# Example 4 - chaining commands to map seqids
seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split()
uniprot_data = uniprot.get_metadata_with_some_seqid_conversions(
    seqids, 'cache2')
pprint.pprint(uniprot_data, indent=2)

# Example 4 - chaining commands to map seqids
seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split()