コード例 #1
0
ファイル: metadata.py プロジェクト: kmcluskey/FlyOmics_1
def get_uniprot_metadata_online(uniprot_ids):

    uniprot_ids = list(set(uniprot_ids))
    print('get_uniprot_metadata', len(uniprot_ids))

    BATCH_SIZE = 200
    uniprot = UniProt()
    uniprot_lookup = {}

    cumulative_total = 0
    for x in batch(uniprot_ids, BATCH_SIZE):
        batch_ids = [i for i in x]
        cumulative_total += len(batch_ids)
        print(cumulative_total, '/', len(uniprot_ids))

        res = uniprot.retrieve(batch_ids)
        for r in res:
            for key in r['accession']:
                protein_id = key.contents[0]
                for x in r['recommendedname']:
                    tag = x.find('shortname')
                    if tag is None:
                        tag = x.find('fullname')
                    label = tag.contents[0]
                    uniprot_lookup[protein_id] = {'display_name': label}

    return uniprot_lookup
コード例 #2
0
def get_protein_info(uniprot_ids):
    """
    Retrieves EMBL accession numbers and taxonomy ids for list of proteins. Creates a dict to map each protein's
    uid to its EMBL accession number and tax id.
    :param uniprot_ids: List of Uniprot IDs, e.g., ['P0AAJ3', 'A0NAQ1']
    :return: dictionary mapping each uid to its info
    """
    from bioservices import UniProt

    missing_embl = []
    missing_taxid = []
    orthos_map = {}
    u = UniProt()

    uniprot_records = list(map(lambda x: x.decode("utf-8"), u.retrieve(uniprot_ids, frmt='txt')))  # WSL CLI
    # uniprot_records = u.retrieve(uniprot_ids, frmt='txt')  # PyCharm

    embl_pattern = re.compile(r"DR\s+EMBL;.*?;\s+(.*?);")
    taxid_pattern = re.compile(r"OX\s+NCBI_TaxID=(\d+)")

    for i, record in enumerate(uniprot_records):
        embl_acc = get_match(embl_pattern, record, uniprot_ids, missing_embl, i)  # EMBL accession number for coding seq
        taxonomy_id = get_match(taxid_pattern, record, uniprot_ids, missing_taxid, i)  # tax_id of organism protein belongs to
        orthos_map[uniprot_ids[i]] = [embl_acc, taxonomy_id]  # map protein info to its uid

    if missing_embl:
        print('\n{} Protein(s) Missing EMBL Accession Number: '.format(len(missing_embl)) + ', '.join(missing_embl))

    if missing_taxid:
        print('\n{} Protein(s) Missing NCBI TaxID: '.format(len(missing_taxid)) + ', '.join(missing_taxid))

    return orthos_map
コード例 #3
0
    def get_fasta(self, id_):
        """Fetches FASTA from uniprot and loads into attrbiute :attr:`fasta`

        :param str id_: a given uniprot identifier
        :returns: the FASTA contents

        """
        print("get_fasta is deprecated. Use load_fasta instead")
        from bioservices import UniProt
        u = UniProt(verbose=False)
        res = u.retrieve(id_, frmt="fasta")
        self._fasta = res[:]
        return res
コード例 #4
0
ファイル: fasta.py プロジェクト: pjshort/bioservices
    def get_fasta(self, id_):
        """Fetches FASTA from uniprot and loads into attrbiute :attr:`fasta`

        :param str id_: a given uniprot identifier
        :returns: the FASTA contents

        """
        print("get_fasta is deprecated. Use load_fasta instead")
        from bioservices import UniProt
        u = UniProt(verbose=False)
        res = u.retrieve(id_, frmt="fasta")
        self._fasta = res[:]
        return res
コード例 #5
0
def write_fasta_for_ids(uniprot_ids, output_file):
    u = UniProt(verbose=False)
    count = 1
    all_seqs = []
    for uni_id in uniprot_ids:
        all_seqs.append(u.retrieve(uni_id, 'fasta'))
        if count % 500 == 0:
            print("Retrieved sequence for {}/{} IDs".format(
                count, len(uniprot_ids)))
        count += 1
    all_fasta_seqs = [i for i in all_seqs if not type(i) == int]
    final_fasta = ''.join(all_fasta_seqs)
    with open(output_file, 'w') as f:
        f.write(final_fasta)
コード例 #6
0
ファイル: fasta.py プロジェクト: pjshort/bioservices
    def load_fasta(self, id_):
        """Fetches FASTA from uniprot and loads into attribute :attr:`fasta`

        :param str id_: a given uniprot identifier
        :returns: nothing

        .. note:: same as :meth:`get_fasta` but returns nothing
        """
        # save fasta into attributes fasta
        from bioservices import UniProt
        u = UniProt(verbose=False)
        try:
            res = u.retrieve(id_, frmt="fasta")
            # some entries in uniprot are valid but obsolet and return empty string
            if res == "":
                raise Exception
            self._fasta = res[:]
        except:
            pass
コード例 #7
0
    def load_fasta(self, id_):
        """Fetches FASTA from uniprot and loads into attribute :attr:`fasta`

        :param str id_: a given uniprot identifier
        :returns: nothing

        .. note:: same as :meth:`get_fasta` but returns nothing
        """
        # save fasta into attributes fasta
        from bioservices import UniProt
        u = UniProt(verbose=False)
        try:
            res = u.retrieve(id_, frmt="fasta")
            # some entries in uniprot are valid but obsolet and return empty string
            if res == "":
                raise Exception
            self._fasta = res[:]
        except:
            pass
コード例 #8
0
def hitrate(proteins, indexes, subclass):
    columns = ['subsequence', 'sprot_start', 'sprot_end', 'sprot_loc', 'dl_start', 'dl_end', 'dl_loc']
    u = UniProt()
    match, total = 0,0
    dl_peptides, dl_starts, dl_ends, sprot_starts, sprot_ends, sprot_locs =[], [], [], [], [], []
    if proteins != None:
        for prot in proteins:
            locs = None
            try:
                entry = u.retrieve(prot.ac, frmt='xml')
                locs = entry['subcellularlocation']
            except:
                continue
            if locs:
                pep_metadata = prot.matching_peptide[0]
                seq_range = pep_metadata.match_range[0]
                peptide = pep_metadata.peptide
                dl_peptides.append(peptide)
                start, end = indexes[peptide]
                dl_starts.append(start)
                dl_ends.append(end)
                
                pos = seq_range.start
                sprot_starts.append(pos)
                sprot_ends.append(seq_range.end)
                
                seq_len = seq_range.end - pos
                offset_weight = 1 if pos == start else min(abs(seq_len / (pos - start)), 1) 
                
                loc = list((locs[0].children))[1].string
                sprot_locs.append(loc)
                match_weight = determine_locations(loc, subclass) * offset_weight
                match += match_weight
                # assert(match_weight <= 1),'match_weight {}'.format(match_weight)
                total += offset_weight
    if total == 0:
        hitrate = 0
    else:
        hitrate = match/total
    vals = [[dl_peptides, sprot_starts, sprot_ends, sprot_locs, dl_starts, dl_ends, subclass]]
    df = pd.DataFrame(vals, columns=columns)
    return (hitrate, df)
コード例 #9
0
ファイル: metadata.py プロジェクト: kmcluskey/FlyOmics_1
def get_single_uniprot_metadata_online(uniprot_id):
    uniprot = UniProt()
    res = uniprot.retrieve(uniprot_id)
    return res
コード例 #10
0
    Entry_selection_list = []  # empty list created called Entry_selection_list
    a = 2  # the numerical value 2 is assigned to

    for y in (Number_selections
              ):  # iterates n times, whereby n = Number_selections
        a = a + 1  # for every iteration a increases by one
        Entry_selection_list.append(
            int(sys.argv[a])
        )  # sets up sys.arg[] for n proteins, whereby n = Number_selections

    for w in Entry_selection_list:  # iterates over members of list called Entry_selection_list
        entry, gene_names = res.split("\n")[w].split(
            "\t"
        )  # protein's entry number and gene names are saved in entry and gene_names, respectively
        seq = u.retrieve(str(entry),
                         frmt="fasta")  # gets the fasta sequence for protein
        print(seq)  # outputs value of seq for each iteration

    entry1 = Entry_selection(
        int(sys.argv[3])
    )  # selects the n entry, whereby n = value of sys.argv[3] in numerical form
    f = open("Clustalfastaseq.txt",
             "w+")  # opens the file called Clustalfastaseq.txt
    f.write(entry1)  # writes a fasta sequence that entry1 currently equals
    f.close()  # closes the file

    length_list = int(
        len(Entry_selection_list))  # length of Entry_selection_list list

    for i in Entry_selection_list[
            1:length_list]:  # iteraretes n-1 times, whereby n = length_list
コード例 #11
0
ファイル: NEXUS.py プロジェクト: tgdev24/Nexus
     unilines = d.split("\n")
     #print(unilines,  "bitchHHHHHHHHHHHHHHHHHHHHHH")
     if len(unilines) > 1:
         uni_entry = unilines[1].split("\t")[0]
         uni_entries.append(uni_entry)
     else:
         uni_entries.append("NULL")
     #print("\t", i[0], i[1])
     #print(uni_entries)
 for i in uni_entries:
     if (i == "NULL"):
         listPfam.append("NULL")
         listProsite.append("NULL")
     else:
         with open("uniprot_out.txt", "w") as outfile:
             p = u.retrieve(i, "txt")
             #newone = open("wow.txt", "w")
             #print(p)
             outfile.write(p)
             #newone.write(p)
         with open("uniprot_out.txt", "r") as infile:
             listuno = []
             list2 = []
             list3 = []
             for j in infile.readlines():
                 listuno.append(j.strip())
             #print(listuno)
             for j in listuno:
                 if "Pfam;" in j:
                     list2.append(j)
                     #print(list2, "lISTT2222222222222222222")