예제 #1
0
def get_id_from_bioservice(entity):

    from bioservices import UniProt
    u = UniProt(cache=True)

    # 数据库API查询
    Ids = None
    temp = []
    temp_cc = []
    # df = u.get_df(["Q9HCK8"])
    # print(df)
    res_reviewed = u.search(entity + '+reviewed:yes',
                            frmt="tab",
                            columns="id, entry name, genes, comment(FUNCTION)",
                            limit=5)  # , protein names
    res_unreviewed = u.search(
        entity,
        frmt="tab",
        columns="id, entry name, genes, genes(PREFERRED)",
        limit=5)
    # print(res_reviewed)
    # print(res_unreviewed)

    if res_reviewed == 400:
        print('请求无效\n')
        return Ids

    if res_reviewed:  # 若是有返回结果
        results = res_reviewed.split('\n')[1:-1]  # 去除开头一行和最后的''
        for line in results:
            results = line.split('\t')
            temp.append(results[0])
            temp_cc.append(results[-1])
            # break
    return temp, temp_cc
예제 #2
0
def retrieve_label_from_uniparc(ID):
    uniprot = UniProt()
    columns, values = uniprot.search(ID, database="uniparc",
                                     limit=1)[:-1].split("\n")
    name_idx = columns.split("\t").index("Organisms")
    name = values.split("\t")[name_idx].split("; ")[0]
    columns, values = uniprot.search(name, database="taxonomy",
                                     limit=1)[:-1].split("\n")
    lineage_idx = columns.split("\t").index("Lineage")
    label = values.split("\t")[lineage_idx].split("; ")[:2][-1]
    return label
예제 #3
0
def kegg_to_symbol_through_uniprot(unknown_genes):
    # create string to call uniprot for mapping
    search_string = '\t'.join(unknown_genes)
    kegg_to_gene_name = dict()
    missing = set()
    uniprot = UniProt(verbose=True)
    # This is where it gets tricky. Checking to see if there is a uniprot
    # mapping for the species, if not, trying from KEGG side. Sometimes
    # kegg  links to a different uniprot, or uniprot links to a diff kegg.
    uni_dict = dict(uniprot.mapping("KEGG_ID", "ACC", query=search_string))
    for i in unknown_genes:
        if i in uni_dict:
            for n in uni_dict[i]:
                x = uniprot.search("accession:{}".format(n),
                                   columns='genes(PREFERRED),reviewed,id',
                                   limit=1)
                _, data = x.rstrip('\n').split('\n')
                name, review, entry = data.split('\t')
                if n != entry:
                    print(i, n, entry, x, "dont match")
                elif review == 'reviewed':
                    kegg_to_gene_name[i] = name

        else:
            missing.add(i)
    print("{} mappings not found from kegg to"
          " gene name".format(len(missing)))
    print(missing)
    return kegg_to_gene_name
예제 #4
0
def search_uniprot(genes, columns):

    start_time = datetime.datetime.now()

    with yaspin(text="Performing UniProt search...", color="cyan") as sp:
        uniprot = UniProt(verbose=False)
        raw_data = ''
        headers = True

        for chunk in chunks(genes, 10):
            gene_search = "+OR+".join(list(chunk))
            new_data = uniprot.search(gene_search, frmt="tab", columns=f"entry name, {','.join(columns)}")

            # Removes first line if this is second or next batches.
            try:
                if not headers:
                    new_data = new_data.split("\n")[1]

                raw_data = raw_data + "\n" + new_data
                headers = False
            except IndexError:
                pass
        data = pandas.read_csv(pandas.compat.StringIO(raw_data), sep="\t")

        time_diff = (datetime.datetime.now() - start_time).total_seconds()

        sp.text = f"Performing UniProt Search => Task done in {time_diff} seconds."
        sp.ok("✔")

        return data
예제 #5
0
def main():
    """ Main function."""
    args = parse_args()
    if args.log:
        logfile = args.log
        logging.basicConfig(filename=logfile, level=logging.DEBUG, \
            filemode='w', format='%(asctime)s %(message)s', \
            datefmt='%Y-%m-%d %H:%M:%S')
    else:
        logfile = sys.stdout

    #Interface to the UniProt service
    u = UniProt(verbose=False)
    con = sqlite3.connect("PDB_Chain_Uniprot.db")
    cur = con.cursor()

    cur.execute("SELECT SP_PRIMARY FROM PDB_Chain_Uniprot WHERE Swissprot_Id = '' LIMIT 1000" )
    #cur.execute("SELECT SP_PRIMARY FROM PDB_Chain_Uniprot WHERE Swissprot_Id =?",(str(current_swissprot),))
    rows = cur.fetchall()
    for row in rows:
        #print(str(row[0]))
        res = u.search(str(row[0]),limit=1)
        #print(res)
        if res != "" :
            for line in res.split("\n")[1:-1]:
                if(line != ""):
                    res_id, res_Entry_Name, res_status, res_protein_names, res_gene_names, res_organism, Length = line.split("\t")
                    cur.execute("UPDATE PDB_Chain_Uniprot SET Swissprot_Id=?  WHERE SP_PRIMARY = ?",(str(res_Entry_Name),str(row[0]),))
                    con.commit()
    con.close()
예제 #6
0
def kegg_to_symbol_through_uniprot(unknown_genes):
    # create string to call uniprot for mapping
    search_string = '\t'.join(unknown_genes)
    kegg_to_gene_name = dict()
    missing = set()
    uniprot = UniProt(verbose=True)
    # This is where it gets tricky. Checking to see if there is a uniprot
    # mapping for the species, if not, trying from KEGG side. Sometimes
    # kegg  links to a different uniprot, or uniprot links to a diff kegg.
    uni_dict = dict(uniprot.mapping("KEGG_ID", "ACC", query=search_string))
    for i in unknown_genes:
        if i in uni_dict:
            for n in uni_dict[i]:
                x = uniprot.search("accession:{}".format(n),
                                   columns='genes(PREFERRED),reviewed,id',
                                   limit=1)
                header, data = x.rstrip('\n').split('\n')
                name, review, entry = data.split('\t')
                if n != entry:
                    print(i, n, entry, x, "dont match")
                elif review == 'reviewed':
                    kegg_to_gene_name[i] = name

        else:
            missing.add(i)
    print("{} mappings not found from kegg to"
          " gene name".format(len(missing)))
    print(missing)
    return kegg_to_gene_name
예제 #7
0
파일: pfam.py 프로젝트: xapple/seqsearch
 def uniprot_acc_to_taxonmy(self, accesion):
     """From one uniprot ID to taxonomy"""
     from bioservices import UniProt
     u = UniProt()
     data = u.search(accesion, frmt="xml")
     from bs4 import BeautifulSoup
     soup = BeautifulSoup(data, "html.parser")
     return ' (' + ', '.join([t.text for t in soup.find_all('taxon')]) + ')'
예제 #8
0
 def uniprot_acc_to_taxonmy(self, accesion):
     """From one uniprot ID to taxonomy"""
     from bioservices import UniProt
     u = UniProt()
     data = u.search(accesion, frmt="xml")
     from bs4 import BeautifulSoup
     soup = BeautifulSoup(data, "html.parser")
     return ' (' + ', '.join([t.text for t in soup.find_all('taxon')]) + ')'
예제 #9
0
    def uniprot2genename(self, name):
        """Return the gene names of a UniProt identifier"""
        from bioservices import UniProt
        c = UniProt(cache=True)

        try:
            res = pd.read_csv(StringIO(c.search(name, limit=1)), sep='\t')
            return list(res['Gene names'].values)
        except:
            print("Could not find %s" % name)
예제 #10
0
파일: gipsy.py 프로젝트: kaslangg/GIPSY
def pI_calc(dataframe):
    df = dataframe
    u = UniProt()
    for index, row in df.iterrows():
        seqce = u.search(df.loc[index, "prot_acc"],
                         frmt="tab",
                         columns="sequence").split('\n')
        p_i = ipc.predict_isoelectric_point(seqce[1])
        df.loc[index, "pI"] = p_i

    return df
예제 #11
0
def find_gene(prot_id):
    u = UniProt(verbose=False)
    res = u.mapping("EMBL", "ACC", query=prot_id)
    for key, values in res.items():
        for value in values:
            res = u.search(value, frmt="tab", limit=3, columns="genes", database='uniparc')

            genes = set(res[11:].split(';'))
            genes = [i for i in genes if (0<len(i) and i !='\n')]

            if len(genes)<1:
                genes = 'none'

            return key, genes
    return prot_id, 'none'
예제 #12
0
def getUniprotInfo(uni_id):
    u = UniProt()  #verbose=False)
    frmt = "tab"
    columns = ','.join(columns_name)
    alldata = u.search(uni_id, frmt=frmt, columns=columns)
    dataline = alldata.split("\n")
    data = [l.split("\t") for l in dataline[1:]]
    header = dataline[0].split("\t")
    dic_data = []
    for j in range(len(data) - 1):
        dic = {}
        for i, key in enumerate(columns_name):
            dic[key] = data[j][i]
        dic_data.append(dic)
    return dic_data, data, header
예제 #13
0
def getTaxonomyProtein(taxonomy, format="tab"):
    u = UniProt()  #verbose=False)
    query = "taxonomy:" + taxonomy
    frmt = "tab"
    columns = ','.join(columns_name)
    #get all entry_name as a data_frame
    #    entry_name=u.search(query,frmt=frmt,columns="entry name")
    #    entry_name_1 = str(entry_name).split("\n")
    #    enrty_name = entry_name_1[1:-1]
    #this is no enought informtion
    #get using the seach
    alldata = u.search(query, frmt=format, columns=columns)
    dataline = alldata.split("\n")
    data = [l.split("\t") for l in dataline[1:]]
    header = dataline[0].split("\t")
    return alldata, data, header
예제 #14
0
def main():
    """ Main function."""
    args = parse_args()
    if args.log:
        logfile = args.log
        logging.basicConfig(filename=logfile, level=logging.DEBUG, \
            filemode='w', format='%(asctime)s %(message)s', \
            datefmt='%Y-%m-%d %H:%M:%S')
    else:
        logfile = sys.stdout

    #Interface to the UniProt service
    u = UniProt(verbose=False)

    with open('../DataFilesVarstructure/pdb_chain_uniprot_1.csv','r') as csvinput:
        with open('../DataFilesVarstructure/pdb_chain_uniprot_2.csv', 'w') as csvoutput:
            writer = csv.writer(csvoutput)

            for row in csv.reader(csvinput):
                column_count = len(row)
                print(column_count)
                if column_count == 9:
                    if row[0] == "PDB":
                        writer.writerow(row + ["Entry_Name"])
                        print(row)
                    else:
                        res=u.search(str(row[2]),limit=1)
                        print(res)
                        if res != "" :
                            for line in res.split("\n")[1:-1]:
                                if(line != ""):
                                    print(line)
                                    res_id, res_Entry_Name, res_status, res_protein_names, res_gene_names, res_organism, Length = line.split("\t")
                                    strList =list()
                                    strList.append(res_Entry_Name)
                                    writer.writerow(row + strList)
예제 #15
0
def retrieve_labels(infile, outfile):
    seqs = SeqIO.parse(infile, "fasta")
    uniprot = UniProt()

    if outfile.exists():
        with open(outfile, "r") as out:
            lines = out.readlines()
    else:
        lines = []

    with open(outfile, "a") as out:
        for i, seq in enumerate(seqs):
            if i < len(lines):
                continue

            if "|" in seq.id:
                ID = seq.id.split("|")[1]
            else:
                ID = seq.id.split("/")[0].replace("UniRef100_", "")
            print(f"Doing ID {i:6}, {ID + ':':15} ", end="")
            try:
                # Try get_df
                df = uniprot.get_df(ID)
                label = df["Taxonomic lineage (PHYLUM)"][0]

                if type(label) == np.float64 and np.isnan(label):
                    columns, values = uniprot.search(ID,
                                                     database="uniparc",
                                                     limit=1)[:-1].split("\n")
                    name_idx = columns.split("\t").index("Organisms")
                    name = values.split("\t")[name_idx].split("; ")[0]
                    columns, values = uniprot.search(name,
                                                     database="taxonomy",
                                                     limit=1)[:-1].split("\n")
                    lineage_idx = columns.split("\t").index("Lineage")
                    label = values.split("\t")[lineage_idx].split("; ")[:2][-1]
            except:
                try:
                    columns, values = uniprot.search(ID,
                                                     database="uniparc",
                                                     limit=1)[:-1].split("\n")
                    name_idx = columns.split("\t").index("Organisms")
                    name = values.split("\t")[name_idx].split("; ")[0]
                    columns, values = uniprot.search(name,
                                                     database="taxonomy",
                                                     limit=1)[:-1].split("\n")
                    lineage_idx = columns.split("\t").index("Lineage")
                    label = values.split("\t")[lineage_idx].split("; ")[:2][-1]
                except:
                    try:
                        columns, values = uniprot.search(
                            ID, database="uniparc", limit=1)[:-1].split("\n")
                        name_idx = columns.split("\t").index("Organisms")
                        name = values.split("\t")[name_idx].split(
                            "; ")[0].split(" ")[0]
                        columns, values = uniprot.search(
                            name, database="taxonomy",
                            limit=1)[:-1].split("\n")
                        lineage_idx = columns.split("\t").index("Lineage")
                        label = values.split("\t")[lineage_idx].split(
                            "; ")[:2][-1]
                    except:
                        print("Couldn't handle it!")
                        breakpoint()

            print(f"{label}")
            out.write(f"{seq.id}: {label}\n")
예제 #16
0
def gathering(query):
    from bioservices import UniProt

    # Import BeautifulSoup, a package specialized for interpreting xml data
    from bs4 import BeautifulSoup

    # These packages need to be installed before being imported. If you have pip, then use
    # $ pip install beautifulsoup
    # $ pip install bioservices

    # Import itertools for efficient looping
    from itertools import groupby
    # Import numpy for efficient array/math functions
    import numpy as np
    from numpy import floor

    service = UniProt()
    result_xml = service.search(query, frmt="xml")
    soup = BeautifulSoup(result_xml, 'html.parser')  # xml "soup" object

    featureFH1 = soup.find_all('feature', description='FH1')

    # note the following code assumes there is one (and only one) annoted FH1 in this structure!

    if len(featureFH1) == 0:
        print('No FH1 domain in this protein')

    beginPosition = int(
        featureFH1[0].find('location').find('begin').get('position'))
    endPosition = int(
        featureFH1[0].find('location').find('end').get('position'))

    lengthOfFH1 = endPosition - beginPosition + 1

    if lengthOfFH1 == 0:
        print('error')
    #print(lengthOfFH1)

    soup_sequences = soup.find_all('sequence')
    soup_sequence = soup_sequences[-1].get_text()
    #print(soup_sequences)
    #print(soup_sequence)
    #script to display index and number of prolines in each PP sequence. Also returns
    #vectors containing each information for ease of plotting

    soup_sequence = soup_sequence.replace("\n", "").replace(
        "\r", "")  # gets rid of newline and line breaks in string
    fh1_sequence = soup_sequence[beginPosition -
                                 1:endPosition]  #specifiying FH1 domain
    fh1_sequence = fh1_sequence[::-1]  #reverses string sequence
    #print(fh1_sequence)

    displayIndex = 0  # index used for poly_proline sequence
    index = 0  # regular indexing

    pp_index_vec = []  # poly-proline vector for storing index
    pp_length_vec = []  # poly-proline vector for storing length

    fh1_length = len(fh1_sequence)
    fh1_length = float(fh1_length)
    #print(f'\nLength of entire sequence is {fh1_length}')

    seq = (groupby(fh1_sequence))
    # group string by letter

    for (k, g) in seq:
        length_seq = len(list(g))  # length of poly_proline sequence
        if k == 'P' and length_seq > 1:  # for indexing, refer to report
            if length_seq % 2 == 0:
                displayIndex = floor((length_seq) / 2) - 1 + index
                pp_index_vec.append(displayIndex)
                pp_length_vec.append(length_seq)
            else:
                displayIndex = floor((length_seq) / 2) + index
                pp_index_vec.append(displayIndex)
                pp_length_vec.append(length_seq)
        index += length_seq

    pp_length_vec = [float(i) for i in pp_length_vec]

    #print(f'\nPoly_proline index vector:{pp_index_vec}')
    #print(f'\nPoly_proline length vector: {pp_length_vec}')

    return fh1_length, pp_index_vec, pp_length_vec
예제 #17
0
# bioservice python

import pandas as pd
from bioservices import UniProt

u = UniProt(verbose=False)

columns_list = [
    'entry name', 'genes', 'organism', 'protein names', 'sequence',
    'feature(TOPOLOGICAL DOMAIN)', 'feature(TRANSMEMBRANE)', 'feature(SIGNAL)',
    'feature(PROPEPTIDE)', 'database(RefSeq)', 'database(GeneID)',
    'subcellular locations', 'feature(INITIATOR METHIONINE)', 'id'
]
columns = ','.join(columns_list)

data = u.search("gene_exact:pdcd1 organism:human AND reviewed:yes",
                frmt='tab',
                limit=3,
                columns=columns)

data_replace = data.replace("\n", "\t")
data_split = data_replace.split("\t")
RAW_data = data_split[len(columns_list):]

print(RAW_data)

print(UniProt._valid_columns)
x = UniProt._valid_columns
예제 #18
0
def process_sequences(records, cachepath: Path, disabletqdm: bool = True):
    """Triage SeqRecords into those that can/cannot be used

    This function also caches all inputs into the SQLite cache
    at cachepath

    :param records:  collection of SeqRecords
    :param cachepath: path to local sequence cache
    :param disabletqdm:  turn off tqdm progress bar
    """
    logger = logging.getLogger(__name__)
    logger.info("Processing sequences...")

    kept, skipped = [], []

    u_service = UniProt()

    for record in tqdm(records,
                       desc="1/5 Process input sequences",
                       disable=disabletqdm):
        seqtype = guess_seqtype(record)
        if seqtype == "UniParc":
            logger.warning("Record %s looks like a UniParc cluster (skipping)",
                           record.id)
            skipped.append(record)
            continue
        elif seqtype == "UniProt":
            match = re.search(re_uniprot_gn, record.description)
            if match is None:  #  No GN field
                logger.warning("Uniprot record %s has no GN field (skipping)",
                               record.id)
                skipped.append(record)
                continue
            logger.debug("Uniprot record has GN field: %s", match.group(0))
            # The UniProt API was updated in June 2022, requiring a change
            # to the returned field for the cross-reference to EMBL
            result = u_service.search(match.group(0),
                                      columns="xref_embl")  # type: ignore
            qstring = result.split("\n")[1].strip()[:-1]
            if qstring == "":
                logger.warning(
                    "Uniprot record %s has no EMBL cross-reference (skipping)",
                    record.id,
                )
                continue
            logger.debug("Recovered EMBL database record: %s", qstring)
            # UniProt can return multiple UIDs separated by semicolons. Sometimes the same
            # UID is repeated. However, the current cache schema uses the accession as primary
            # key in the same table as the query IDs.
            # TODO: Update schema to allow multiple queries per record
            for qid in qstring.split(";"):
                logger.debug("Adding record %s to cache with query %s",
                             record.id, qid)
                try:  # Uniprot sequences are added to cache as (accession, NULL, nt_query)
                    add_input_sequence(cachepath, record.id, None, qid)
                except sqlite3.IntegrityError:  # Sequence exists
                    logger.warning(
                        "Additional query terms found for %s: %s (not used)",
                        record.id,
                        qid,
                    )
                    continue
        elif seqtype == "NCBI":
            try:  # NCBI sequences are added to cache as (accession, aa_query, NULL)
                add_input_sequence(cachepath, record.id, record.id, None)
            except sqlite3.IntegrityError:  # Sequence exists
                continue
        # If the record has no query terms, skip it
        if has_query(cachepath, record.id):
            kept.append(record)
        else:
            skipped.append(record)

    return kept, skipped
예제 #19
0
 def func():
     u = UniProt()
     res = u.search(query, frmt=fmt)
     with open(target_fn, 'wb') as fp:
         fp.write(res)
예제 #20
0
def main(argv=sys.argv):

    parser = argparse.ArgumentParser(
        argv, usage=__doc__)

    optional = parser.add_argument_group('optional arguments')
    required = parser.add_argument_group('required arguments')

    required.add_argument('-t', '--tax-id', dest="tax_id",
                          required=True, type=int,
                          help=("Tax id for species"))

    optional.add_argument('-o', '--outfile', dest="outfile",
                          help=("Where to write results"))

    optional.add_argument('-cons', '--consensus', dest="consensus",
                          default=4, type=int,
                          help=("How many tools must agree"))

    optional.add_argument('--size', dest="size",
                          default=20, type=int,
                          help=("Minimum size of IDR region"))

    optional.add_argument('--whitelist', dest="whitelist",
                          nargs='+',
                          help=("Comma separated list of tools to use"))

    optional.add_argument('--blacklist', dest="blacklist",
                          nargs='+',
                          help=("Comma separated list of tools to ignore"))

    optional.add_argument('-l', '--logfile', dest="logfile",
                          default=os.devnull,
                          help=("Enter a file name for logging program "
                                "output. Else, nothing will be printed"))

    args = vars(parser.parse_args())

    if args['logfile']:
        logfile = open(args['logfile'], 'w')
    else:
        logfile = open(os.devnull,"w")

    logfile.write("Logfile for get_ids.py %s\n\n" % (
        datetime.datetime.now()))

    section_blocker = writeSectionHeader(logfile, "Script arguments:")
    for key, value in args.items():
        logfile.write("%s: %s\n" % (key, value))
    logfile.write("%s\n\n" % section_blocker)

    # 1. Get all uniprot IDs for species
    u = UniProt()
    results = u.search("organism:%s+and+reviewed:yes" % args['tax_id'], columns="id")
    uniprot_ids = [x.split()[0] for x in results.strip().split("\n")[1:]]

    # 2. Get IDRs, rebuild consensus and get IDR blocks, write out

    if args['outfile']:
        outf = open(args['outfile'], "w")
    else:
        outf = sys.stdout
    
    outf.write(
        "\t".join(
            ("UniprotID", "protein_length", "total_idr_length",
             "fraction_idr", "idrs", "idr_positions")) + "\n")

    section_blocker = writeSectionHeader(logfile, "Getting IDR info from D2P2...")
    idr_data_available = set()
    idr_data_unavailable = set()
    sequence_length_idr = {}

    n = 0

    start_time = datetime.datetime.now()

    for d2p2_entry in d2p2.iterator(uniprot_ids, chunk_size=50):

        d2p2_entry.rebuildConsensus(
            tools_whitelist=args['whitelist'],
            tools_blacklist=args['blacklist'])

        d2p2_entry.setIDRs(args['consensus'], args['size'])

        idr_data_available.add(d2p2_entry.name)
        sequence_length_idr[d2p2_entry.name] = d2p2_entry.length

        idr_positions = []
        total_idr = 0
        for idr in d2p2_entry.idrs:
            if not idr[1] - idr[0] >= args['size']:
                raise ValueError(
                    "This IDR is too small, how did it get here?!: %s" % idr)
            total_idr += (idr[1] - idr[0])
            idr_positions.append("[%s, %s]" % idr)
        protein_length = sequence_length_idr[d2p2_entry.name]
        fraction_idr = total_idr/protein_length
        outf.write("\t".join(map(str, (
            d2p2_entry.name, protein_length, total_idr,
            fraction_idr, len(d2p2_entry.idrs),
            ",".join(idr_positions)))) + "\n")
        outf.flush()

        n += 1
        current_time = datetime.datetime.now()
        pred_finish = start_time + ((current_time-start_time) * len(uniprot_ids)/n)
        logfile.write('proteins done: %i/%i  at %s. Pred. finish = %s\r' % (
            n, len(uniprot_ids), current_time, pred_finish))
        logfile.flush()

    logfile.write('\nfinished: %s\n' % current_time)
    logfile.flush()

    logfile.write("%s\n\n" % section_blocker)

    # 3. Log proteins with no IDR data
    idr_data_unavailable = set(uniprot_ids).difference(idr_data_available)

    section_blocker = writeSectionHeader(logfile, "Missing IDRs")
    logfile.write("Unable to detect IDR for %s / %s proteins\n" % (
        len(idr_data_unavailable), len(uniprot_ids)))
    logfile.write("No IDRs for:\n%s\n" %  "\n".join(idr_data_unavailable))
    logfile.write("%s\n\n" % section_blocker)
    logfile.write("%s\n\n" % section_blocker)

    outf.close()
    logfile.close()
#original reference: https://www.youtube.com/watch?v=7pOcOvhG7xQ 13:50

from bioservices import UniProt  #specify provider

u = UniProt()  #for calling user query

res = u.search('taxibp1 AND htlv',
               frmt='tab',
               columns='entry name, length, id, genes')
#res for result, query parameter is the keyword you want to search for
#frmt for format, tab for table, columns for specify your table header

print res
예제 #22
0
from bioservices import UniProt

uniprot_handle = UniProt(verbose=False)

search = uniprot_handle.search(
    "Z9JIV0",
    columns=
    "go(cellular component), subcellular locations, go(biological process), go(molecular function),feature(TRANSMEMBRANE), id, entry name, protein names, 3d"
)

print(search.split("\n")[1].split("\t"))

search = uniprot_handle.search(
    "Z9JIV0",
    columns=
    "go(cellular component), subcellular locations, go(biological process), go(molecular function),feature(TRANSMEMBRANE), id, entry name, protein names, 3d"
)

print(search.split("\n")[1].split("\t"))
search = uniprot_handle.search(
    "Z9JIV0",
    columns=
    "go(cellular component), subcellular locations, go(biological process), go(molecular function),feature(TRANSMEMBRANE), id, entry name, protein names, 3d"
)

print(search.split("\n")[1].split("\t"))
예제 #23
0
def getUniprotAndComplexPortalData(df, gene='uniprotId'):
    '''
    This function retrieves from Uniprot and Complex Portal databases protein quaternary structures,
    protein macromolecular complexes and protein-protein interactions established by a given
    metabolic gene.
    Input:
    - df: df kegg2Uniprot identifiers conversion.
    - gene: column name of df variable containing Uniprot identifiers
    Output:
    - Initial dataframe enriched with information retrieved from Uniprot and Complex Portal.
    '''
    u = UniProt(verbose=False)
    lAllProteinNames = []
    lAllGeneNames = []
    lUniprotIds = []
    lsubunits = []
    lComplexPortal = []
    lAllComplexPortal_uniprotId = []
    lAllComplexPortal_protName = []
    lFunction = []
    for el in df[gene]:
        res = u.search("%s" % el, frmt="xml")
        proteinNames = []
        geneNames = []
        complexPortal = ''
        subunitDescription = ''
        uniprotIds = []
        complexPortal_uniprotId = []
        complexPortal_protName = []
        if res != '':
            dRes = xmltodict.parse(res)
            try:
                for ref in dRes['uniprot']['entry']['dbReference']:
                    if ref['@type'] == 'ComplexPortal':
                        complexPortal = ref['@id']
            except:
                complexPortal = ''

            if complexPortal != '':
                url = 'https://www.ebi.ac.uk/intact/complex-ws/complex/' + complexPortal
                response = requests.get(url,
                                        headers={"Accept": "application/json"})
                jsonData = json.loads(response.text)
                try:
                    for participant in jsonData['participants']:
                        if participant['interactorType'] == 'protein':
                            complexPortal_uniprotId.append(
                                participant['identifier'])
                            complexPortal_protName.append(participant['name'])
                except:
                    print('no complex portal element')

            try:
                for k, val in dRes['uniprot']['entry']['protein'][
                        'recommendedName'].items():
                    if k == 'fullName' or k == 'shortName':
                        if type(val) == str:
                            proteinNames.append(val)
                        else:
                            proteinNames.append(val['#text'])
            except:
                print('no protein recommended name')
            try:
                for diz in dRes['uniprot']['entry']['protein'][
                        'alternativeName']:
                    for k, val in diz.items():
                        if k == 'fullName' or k == 'shortName':
                            if type(val) == str:
                                proteinNames.append(val)
                            else:
                                proteinNames.append(val['#text'])
            except:
                print('no protein alternative name')
            try:
                for diz in dRes['uniprot']['entry']['gene']['name']:
                    for k, val in diz.items():
                        if k == '#text':
                            geneNames.append(val)
            except:
                print('no gene name')

            function = ''
            try:
                for el in dRes['uniprot']['entry']['comment']:
                    if el['@type'] == 'function' and type(el['text']) == str:
                        function = el['text']
                    elif el['@type'] == 'function' and type(el['text']) != str:
                        function = el['text']['#text']
            except:
                print('no function')

            subunitDescription = ''
            if 'comment' in dRes['uniprot']['entry'].keys() and type(
                    dRes['uniprot']['entry']['comment']) == list:
                try:
                    for diz in dRes['uniprot']['entry']['comment']:
                        if diz['@type'] == 'subunit':
                            for k, val in diz.items():
                                if k == 'text':
                                    subunitDescription = val['#text']
                except:
                    for el in dRes['uniprot']['entry']['comment']:
                        if el['@type'] == 'subunit':
                            subunitDescription = el['text']
            try:
                if type(dRes['uniprot']['entry']['accession']) != str:
                    uniprotIds = dRes['uniprot']['entry']['accession']
                else:
                    uniprotIds = [dRes['uniprot']['entry']['accession']
                                  ]  ## questa è una lista
            except:
                uniprotIds = []
        lAllProteinNames.append(proteinNames)
        lAllGeneNames.append(geneNames)
        lUniprotIds.append(uniprotIds)
        lFunction.append(function)
        lsubunits.append(subunitDescription)
        lComplexPortal.append(complexPortal)
        lAllComplexPortal_uniprotId.append(complexPortal_uniprotId)
        lAllComplexPortal_protName.append(complexPortal_protName)
    df['proteinNames'] = lAllProteinNames
    df['geneNames'] = lAllGeneNames
    df['txt_subunit'] = lsubunits
    df['function'] = lFunction
    df['id_uniprot'] = lUniprotIds
    df['complexPortal'] = lComplexPortal
    df['complexPortal_uniprotId'] = lAllComplexPortal_uniprotId
    df['complexPortal_protName'] = lAllComplexPortal_protName
    u1mer = []
    for i in range(0, len(df)):
        struttura = [
            word
            for word in word_tokenize(df['txt_subunit'][i], "english", False)
            if 'mer' in el
        ]
        if struttura != []:
            u1mer.append(','.join(struttura))
        else:
            u1mer.append('')
    df['structure'] = u1mer
    return (df)
예제 #24
0
파일: NEXUS.py 프로젝트: tgdev24/Nexus
            count = 0
            wp = ""
            def1 = ""
            eval1 = ""

    #print(len(list1))
    #print(list1)
    uni_entries = []
    listPfam = []
    listProsite = []
    #    listuno = []
    #    list2 = []
    #    list3 = []
    for i in list1:
        u = UniProt()
        d = u.search(i[1], limit=3)
        unilines = d.split("\n")
        #print(unilines,  "bitchHHHHHHHHHHHHHHHHHHHHHH")
        if len(unilines) > 1:
            uni_entry = unilines[1].split("\t")[0]
            uni_entries.append(uni_entry)
        else:
            uni_entries.append("NULL")
        #print("\t", i[0], i[1])
        #print(uni_entries)
    for i in uni_entries:
        if (i == "NULL"):
            listPfam.append("NULL")
            listProsite.append("NULL")
        else:
            with open("uniprot_out.txt", "w") as outfile:
예제 #25
0
cols = ["name", "id","1", "2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25"]
df = pd.DataFrame(columns=cols)
print(df)
#
records = AlignIO.read(original_file, 'fasta')
#
for record in records:
    # print(record.id)
    # print(record.description)
    spl = str(record.description).split("/")
    name = record.description
    id = spl[-2]


    try:
        results = split_mine(u.search(id, columns="id, organism, lineage(SUPERKINGDOM), lineage(all)", limit=1))
        if results["Organism"] == '':
            lineage = ['No_identified']
            print('No_identified')
        else:
            lineage = str(results["Taxonomic lineage (all)"]).split(", ")
            print(lineage)


    except Exception as e:
        print('ошибка1', e)
        try:
            spl = str(record.description).split("[")
            search = spl[1][:-1]
            results = split_mine(u.search(search, columns="id, organism, lineage(SUPERKINGDOM), lineage(all)", limit=1))
            if results["Organism"] == '':
예제 #26
0
from bioservices import UniProt
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
help(NCBIWWW.qblast)
input_sequence = "PIO39916.1"

delta_blast = NCBIWWW.qblast(program = "blastp", database = "nr", sequence=input_sequence, service = "del", entrez_query = "all[filter] NOT predicted[title]")



blast_record = NCBIXML.read(delta_blast)
proteinIds=[]
for alignment in blast_record.alignments:
    proteinIds.append(alignment.hit_id.rsplit("|",2)[-2])
    for hsp in alignment.hsps:
            print("****Alignment****")
            print("sequence:", alignment.title)
            print("length:", alignment.length)
            print("e value:", hsp.expect)
            print("identities:", (hsp.identities/ hsp.align_length*100),"%")
            print(hsp.query[0:75] + "...")
            print(hsp.match[0:75] + "...")
            print(hsp.sbjct[0:75] + "...")

uni = UniProt(verbose=False)
search= "+OR+".join(proteinIds)
data = uni.search(search, frmt="tab", columns="entry name,length,id, go, database(kegg)")
print(data)
print(proteinIds)
예제 #27
0
                        'golgi apparatus membrane', 'Golgi membrane', 'lysosomal membrane', 'lysosome']

dname2GO = {}
dAnc2Name = {}
outFile = open(os.path.join(OUTDIR, dfGenesLoc + '.csv'), mode='w')
gL.writeLineByLineToFile(outFile, ['Gene', 'lCompartments'], '\t')

u = UniProt(verbose=False)

for gene in lAllGenes:
    lDefinitiveCompartment = []
    if gene in dGene2Uniprot:
        lCompartments = []
        lUniprotIds = dGene2Uniprot[gene]
        for unip in lUniprotIds:
            uniprotSearch = u.search("%s" % unip, frmt="xml")
            if uniprotSearch != '':
                dUniprotSearch = xmltodict.parse(uniprotSearch)
                dEcoEvidence = {}
                # Construct dictionary of the Evidence codes
                if 'evidence' in dUniprotSearch['uniprot']['entry']:
                    if type(dUniprotSearch['uniprot']['entry']['evidence']) == list:
                        for evidence in dUniprotSearch['uniprot']['entry']['evidence']:
                            for k, v in evidence.items():
                                if k == '@key':
                                    code = v
                                if k == '@type':
                                    ecoCode = v
                            dEcoEvidence[str(code)] = ecoCode
                    else:
                        for k, v in dUniprotSearch['uniprot']['entry']['evidence'].items():
예제 #28
0
#%% Nick180919 Test package bioservices
from bioservices import UniProt
u = UniProt(verbose=False)
data = u.search("YER030W or YPL190C or YGR086C",
                frmt="tab",
                limit=20,
                columns="entry name,length,id, genes,comment(FUNCTION)")
print(data)
#%% Pramod180919 Test package bioservices
from bioservices import UniProt
u = UniProt(verbose=False)
data = u.search("YER030W",
                frmt="tab",
                limit=20,
                columns="entry name,length,\
                id, genes, organism-id")
print(data)

#%% Nick181008 Get all data from Rain's code

from bioservices import UniProt
import pandas as pd
import re
import numpy as np

u = UniProt(verbose=False)
index = pd.ExcelFile('individual hits analysis_yeast_1.xlsx')
dfs = pd.read_excel(index, sheet_name=1)
print(dfs)
ele = dfs['Histatin-5_unique']
예제 #29
0
            print(self.id + " belongs to the same sister group as" +
                  self.descendant_which_splits_from_same_node)
            print("Thus," + self.id + " and" +
                  self.descendant_which_splits_from_same_node +
                  " are close relatives of each other")
            print(
                "In addition, all the other proteins in the Phylogenetic tree are the outgroup to"
                + self.id + " and" +
                self.descendant_which_splits_from_same_node)


if __name__ == '__main__':  #everyting above is can be imported into the test.AdvProAssessment.py unit testing file

    u = UniProt()  # assigns the UniProt function to u

    res = u.search(sys.argv[1], frmt="tab", columns="id, genes",
                   limit=None)  #accesses UniProt database

    Number_selections = range(
        int(sys.argv[2])
    )  # the value third argument in command line is assigned to the Number_selections

    Entry_selection_list = []  # empty list created called Entry_selection_list
    a = 2  # the numerical value 2 is assigned to

    for y in (Number_selections
              ):  # iterates n times, whereby n = Number_selections
        a = a + 1  # for every iteration a increases by one
        Entry_selection_list.append(
            int(sys.argv[a])
        )  # sets up sys.arg[] for n proteins, whereby n = Number_selections
예제 #30
0
def main():
    """ Main function."""
    args = parse_args()
    if args.log:
        logfile = args.log
        logging.basicConfig(filename=logfile, level=logging.DEBUG, \
            filemode='w', format='%(asctime)s %(message)s', \
            datefmt='%Y-%m-%d %H:%M:%S')
    else:
        logfile = sys.stdout

    outputfile = open(args.out, "w")
    # Output header
    outputfile.write("chr\tpos\tid\tref\talt\tgene\tfeature\tfeature_type\tconsequence\tswissprotid\tuniprotid\tpdbid\tprotein_position\tamino_acid\n")

    vcf_row = {}

    #Interface to the UniProt service
    u = UniProt(verbose=False)

    vcf_reader = vcf.Reader(open(args.vcf, 'r'))
    ENSP_PDB_UNIPROT_mapping_DataFram = pd.DataFrame(columns=['ENSP','UniProtID','PDB'])
    #creating a util function to store mapping of Uniprot and PDB_ID
    for record in vcf_reader:
        # VEP fields
        curr_ENSP = ''
        if "CSQ" in record.INFO:
            csq = record.INFO['CSQ']
            # BELOW: THERE ARE A COUPLE OF OPTIONS TO PROCEED
            # For going through annotations for all transcript
            for current_csq_element in csq:
                current_csq = current_csq_element.split('|')
                curr_ENSP = str(current_csq[26])
                if curr_ENSP != "":
                    # to get Protein ID given ENSP ID
                    current_protein_list = u.search(curr_ENSP,frmt="list")
                    for curr_protein in current_protein_list.split("\n"):
                        if curr_protein != "":
                            # to get PDB ID given protein id
                            mapping_Dictionary = u.mapping(fr="ID", to="PDB_ID", query=str(curr_protein))
                            if bool(mapping_Dictionary) == True :
                                if curr_ENSP not in ENSP_PDB_UNIPROT_mapping_DataFram.index:
                                    ENSP_PDB_UNIPROT_mapping_DataFram.loc[curr_ENSP] = pd.Series({'ENSP':curr_ENSP, 'UniProtID':mapping_Dictionary.keys(), 'PDB':mapping_Dictionary.values()})

    #print(ENSP_PDB_UNIPROT_mapping_DataFram)
    # writing in a csv file
    for record in vcf_reader:
        current_chr = record.CHROM
        current_id = record.ID
        current_pos = record.POS
        current_ref = record.REF
        current_alt = ','.join(str(v) for v in record.ALT)

        # VEP fields
        current_gene, current_feature = '',''
        current_feature_type, current_consequence = '',''
        current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid = '','','','','',''
        if "CSQ" in record.INFO:
            csq = record.INFO['CSQ']

            # BELOW: THERE ARE A COUPLE OF OPTIONS TO PROCEED
            # For going through annotations for all transcript
            for current_csq_element in csq:
                current_csq = current_csq_element.split('|')
                current_consequence = current_csq[1]
                current_gene = current_csq[4]
                current_feature_type = current_csq[5]
                current_feature = current_csq[6]
                current_protein_position = current_csq[14]
                current_amino_acid = current_csq[15]
                current_ENSP = current_csq[26]
                current_swissport = current_csq[27]

                # only cosider missense mutation

                #if current_swissport_in_my_list(current_swissport, swissprot_pdb_)
                if current_ENSP in ENSP_PDB_UNIPROT_mapping_DataFram.index:
                    current_protein = ENSP_PDB_UNIPROT_mapping_DataFram.loc[current_ENSP]['UniProtID']
                    for item in ENSP_PDB_UNIPROT_mapping_DataFram.loc[current_ENSP]['PDB']:
                        current_pdbid = item
                        break;
                    out_str = [ current_chr, str(current_pos), str(current_id), current_ref, current_alt,
                                current_gene, current_feature, current_feature_type, current_consequence,current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid]
                else:
                    current_protein = ""
                    current_pdbid = ""
                    out_str = [ current_chr, str(current_pos), str(current_id), current_ref, current_alt,
                            current_gene, current_feature, current_feature_type, current_consequence,current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid]

                out_str = [x or 'None' for x in out_str]

                outputfile.write("\t".join(out_str))
                outputfile.write("\n")

        else:
            current_gene, current_feature = '',''
            current_feature_type, current_consequence = '',''
            current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid = '','','','','',''

            out_str = [ current_chr, str(current_pos), str(current_id), current_ref, current_alt,
                        current_gene, current_feature, current_feature_type, current_consequence,current_swissport,current_ENSP, current_protein, current_pdbid , current_protein_position, current_amino_acid]
            out_str = [x or 'None' for x in out_str]
            outputfile.write("\t".join(out_str))
            outputfile.write("\n")

    outputfile.close()

    logging.info('Start.')
    logging.info('Command line: {}'.format(' '.join(sys.argv)))
ls = list(df_new['sseqid'])  ### list the final ids

e = "id,organism,pathway,go(biological process), go(molecular function), go(cellular component)"  ###retrieve the info that you want

bar = progressbar.ProgressBar(
    maxval=len(ls),
    widgets=[progressbar.Bar('=', '[', ']'), ' ',
             progressbar.Percentage()])

result = ''

bar.start()

for i, query in enumerate(ls):

    bar.update(i + 1)
    if i % 1000 == 0: print(i)
    result += service.search(query, frmt="tab", columns=e)

bar.finish()

new_df2 = pd.read_table(io.StringIO(result))  ### load results into a dataframe

final_df = new_df2[new_df2.Entry != 'Entry']  #### remove duplicate header bars

with open('GO.txt', 'wt') as txt_f:
    txt_f.write(final_df.to_csv(sep='\t', index=False,
                                header=True))  ### export to csv format

# final_df
예제 #32
0
from bioservices import UniProt
from chembl_webresource_client import *
from chembl_webresource_client.new_client import new_client
import sys
from collections import defaultdict
from numpy import *

d = defaultdict(int)
targets = TargetResource()
u = UniProt()
""" the search funtion is used to extract the protein IDs from UniProt since the IDs are used as the
query set for data extraction from the chemical databases"""
res = u.search(
    '(GO:0006665+OR+sphingolipid+OR+sphingomyelin+OR+glycosphingolipid)+AND+organism:9606',
    frmt='tab',
    columns='id, entry name, database(chembl)')
identifier = res[42:]  #42 is just choosen to get rid off the headers
yeni = str(identifier)
lines = yeni.split('\n')
un = []
name = []
chembl = []
pro = []
""" At this step, the information retrieved from the UniProt is saved as separate columns
depending on the info it has. For instance, after splitting the lines, the first entry is the ID, mid one
is protein short name and the third one is ChEMBL ID. All of them are saved as separate arrays to use at
the next step"""

for i in range(len(lines)):
    lineList = lines[i].split('\t')
    if len(lineList) == 3:
예제 #33
0
class GetPDBThread(QThread):
    """
    Input is a tuple of [SeqRs],Indices. To keep it compatible, if this is being called from a different
    method other than the button (like from , it will ignore the fact that
    """
    finished = pyqtSignal(list)
    logger = pyqtSignal(str)

    def __init__(self, input, parent=None):
        QThread.__init__(self, parent)
        self.seqs = input[0]
        self.nodes = None if not input[1] else input[1]
        self.u = UniProt(verbose=False)
        self.PDBLogger = logging.getLogger("PDBSearch")
        del input, parent

    def run(self):
        returned = []
        # TODO: Should pop up modal if not confident in the structure results!
        index = None
        pid = None
        struct = None
        structseq = None
        uniID = None
        alignoff = 0
        result = None
        ids = []
        coordinates = None
        line = None
        i = 0
        x = None
        data = None
        structs = None
        struct0 = None
        offset = 0
        start = None
        end = None
        parser = None
        tmp = None
        url = None
        myfile = None

        for seq in self.seqs:
            if self.nodes:
                index = self.nodes[self.seqs.index(seq)]
            pid = seq.id
            self.PDBLogger.info("Searching with ID %s" % pid)
            self.logger.emit("Searching with ID %s" % pid)
            uniID = self.u.search(pid,
                                  columns="id, genes, organism, protein names")
            if uniID:
                self.PDBLogger.info('Results!\n\n%s' % uniID)
                self.logger.emit('Results collected for search: %s' % uniID)
                result = uniID.split("\n")
                for line in result[1:]:
                    ids.append(line.split("\t"))
                while coordinates == None:
                    self.PDBLogger.info('Attempting search with %s from %s' %
                                        (ids[i][1], ids[i][2]))
                    self.logger.emit('Attempting search with %s from %s' %
                                     (ids[i][1], ids[i][2]))
                    structurl = "https://swissmodel.expasy.org/repository/uniprot/%s.json" % ids[
                        i][0]
                    self.PDBLogger.debug(
                        'Searching SwissModel repository: %s' % structurl)
                    self.logger.emit('Searching SwissModel for structure')
                    try:
                        with urllib.request.urlopen(structurl) as url:
                            data = json.loads(url.read().decode())
                        if data['result']:
                            #print("Data found")
                            result = data['result']
                            if result['structures']:
                                #print("structures found")
                                structs = result['structures']
                                structseq = result['sequence']
                                self.PDBLogger.info(
                                    "QUERY: \n%s" %
                                    str(seq.seq).replace("-", ""))
                                self.PDBLogger.info("RESULT: \n%s" % structseq)
                                if str(seq.seq).replace("-", "") == structseq:
                                    # They should match, else keep looking
                                    if structs[0]:
                                        # print("accessing first model")
                                        struct0 = structs[0]
                                        if struct0['coordinates']:
                                            coordinates = struct0[
                                                'coordinates']
                                            alignoff = int(
                                                struct0['chains']['A'][0]
                                                ['uniprot']['from']) - 1
                                            self.PDBLogger.debug(
                                                "MODEL ACQUIRED")
                                            self.logger.emit('MODEL ACQUIRED!')
                                        else:
                                            i += 1
                                            continue
                                    else:
                                        i += 1
                                        continue
                                else:
                                    self.PDBLogger.debug(
                                        "Seq didn't match, trying with next model"
                                    )
                                    i += 1
                                    continue
                            else:
                                i += 1
                                continue
                        elif i == len(ids):
                            self.PDBLogger.info("Sorry, no models found")
                            break
                        else:
                            i += 1
                            continue
                    except HTTPError:
                        break

                if coordinates:
                    start = structseq[:7]
                    #print(start)
                    for x in range(len(seq)):
                        end = x + 7
                        if str(seq.seq)[x:end] == start:
                            self.PDBLogger.debug(
                                "Sequence offset is %s residues" % x)
                            offset = x + alignoff
                            self.PDBLogger.info(
                                "Alignment offset is %s residues" % offset)
                            self.logger.emit(
                                "Alignment offset is %s residues" % offset)
                    parser = PDB.PDBParser()
                    tmp = QTemporaryFile()
                    with urllib.request.urlopen(coordinates) as url:
                        myfile = url.read()
                        if tmp.open():
                            tmp.write(myfile)
                            struct = parser.get_structure(
                                ids[1], tmp.fileName())
                            self.PDBLogger.debug("STRUCTURE PARSED")
                            self.logger.emit("STRUCTURE PARSED")
                            #print(struct, type(struct))
                            returned.append([struct, seq, index, offset])

                else:
                    self.PDBLogger.debug("Sorry, no models found!!!")
            else:
                self.PDBLogger.info("NO STRUCTURE FOUND")
                self.logger.emit("No structure found, sorry!")
            self.finished.emit(returned)
            del returned, index, uniID, result, ids, line, coordinates, i, x, url, start, offset, tmp, parser, seq, struct, structseq, alignoff
# Dernière modification: 17 décembre 2013
#
# Program qui obtient le uniref correspondant à un no d'accession genbank

from bioservices import UniProt
import sys
import os
from BeautifulSoup import BeautifulSoup

UNIREF_PATH = "../uniref/"

u = UniProt()
with open("uniref_mapping.txt", "w") as r:
    with open("resultatNBCI.txt", "r") as f:
        for line in f:
            temp = line.split("-|-")
            print("Traitement du contig " + temp[0])
            accession = temp[2].strip(" \t\n\r")
            u.mapping(fr='EMBL_ID', to='NF100', query=accession)
            res = u.search(accession, format='xml', limit=10)
            if res is '':
                r.write(temp[0] + "\tNone\n")
                print "aucun résultat pour ce contig"
            else:
                contig = temp[0].strip(" \t\n\r")
                with open(UNIREF_PATH+"result"+contig+".xml", "w") as xml:
                    xml.write(res)
                #xml = BeautifulSoup(res)
                r.write(contig + "\t Result\n")

예제 #35
0
 def func():
     u = UniProt()
     res = u.search(query, frmt=fmt)
     with open(target_fn, 'wb') as fp:
         fp.write(res)
예제 #36
0
original_file = open(r"data/Proteins_origin.fasta", 'r')
# original_file = open(r"data/Proteins_short.fasta", 'r')
new_file = open(r"data/Proteins_named_Euk_Vir.fasta", 'w')

records = AlignIO.read(original_file, 'fasta')

for record in records:
    # print(record.id)
    # print(record.seq)
    organism = ''
    description = ''
    try:
        find = record.id.split('/')[0]
        results = split_mine(
            u.search(
                find,
                columns="id, organism, lineage(SUPERKINGDOM), lineage(all)"))
        if results["Organism"] == '':
            organism = 'No_identified'
        else:
            organism = results[
                "Taxonomic lineage (SUPERKINGDOM)"][:3] + './' + results[
                    "Organism"]
            # description =results["Taxonomic lineage (all)"]

    except Exception as e:
        print('ошибка', e)
        organism = 'Error'

    record.id = organism + "/" + record.id
    # record.id = organism.replace(' ', '_')+ "/"+record.id
예제 #37
0
    input ::=
    protlist: list of proteins
    idnm: type of protein names, such as AC.
    output::
    dict of protein information::
    Entry name; Gene names; Length; Organism; Protein names; Status.
    """
    u = UniProt(verbose=False)
    return u.quick_search(protlist)


# if "__name__" == "__main__":
u = UniProt(verbose=False)
data = u.search("zap70+taxonomy:9606",
                frmt="tab",
                limit=3,
                columns="entry name, length, id, genes")
print data

res = u.search("DNMT1_HUMAN",
               frmt="tab",
               columns="entry name, protein names, pathway, comments")
print(res)

res = u.quick_search("DNMT1_HUMAN")
getreskey = res.keys()
res[getreskey]['Gene names']

df = u.get_df("GALK1_HUMAN")
df['Length'].hist()
plt.show()