예제 #1
0
def quick_getprotinfo(protlist):
    """get protein information from uniprot database
    uniprot(http://www.uniprot.org) based on the package of
    bioservices.

    input ::=
    protlist: list of proteins
    idnm: type of protein names, such as AC.
    output::
    dict of protein information::
    Entry name; Gene names; Length; Organism; Protein names; Status.
    """
    u = UniProt(verbose=False)
    return u.quick_search(protlist)
예제 #2
0
def search_struc():
    u = UniProt()
    with open(os.path.join(resultspath, "myprot_list.csv"), "r") as infile:
        with open(os.path.join(resultspath, 'myprot_list_struc.csv'),
                  'w') as outfile:
            mywriter = csv.writer(outfile, delimiter=';')
            myreader = csv.reader(infile, delimiter=';')
            for row in myreader:
                if row[0] == "Family":
                    mywriter.writerow(row + ["Uniprot entry", "Struc"])
                    continue
                struccell = ""
                entry = ""
                uprot = row[3]
                print("\n\n", row[2])
                if uprot:
                    data = u.quick_search("id:%s" % uprot)
                    if data:
                        entry = data[uprot]['Entry name'].lower()
                        struc_res = obtain_struc_pdb(entry)
                        if struc_res:
                            struccell = struc_res
                            print(struccell)
                        else:
                            temp = requests.get(
                                'http://gpcrdb.org/services/structure/template/'
                                + entry).json()
                            if temp:
                                temp_res = obtain_struc_pdb(temp)
                                if temp_res:
                                    struccell = "[Model]: " + temp_res
                                    print(struccell)
                                else:
                                    print("-----No struc for template")
                            else:
                                print("-----No template")
                    else:
                        print("-----Uprot ID not found")
                else:
                    print("-----No uprot ID")

                mywriter.writerow(row + [entry, struccell])
예제 #3
0
파일: pyuniprot.py 프로젝트: fsoubes/Tulip
def obtained(liste, output_file):

    uprot = []
    u = UniProt()
    for ids in liste:
        sp = " "
        name = "Escherichia coli (strain K12)"
        final = str(ids) + sp + name
        d = u.quick_search(final, limit=1)
        for entry in d.keys():
            uprot.append(entry)

    dd = u.get_df(uprot)
    GO = dd["Gene ontology (GO)"]

    file2 = open(output_file, 'w')

    file2.write("numero uniprot est le suivant \n \n")
    for i in range(len(liste)):
        file2.write(str(liste[i] + " "))

    file2.write("\n")
    file2.write("\n")
    file2.write("ID accession uniprot est le suivant \n")
    for i in range(len(uprot)):
        file2.write(str(uprot[i] + "  "))

    file2.write("\n")
    file2.write("\n")
    file2.write(
        "L'enrichissement ontologique pour ces termes est la suivante \n\n")
    for i in range(len(GO)):
        for j in range(len(GO[i])):
            file2.write(GO[i][j])
        file2.write("\n \n")

    return GO
예제 #4
0
    def handle(self, *args, **options):
        def open_json(json_filepath):
            json_file = open(json_filepath)
            json_str = json_file.read()
            json_dict = pd.io.json.loads(json_str)
            return json_dict

        def open_dict_or_new(filepath):
            filepath_obj = Path(filepath)
            try:
                filepath_res = filepath_obj.resolve()
                mydict = open_json(filepath)
            except FileNotFoundError:
                mydict = {}
            return mydict

        def uniprot_mapping(fromtype, totype, identifier):
            """Takes an identifier, and types of identifier 
            (to and from), and calls the UniProt mapping service
            Abbrebiations of Uniprot identifier types can be found here: https://www.uniprot.org/help/api_idmapping
            """
            base = 'http://www.uniprot.org'
            tool = 'mapping'
            params = {
                'from': fromtype,
                'to': totype,
                'format': 'tab',
                'query': identifier,
            }
            #urllib turns the dictionary params into an encoded url suffix
            data = urllib.parse.urlencode(params)
            #construct the UniProt URL
            url = base + '/' + tool + '?' + data
            #and grab the mapping
            response = urllib.request.urlopen(url)
            #response.read() provides tab-delimited output of the mapping
            return (response.read())

        def calc_foldcgange(wt_val, mut_val):
            if mut_val >= wt_val:
                fchange = str(round(mut_val / wt_val, 3))
            else:
                fchange = str(-round(wt_val / mut_val, 3))
            return fchange

        def obtain_mut_info(muts_dict, uprot, entry):
            mut_info = requests.get('http://gpcrdb.org/services/mutants/' +
                                    entry).json()
            if entry not in muts_dict:
                if mut_info:
                    muts_dict[entry] = {}
                    for pos_mut in mut_info:
                        seqpos = pos_mut["mutation_pos"]
                        if float(pos_mut["exp_wt_value"]) == 0 or float(
                                pos_mut["exp_mu_effect_value"]) == 0:
                            fchange = False
                        else:
                            fchange = calc_foldcgange(
                                pos_mut["exp_wt_value"],
                                pos_mut["exp_mu_effect_value"])
                        #fchange=str(round(pos_mut["exp_mu_effect_value"]/pos_mut["exp_wt_value"],2))
                        pos_mut_d = {
                            "from": pos_mut["mutation_from"],
                            "to": pos_mut["mutation_to"],
                            "fchange": fchange,
                            "measure": pos_mut["exp_type"],
                            "unit": pos_mut["exp_wt_unit"],
                            "exp": pos_mut["exp_func"],
                            "lig": pos_mut["ligand_name"],
                            "pub_ref": pos_mut["reference"],
                            "qual": pos_mut["exp_mu_effect_qual"]
                        }
                        if seqpos in muts_dict[entry]:
                            muts_dict[entry][seqpos].append(pos_mut_d)
                        else:
                            muts_dict[entry][seqpos] = [pos_mut_d]
                else:
                    muts_dict[entry] = {}

            return muts_dict

        def obtain_var_info(vars_dict, uprot, entry):
            uprot_map = uniprot_mapping('ACC+ID', 'ENSEMBL_ID', uprot)
            if len(uprot_map) <= 8:
                return vars_dict
            ens_id = uprot_map.decode().strip("\n").split("\t")[-1]
            exac = requests.get(
                "http://exac.hms.harvard.edu/rest/gene/variants_in_gene/" +
                ens_id).json()
            if entry not in vars_dict:
                vars_dict[entry] = {}
                for exac_var in exac:
                    consequence = exac_var["major_consequence"]
                    if consequence in [
                            "frameshift_variant", "missense_variant",
                            "stop_gained", "synonymous_variant"
                    ]:
                        #frameshift_variant: p.Val152SerfsTer3
                        #stop_gained: p.Trp32Ter
                        #missense_variant: p.Leu340Pro
                        #synonymous_variant: p.Thr384Thr
                        var_info = exac_var["HGVSp"]
                        mymatch = re.match(
                            "p\.([A-Za-z]*)(\d*)(([A-Za-z]*).*)", var_info)
                        fromAA = mymatch.group(1)
                        seqNum = mymatch.group(2)
                        toAA = mymatch.group(3)
                        allele_freq = exac_var["allele_freq"]
                        if consequence == "missense_variant" or consequence == "synonymous_variant":
                            from_sAA = aa_short[fromAA.upper()]
                            to_sAA = aa_short[toAA.upper()]
                        elif consequence == "stop_gained":
                            from_sAA = aa_short[fromAA.upper()]
                            to_sAA = toAA
                        elif consequence == "frameshift_variant":
                            from_sAA = aa_short[fromAA.upper()]
                            mymatch2 = re.match("^([A-Za-z]*)(fsTer.*)$", toAA)
                            to_AAnm = mymatch2.group(1)
                            to_rest = mymatch2.group(2)
                            to_AAnm = aa_short[to_AAnm.upper()]
                            to_sAA = to_AAnm + to_rest
                        exac_var_id = exac_var["variant_id"]
                        pos_vars = {
                            "from": from_sAA,
                            "to": to_sAA,
                            "consequence": consequence,
                            "exac_var_id": exac_var_id,
                            "allele_freq": allele_freq
                        }
                        if seqNum in vars_dict[entry]:
                            vars_dict[entry][seqNum].append(pos_vars)
                        else:
                            vars_dict[entry][seqNum] = [pos_vars]
            return vars_dict

        mypath = "/protwis/sites/files/Precomputed/muts_vars_info"
        if not os.path.isdir(mypath):
            os.makedirs(mypath)
        vars_filepath = os.path.join(mypath, "gpcr_vars.json")
        muts_filepath = os.path.join(mypath, "gpcr_muts.json")
        if options['update']:
            vars_dict = {}
            muts_dict = {}
        else:
            vars_dict = open_dict_or_new(vars_filepath)
            muts_dict = open_dict_or_new(muts_filepath)

        dyn_li = DyndbDynamics.objects.filter(is_published=True)

        dynobj = dyn_li.annotate(dyn_id=F('id'))
        dynobj = dynobj.annotate(
            uniprot=F('id_model__id_protein__uniprotkbac'))
        dynobj = dynobj.annotate(uniprot2=F(
            'id_model__id_complex_molecule__id_complex_exp__dyndbcomplexprotein__id_protein__uniprotkbac'
        ))
        dynprotdata = dynobj.values("dyn_id", "uniprot", "uniprot2")

        dyn_dict = {}
        for dyn in dynprotdata:
            dyn_id = dyn["dyn_id"]
            up = dyn["uniprot"]
            if not up:
                up = dyn["uniprot2"]
            if not up:
                self.stdout.write(
                    self.style.NOTICE("UniProt ID not found for dyn %s" %
                                      (dyn_id)))
                continue
            if dyn_id not in dyn_dict:
                dyn_dict[dyn_id] = [up]
            else:
                dyn_dict[dyn_id].append(up)
        u = UniProt()
        for dyn_id, uprot_li in dyn_dict.items():
            for uprot in uprot_li:
                data = u.quick_search("id:%s" % uprot)
                if data:
                    entry = data[uprot]['Entry name'].lower()
                    if options["sel_mv"] == "var_only" or options[
                            "sel_mv"] == "all":
                        vars_dict = obtain_var_info(vars_dict, uprot, entry)
                    elif options["sel_mv"] == "mut_only" or options[
                            "sel_mv"] == "all":
                        muts_dict = obtain_mut_info(muts_dict, uprot, entry)
                else:
                    self.stdout.write(
                        self.style.NOTICE(
                            "No uniprot id found for %s (dyn id:%i)." %
                            (uprot, dyn_id)))
        if options["sel_mv"] == "var_only" or options["sel_mv"] == "all":
            with open(vars_filepath, 'w') as outfile:
                json.dump(vars_dict, outfile)
        elif options["sel_mv"] == "mut_only" or options["sel_mv"] == "all":
            with open(muts_filepath, 'w') as outfile:
                json.dump(muts_dict, outfile)
예제 #5
0
    protlist: list of proteins
    idnm: type of protein names, such as AC.
    output::
    dict of protein information::
    Entry name; Gene names; Length; Organism; Protein names; Status.
    """
    u = UniProt(verbose=False)
    return u.quick_search(protlist)


# if "__name__" == "__main__":
u = UniProt(verbose=False)
data = u.search("zap70+taxonomy:9606",
                frmt="tab",
                limit=3,
                columns="entry name, length, id, genes")
print data

res = u.search("DNMT1_HUMAN",
               frmt="tab",
               columns="entry name, protein names, pathway, comments")
print(res)

res = u.quick_search("DNMT1_HUMAN")
getreskey = res.keys()
res[getreskey]['Gene names']

df = u.get_df("GALK1_HUMAN")
df['Length'].hist()
plt.show()
예제 #6
0
class GOTermAdder(object):
        
    def __init__(self, input_file, gene_id_column, output_file):
        self._input_file = input_file
        self._gene_id_column = gene_id_column
        self._output_file = output_file
        self._tmp_folder = "tmp_data"
        self._uniprot = UniProt(verbose=False)
        self._quickgo = QuickGO(verbose=False)
        if os.path.exists(self._tmp_folder) is False:
            os.mkdir(self._tmp_folder)
            
    def add_go_terms(self):
        with open(self._output_file, "w") as output_fh:
            for row in csv.reader(open(self._input_file), delimiter="\t"):
                if len(row[0]) == 0:
                    self._write_row(row, output_fh)
                    continue
                else:
                    row = self._add_go_term_column(row)
                    self._write_row(row, output_fh)

    def _write_row(self, row, output_fh):
        output_fh.write("\t".join(row) + "\n")

    def _add_go_term_column(self, row):
        gene_id = self._gene_id(row)
        uniprot_id = self._uniprot_id(gene_id)
        if uniprot_id is None:
            return row
        go_terms = self._go_terms(uniprot_id)
        go_term_names = [
            self._go_term_name(go_term) for go_term in go_terms]
        assert len(go_terms) == len(go_term_names)
        row.append(", ".join(
            ["%s (%s)" % (go_terms, go_term_names) 
             for go_terms, go_term_names in 
             zip(go_terms, go_term_names)]))
        return row

    def _uniprot_id(self, gene_id):
        file_path = self._tmp_file_path(gene_id)
        if os.path.exists(file_path) is True:
            with open(file_path) as json_fh:
                return json.load(json_fh)["Uniprot"] 
        else:
            uniprot_id = self._search_uniprot_id(gene_id)
            gene_data = {"Uniprot" : uniprot_id}
            with open(file_path, "w") as json_fh:
                json.dump(gene_data, json_fh)

    def _go_terms(self, uniprot_id):
        file_path = self._tmp_file_path(uniprot_id)
        if os.path.exists(file_path) is True:
            with open(file_path) as json_fh:
                return json.load(json_fh)["GO-Terms"]
        else:
            uniprot_entry = self._uniprot.searchUniProtId(uniprot_id)
            go_ids = []
            for dbref in uniprot_entry.findAll("dbreference"):
                if dbref.attrs["type"] == "GO":
                    go_ids.append(dbref.attrs["id"])
                go_term_data = {"GO-Terms" : go_ids}
                with open(file_path, "w") as json_fh:
                    json.dump(go_term_data, json_fh)
            return go_ids

    def _go_term_name(self, go_term):
        file_path = self._tmp_file_path(go_term)
        if os.path.exists(file_path) is True:
            with open(file_path) as json_fh:
                return json.load(json_fh)["name"]
        else:
            go_term_info = self._quickgo.Term(go_term).soup
            go_term_name = go_term_info.term.find("name").text
            go_term_data = {"name" : go_term_name}
            with open(file_path, "w") as json_fh:
                    json.dump(go_term_data, json_fh)
            return go_term_name

    def _search_uniprot_id(self, gene_id):
        uniprot_id_search = self._uniprot.quick_search(gene_id)
        if len(uniprot_id_search) == 1:
            uniprot_id = uniprot_id_search.keys()[0]
            return uniprot_id
        elif len(uniprot_id_search) > 1:
            pass
        elif len(uniprot_id_search) > 0:
            pass

    def _tmp_file_path(self, gene_id):
        return "%s/%s.json" % (self._tmp_folder, gene_id)

    def _gene_id(self, row):
        return row[self._gene_id_column-1].split("GeneID:")[1].split(";")[0]
예제 #7
0
class uniprot_queries():
    '''
    Interrogation of the uniprot database
    '''

    def __init__(self,specie,taxid):

        self.query = ""
        self.specie = specie
        self.taxid = taxid
        self.dicotested = {} #avoid to test several time the same ID
        self.unip = UniProt()
              

    def create_query(self, query):
        '''
        Create the query that will be used to search the database
        '''
        self.query = query
        self.query = re.sub('\+','\\+',self.query)
        self.query = re.sub('"','',self.query)
        self.query = self.query.replace(':','')
        self.query = self.query.replace(';','')

        cq=''
        #change cq in function of the specie specifications
        if self.specie != '':
            if self.taxid != '':
                cq = self.query + ' AND organism: ' + self.specie + ' [' + self.taxid + ']'
            else:
                cq = self.query + ' AND organism: ' + self.specie
        else:
            cq = self.query
        return cq


    def uniprot_request(self, cq):
        '''
        Request to the uniprot database using the uniprot
        module from bioservices
        '''
        for n in range(10):
            try:
                cq=re.sub('^\[','',cq) # remove the [ when it start by it 
                d = self.unip.quick_search(cq,limit=1) #I take the best result of the match
                break
            except(zeep.exceptions.Fault, zeep.exceptions.TransportError, TypeError, AttributeError):
                d={}
                pass
        return d

    def query_id(self, query):
        '''
        Use bioservice
        research uniprot ID corresponding to a query
        RETURN: uniprotID, uniprot entry name, uniprot gene names
        '''
        cq = self.create_query(query)
        if not cq in self.dicotested.keys():#cq not already tested
            d = self.uniprot_request(cq)
            if(len(d.keys())>0): #at least one result
                unipID = d.keys()[0]
                for gene in d[unipID]['Gene names'].split(' '):
                    if clean(query) == clean(gene):
                        tabtosave=[unipID, d[unipID]['Entry name'], d[unipID]['Gene names'], d[unipID]['Protein names']]
                        self.dicotested[cq] = tabtosave
                        return unipID, d[unipID]['Entry name'], d[unipID]['Gene names'], d[unipID]['Protein names']
                    if len(query.split('_')) > 1:
                        query = query.split('_')[0]
                    if clean(query) == clean(gene):
                        tabtosave=[unipID, d[unipID]['Entry name'], d[unipID]['Gene names'], d[unipID]['Protein names']]
                        self.dicotested[cq] = tabtosave
                        return unipID, d[unipID]['Entry name'], d[unipID]['Gene names'], d[unipID]['Protein names']
                for protein in d[unipID]['Protein names'].split('('):
                    protein = protein.strip()
                    protein = re.sub("\)$",'',protein)
                    protein = protein.replace('+','')
                    if clean(query) == clean(protein):
                        tabtosave = [unipID, d[unipID]['Entry name'], d[unipID]['Gene names'], d[unipID]['Protein names']]
                        self.dicotested[cq] = tabtosave
                        return unipID, d[unipID]['Entry name'], d[unipID]['Gene names'], d[unipID]['Protein names']
                    #By Cecile:
                    if(re.search(clean(protein),clean(query))):
                        tabtosave=[unipID, d[unipID]['Entry name'], d[unipID]['Gene names'], d[unipID]['Protein names']]
                        self.dicotested[cq] = tabtosave
                        return unipID, d[unipID]['Entry name'], d[unipID]['Gene names'], d[unipID]['Protein names']
            tabtosave = ['','','','']
            self.dicotested[cq] = tabtosave
            return '', '', '', ''
        else:
            return self.dicotested[cq][0], self.dicotested[cq][1], self.dicotested[cq][2], self.dicotested[cq][3]


    def mapping_id(self, prot_id, database):
        '''
        Map a protein id to obtain the information from the
        uniprot database

        This function became deprecated after bioservices uniprot stopped working, 
        not allowing the retrieval of the mapping id.
        In turn, I created the mapping_id2 functiion to make API requests to uniprot db
        '''
        if not prot_id in self.dicotested.keys():
            #mapping = self.unip.mapping(fr=database, to='ID', query = prot_id) # No longer used since bioservices stoped working
            #print mapping
            #sys.exit()
            mapping = self.mapping_id2(prot_id, database)
            if len(mapping) > 1:
                for k in mapping:
                    value = k.split('\t')[1]
                    break
            else:
                tabtosave = ['','','','']
                self.dicotested[prot_id] = tabtosave
                return '', '', '', ''
            cq = self.create_query(value)
            r = self.unip.quick_search(value, limit=1)
            if len(r.keys()) > 0:
                unipID = r.keys()[0]
                tabtosave = [unipID, r[unipID]['Entry name'], r[unipID]['Gene names'], r[unipID]['Protein names']]
                self.dicotested[prot_id] = tabtosave
                return unipID, r[unipID]['Entry name'], r[unipID]['Gene names'], r[unipID]['Protein names']
            tabtosave = ['','','','']
            self.dicotested[cq] = tabtosave
            return '', '', '', ''
        else:
            return self.dicotested[prot_id][0], self.dicotested[prot_id][1], self.dicotested[prot_id][2], self.dicotested[prot_id][3]


    def mapping_id2(self, prot_id, database):
        '''
        '''
        import urllib,urllib2

        url = 'https://www.uniprot.org/uploadlists/'

        params = {
        'from': database,
        'to':'ID',
        'format':'tab',
        'query': prot_id
        }

        data = urllib.urlencode(params)
        request = urllib2.Request(url, data)
        contact = "" # Please set a contact email address here to help us debug in case of problems (see https://www.uniprot.org/help/privacy).
        request.add_header('User-Agent', 'Python %s' % contact)
        for n in range(10): # deal with problems to connect to the server
            try: 
                response = urllib2.urlopen(request)
                page = response.read(200000)
                page = page.split('\n')[1:]
                break
            except (urllib2.HTTPError, httplib.BadStatusLine):
                page = ''
                pass
        return page


    def uni_search(self, protein):
        '''
        Do the quick search but not trying to find the 
        '''
        if not protein in self.dicotested.keys():
            cq = self.create_query(protein)
            r = self.unip.quick_search(cq, limit=1)
            unipID = r.keys()[0]
            tabtosave = [unipID, r[unipID]['Entry name'], r[unipID]['Gene names'], r[unipID]['Protein names']]
            self.dicotested[protein] = tabtosave
            return unipID, r[unipID]['Entry name'], r[unipID]['Gene names'], r[unipID]['Protein names']
        else:
            return self.dicotested[protein][0], self.dicotested[protein][1], self.dicotested[protein][2], self.dicotested[protein][3]
        return None