Example #1
0
    def load_by_gtop_id(self, ligand_name, gtop_id, ligand_type):
        logger = logging.getLogger('build')

        # get the data from cache or web services
        cache_dir = ['guidetopharmacology', 'ligands']
        url = 'http://www.guidetopharmacology.org/services/ligands/$index'
        gtop = fetch_from_web_api(url, gtop_id, cache_dir)

        if gtop:
            # get name from response
            ligand_name = gtop['name']

        # does a ligand by this name already exists?
        try:
            existing_ligand = Ligand.objects.get(name=ligand_name,
                                                 canonical=True)
            return existing_ligand
        except Ligand.DoesNotExist:
            web_resource = False

            if gtop_id:
                # gtoplig webresource
                web_resource = WebResource.objects.get(slug='gtoplig')

            return self.update_ligand(ligand_name, {}, ligand_type,
                                      web_resource, gtop_id)
    def main_func(self, positions, iteration,count,lock):
        while count.value<len(self.structures):
            with lock:
                s = self.structures[count.value]
                count.value +=1 
                self.logger.info('Generating DSSP data for \'{}\'... ({} out of {})'.format(s, count.value, len(self.structures)))
            print(s)

            pdbcode = s.pdb_code.index.lower()
            chain = s.preferred_chain

            # Grab DSSP db index number
            url = 'http://mrs.cmbi.ru.nl/search?db=dssp&q=%s&count=3' % (pdbcode)
            r = request.urlopen(url)
            t = r.geturl()
            d_id = t.split('=')[2][:-3]

            # Grab DSSP file
            url = 'http://mrs.cmbi.ru.nl/download?db=dssp&nr=$index'
            cache_dir = ['dssp', 'id']
            dssp = fetch_from_web_api(url, d_id, cache_dir, raw=True)

            # Parse file
            dssp = self.dssp_dict(dssp,chain)

            rs = Residue.objects.filter(protein_conformation=s.protein_conformation).all()

            for r in rs:
                if r.sequence_number in dssp:
                    point, created = ResidueDataPoint.objects.get_or_create(data_type=self.dssp_type, residue=r, value_text=dssp[r.sequence_number])
Example #3
0
    def load_by_gtop_id(self, ligand_name, gtop_id, ligand_type):
        logger = logging.getLogger('build')

        # get the data from cache or web services
        cache_dir = ['guidetopharmacology', 'ligands']
        url = 'http://www.guidetopharmacology.org/services/ligands/$index'
        gtop = fetch_from_web_api(url, gtop_id, cache_dir)
        
        if gtop:
            # get name from response
            ligand_name = gtop['name']
            if ligand_name=='11-<i>cis</i>-retinal':
                ligand_name = 'retinal'

        # does a ligand by this name already exists?
        try:
            existing_ligand = Ligand.objects.get(name=ligand_name, canonical=True)
            return existing_ligand
        except Ligand.DoesNotExist:
            web_resource = False
            
            if gtop_id:
                # gtoplig webresource
                web_resource = WebResource.objects.get(slug='gtoplig')
            
            return self.update_ligand(ligand_name, {}, ligand_type, web_resource, gtop_id)
Example #4
0
    def find_cid_for_chembl(self, chembl_mol_id):
        # function to find cid based on chembl
        cache_dir = ['ebi', 'chembl', 'src_compound_id_all']
        url = 'https://www.ebi.ac.uk/unichem/rest/src_compound_id_all/$index/1/22'
        lig_data = fetch_from_web_api(url, chembl_mol_id, cache_dir)
        # print("Searching for ",chembl_mol_id)
        not_found = False
        cid = False
        temp = []
        if not lig_data:
            #if not successful
            not_found = True
        else:
            try:
                for i, x  in enumerate(lig_data):
                    temp.append(lig_data[i]['src_compound_id'])
                if len(temp) > 1:
                    cid = ';'.join(temp)
                    self.add_cid_to_dict(chembl_mol_id,cid)
                elif len(temp) == 1 and temp[0] != '\n':
                    cid = temp[0] #lig_data[0]['src_compound_id'] 
                    self.add_cid_to_dict(chembl_mol_id,cid)
                else:
                    not_found = True
                    #print (chembl_mol_id)
            except KeyError:
                not_found = True
        if not cid:
            # not found
            cache_dir = ['pubchem', 'chembl', 'compound_name']
            url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/$index/json'
            lig_data = fetch_from_web_api(url, chembl_mol_id, cache_dir)
            if lig_data:
                try:
                    cid = lig_data['PC_Compounds'][0]['id']['id']['cid']
                    self.add_cid_to_dict(chembl_mol_id,cid)
                    not_found = False
                except KeyError:
                    not_found = True

        return cid,not_found
Example #5
0
    def find_cid(self, chembl_mol_ids, chembl_cid_dict):
        notfound = set()
        for chembl_mol_id in chembl_mol_ids:

            if chembl_mol_id not in chembl_cid_dict.keys():
                temp = []

                # url = 'https://www.ebi.ac.uk/unichem/rest/src_compound_id_all/'+chembl_mol_id+'/1/22'
                # response = requests.get(url)
                # lig_data = response.json()
                cache_dir = ['ebi', 'chembl', 'src_compound_id_all']
                url = 'https://www.ebi.ac.uk/unichem/rest/src_compound_id_all/$index/1/22'
                lig_data = fetch_from_web_api(url, chembl_mol_id, cache_dir)
                # print("Searching for ",chembl_mol_id,len(chembl_mol_ids))
                if not lig_data:
                    #if not successful
                    notfound.add(chembl_mol_id)
                    continue
                try:
                    for i, x in enumerate(lig_data):
                        temp.append(lig_data[i]['src_compound_id'])
                    if len(temp) > 1:
                        cid = ';'.join(temp)
                        chembl_cid_dict[chembl_mol_id] = cid
                        self.add_cid_to_dict(chembl_mol_id, cid)
                    elif len(temp) == 1 and temp[0] != '\n':
                        cid = temp[0]  #lig_data[0]['src_compound_id']
                        #updating the existing dictionary
                        chembl_cid_dict[chembl_mol_id] = cid
                        self.add_cid_to_dict(chembl_mol_id, cid)
                    else:
                        notfound.add(chembl_mol_id)
                        #print (chembl_mol_id)
                except KeyError:
                    notfound.add(chembl_mol_id)
                    #print (chembl_mol_id)
            elif chembl_mol_id in chembl_cid_dict:
                continue
            else:
                #raise KeyError:
                print(
                    chembl_mol_id)  #to do put it to logfile where it should be

        return chembl_cid_dict, notfound  # to do perhaps a redundant to have found and chembl_cid_dict.
Example #6
0
 def find_cid(self, chembl_mol_ids, chembl_cid_dict):
     notfound = set()
     for chembl_mol_id in chembl_mol_ids:
         
         if chembl_mol_id not in chembl_cid_dict.keys():
             temp = []
             
             # url = 'https://www.ebi.ac.uk/unichem/rest/src_compound_id_all/'+chembl_mol_id+'/1/22'
             # response = requests.get(url)
             # lig_data = response.json()
             cache_dir = ['ebi', 'chembl', 'src_compound_id_all']
             url = 'https://www.ebi.ac.uk/unichem/rest/src_compound_id_all/$index/1/22'
             lig_data = fetch_from_web_api(url, chembl_mol_id, cache_dir)
             # print("Searching for ",chembl_mol_id,len(chembl_mol_ids))
             if not lig_data:
                 #if not successful
                 notfound.add(chembl_mol_id)
                 continue
             try:
                 for i, x  in enumerate(lig_data):
                     temp.append(lig_data[i]['src_compound_id'])
                 if len(temp) > 1:
                     cid = ';'.join(temp)
                     chembl_cid_dict[chembl_mol_id] = cid
                     self.add_cid_to_dict(chembl_mol_id,cid)
                 elif len(temp) == 1 and temp[0] != '\n':
                     cid = temp[0] #lig_data[0]['src_compound_id'] 
                     #updating the existing dictionary
                     chembl_cid_dict[chembl_mol_id] = cid
                     self.add_cid_to_dict(chembl_mol_id,cid)
                 else:
                     notfound.add(chembl_mol_id)
                     #print (chembl_mol_id)
             except KeyError:
                 notfound.add(chembl_mol_id)
                 #print (chembl_mol_id)
         elif chembl_mol_id in chembl_cid_dict:
             continue
         else:
             #raise KeyError:
             print (chembl_mol_id) #to do put it to logfile where it should be
 
     return chembl_cid_dict, notfound # to do perhaps a redundant to have found and chembl_cid_dict.
Example #7
0
    def main_func(self, positions, iteration, count, lock):
        while count.value < len(self.structures):
            with lock:
                s = self.structures[count.value]
                count.value += 1
                self.logger.info(
                    'Generating DSSP data for \'{}\'... ({} out of {})'.format(
                        s, count.value, len(self.structures)))
            print(s)

            pdbcode = s.pdb_code.index.lower()
            chain = s.preferred_chain

            # Grab DSSP db index number
            url = 'http://mrs.cmbi.ru.nl/search?db=dssp&q=%s&count=3' % (
                pdbcode)
            r = request.urlopen(url)
            t = r.geturl()
            d_id = t.split('=')[2][:-3]

            # Grab DSSP file
            url = 'http://mrs.cmbi.ru.nl/download?db=dssp&nr=$index'
            cache_dir = ['dssp', 'id']
            dssp = fetch_from_web_api(url, d_id, cache_dir, raw=True)

            # Parse file
            dssp = self.dssp_dict(dssp, chain)

            rs = Residue.objects.filter(
                protein_conformation=s.protein_conformation).all()

            for r in rs:
                if r.sequence_number in dssp:
                    point, created = ResidueDataPoint.objects.get_or_create(
                        data_type=self.dssp_type,
                        residue=r,
                        value_text=dssp[r.sequence_number])
Example #8
0
    def main_func(self, positions, iteration,count,lock):
        #####Create chembl compound link and connect it to the corresponding ligand/cid#####

        if iteration==0:
            # First load makes sure ligands are there
            list_of_chembl_ids = self.chembl_mol_ids
            while count.value<len(list_of_chembl_ids):
                with lock:
                    chembl_ligand = list_of_chembl_ids[count.value]
                    count.value +=1 
                    if count.value % 1000 == 0:
                        print('{} Status {} out of {}'.format(
                        datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S'), count.value, len(list_of_chembl_ids)))

                l = Ligand.objects.filter(properities__web_links__web_resource__slug = 'chembl_ligand', properities__web_links__index=chembl_ligand).first()
                if l:
                    cid = l.properities.web_links.filter(web_resource__slug = 'pubchem').first()
                    if cid:
                        cid = cid.index
                    else:
                        l = None
                        # make sure code blow is run

                if not l:
                    # if l already has chembl link, assume all is good.
                    if chembl_ligand not in self.chembl_cid_dict.keys():
                        cids, not_found = self.find_cid_for_chembl(chembl_ligand)
                        if not_found:
                            print('SKIPPED: Could not determine CID',chembl_ligand,cids)
                            continue
                    else:
                        cids = self.chembl_cid_dict[chembl_ligand]

                    temp = str(cids).split(';') #perhaps we should load all of the CIDs
                    cid = str(temp[0])

                    l = get_or_make_ligand(cid,'PubChem CID') #call the first cid if there are more than one
                    if not l:
                        print('SKIPPED: Ligand not found in PubChem', cid)
                        continue

                    if not l.properities.web_links.filter(web_resource__slug = 'pubchem',index = cid).exists():
                        # NO CID FOR LIGAND! Rare cases where SMILES was used for initial look up
                        wl, created = WebLink.objects.get_or_create(index=cid, web_resource=self.wr_pubchem)
                        l.properities.web_links.add(wl)

                    if not l.properities.web_links.filter(web_resource__slug = 'chembl_ligand',index = chembl_ligand).exists():
                        wl, created = WebLink.objects.get_or_create(index=chembl_ligand, web_resource=self.wr)
                        l.properities.web_links.add(wl)
                
                ###### Vendor stuff  ######
                if not len(l.properities.vendors.all()):
                    # If it has some, assume they are all loaded
                    cache_dir = ['pubchem', 'cid', 'vendors']
                    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/categories/compound/$index/JSON/'
                    vendors = fetch_from_web_api(url, cid, cache_dir)
                    
                    if vendors:
                        for vendor_data in vendors['SourceCategories']['Categories'][0]['Sources'] :
                            lv, created = LigandVendors.objects.get_or_create(slug = slugify(vendor_data['SourceName']))
                            lv.name = vendor_data['SourceName']
                            if 'SourceURL' in vendor_data:
                                lv.url = vendor_data['SourceURL']
                            lv.save()

                            if 'SID' in vendor_data:
                                #print (vendor_data['SID'])
                                lvls = LigandVendorLink.objects.filter(sid = vendor_data['SID'] )
                                if not lvls.exists():
                                    lvl = LigandVendorLink()
                                    lvl.vendor = lv
                                    lvl.lp = l.properities
                                    lvl.sid =  vendor_data['SID'] 
                                    if 'RegistryID' in vendor_data:
                                        lvl.vendor_external_id = vendor_data['RegistryID']
                                    if 'SourceRecordURL' in vendor_data:
                                        lvl.url = vendor_data['SourceRecordURL']
                                    else:
                                        continue
                                    lvl.save()

        elif iteration==1:
            # Third load loads the exp (based on ligand/assay)
            header = self.header_dict
            skipped = 0
            non_p = []
            wr_chembl_assays = WebResource.objects.get(slug='chembl_assays')
            while count.value<len(self.data):
                with lock:
                    record = self.data[count.value]
                    count.value +=1 
                    if count.value % 10000 == 0:
                        print('{} Status {} out of {}'.format(
                        datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S'), count.value, len(self.data)))


                target = record[header['target_chembl_id']]
                assay_id = record[header['assay_chembl_id']]

                assay, created = ChemblAssay.objects.get_or_create(assay_id=assay_id)
                if created:
                    wl, created = WebLink.objects.get_or_create(index=assay_id, web_resource=wr_chembl_assays)
                    assay.web_links.add(wl)


                ligand =record[header['molecule_chembl_id']]
                p = Protein.objects.filter(web_links__index = target, web_links__web_resource__slug = 'chembl').first()
                if not p:
                    if not target in non_p:
                        non_p.append(target)
                        print('Not found protein!',target)
                    continue

                ls = Ligand.objects.filter(properities__web_links__index=ligand, properities__web_links__web_resource__slug = 'chembl_ligand', canonical=True)
                if not ls.exists():
                    # if no ligand matches this, then ignore -- be sure this works later.
                    skipped += 1
                    continue
                for l in ls:
                    if len(ls)>1:
                        print('issue with canonical! give to munk',l,l.pk,ligand)
                        break
                assay_experiments = AssayExperiment.objects.filter( protein=p, ligand=l, assay=assay)
                
                if assay_experiments.exists():
                    assay_experiment = assay_experiments.get()
                else:
                    assay_experiment = AssayExperiment()
                    assay_experiment.assay = assay
                    assay_experiment.ligand = l
                    assay_experiment.protein = p

                
                
                assay_experiment.assay_type = record[header['assay_type']]
                assay_experiment.pchembl_value = record[header['pchembl_value']]
                assay_experiment.assay_description = record[header['assay_description']]
                assay_experiment.published_value = record[header['published_value']]
                assay_experiment.published_relation = record[header['published_relation']]
                assay_experiment.published_type = record[header['published_type']]
                assay_experiment.published_units = record[header['published_units']]
                
                assay_experiment.standard_value = record[header['standard_value']]
                assay_experiment.standard_relation = record[header['standard_relation']]
                assay_experiment.standard_type = record[header['standard_type']]
                assay_experiment.standard_units = record[header['standard_units']]
                
                try:
                    assay_experiment.save()
                except IntegrityError:
                    assay_experiment = AssayExperiment.objects.get( protein=p, ligand=l, assay=assay)

            print('done, skipped:',skipped)
Example #9
0
    def handle(self, *args, **options):
        cache_dir = ['ensembl2', 'isoform']
        url = 'https://rest.ensembl.org/sequence/id/$index?content-type=application/json&type=protein'
        #Altenrative
        url = 'https://grch37.rest.ensembl.org/sequence/id/$index?db_type=core;object_type=predictiontranscript;content-type=application/json;species=homo_sapiens;type=protein'
        url = 'https://grch37.rest.ensembl.org/sequence/id/$index?type=protein;content-type=application/json'
        filepath = 'protein/data/Isoform_annotation_table.txt'
        isoforms = []
        with open(filepath, "r", encoding='UTF-8') as f:
            for row in f:
                c = row.split("\t")
                isoforms.append(c)

        # Skip header
        total_matches = 0
        total_mismatches = 0
        total_mismatches_1 = 0
        total_align_match = 0
        total_align_mismatch = 0

        isoforms_with_issue = {}

        dump = {}

        for c, i in enumerate(isoforms[1:]):

            p = '{}_human'.format(i[0].lower())
            print(p)
            protein = Protein.objects.get(entry_name=p,
                                          sequence_type__slug='wt',
                                          species__common_name='Human')
            wt_seq = protein.sequence
            rs = Residue.objects.filter(
                protein_conformation__protein=protein).prefetch_related(
                    'protein_segment', 'display_generic_number',
                    'generic_number')
            r_lookup = {}
            r_segment = {}
            for r in rs:
                r_lookup[r.sequence_number] = [
                    r.protein_segment.slug,
                    str(r.display_generic_number), r.sequence_number
                ]
                if r.protein_segment.slug not in r_segment:
                    r_segment[r.protein_segment.slug] = 0
                r_segment[r.protein_segment.slug] += 1

            seq_filename = "protein/data/MSA_GPCR_isoforms/{}_isoform_MSA.fa".format(
                p.lower())
            with open(seq_filename, "r") as myfile:
                fasta_raw = myfile.read()
                fasta = fasta_raw.splitlines()

            wt_seq2 = fasta[1]

            es = i[3].split(", ")
            isoform_id = i[1]

            print(c, len(isoforms), p, isoform_id, es)

            wt_check = wt_seq == wt_seq2.replace("-", "")
            if not wt_check:
                print(p, 'WT SEQ NO MATCH!!')
                # continue
            # print('WT SEQ',wt_seq==wt_seq2.replace("-",""))
            ranges = {}
            for e in es:
                iso_seq_msa = fasta[1 + int(isoform_id) * 2]
                iso_seq_msa_corrected = ''
                for pos, a in enumerate(iso_seq_msa):
                    if wt_seq2[pos] == '-' and a == '-':
                        continue
                    iso_seq_msa_corrected += a

                isoform_info = fetch_from_web_api(url, e, cache_dir)
                if (isoform_info):
                    iso_seq = isoform_info['seq']
                    iso_check = iso_seq == iso_seq_msa.replace("-", "")
                    if not iso_check:
                        isoforms_with_issue[
                            p + "_" + e] = "Sequence does not match with API"
                        # print("E_ID:", e, " SEQUENCE DO NOT MATCH")
                        # print("API:",iso_seq)
                        # print("MSA:",iso_seq_msa.replace("-",""))
                        total_mismatches += 1
                        if iso_seq == iso_seq_msa.replace("-", "")[:-1]:
                            total_mismatches_1 += 1
                    else:
                        total_matches += 1
                    pw2 = pairwise2.align.globalms(wt_seq, iso_seq, 2, -5, -10,
                                                   -.5)
                    aln_ref = pw2[0][0]
                    aln_isoform = pw2[0][1]
                    if aln_isoform != iso_seq_msa_corrected:
                        isoforms_with_issue[
                            p + "_" +
                            e] = "Alignment differs than pairwise, see alignment for sanity"
                        total_align_mismatch += 1
                        # print('misalign')
                        # print(aln_isoform)
                        # print(iso_seq_msa_corrected)
                    else:
                        total_align_match += 1

                    gaps = 0
                    gaps_iso = 0
                    missing_pos = []
                    missing_pos_iso = []
                    res_correct = {}
                    isoform_missing_segment = {}
                    count_segment = {}
                    # print("length",len(aln_ref),len(aln_isoform))
                    for i, r in enumerate(aln_ref, 1):
                        if aln_isoform[i - 1] == '-':
                            gaps_iso += 1
                        if r == "-":
                            res_correct[i] = ['', '', '']
                            gaps += 1
                            if aln_isoform[i - 1] != "-":
                                # Ref is missing
                                missing_pos_iso.append(i - gaps_iso)
                                if i - gaps == 0:
                                    # Take N-term if it's begining
                                    isoform_missing_segment[
                                        i - gaps_iso] = r_lookup[1][0]
                                else:
                                    isoform_missing_segment[i - gaps_iso] = (
                                        i - gaps, r_lookup[i - gaps][0])
                        else:
                            res_correct[i] = aln_ref[i - gaps - 1]
                            if aln_isoform[i - 1] == "-":
                                # Ref is missing
                                missing_pos.append(i - gaps)
                            else:
                                segment = r_lookup[i - gaps][0]
                                if segment not in count_segment:
                                    count_segment[segment] = 0
                                count_segment[segment] += 1

                    result_segment = {}
                    for segment, value in r_segment.items():
                        if segment in count_segment:
                            freq = round(count_segment[segment] / value, 2)
                            count = count_segment[segment]
                        else:
                            freq = 0
                            count = 0
                        if freq != 1:
                            # If incomplete segment, save it
                            result_segment[segment] = [freq, count, value]
                    # print(result_segment)
                    # print(missing_pos,missing_pos_iso)
                    ranges = {}
                    ranges['deleted_ref'] = []
                    ranges['inserts'] = []
                    ranges['segments_altered'] = result_segment
                    for k, g in groupby(enumerate(missing_pos),
                                        lambda x: x[0] - x[1]):
                        group = list(map(itemgetter(1), g))
                        # What was the previous postions slug
                        from_segment = r_lookup[group[0]][0]
                        to_segment = r_lookup[group[-1]][0]
                        ranges['deleted_ref'].append({
                            'from': [group[0], from_segment],
                            'to': [group[-1], to_segment],
                            'length':
                            len(group)
                        })

                    for k, g in groupby(enumerate(missing_pos_iso),
                                        lambda x: x[0] - x[1]):
                        group = list(map(itemgetter(1), g))
                        inserted_into = isoform_missing_segment[group[0]]
                        ranges['inserts'].append({
                            'from': group[0],
                            'to': group[-1],
                            'inserted_into': inserted_into,
                            'length': len(group)
                        })

                    # print(ranges)

                else:
                    print(e, 'no info')

                key = '{}_{}'.format(p, isoform_id)
                dump[key] = ranges
                dump[key]['e_ids'] = es
        #print(dump)
        f = open('protein/data/isoforms.json', 'w')
        json.dump(dump, f, indent=4, separators=(',', ': '))
        print("SUMMARY")
        print("TOTAL MATCHES of isoform seq", total_matches)
        print("ALIGNMENT MATCH", total_align_match, "MISMATCH",
              total_align_mismatch)
        print("TOTAL MISMATCHES of isoform seq", total_mismatches)
        print("TOTAL MISMATCHES of isoform seq (MSA has one extra)",
              total_mismatches_1)

        for e, r in isoforms_with_issue.items():
            print(e, r)
Example #10
0
    def update_from_doi(self, doi):
        logger = logging.getLogger('build')
        # should entrez be tried as a backup?
        try_entrez_on_fail = False

        # check whether this data is cached
        cache_dir = ['crossref', 'doi']
        url = 'http://api.crossref.org/works/$index'
        pub = fetch_from_web_api(url, doi, cache_dir)

        if pub:
            # update record
            try:
                self.title = pub['message']['title'][0]
                try:
                    self.year = pub['message']['created']['date-parts'][0][0]
                except:
                    self.year = pub['message']['deposited']['date-parts'][0][0]

                # go from [{'family': 'Gloriam', 'given': 'David E.'}] to ['Gloriam DE']
                authors = [
                    '{} {}'.format(
                        x['family'],
                        ''.join([y[:1] for y in x['given'].split()]))
                    for x in pub['message']['author']
                ]
                self.authors = ', '.join(authors)

                # get volume and pages if available
                reference = {}
                fields = ['volume', 'page']
                for f in fields:
                    if f in pub['message']:
                        reference[f] = pub['message'][f]
                    else:
                        reference[f] = 'X'
                self.reference = '{}:{}'.format(reference['volume'],
                                                reference['page'])

                # journal
                journal = pub['message']['container-title'][0]
                try:
                    # not all records have the journal abbreviation
                    journal_abbr = pub['message']['container-title'][1]
                except:
                    journal_abbr = slugify(journal)
                try:
                    self.journal, created = PublicationJournal.objects.get_or_create(
                        name=journal, defaults={'slug': journal_abbr})
                    if created:
                        logger.info('Created journal {}'.format(journal))
                except IntegrityError:
                    self.journal = PublicationJournal.objects.get(name=journal)
            except Exception as msg:
                logger.warning(
                    'Processing data from CrossRef for {} failed: {}'.format(
                        doi, msg))
                try_entrez_on_fail = False
        else:
            print("Publication not on crossref", doi)
            try_entrez_on_fail = False

        if try_entrez_on_fail:
            # try searching entrez for DOI
            try:
                Entrez.email = '*****@*****.**'
                record = Entrez.read(
                    Entrez.esearch(db='pubmed', retmax=1, term=doi))
                self.update_from_pubmed_data(record['IdList'][0])
            except:
                return False
Example #11
0
def fetch_pdb_info(pdbname,protein):
    logger = logging.getLogger('build')
    #d = {}
    d = OrderedDict()
    d['construct_crystal'] = {}
    d['construct_crystal']['pdb'] = pdbname
    d['construct_crystal']['pdb_name'] = 'auto_'+pdbname
    d['construct_crystal']['uniprot'] = protein.parent.entry_name

    d['contact_info'] = {}
    d['contact_info']['name_cont'] = 'gpcrdb'
    d['contact_info']['pi_email'] = '*****@*****.**'
    d['contact_info']['pi_name'] = 'gpcrdb'
    d['contact_info']['url'] = 'gpcrdb.org'
    d['contact_info']['date'] = time.strftime('%m/%d/%Y')
    d['contact_info']['address'] = ''

    d['protein'] = protein.parent.name
    d['wt_seq'] = protein.parent.sequence
    d['pdb'] = pdbname
    d['links'] = []
    d['xml_not_observed'] = []
    d['xml_segments'] = []

    pos_in_wt = list(range(1,len(d['wt_seq'])+1))

    #http://files.gpcrdb.org/uniprot_mapping.txt
    ## get uniprot to name mapping
    url = 'http://files.gpcrdb.org/uniprot_mapping.txt'
    req = urlopen(url)
    uniprot_mapping = req.read().decode('UTF-8')
    rows = ( line.split(' ') for line in uniprot_mapping.split('\n') )
    uniprot_mapping = { row[0]:row[1:] for row in rows }

    #errors, fix it.
    uniprot_mapping['P08483'] = ['acm3_rat']
    uniprot_mapping['P42866'] = ['oprm_mouse']
     

    #ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/xml/1xyz.xml.gz
    cache_dir = ['sifts', 'xml']
    url = 'ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/xml/$index.xml.gz'
    sifts = fetch_from_web_api(url, pdbname.lower(), cache_dir, xml = True)
    d['links'].append(Template(url).substitute(index=quote(str(pdbname.lower()), safe='')))
    d['mutations'] = []
    d['auxiliary'] = OrderedDict()
    receptor_seq_ids = []
    receptor_chain = ''
    if sifts: #success
        # print(sifts)
        insert_position = 'N-term'
        insert_start = 0
        msg_1 = 0
        msg_2 = 0
        # for elem in sifts:
        #     print(elem)
        for elem in sifts.findall('.//{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}segment'):
            receptor = False
            chain = elem.attrib['segId'].split('_')[1]
            for res in elem[0]: #first element is residuelist
                if receptor_chain!='':
                    break #break if found
                for node in res:
                    if node.tag == '{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}crossRefDb':
                        source = node.attrib['dbSource']
                        if source=='UniProt':
                            u_id = node.attrib['dbAccessionId']
                            if u_id in uniprot_mapping:
                                receptor_chain = chain
                                break
                                
        for elem in sifts.findall('.//{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}segment'):
            # print(elem.attrib)
            if 'segId' not in elem.attrib:
                continue #not receptor
            seg_uniprot_ids = []
            max_pos = 0
            min_pos = 99999
            pos_list = []
            uniprot_pos = None
            receptor = False
            u_id_source = 'N/A'
            chain = elem.attrib['segId'].split('_')[1]
            seg_resid_list = []
            # print(chain,'chain')
            for res in elem[0]: #first element is residuelist
                u_id = 'N/A'
                pdb_aa = ''
                for node in res:
                    if node.tag == '{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}crossRefDb':
                        source = node.attrib['dbSource']
                        if source=='UniProt':
                            u_id = node.attrib['dbAccessionId']
                            u_id_source = 'UniProt'
                            if u_id in uniprot_mapping:
                                u_id = uniprot_mapping[u_id][0] 
                                receptor = True ## this is receptor element
                                if receptor_chain=='' or receptor_chain==chain:
                                    receptor_chain = chain
                                elif msg_1==0:
                                    msg_1 = 1
                                    # print('\t', pdbname.lower(),'receptor in many chains?!',chain,receptor_chain)
                                    logger.warning('{} has receptor in many chains {} {}'.format(pdbname.lower(),chain,receptor_chain))
                                insert_position = 'Within Receptor'
                            if u_id not in seg_uniprot_ids:
                                seg_uniprot_ids.append(u_id)
                            uniprot_pos = int(node.attrib['dbResNum'])
                            uniprot_aa = node.attrib['dbResName']
                        elif source=='PDB' and node.attrib['dbResNum'].isdigit(): #use instead of isinstance(node.attrib['dbResNum'], int):
                            pos = int(node.attrib['dbResNum'])
                            try:
                                pdb_aa = AA_three[node.attrib['dbResName'].upper()]
                            except:
                                pdb_aa = "X"
                            if receptor:
                                receptor_seq_ids.append(pos)
                            seg_resid_list.append(pos)
                            if pos>max_pos: max_pos = pos
                            if pos<min_pos: min_pos = pos
                    elif pdb_aa and node.tag == '{http://www.ebi.ac.uk/pdbe/docs/sifts/eFamily.xsd}residueDetail':
                        #print(node.attrib['property'],node.text)
                        if node.text=='Not_Observed' and receptor:
                            d['xml_not_observed'].append(uniprot_pos)
                        elif node.attrib['property']=='Annotation' and u_id=='N/A':
                            u_id = node.text
                            if u_id not in seg_uniprot_ids:
                                seg_uniprot_ids.append(u_id)
                        elif receptor and node.attrib['property']=='Annotation' and node.text == 'Engineered mutation': ## only in receptor
                            if {'mut':pdb_aa,'wt':uniprot_aa,'pos':uniprot_pos,'type':''} not in d['mutations']: #prevent duplicates
                                d['mutations'].append({'mut':pdb_aa,'wt':uniprot_aa,'pos':uniprot_pos,'type':''})
                if uniprot_pos:
                    pos_list.append(uniprot_pos) 
                    if receptor and uniprot_pos in pos_in_wt:
                       pos_in_wt.remove(uniprot_pos)
                       insert_start =  str(uniprot_pos+1)
                    elif receptor:
                        # print('wierd error with position already deleted',uniprot_pos)
                        pass
            ranges = []
            for k, g in groupby(enumerate(pos_list), lambda x:x[0]-x[1]):
                group = list(map(itemgetter(1), g))
                ranges.append((group[0], group[-1]))

            if receptor==False and u_id_source=='UniProt':
                url = 'http://www.uniprot.org/uniprot/$index.xml'
                insert_info = fetch_from_web_api(url, seg_uniprot_ids[0], cache_dir, xml = True)
                d['links'].append(Template(url).substitute(index=quote(str(seg_uniprot_ids[0]), safe='')))

                for elm in insert_info.findall('.//{http://uniprot.org/uniprot}recommendedName'):
                    seg_uniprot_ids[0] = elm.find('{http://uniprot.org/uniprot}fullName').text

            d['xml_segments'].append([elem.attrib['segId'],seg_uniprot_ids,min_pos,max_pos,ranges,insert_position,seg_resid_list])
            if receptor == False and receptor_chain==chain: #not receptor, but is in same chain
                if len(seg_uniprot_ids):
                    subtype =seg_uniprot_ids[0]
                else:
                    subtype ='N/A'
                    continue #do not add segments without information
                if subtype == 'Not_Observed':
                    continue #ignore "aux" that are 'not observed'
                if subtype == 'Engineered mutation':
                    continue #ignore "aux" that are 'not observed'
                if subtype == 'S-arrestin':
                    continue #  S-arrestin is not part of the chain
                d['auxiliary']['aux'+str(len(d['auxiliary']))] = {'type':'auto','subtype':subtype,'presence':'YES','position':insert_position, 'start':insert_start}
            elif receptor == False:
                # print('\t',pdbname.lower(),'Protein in PDB, not part of receptor chain',seg_uniprot_ids,'chain',chain)
                logger.warning('{} Protein in structure, but not part of receptor chain {} {}'.format(pdbname.lower(),seg_uniprot_ids,chain))
        d['deletions'] = []
        for k, g in groupby(enumerate(pos_in_wt), lambda x:x[0]-x[1]):
            group = list(map(itemgetter(1), g))
            d['deletions'].append({'start':group[0], 'end':group[-1], 'origin':'user'})

        d['not_observed'] = []
        for k, g in groupby(enumerate(d['xml_not_observed']), lambda x:x[0]-x[1]):
            group = list(map(itemgetter(1), g))
            d['not_observed'].append((group[0], group[-1]))

    else:
        pass
        # print('failed sifts')

    #http://www.ebi.ac.uk/pdbe/api/pdb/entry/experiment/2RH1
    ## experiment data
    cache_dir = ['pdbe', 'experiment']
    url = 'http://www.ebi.ac.uk/pdbe/api/pdb/entry/experiment/$index'
    pdbe = fetch_from_web_api(url, pdbname, cache_dir)
    d['links'].append(Template(url).substitute(index=quote(str(pdbname), safe='')))
    if pdbe: #success
        r = pdbe[pdbname.lower()][0]
        d['resolution'] = r.get('resolution')
        d['crystal_growth'] = r.get('crystal_growth')
        d['r_factor'] = r.get('r_factor')
        d['experimental_method'] = r.get('experimental_method')
    else:
        pass
        # print('failed pdbe')

    # #http://www.ebi.ac.uk/pdbe/api/pdb/entry/modified_AA_or_NA/2RH1
    # ## modified AA (empty on 2RH1)
    # cache_dir = ['pdbe', 'modified_AA_or_NA']
    # url = 'http://www.ebi.ac.uk/pdbe/api/pdb/entry/modified_AA_or_NA/$index'
    # pdbe_mod = fetch_from_web_api(url, pdbname, cache_dir)
    # d['links'].append(Template(url).substitute(index=quote(str(pdbname), safe='')))
    
    # if pdbe_mod: #success
    #     print(pdbe_mod)
    # else:
    #     d['modifications3'] = 'None'
    #     print('failed pdbe_mod')

    #http://www.rcsb.org/pdb/explore/jmol.do?structureId=4LDO&json=true
    ## modifications for their jmol -- "hacky" way to get it
    cache_dir = ['rcsb', 'jmol_modifications']
    url = 'http://www.rcsb.org/pdb/explore/jmol.do?structureId=$index&json=true'
    rcsb_mod = fetch_from_web_api(url, pdbname, cache_dir)
    d['links'].append(Template(url).substitute(index=quote(str(pdbname), safe='')))
    # print(Template(url).substitute(index=quote(str(pdbname), safe='')))
    if rcsb_mod: #success
        d['modifications'] = []
        d['modifications2'] = rcsb_mod
        # print(receptor_seq_ids)
        for mod in rcsb_mod['protmod']['domains']:
            t = mod['range'].split(',')
            if t[0].split(':')[1]!=receptor_chain:
                # print('modification not in receptor chain, not interested')
                continue
            if len(t)>1:
                position_type = 'pair'
                position_info = [t[0].split(':')[0],t[1].split(':')[0]]
            elif len(t)==1:
                position_type = 'single'
                position_info = [t[0].split(':')[0],0]
            else:
                print('error',t)
                continue
            # print(mod['id'],pair,mod['description'])
            if mod['id']=='crosslink2': mod['id']="Disulfide bond" #replace non-descript crosslink2 
            d['modifications'].append({'position':[position_type,position_info],'type':mod['id'],'remark':mod['description']})
            #{{v.id}} {{v.range}} {{v.description}} {{v.pdbCcId}} <br><br>
   
    else:
        d['modifications2'] = 'None'
        # print('failed pdbe_mod')

    #http://www.ebi.ac.uk/pdbe/api/pdb/entry/ligand_monomers/2RH1
    cache_dir = ['pdbe', 'ligands']
    url = 'http://www.ebi.ac.uk/pdbe/api/pdb/entry/ligand_monomers/$index'
    pdbe_ligands = fetch_from_web_api(url, pdbname, cache_dir)
    d['links'].append(Template(url).substitute(index=quote(str(pdbname), safe='')))
    # print(Template(url).substitute(index=quote(str(pdbname), safe='')))
    if pdbe_ligands: #success
        d['ligands'] = {}
        for name,pdb in pdbe_ligands.items():
            for ligand in pdb:
                if ligand['chem_comp_id'] not in d['ligands']:
                    d['ligands'][ligand['chem_comp_id']] = {'comp_name':ligand['chem_comp_name'], 'number_of_entries':1}
                else:
                    d['ligands'][ligand['chem_comp_id']]['number_of_entries'] += 1
        # print(d['ligands'])
   
    else:
        d['ligands'] = 'None'
        # print('failed pdbe_ligands')


    ## NOT NEED - FETCH MUT FROM XML 
    
    # #http://www.ebi.ac.uk/pdbe/api/pdb/entry/mutated_AA_or_NA/2RH1
    # ## mutated AA
    # ### got conflicts, engerineered mutation and expression tag examples
    # cache_dir = ['pdbe', 'mutated_AA_or_NA']
    # url = 'http://www.ebi.ac.uk/pdbe/api/pdb/entry/mutated_AA_or_NA/$index'
    # pdbe_mut = fetch_from_web_api(url, pdbname, cache_dir)
    # d['links'].append(Template(url).substitute(index=quote(str(pdbname), safe='')))
    
    # if pdbe_mut: #success
    #     r = pdbe_mut[pdbname.lower()]
    #     d['mutations_pdbe'] = []
    #     for mut in r:
    #         mut_from = mut['mutation_details']['from']
    #         mut_to = mut['mutation_details']['to']
    #         mut_type = mut['mutation_details']['type']
    #         construct_seq_number = mut['residue_number']
    #         wt_seq_number = mut['author_residue_number']
    #         t = {'wt':mut_from,'mut':mut_to,'type':mut_type,'c_seq_nr':construct_seq_number,'pos':wt_seq_number}
    #         d['mutations_pdbe'].append(t)
    # else:
    #     print('failed pdbe_mut')


    #http://www.rcsb.org/pdb/rest/das/pdb_uniprot_mapping/alignment?query=2RH1
    ## uniprot mappings
    ### seems to be IDs of stuff then use:
    # http://www.uniprot.org/uniprot/P00720.xml
    cache_dir = ['rcsb', 'pdb_uniprot_mapping']
    url = 'http://www.rcsb.org/pdb/rest/das/pdb_uniprot_mapping/alignment?query=$index'
    uniprot_map = fetch_from_web_api(url, pdbname, cache_dir, xml = True)
    d['links'].append(Template(url).substitute(index=quote(str(pdbname), safe='')))
    
    if uniprot_map: #success
        inserts = {}
        inserts_fixed = {}
        for block in uniprot_map[0]:
            if block.tag[-5:]!='block':
                continue #only interested in the blocks...
            i = 0
            for segment in block:
                if i==0:
                    construct_range = [segment.attrib['start'],segment.attrib['end']]
                else:
                    insert_range = [segment.attrib['start'],segment.attrib['end']]
                    insert_id = segment.attrib['intObjectId']
                prev_block = segment
                i += 1
            i = inserts.setdefault(insert_id, [])
            i.append({'c':construct_range,'i':insert_range})
        for insert,blocks in inserts.items():

            if insert in uniprot_mapping:
                insert = uniprot_mapping[insert][0] 

            inserts_fixed[insert] = {}
            cache_dir = ['uniprot', 'id']
            url = 'http://www.uniprot.org/uniprot/$index.xml'
            insert_info = fetch_from_web_api(url, insert, cache_dir, xml = True)
            d['links'].append(Template(url).substitute(index=quote(str(insert), safe='')))

            for elm in insert_info.findall('.//{http://uniprot.org/uniprot}recommendedName'):
                inserts_fixed[insert]['alt_name'] = elm.find('{http://uniprot.org/uniprot}fullName').text
            # print(insert_info.findall('.//.'))

            blocks_num = len(blocks)
            prev_block = None
            temp = []
            for i, b in enumerate(blocks): #for each block, to glue them together
                if i==0:
                    start = [b['i'][0],b['c'][0]]
                    end = [b['i'][1],b['c'][1]]
                # print(i,b)
                if i<blocks_num-1: #if not last
                    # print('cur',b,'next',blocks[i+1])
                    if int(b['i'][1])==int(blocks[i+1]['i'][0])-1 and int(b['c'][1])==int(blocks[i+1]['c'][0])-1:
                        #if insert is a contination #if construct continues
                        end = [blocks[i+1]['i'][1],blocks[i+1]['c'][1]]
                    else:
                        #gap
                        temp.append({'i_start':start[0],'i_end':end[0],'c_start':start[1],'c_end':end[1]})
                        # temp.append([start,end])
                        start = [blocks[i+1]['i'][0],blocks[i+1]['c'][0]]
                        end = [blocks[i+1]['i'][1],blocks[i+1]['c'][1]]
            temp.append({'i_start':start[0],'i_end':end[0],'c_start':start[1],'c_end':end[1]})
            i = inserts_fixed[insert].setdefault('positions', [])
            i.append(temp)

        d['inserts'] = inserts_fixed


    else:
        pass
        # print('failed uniprot_map')

    return d
Example #12
0
    def load_from_pubchem(self, lookup_type, pubchem_id, ligand_type, ligand_title=False):
        logger = logging.getLogger('build')

        # if ligand title is specified, use that as the name
        if ligand_title:
            ligand_name = ligand_title

        # otherwise, fetch ligand name from pubchem
        else:
            # check cache
            cache_dir = ['pubchem', 'cid', 'synonyms']
            url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/{}/$index/synonyms/json'.format(lookup_type)
            pubchem = fetch_from_web_api(url, pubchem_id, cache_dir)
            ##print (pubchem)
            
            # get name from response
            try:
                ligand_name = pubchem['InformationList']['Information'][0]['Synonym'][0]
            except:
                ## Some compounds do not have a name but are still a valid pubchem entry. (Peptides)
                logger.warning('Ligand {} does not have a name in PubChem'.format(pubchem_id))
                ligand_name = lookup_type + ' ' + pubchem_id
                # return None

        # fetch ligand properties from pubchem
        properties = {}
        
        # check cache
        cache_dir = ['pubchem', 'cid', 'property']
        url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/{}/$index/property/CanonicalSMILES,InChIKey,MolecularWeight,HBondDonorCount,HBondAcceptorCount,XLogP,RotatableBondCount/json'.format(lookup_type)
        pubchem = fetch_from_web_api(url, pubchem_id, cache_dir)
        # get properties from response
        if pubchem==False:
            logger.warning('Ligand {} not found in PubChem'.format(pubchem_id))
            return None

        if pubchem['PropertyTable']['Properties'][0]:   
            if 'HBondAcceptorCount' in pubchem['PropertyTable']['Properties'][0] :
                properties['hacc'] =  pubchem['PropertyTable']['Properties'][0]['HBondAcceptorCount']
            if 'HBondDonorCount' in pubchem['PropertyTable']['Properties'][0] :
                properties['hdon'] =  pubchem['PropertyTable']['Properties'][0]['HBondDonorCount']
            if 'XLogP' in pubchem['PropertyTable']['Properties'][0] :
                properties['logp'] =  pubchem['PropertyTable']['Properties'][0]['XLogP']
            if 'RotatableBondCount' in pubchem['PropertyTable']['Properties'][0] :
                properties['rotatable_bonds'] =  pubchem['PropertyTable']['Properties'][0]['RotatableBondCount']
            if 'MolecularWeight' in pubchem['PropertyTable']['Properties'][0] :
                properties['mw'] = pubchem['PropertyTable']['Properties'][0]['MolecularWeight']
        try:
            
            properties['smiles'] =  pubchem['PropertyTable']['Properties'][0]['CanonicalSMILES']
            properties['inchikey'] =  pubchem['PropertyTable']['Properties'][0]['InChIKey']

        except:
            logger.warning('Ligand {} not found in PubChem'.format(pubchem_id))
            return None

        # pubchem webresource
        web_resource = WebResource.objects.get(slug='pubchem')
        #print (web_resource)
        
        # does a ligand with this canonical name already exist
        try:
            return Ligand.objects.get(name=ligand_name, canonical=True)
            # FIXME check inchikey
        except Ligand.DoesNotExist:
            pass # continue

        # does a (canonical) ligand with this inchikey already exist?
        try:
            existing_lp = LigandProperities.objects.get(inchikey=properties['inchikey'])
            self.properities = existing_lp
            self.name = ligand_name
            self.canonical = False
            self.ambigious_alias = False
            
            try:
                self.save()
                return self
            except IntegrityError:
                return Ligand.objects.get(name=ligand_name, canonical=False)
        except LigandProperities.DoesNotExist:
            return self.update_ligand(ligand_name, properties, ligand_type, web_resource, pubchem_id)
Example #13
0
    def handle(self, *args, **options):

        ## Url API to map genename to ensemble ID
        cache_dir_genes = ['genenames', 'gene_lookup']
        url_gene = 'http://rest.genenames.org/fetch/symbol/$index'

        ensembl_version = 'grch37' # anything uses newest

        if ensembl_version=='grch37':
            ## Url to lookup ensemble ID to find transcripts
            cache_dir_transcripts = ['ensembl37', 'transcripts']
            url_ensembl = 'https://grch37.rest.ensembl.org/lookup/id/$index?expand=1;content-type=application/json'

            ## Url to lookup sequence of transcript
            cache_dir_seq = ['ensembl37', 'seq']
            url_ensembl_seq = 'https://grch37.rest.ensembl.org/sequence/id/$index?content-type=application/json'
        else:
            ## Url to lookup ensemble ID to find transcripts
            cache_dir_transcripts = ['ensembl', 'transcripts']
            url_ensembl = 'https://rest.ensembl.org/lookup/id/$index?expand=1;content-type=application/json'

            ## Url to lookup sequence of transcript
            cache_dir_seq = ['ensembl', 'seq']
            url_ensembl_seq = 'https://rest.ensembl.org/sequence/id/$index?content-type=application/json'
        

        # Get all human GPCRs
        ps = Protein.objects.filter(sequence_type__slug='wt', species__common_name="Human", family__slug__startswith='00').all().prefetch_related('genes').order_by('entry_name')
       
        isoforms = {}
        total_transcripts = 0
        total_proteins_with_isoforms = 0
        gene_to_ensembl = {}
        for p in ps:
            transcripts = []
            genes = list(p.genes.all().values_list('name',flat=True))
            print(">" + p.entry_name, 'genes:',genes)
            for gene in genes:

                # Use requests method due to weird functionality of genenames.org
                import requests
                url = 'http://rest.genenames.org/fetch/symbol/{}'.format(gene)
                cache_file_path = '{}/{}'.format('/'.join(cache_dir_genes), gene)
                # try fetching from cache
                data = cache.get(cache_file_path)
                if not data:
                    headers = {'Accept': 'application/json'}
                    try:
                        resp = requests.get(url=url, headers=headers)
                        data = resp.json()
                        cache.set(cache_file_path, data, 60*60*24*7) #7 days
                    except:
                        print('Error converting',gene)
                        continue
                if data['response']['docs']:
                    try:
                        # Get ensemble_gene_id
                        ensembl_gene_id = data['response']['docs'][0]['ensembl_gene_id']
                        gene_to_ensembl[p.entry_name] = ensembl_gene_id
                        #print("E_ID: " +ensembl_gene_id)
                        ensembl_transcripts = fetch_from_web_api(url_ensembl, ensembl_gene_id, cache_dir_transcripts)
                        for t in ensembl_transcripts['Transcript']:
                            display_name = t['display_name']
                            is_canonical = t['is_canonical']
                            if is_canonical:
                                # Skip canonical entries
                                continue
                            biotype = t['biotype']
                            t_id = t['id']

                            # Only interested in protein_coding
                            if biotype=='protein_coding':
                                length = t['Translation']['length']
                                seq_id = t['Translation']['id']
                                transcript_info = OrderedDict([('display_name',display_name),('t_id',t_id),('length',length), ('seq_id',seq_id)])
                                seq = fetch_from_web_api(url_ensembl_seq, seq_id,cache_dir_seq)
                                transcript_info['seq'] = seq['seq']
                                transcripts.append(transcript_info)
                                total_transcripts += 1
                    except:
                        print('Error fetching ensemble_gene_id for gene',gene)
                        pass
            print(len(transcripts), 'transcripts found')
            
            # Add if transcripts found
            if len(transcripts):
                isoforms[p.entry_name] = transcripts
                total_proteins_with_isoforms += 1

        # print small summary results
        print('total_proteins_searched',len(ps))
        print('total_proteins_with_isoforms', total_proteins_with_isoforms)
        print('total_transcripts',total_transcripts)

        print(gene_to_ensembl)
        # save to file
        f = open('protein/data/all_isoforms.json', 'w')
        json.dump(isoforms,f, indent=4, separators=(',', ': '))
Example #14
0
    def load_from_pubchem(self, lookup_type, pubchem_id, ligand_type, ligand_title=False):
        logger = logging.getLogger('build')

        # if ligand title is specified, use that as the name
        if ligand_title:
            ligand_name = ligand_title

        # otherwise, fetch ligand name from pubchem
        else:
            # check cache
            cache_dir = ['pubchem', 'cid', 'synonyms']
            url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/{}/$index/synonyms/json'.format(lookup_type)
            pubchem = fetch_from_web_api(url, pubchem_id, cache_dir)
            
            # get name from response
            try:
                ligand_name = pubchem['InformationList']['Information'][0]['Synonym'][0]
            except:
                logger.warning('Ligand {} not found in PubChem'.format(pubchem_id))
                return None

        # fetch ligand properties from pubchem
        properties = {}
        
        # check cache
        cache_dir = ['pubchem', 'cid', 'property']
        url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/{}/$index/property/CanonicalSMILES,InChIKey/json'.format(lookup_type)
        pubchem = fetch_from_web_api(url, pubchem_id, cache_dir)
        
        # get properties from reponse
        try:
            properties['smiles'] =  pubchem['PropertyTable']['Properties'][0]['CanonicalSMILES']
            properties['inchikey'] =  pubchem['PropertyTable']['Properties'][0]['InChIKey']
        except:
            logger.warning('Ligand {} not found in PubChem'.format(pubchem_id))
            return None

        # pubchem webresource
        web_resource = WebResource.objects.get(slug='pubchem')

        # does a ligand with this canonical name already exist
        try:
            return Ligand.objects.get(name=ligand_name, canonical=True)
            # FIXME check inchikey
        except Ligand.DoesNotExist:
            pass # continue

        # does a (canonical) ligand with this inchikey already exist?
        try:
            existing_lp = LigandProperities.objects.get(inchikey=properties['inchikey'])
            self.properities = existing_lp
            self.name = ligand_name
            self.canonical = False
            self.ambigious_alias = False
            
            try:
                self.save()
                return self
            except IntegrityError:
                return Ligand.objects.get(name=ligand_name, canonical=False)
        except LigandProperities.DoesNotExist:
            return self.update_ligand(ligand_name, properties, ligand_type, web_resource, pubchem_id)
Example #15
0
def AlignIsoformWildtype(request):

    p = request.GET.get("protein")
    es = request.GET.getlist("ensembl_id[]")
    iso = request.GET.get("iso_id")
    data = {}
    data['isoforms'] = {}
    protein = Protein.objects.get(entry_name__startswith=p.lower(), sequence_type__slug='wt', species__common_name='Human')
    parent_seq = protein.sequence
    rs = Residue.objects.filter(protein_conformation__protein=protein).prefetch_related('protein_segment','display_generic_number','generic_number')
    data['res'] = {}
    data['same'] = "true"
    for r in rs:
        data['res'][r.sequence_number] = [r.protein_segment.slug,str(r.display_generic_number), r.sequence_number]

    from common.tools import fetch_from_web_api
    from Bio import pairwise2
    from Bio.pairwise2 import format_alignment
    from Bio.SubsMat import MatrixInfo as matlist
    from Bio.Align.Applications import ClustalOmegaCommandline
    from Bio import AlignIO
    cache_dir = ['ensembl', 'isoform']
    url = 'https://rest.ensembl.org/sequence/id/$index?content-type=application/json&type=protein'
    url = 'https://grch37.rest.ensembl.org/sequence/id/$index?type=protein;content-type=application/json'

    # print(iso,'iso_id')
    # 1: 3, 2, 5, 3, 7
    seq_filename = "protein/data/MSA_GPCR_isoforms/{}_human_isoform_MSA.fa".format(p.lower())
    with open (seq_filename, "r") as myfile:
        fasta_raw = myfile.read()
        fasta=fasta_raw.splitlines()
    # print(aln_human)
    # print(fasta_raw)
    data['wt2']=fasta[1]
    data['pre_aligned']=fasta[1+int(iso)*2]

    new_wt2 = ''
    new_pre_aligned = ''
    for i,wt in enumerate(data['wt2']):
        pa = data['pre_aligned'][i]
        if not (wt=='-' and pa=='-'):
            new_wt2 += wt
            new_pre_aligned += pa
    gaps = 0
    data['res_correct2'] = {}
    for i, r in enumerate(data['wt2'], 1):
        if r == "-":
            data['res_correct2'][i] = ['','','']
            gaps += 1
        else:
            data['res_correct2'][i] = data['res'][i-gaps]

    for e in es[:1]:
        isoform_info = fetch_from_web_api(url, e, cache_dir)
        if (isoform_info):
            seq = isoform_info['seq']
            # seq_filename = "/tmp/" + e + ".fa"
            # with open(seq_filename, 'w') as seq_file:
            #     seq_file.write("> ref\n")
            #     seq_file.write(parent_seq + "\n")
            #     seq_file.write("> seq\n")
            #     seq_file.write(seq + "\n")

            # ali_filename = "/tmp/"+e +"_out.fa"
            # acmd = ClustalOmegaCommandline(infile=seq_filename, outfile=ali_filename, force=True)
            # stdout, stderr = acmd()
            # pw2 = AlignIO.read(ali_filename, "fasta")
            # aln_human = str(pw2[0].seq)
            # aln_isoform = str(pw2[1].seq)
            pw2 = pairwise2.align.globalms(parent_seq, seq, 2, -5, -10, -.5)
            # for a in pw2:
            #     print(format_alignment(*a))
            aln_human = pw2[0][0]
            aln_isoform = pw2[0][1]
            data['wt'] = aln_human
            data['isoforms'][e]=aln_isoform
            # print(aln_human)
            # print(aln_isoform)
            # with open (ali_filename, "r") as myfile:
            #     fasta=myfile.read()
            # data['fasta'] = fasta
            gaps = 0
            data['res_correct'] = {}
            for i, r in enumerate(data['wt'], 1):
                if r == "-":
                    data['res_correct'][i] = ['','','']
                    gaps += 1
                else:
                    data['res_correct'][i] = data['res'][i-gaps]
            # print(fasta)
            # pw = pairwise2.align.globalms(parent_seq, seq, 2, 1, -10, -.5)
            # for a in pw:
            #     print(format_alignment(*a))
            if new_pre_aligned!=aln_isoform:
                # print(new_pre_aligned,aln_isoform)
                data['same'] = "false"
        else:
            print('error fetching info from',e)

    # print(data['same'])
    return JsonResponse(data)
Example #16
0
    def load_from_pubchem(self,
                          lookup_type,
                          pubchem_id,
                          ligand_type,
                          ligand_title=False):
        logger = logging.getLogger('build')

        # if ligand title is specified, use that as the name
        if ligand_title:
            ligand_name = ligand_title

        # otherwise, fetch ligand name from pubchem
        else:
            # check cache
            cache_dir = ['pubchem', 'cid', 'synonyms']
            url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/{}/$index/synonyms/json'.format(
                lookup_type)
            pubchem = fetch_from_web_api(url, pubchem_id, cache_dir)
            ##print (pubchem)

            # get name from response
            try:
                ligand_name = pubchem['InformationList']['Information'][0][
                    'Synonym'][0]
            except:
                ## Some compounds do not have a name but are still a valid pubchem entry. (Peptides)
                logger.warning(
                    'Ligand {} does not have a name in PubChem'.format(
                        pubchem_id))
                ligand_name = lookup_type + ' ' + pubchem_id
                # return None

        # fetch ligand properties from pubchem
        properties = {}

        # check cache
        cache_dir = ['pubchem', 'cid', 'property']
        url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/{}/$index/property/CanonicalSMILES,InChIKey,MolecularWeight,HBondDonorCount,HBondAcceptorCount,XLogP,RotatableBondCount/json'.format(
            lookup_type)
        pubchem = fetch_from_web_api(url, pubchem_id, cache_dir)
        # get properties from response
        if pubchem == False:
            logger.warning('Ligand {} not found in PubChem'.format(pubchem_id))
            return None

        if pubchem['PropertyTable']['Properties'][0]:
            if 'HBondAcceptorCount' in pubchem['PropertyTable']['Properties'][
                    0]:
                properties['hacc'] = pubchem['PropertyTable']['Properties'][0][
                    'HBondAcceptorCount']
            if 'HBondDonorCount' in pubchem['PropertyTable']['Properties'][0]:
                properties['hdon'] = pubchem['PropertyTable']['Properties'][0][
                    'HBondDonorCount']
            if 'XLogP' in pubchem['PropertyTable']['Properties'][0]:
                properties['logp'] = pubchem['PropertyTable']['Properties'][0][
                    'XLogP']
            if 'RotatableBondCount' in pubchem['PropertyTable']['Properties'][
                    0]:
                properties['rotatable_bonds'] = pubchem['PropertyTable'][
                    'Properties'][0]['RotatableBondCount']
            if 'MolecularWeight' in pubchem['PropertyTable']['Properties'][0]:
                properties['mw'] = pubchem['PropertyTable']['Properties'][0][
                    'MolecularWeight']
        try:

            properties['smiles'] = pubchem['PropertyTable']['Properties'][0][
                'CanonicalSMILES']
            properties['inchikey'] = pubchem['PropertyTable']['Properties'][0][
                'InChIKey']

        except:
            logger.warning('Ligand {} not found in PubChem'.format(pubchem_id))
            return None

        # pubchem webresource
        web_resource = WebResource.objects.get(slug='pubchem')
        #print (web_resource)

        # does a ligand with this canonical name already exist
        try:
            return Ligand.objects.get(name=ligand_name, canonical=True)
            # FIXME check inchikey
        except Ligand.DoesNotExist:
            pass  # continue

        # does a (canonical) ligand with this inchikey already exist?
        try:
            existing_lp = LigandProperities.objects.get(
                inchikey=properties['inchikey'])
            self.properities = existing_lp
            self.name = ligand_name
            self.canonical = False
            self.ambigious_alias = False

            try:
                self.save()
                return self
            except IntegrityError:
                return Ligand.objects.get(name=ligand_name, canonical=False)
        except LigandProperities.DoesNotExist:
            return self.update_ligand(ligand_name, properties, ligand_type,
                                      web_resource, pubchem_id)
    def handle(self, *args, **options):

        ## Prepare comparasion info ##
        filepath = 'protein/data/Isoform_annotation_table.txt'
        lmb_data = OrderedDict()
        total_lmb_isoforms = 0
        all_lmb_isoforms = []
        with open(filepath, "r", encoding='UTF-8') as f:
            for i,row in enumerate(f):
                if i>0:
                    c = row.split("\t")
                    entry_name = "{}_human".format(c[1].lower())
                    transcripts = c[4].split(", ")

                    if not entry_name in lmb_data:
                        lmb_data[entry_name] = []
                    lmb_data[entry_name] += transcripts
                    total_lmb_isoforms += 1
                    all_lmb_isoforms += transcripts

        print('all_lmb_isoforms',len(all_lmb_isoforms),'distinct',len(set(all_lmb_isoforms)))

        
        ## Get parsed gtex annotation
        with open('protein/data/matched_gtex.json') as json_file:
            gtex_old = json.load(json_file)

        ## Need to rewrite these entries, as ensembl doesnt use the . for transcripts
        gtex = {}
        for key, val in gtex_old['transcripts'].items():
            t,g = key.split("_")
            new_key = "{}_{}".format(t.split(".")[0],g)
            gtex[new_key] = val
            # del gtex[new_key]['subjects']


        ## Url API to map genename to ensemble ID
        cache_dir_genes = ['gtexportal', 'gene_lookup']
        url_gene = 'https://gtexportal.org/rest/v1/reference/gene?geneId=$index&gencodeVersion=v19&genomeBuild=GRCh37%2Fhg19&pageSize=250&format=json'

        ## Url to lookup ensemble ID to find transcripts
        cache_dir_transcripts_gtex = ['gtexportal', 'transcripts']
        url_transcripts = 'https://gtexportal.org/rest/v1/reference/transcript?gencodeId=$index&gencodeVersion=v19&genomeBuild=GRCh37%2Fhg19'

        cache_dir_transcripts = ['ensembl37', 'transcripts']
        url_ensembl = 'https://grch37.rest.ensembl.org/lookup/id/$index?expand=1;content-type=application/json'

        cache_dir_gtex_expression  = ['gtexportal', 'expression_data']
        url_expression = 'https://gtexportal.org/rest/v1/expression/medianTranscriptExpression?datasetId=gtex_v7&gencodeId=$index&format=json'

        ## Url to lookup sequence of transcript
        cache_dir_seq = ['ensembl37', 'seq_protein']
        url_ensembl_seq = 'https://grch37.rest.ensembl.org/sequence/id/$index?content-type=application/json;type=protein'

        # Get all human GPCRs
        ps = Protein.objects.filter(sequence_type__slug='wt', species__common_name="Human", family__slug__startswith='00').all().prefetch_related('genes').order_by('entry_name')
       
        isoforms = {}
        total_transcripts = 0
        total_transcript_skipped_no_tissue=0
        total_proteins_with_isoforms = 0
        gene_to_ensembl = {}
        transcripts_ids_total = set()
        transcripts_ids_skipped_total = set()
        total_fetched_transcripts = 0
        canonical_disagreement_count = 0

        total_new_transcripts = []
        total_not_found = []
        total_not_found_due_to_skipped = []
        new_proteins = set()

        lmb_compare_sequences = [0,0,0] # correct, wrong, not exists in lmb

        sequence_lookup = {}

        ## COMPARE SEQUENCES
        filenames = os.listdir("protein/data/LMB_sequences/")
        all_lmb_sequences= {}
        for f in filenames:
            with open ("protein/data/LMB_sequences/"+f, "r") as myfile:
                fasta=myfile.read().splitlines()
                for i,l in enumerate(fasta):
                    if l[0]==">":
                        e_id = l[2:]
                        continue
                    if e_id in all_lmb_sequences:
                        print('already there!',e_id)
                    if i>2:
                        all_lmb_sequences[e_id]=l
        print('all_lmb_sequences',len(all_lmb_sequences))

        f = open("protein/data/20190726_transcripts.fa", "w")
        missing_sequences = 0
        total_lmb_sequences = 0
        sequences_lookup = defaultdict(list)
        for p,ts in lmb_data.items():
            seq = Protein.objects.get(entry_name=p).sequence
            sequences_lookup[seq].append([p,p])
            # print(p,ts)
            # print(seq)
            f.write(">{} GPCRdb sequence reference\n".format(p))
            f.write("{}\n".format(seq))
            seq_filename = "protein/data/LMB_sequences/{}_nonstrict_transcripts.fa".format(p)
            lmb_sequences = {}
            try:
                with open (seq_filename, "r") as myfile:
                    #fasta_raw = myfile.read()
                    fasta=myfile.read().splitlines()
                    for i,l in enumerate(fasta):
                        if l[0]==">":
                            e_id = l[2:]
                            continue
                        lmb_sequences[e_id]=l
                        if i>2:
                            total_lmb_sequences += 1
            except:
                #print('No file for',p,' So no sequence for',ts)
                missing_sequences += len(ts)
            for t in ts:
                if not t in lmb_sequences:
                    #print('missing ',t,'in',"{}_nonstrict_transcripts.fa".format(p))
                    missing_sequences += 1

                seq = fetch_from_web_api(url_ensembl_seq, t,cache_dir_seq)['seq']
                sequences_lookup[seq].append([t,p])
                if t in lmb_sequences:
                    if seq!=lmb_sequences[t]:
                        print(t,'different from LBM - length ensembl:',len(seq),"length lmb:",len(lmb_sequences[t]))
                f.write(">{} ({})\n".format(t,p))
                f.write("{}\n".format(seq))
        f.close()
        print('total missing sequences',missing_sequences)
        print('total lmb transcript sequences provided',total_lmb_sequences)
        print('total lmb protein',len(lmb_data))
        #return
        for seq,ts in sequences_lookup.items():
            if len(ts)>1:
                print('Identical sequence:',ts)

        sequences_lookup = defaultdict(list)
        all_transcript_seq = {}
        for p in ps:# .filter(entry_name='gpc5b_human').all():
            transcripts = []
            transcripts_ids = []
            transcripts_ids_skipped = []
            ensembl_transcripts_count = 0
            genes = list(p.genes.all().values_list('name',flat=True))
            uniprot = p.accession
            canonical = ''
            canon_seq = p.sequence
            # sequence_lookup[canon_seq] = p.entry_name
            grch37_canonical_seq = ''
            uniprot_canonical = ''
            grch37_canonical = ''

            # print(">" + p.entry_name,uniprot, 'genes:',genes)
            seq_filename = "protein/data/LMB_sequences/{}_nonstrict_transcripts.fa".format(p.entry_name)
            lmb_sequences = {}
            try:
                with open (seq_filename, "r") as myfile:
                    #fasta_raw = myfile.read()
                    fasta=myfile.read().splitlines()
                    for l in fasta:
                        if l[0]==">":
                            e_id = l[2:]
                            continue
                        lmb_sequences[e_id]=l
            except:
                pass
            #break

            alternative_ids_uniprot = self.find_ensembl_id_by_uniprot(uniprot)
            # print(alternative_ids_uniprot)
            ensembl_gene_id = []
            for gene in genes:
                if not gene:
                    continue
                gene_lookup = fetch_from_web_api(url_gene, gene, cache_dir_genes)
                
                # try:
                same_gene_id = ''
                if gene_lookup['gene']:
                    for gene_info in gene_lookup['gene']:
                        if gene_info['geneSymbol']==gene:
                            ensembl_gene_id.append(gene_info['gencodeId'])

            if len(ensembl_gene_id)>1:
                print(ensembl_gene_id,'MORE THAN 1 !!!!')

            if len(ensembl_gene_id)==0:
                print('No ID found, using uniprot')
                if alternative_ids_uniprot['genes']:
                    ensembl_gene_id = alternative_ids_uniprot['genes'][0]
                else:
                    print("NO ID FOR THIS RECEPTOR")
                    continue
            else:
                ensembl_gene_id = ensembl_gene_id[0]

            #alternative_id = self.find_ensembl_id(gene)
            # alternative_id_uniprot = self.find_ensembl_id_by_uniprot(uniprot)
            # print(ensembl_gene_id,alternative_ids_uniprot)
            # expression = fetch_from_web_api(url_expression,ensembl_gene_id,cache_dir_gtex_expression)
            # print(expression)
            # go through expression
            # expressed_transcripts = {}
            # for e in expression['medianTranscriptExpression']:
            #     if e['median']>0 or 1==1:
            #         #only if expression
            #         t_id = e['transcriptId']
            #         t_short = t_id.split(".")[0]
            #         tissue = e['tissueSiteDetailId']
            #         if t_short not in expressed_transcripts:
            #             expressed_transcripts[t_short] = {'long':t_id,'tissues':[], 'max_median':0}
            #         if expressed_transcripts[t_short]['max_median']<e['median']:
            #             expressed_transcripts[t_short]['max_median'] = e['median']
            #         expressed_transcripts[t_short]['tissues'].append([tissue,e['median']])   
            # print(expressed_transcripts)
            # print(ensembl_gene_id)
            gene_to_ensembl[p.entry_name] = ensembl_gene_id
            # print("E_ID: " +ensembl_gene_id,alternative_ids_uniprot)
            # ensembl_transcripts = fetch_from_web_api(url_ensembl, ensembl_gene_id, cache_dir_transcripts)
            # use uniprot gene ID instead
            ensembl_transcripts = fetch_from_web_api(url_ensembl, ensembl_gene_id, cache_dir_transcripts)
            # print(ensembl_gene_id)
            if (alternative_ids_uniprot['genes'] and ensembl_gene_id.split(".")[0]!=alternative_ids_uniprot['genes'][0]):
                print("##### ensembl gene id changed",ensembl_gene_id,alternative_ids_uniprot['genes'][0])

            #total_fetched_transcripts += len(ensembl_transcripts['Transcript'])
            # print(ensembl_transcripts)
            same_gene_id = True
            if not ensembl_transcripts:
                print('error',alternative_ids_uniprot,ensembl_gene_id)
                same_gene_id = False
                ensembl_transcripts = fetch_from_web_api(url_ensembl, alternative_ids_uniprot['genes'][0], cache_dir_transcripts)

            for t in ensembl_transcripts['Transcript']:
                ensembl_transcripts_count += 1
                display_name = t['display_name']
                is_canonical = t['is_canonical']
                biotype = t['biotype']
                t_id = t['id']
                #     # Skip canonical entries
                #     continue

                # Only interested in protein_coding
                if biotype=='protein_coding':
                    total_fetched_transcripts += 1

                    key = '{}_{}'.format(t_id,ensembl_gene_id)

                    if not key in gtex:
                        # print('t_id', t_id, 'not in expressed_transcripts')
                        total_transcript_skipped_no_tissue += 1
                        transcripts_ids_skipped_total.add(t_id)
                        transcripts_ids_skipped.append(t_id)
                        continue

                    if gtex[key]["count"]<3:
                        total_transcript_skipped_no_tissue += 1
                        transcripts_ids_skipped_total.add(t_id)
                        transcripts_ids_skipped.append(t_id)
                        continue

                    length = t['Translation']['length']
                    seq_id = t['Translation']['id']
                    transcript_info = OrderedDict([('display_name',display_name),('t_id',t_id),('length',length), ('seq_id',seq_id), ('expressed',gtex[key])])
                    seq = fetch_from_web_api(url_ensembl_seq, seq_id,cache_dir_seq)


                    if is_canonical:
                        grch37_canonical = t_id
                        transcript_info['grch37_canonical'] = True
                        grch37_canonical_seq = seq['seq']

                    if seq['seq']==canon_seq:
                        uniprot_canonical = t_id
                        transcript_info['uniprot_canonical'] = True
                        continue
                        # Skip canonical entries

                    sequences_lookup[seq['seq']].append([t_id,p.entry_name])
                    all_transcript_seq[t_id] = seq['seq']
                    if seq['seq'] in sequence_lookup:
                        print('SEQUENCE ALREADY SEEN',t_id, sequence_lookup[seq['seq']])
                        continue
                    sequence_lookup[seq['seq']] = t_id


                    transcript_info['seq'] = seq['seq']
                    if not t_id in lmb_sequences:
                        transcript_info['lmb_sequences'] = False
                        lmb_compare_sequences[2] += 1
                    else:
                        if lmb_sequences[t_id]==seq['seq']:
                            transcript_info['lmb_sequences'] = True
                            lmb_compare_sequences[0] += 1
                        else:
                            transcript_info['lmb_sequences'] = lmb_sequences[t_id]
                            lmb_compare_sequences[1] += 1

                    if t_id in alternative_ids_uniprot['transcripts']:
                        transcript_info['in_uniprot'] = True
                    else:
                        transcript_info['in_uniprot'] = False

                    if p.entry_name in lmb_data and t_id in lmb_data[p.entry_name]:
                        transcript_info['in_lmb'] = True
                    else:
                        transcript_info['in_lmb'] = False

                    if t_id not in transcripts_ids:
                        transcripts.append(transcript_info)
                        transcripts_ids.append(t_id)
                        transcripts_ids_total.add(t_id)
                    total_transcripts += 1
                # except:
                #     print('Error fetching ensemble_gene_id for gene',gene)
                #     pass

            not_found = []
            not_found_due_to_skipped = []
            if p.entry_name in lmb_data:
                for t in lmb_data[p.entry_name]:
                    if t not in transcripts_ids:
                        if t in transcripts_ids_skipped:

                            f = open("protein/data/20190726_skipped_due_to_gtex.txt", "a")
                            not_found_due_to_skipped.append(t)
                            key = '{}_{}'.format(t,ensembl_gene_id)
                            if not key in gtex:
                                reason = 'Not in GTEX'
                            else:
                                reason = 'Subjects low in GTEX - count is {} - subject ids {}'.format(gtex[key]['count'],", ".join(gtex[key]['subjects']))
                            f.write("{}: {}\n".format(t,reason))
                            f.close()
                            # print(t)
                        else:
                            not_found.append(t)

            total_not_found += not_found
            total_not_found_due_to_skipped += not_found_due_to_skipped

            new = []
            for t in transcripts_ids:
                if p.entry_name in lmb_data and t in lmb_data[p.entry_name]:
                    pass
                else:
                    ts_check = sequences_lookup[all_transcript_seq[t]]
                    for t_check in ts_check:
                        if p.entry_name in lmb_data and t_check in lmb_data[p.entry_name]:
                            print('found via duplicate',t_check,t)
                            continue
                    key = '{}_{}'.format(t,ensembl_gene_id)



                    #blast = BlastSearch(top_results=2)

                    blast = BlastSearch(blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_human_blastdb']), top_results=2)
                    blast_out = blast.run(all_transcript_seq[t])
                    result = [(Protein.objects.get(pk=x[0]).entry_name, x[1].hsps[0].expect) for x in blast_out]
                    #print(result)
                    if result:
                        if result[0][0]==p.entry_name and result[0][1]<0.05:
                            f = open("protein/data/20190726_new_transcripts_for_consideration.txt", "a")
                            reason = 'GTEX count: {}'.format(gtex[key]['count'])
                            f.write(">{} ({}): {}\n".format(t,p.entry_name,reason))
                            f.write("{}\n".format(all_transcript_seq[t]))
                            f.close()
                            new.append(t)
                            if p.entry_name in lmb_data:
                                new_proteins.add(p.entry_name)
                        else:
                            print('bad blast match',result)
                    else:
                        print('bad blast match',result)

            total_new_transcripts += new

            # print(len(alternative_ids_uniprot['transcripts']), 'uniprot transcripts found',ensembl_transcripts_count, ' ensembl transcripts found',len(transcripts), 'transcripts kept after filtering')
            
            # Add if transcripts found
            if len(transcripts):
                isoforms[p.entry_name] = {'ensembl_gene_id':ensembl_gene_id,'same_gene_id':same_gene_id,'canonical_seq':canon_seq, 'grch37_canonical_seq':grch37_canonical_seq, 'isoforms': transcripts, 'uniprot_lookup': alternative_ids_uniprot, 'lmb_not_found':not_found, 'lmb_not_found_due_to_skipped': not_found_due_to_skipped, 'new_transcripts_than_lmb': new,'skipped_due_to_gtex': transcripts_ids_skipped, 'grch37_canonical':grch37_canonical, 'uniprot_canonical':uniprot_canonical}
                if grch37_canonical_seq!=canon_seq:
                    isoforms[p.entry_name]['canonical_disagreement'] = True
                    canonical_disagreement_count += 1
                # isoforms[p.entry_name].append(alternative_ids_uniprot)
                # isoforms[p.entry_name].append(not_found)
                total_proteins_with_isoforms += 1
            else:
                isoforms[p.entry_name] = {'ensembl_gene_id':ensembl_gene_id,'same_gene_id':same_gene_id,'canonical_seq':canon_seq, 'grch37_canonical_seq':grch37_canonical_seq, 'isoforms': transcripts, 'uniprot_lookup': alternative_ids_uniprot, 'lmb_not_found':not_found, 'lmb_not_found_due_to_skipped': not_found_due_to_skipped, 'new_transcripts_than_lmb': new,'skipped_due_to_gtex': transcripts_ids_skipped, 'grch37_canonical':grch37_canonical, 'uniprot_canonical':uniprot_canonical}
                
            # break
            f = open('protein/data/all_isoforms_gtex.json', 'w')
            json.dump(isoforms,f, indent=4, separators=(',', ': '))
            #break

        for seq,ts in sequences_lookup.items():
            if len(ts)>1:
                print('identical sequence',ts)

        for t in total_not_found:
            ts_check = sequences_lookup[all_transcript_seq[t]]
            found = False
            for t_check in ts_check:
                if t_check[0] not in total_not_found:
                    print(t,'found but under another id',t_check[0])
                    found = True

            if not found:
                print('##',t,'in LMB but not in this search')


        # print small summary results
        print('total_proteins_searched',len(ps))
        print('total_proteins_with_isoforms', total_proteins_with_isoforms)
        print('Total transcripts deemed to be isoforms',total_transcripts)
        print('Amount of these not in LMB data',len(total_new_transcripts))
        print(new_proteins)
        # print('Amount in LBM not found',len(total_not_found))
        # print(total_not_found)
        print('Amount in LBM found but skipped due to GTEX data',len(total_not_found_due_to_skipped))
        print(total_not_found_due_to_skipped)
        print('Sequence compare to LMB', lmb_compare_sequences)
        print('canonical_disagreement_count',canonical_disagreement_count)
        print(total_not_found)
        # print('total_transcript_skipped_no_tissue',total_transcript_skipped_no_tissue)
        # print('total_transcript_skipped_no_tissue2 ',len(transcripts_ids_skipped_total))
        # print('total_fetched_transcripts',total_fetched_transcripts)

        # print(gene_to_ensembl)
        # save to file
        f = open('protein/data/all_isoforms_gtex.json', 'w')
        json.dump(isoforms,f, indent=4, separators=(',', ': '))
Example #18
0
    def update_from_doi(self, doi):
        logger = logging.getLogger('build')
        # should entrez be tried as a backup?
        try_entrez_on_fail = False
        
        # check whether this data is cached
        cache_dir = ['crossref', 'doi']
        url = 'http://api.crossref.org/works/$index'
        pub = fetch_from_web_api(url, doi, cache_dir)
                
        if pub:
            # update record
            try:
                self.title = pub['message']['title'][0]
                try: 
                    self.year = pub['message']['created']['date-parts'][0][0]
                except:
                    self.year = pub['message']['deposited']['date-parts'][0][0]

                # go from [{'family': 'Gloriam', 'given': 'David E.'}] to ['Gloriam DE']
                authors = ['{} {}'.format(x['family'], ''.join([y[:1] for y in x['given'].split()]))
                    for x in pub['message']['author']]
                self.authors = ', '.join(authors)
            
                # get volume and pages if available
                reference = {}
                fields = ['volume', 'page']
                for f in fields:
                    if f in pub['message']:
                        reference[f] = pub['message'][f]
                    else:
                        reference[f] = 'X'
                self.reference = '{}:{}'.format(reference['volume'], reference['page'])

                # journal
                journal = pub['message']['container-title'][0]
                try:
                    # not all records have the journal abbreviation
                    journal_abbr = pub['message']['container-title'][1]
                except:
                    journal_abbr = slugify(journal)
                try:
                    self.journal, created = PublicationJournal.objects.get_or_create(name=journal,
                        defaults={'slug': journal_abbr})
                    if created:
                        logger.info('Created journal {}'.format(journal))
                except IntegrityError:
                    self.journal = PublicationJournal.objects.get(name=journal)
            except Exception as msg:
                logger.warning('Processing data from CrossRef for {} failed: {}'.format(doi, msg))
                try_entrez_on_fail = False
        else:
            print("Publication not on crossref",doi)
            try_entrez_on_fail = False

        if try_entrez_on_fail:
            # try searching entrez for DOI
            try:
                Entrez.email = '*****@*****.**'
                record = Entrez.read(Entrez.esearch(
                    db='pubmed',
                    retmax=1,
                    term=doi
                    ))
                self.update_from_pubmed_data(record['IdList'][0])
            except:
                return False
Example #19
0
    def main_func(self, positions, iteration, count, lock):
        # filenames
        # if not positions[1]:
        #     rows = self.data[positions[0]:]
        # else:
        #     rows = self.data[positions[0]:positions[1]]

        missing_proteins = {}
        mutants_for_proteins = {}
        wrong_uniport_ids = {}

        c = 0
        skipped = 0
        inserted = 0
        bulk_m = []
        bulk_r = []
        current_sheet = time.time()

        rows = self.data_all
        while count.value < len(rows):
            with lock:
                r = rows[count.value]
                count.value += 1
        # for r in rows:
        # print(r['source_file'],c)
        # PRINT IF ERRORS OCCUR
        #self.logger.info('File '+str(r['source_file'])+' number '+str(c))
            current = time.time()
            c += 1
            # if c%100==0:
            #     self.logger.info('Parsed '+str(c)+' mutant data entries')

            # publication
            try:  #fix if it thinks it's float.
                float(r['reference'])
                r['reference'] = str(int(r['reference']))
                float(r['review'])
                r['review'] = str(int(r['review']))
            except ValueError:
                pass

            if r['reference'].isdigit():  #assume pubmed
                pub_type = 'pubmed'
            else:  #assume doi
                pub_type = 'doi'

            if r['reference'] not in self.publication_cache:
                try:
                    wl = WebLink.objects.get(index=r['reference'],
                                             web_resource__slug=pub_type)
                except WebLink.DoesNotExist:
                    try:
                        wl = WebLink.objects.create(
                            index=r['reference'],
                            web_resource=WebResource.objects.get(
                                slug=pub_type))
                    except IntegrityError:
                        wl = WebLink.objects.get(index=r['reference'],
                                                 web_resource__slug=pub_type)

                try:
                    pub = Publication.objects.get(web_link=wl)
                except Publication.DoesNotExist:
                    pub = Publication()
                    try:
                        pub.web_link = wl
                        pub.save()
                    except IntegrityError:
                        pub = Publication.objects.get(web_link=wl)

                    if pub_type == 'doi':
                        pub.update_from_doi(doi=r['reference'])
                    elif pub_type == 'pubmed':
                        pub.update_from_pubmed_data(index=r['reference'])
                    try:
                        pub.save()
                    except:
                        self.logger.error('error with reference ' +
                                          str(r['reference']) + ' ' + pub_type)
                        continue  #if something off with publication, skip.
                self.publication_cache[r['reference']] = pub
            else:
                pub = self.publication_cache[r['reference']]

            # print(r['review'],r['reference'])
            if r['review'].isdigit():  #assume pubmed
                pub_type = 'pubmed'
            elif r['review'].startswith('http'):
                pub_type = 'raw_link'
            else:  #assume doi
                pub_type = 'doi'

            # print(r['review'],pub_type)
            if r['review']:
                if r['review'] not in self.publication_cache:
                    try:
                        wl = WebLink.objects.get(index=r['review'],
                                                 web_resource__slug=pub_type)
                    except WebLink.DoesNotExist:
                        try:
                            wl = WebLink.objects.create(
                                index=r['review'],
                                web_resource=WebResource.objects.get(
                                    slug=pub_type))
                        except IntegrityError:
                            wl = WebLink.objects.get(
                                index=r['review'], web_resource__slug=pub_type)

                    try:
                        pub_review = Publication.objects.get(web_link=wl)
                    except Publication.DoesNotExist:
                        pub_review = Publication()
                        try:
                            pub_review.web_link = wl
                            pub_review.save()
                        except IntegrityError:
                            pub_review = Publication.objects.get(web_link=wl)

                        if pub_type == 'doi':
                            pub_review.update_from_doi(doi=r['review'])
                        elif pub_type == 'pubmed':
                            pub_review.update_from_pubmed_data(
                                index=r['review'])
                        try:
                            pub_review.save()
                        except:
                            self.logger.error('error with review ' +
                                              str(r['review']) + ' ' +
                                              pub_type)
                            continue  #if something off with publication, skip.
                        self.publication_cache[r['review']] = pub_review
                else:
                    pub_review = self.publication_cache[r['review']]
            else:
                pub_review = None

            l = None
            if str(r['ligand_name']) in self.ligand_cache:
                if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]:
                    l = self.ligand_cache[str(
                        r['ligand_name'])][r['ligand_id']]
            else:
                self.ligand_cache[str(r['ligand_name'])] = {}

            if not l:
                try:
                    l = get_or_make_ligand(r['ligand_id'], r['ligand_type'],
                                           str(r['ligand_name']))
                except Exception as msg:
                    print(
                        'Something errored with ligand, aborting entry of mutation',
                        r['ligand_name'], r['ligand_type'], r['ligand_id'],
                        r['source_file'])
                    print(msg)
                    traceback.print_exc()
                    continue
                self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l

            l_ref = None
            if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache:
                l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])]
            else:
                if Ligand.objects.filter(
                        name=r['exp_mu_ligand_ref'], canonical=True
                ).exists(
                ):  #if this name is canonical and it has a ligand record already
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                               canonical=True)
                elif Ligand.objects.filter(
                        name=r['exp_mu_ligand_ref'],
                        canonical=False,
                        ambigious_alias=False
                ).exists(
                ):  #if this matches an alias that only has "one" parent canonical name - eg distinct
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                               canonical=False,
                                               ambigious_alias=False)
                elif Ligand.objects.filter(
                        name=r['exp_mu_ligand_ref'],
                        canonical=False,
                        ambigious_alias=True
                ).exists(
                ):  #if this matches an alias that only has several canonical parents, must investigate, start with empty.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = False
                    l_ref.ambigious_alias = True
                    l_ref.save()
                    l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    l_ref.save()
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'],
                                           canonical=False).exists(
                                           ):  #amigious_alias not specified
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                               canonical=False)
                    l_ref.ambigious_alias = False
                    l_ref.save()
                elif r['exp_mu_ligand_ref']:  #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = True
                    l_ref.ambigious_alias = False
                    try:
                        l_ref.save()
                        l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    except IntegrityError:
                        if Ligand.objects.filter(name=r['exp_mu_ligand_ref'],
                                                 canonical=True).exists():
                            l_ref = Ligand.objects.get(
                                name=r['exp_mu_ligand_ref'], canonical=True)
                        else:
                            l_ref = Ligand.objects.get(
                                name=r['exp_mu_ligand_ref'], canonical=False)
                        # print("error failing ligand, duplicate?")
                    try:
                        l_ref.save()
                    except IntegrityError:
                        l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                                   canonical=False)
                        # print("error failing ligand, duplicate?")
                        # logger.error("FAILED SAVING LIGAND, duplicate?")
                else:
                    l_ref = None
                self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref

            protein_id = 0
            residue_id = 0

            protein = Protein.objects.filter(entry_name=r['protein'])
            if protein.exists():
                protein = protein.get()
                if r['protein'] in mutants_for_proteins:
                    mutants_for_proteins[r['protein']] += 1
                else:
                    mutants_for_proteins[r['protein']] = 1

            elif r['protein'] not in missing_proteins:

                try:
                    r['protein'] = wrong_uniport_ids[r['protein']]
                    real_uniprot = wrong_uniport_ids[r['protein']]
                    protein = Protein.objects.get(entry_name=r['protein'])
                    # print('fetched with lookup table',r['protein'])
                except:
                    # look for it as uniprot
                    protein = Protein.objects.filter(
                        web_links__web_resource__slug='uniprot',
                        web_links__index=r['protein'].upper())
                    if protein.exists():
                        protein = protein.get()
                        real_uniprot = protein.entry_name
                        if r['protein'] in mutants_for_proteins:
                            mutants_for_proteins[r['protein']] += 1
                        else:
                            mutants_for_proteins[r['protein']] = 1
                    else:
                        # Try to lookup in uniprot to catch typing errors / variants in entry_name
                        url = 'http://www.uniprot.org/uniprot/$index.xml'
                        cache_dir = ['uniprot', 'id']
                        uniprot_protein = fetch_from_web_api(url,
                                                             r['protein'],
                                                             cache_dir,
                                                             xml=True)
                        try:
                            real_uniprot = uniprot_protein.find(
                                './/{http://uniprot.org/uniprot}name'
                            ).text.lower()
                            protein = Protein.objects.get(
                                entry_name=real_uniprot)
                        except:
                            skipped += 1
                            if r['protein'] in missing_proteins:
                                missing_proteins[r['protein']] += 1
                            else:
                                missing_proteins[r['protein']] = 1
                                # print('Skipped due to no protein '+ r['protein'])
                                self.logger.error(
                                    'Skipped due to no protein ' +
                                    r['protein'])
                            continue
                    wrong_uniport_ids[r['protein']] = protein.entry_name
                    r['protein'] = real_uniprot
            else:
                missing_proteins[r['protein']] += 1
                continue

            res = Residue.objects.filter(
                protein_conformation__protein=protein,
                amino_acid=r['mutation_from'],
                sequence_number=r['mutation_pos'])  #FIXME MAKE AA CHECK
            if res.exists():
                res = res.get()
            else:
                self.logger.error('Skipped due to no residue or mismatch AA ' +
                                  r['protein'] + ' pos:' +
                                  str(r['mutation_pos']) + ' AA:' +
                                  r['mutation_from'])
                # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file'])
                skipped += 1
                continue

            if r['ligand_class']:
                try:
                    l_role, created = LigandRole.objects.get_or_create(
                        name=r['ligand_class'],
                        defaults={'slug': slugify(r['ligand_class'])[:50]
                                  })  # FIXME this should not be needed
                except Exception as e:
                    if LigandRole.objects.filter(
                            slug=slugify(r['ligand_class'])[:50]).exists():
                        l_role = LigandRole.objects.get(
                            slug=slugify(r['ligand_class'])[:50])
                        if l_role.name == slugify(r['ligand_class'])[:50]:
                            #if name of role is same as slug, then it was created by constructs script, replace it
                            l_role.name = r['ligand_class']
                            l_role.save()
                    else:
                        print(e)
                        print("Error with", r['ligand_class'],
                              slugify(r['ligand_class'])[:50])
                        l_role, created = LigandRole.objects.get_or_create(
                            slug=slugify(r['ligand_class'])
                            [:50])  # FIXME this should not be needed
            else:
                l_role = None

            if r['exp_type']:
                exp_type_id, created = MutationExperimentalType.objects.get_or_create(
                    type=r['exp_type'])
            else:
                exp_type_id = None

            if r['exp_func']:
                exp_func_id, created = MutationFunc.objects.get_or_create(
                    func=r['exp_func'])
            else:
                exp_func_id = None

            if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']:
                exp_qual_id, created = MutationQual.objects.get_or_create(
                    qual=r['exp_mu_effect_qual'],
                    prop=r['exp_mu_effect_ligand_prop'])
            else:
                exp_qual_id = None

            # if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']:
            #     exp_opt_id, created =  MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist'])
            # else:
            #     exp_opt_id = None

            try:
                mutation, created = Mutation.objects.get_or_create(
                    amino_acid=r['mutation_to'], protein=protein, residue=res)
            except IntegrityError:
                mutation = Mutation.objects.get(amino_acid=r['mutation_to'],
                                                protein=protein,
                                                residue=res)
            logtypes = ['pEC50', 'pIC50', 'pK']

            foldchange = 0
            typefold = ''
            if r['exp_wt_value'] != 0 and r[
                    'exp_mu_value_raw'] != 0:  #fix for new format
                if re.match("(" + ")|(".join(logtypes) + ")",
                            r['exp_type']):  #-log values!
                    try:
                        foldchange = round(
                            math.pow(10, -r['exp_mu_value_raw']) /
                            pow(10, -r['exp_wt_value']), 3)
                    except:
                        print(r)
                    typefold = r['exp_type'] + "_log"
                elif "%" == r['exp_wt_unit']:
                    # if % then it's a difference case, then lower value is bad. Otherwise it's conc and lower is better
                    foldchange = round(
                        r['exp_wt_value'] / r['exp_mu_value_raw'], 3)
                else:
                    foldchange = round(
                        r['exp_mu_value_raw'] / r['exp_wt_value'], 3)
                    typefold = r['exp_type'] + "_not_log"
                if foldchange > 0 and foldchange < 1 and foldchange != 0:
                    foldchange = -round((1 / foldchange), 3)
            elif r['fold_effect'] != 0:
                foldchange = round(r['fold_effect'], 3)
                if foldchange < 1: foldchange = -round((1 / foldchange), 3)
            r['fold_effect'] = foldchange

            raw_experiment = self.insert_raw(r)
            # raw_experiment.save()
            bulk = MutationExperiment(
                refs=pub,
                review=pub_review,
                submitting_group=r['submitting_group'],
                data_container=r['data_container'],
                data_container_number=r['data_container_number'],
                protein=protein,
                residue=res,
                ligand=l,
                ligand_role=l_role,
                ligand_ref=l_ref,
                # raw = raw_experiment, #raw_experiment, OR None
                # optional = exp_opt_id,
                exp_type=exp_type_id,
                exp_func=exp_func_id,
                exp_qual=exp_qual_id,
                mutation=mutation,
                wt_value=r['exp_wt_value'],  #
                wt_unit=r['exp_wt_unit'],
                mu_value=r['exp_mu_value_raw'],
                mu_sign=r['exp_mu_effect_sign'],
                foldchange=foldchange,
                opt_receptor_expression=r['opt_receptor_expression'],
                opt_basal_activity=r['opt_basal_activity'],
                opt_gain_of_activity=r['opt_gain_of_activity'],
                opt_ligand_emax=r['opt_ligand_emax'],
                opt_agonist=r['opt_agonist'],
            )
            # for line,val in r.items():
            #     val = str(val)
            #     if len(val)>100:
            #         print(line,"too long",val)
            # mut_id = obj.id
            bulk_r.append(raw_experiment)
            bulk_m.append(bulk)
            # try:
            #     bulk.save()
            # except Exception as e:
            #     print(e)
            #     print(r)
            #     break
            #print('saved ',r['source_file'])
            inserted += 1
            end = time.time()
            diff = round(end - current, 2)
            #print(diff)

        self.logger.info('Parsed ' + str(c) +
                         ' mutant data entries. Skipped ' + str(skipped))

        current = time.time()

        raws = MutationRaw.objects.bulk_create(bulk_r)
        for i, me in enumerate(bulk_m):
            me.raw = raws[i]
        MutationExperiment.objects.bulk_create(bulk_m)
        end = time.time()
        diff = round(end - current, 2)
        # current_sheet
        diff_2 = round(end - current_sheet, 2)
        print("overall", diff_2, "bulk", diff, len(bulk_m), "skipped",
              str(skipped))
        sorted_missing_proteins = sorted(missing_proteins.items(),
                                         key=operator.itemgetter(1),
                                         reverse=True)
Example #20
0
    def load_from_pubchem(self,
                          lookup_type,
                          pubchem_id,
                          ligand_type,
                          ligand_title=False):
        logger = logging.getLogger('build')

        # if ligand title is specified, use that as the name
        if ligand_title:
            ligand_name = ligand_title

        # otherwise, fetch ligand name from pubchem
        else:
            # check cache
            cache_dir = ['pubchem', 'cid', 'synonyms']
            url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/{}/$index/synonyms/json'.format(
                lookup_type)
            pubchem = fetch_from_web_api(url, pubchem_id, cache_dir)

            # get name from response
            try:
                ligand_name = pubchem['InformationList']['Information'][0][
                    'Synonym'][0]
            except:
                logger.warning(
                    'Ligand {} not found in PubChem'.format(pubchem_id))
                return None

        # fetch ligand properties from pubchem
        properties = {}

        # check cache
        cache_dir = ['pubchem', 'cid', 'property']
        url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/{}/$index/property/CanonicalSMILES,InChIKey/json'.format(
            lookup_type)
        pubchem = fetch_from_web_api(url, pubchem_id, cache_dir)

        # get properties from reponse
        try:
            properties['smiles'] = pubchem['PropertyTable']['Properties'][0][
                'CanonicalSMILES']
            properties['inchikey'] = pubchem['PropertyTable']['Properties'][0][
                'InChIKey']
        except:
            logger.warning('Ligand {} not found in PubChem'.format(pubchem_id))
            return None

        # pubchem webresource
        web_resource = WebResource.objects.get(slug='pubchem')

        # does a ligand with this canonical name already exist
        try:
            return Ligand.objects.get(name=ligand_name, canonical=True)
            # FIXME check inchikey
        except Ligand.DoesNotExist:
            pass  # continue

        # does a (canonical) ligand with this inchikey already exist?
        try:
            existing_lp = LigandProperities.objects.get(
                inchikey=properties['inchikey'])
            self.properities = existing_lp
            self.name = ligand_name
            self.canonical = False
            self.ambigious_alias = False

            try:
                self.save()
                return self
            except IntegrityError:
                return Ligand.objects.get(name=ligand_name, canonical=False)
        except LigandProperities.DoesNotExist:
            return self.update_ligand(ligand_name, properties, ligand_type,
                                      web_resource, pubchem_id)
Example #21
0
    def main_func(self, positions, iteration,count,lock):
        # filenames
        # if not positions[1]:
        #     rows = self.data[positions[0]:]
        # else:
        #     rows = self.data[positions[0]:positions[1]]


        missing_proteins = {}
        mutants_for_proteins = {}
        wrong_uniport_ids = {}

        c = 0
        skipped = 0
        inserted = 0
        bulk_m = []
        bulk_r = []
        current_sheet = time.time()

        rows = self.data_all
        while count.value<len(rows):
            with lock:
                r = rows[count.value]
                count.value +=1 
        # for r in rows:
            # print(r['source_file'],c)
            # PRINT IF ERRORS OCCUR
            #self.logger.info('File '+str(r['source_file'])+' number '+str(c))
            current = time.time()
            c += 1
            # if c%100==0:
            #     self.logger.info('Parsed '+str(c)+' mutant data entries')

            # publication
            try: #fix if it thinks it's float.
                float(r['reference'])
                r['reference'] = str(int(r['reference']))
                float(r['review'])
                r['review'] = str(int(r['review']))
            except ValueError:
                pass

            if r['reference'].isdigit(): #assume pubmed
                pub_type = 'pubmed'
            else: #assume doi
                pub_type = 'doi'

            if r['reference'] not in self.publication_cache:
                try:
                    wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type)
                except WebLink.DoesNotExist:
                    try:
                        wl = WebLink.objects.create(index=r['reference'],
                                web_resource = WebResource.objects.get(slug=pub_type))
                    except IntegrityError:
                        wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type)


                try:
                    pub = Publication.objects.get(web_link=wl)
                except Publication.DoesNotExist:
                    pub = Publication()
                    try:
                        pub.web_link = wl
                        pub.save()
                    except IntegrityError:
                        pub = Publication.objects.get(web_link=wl)


                    if pub_type == 'doi':
                        pub.update_from_doi(doi=r['reference'])
                    elif pub_type == 'pubmed':
                        pub.update_from_pubmed_data(index=r['reference'])
                    try:
                        pub.save()
                    except:
                        self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type)
                        continue #if something off with publication, skip.
                self.publication_cache[r['reference']] = pub
            else:
                pub = self.publication_cache[r['reference']]

            # print(r['review'],r['reference'])
            if r['review'].isdigit(): #assume pubmed
                pub_type = 'pubmed'
            elif r['review'].startswith('http'):
                pub_type = 'raw_link'
            else: #assume doi
                pub_type = 'doi'

            # print(r['review'],pub_type)
            if r['review']:
                if r['review'] not in self.publication_cache:
                    try:
                        wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type)
                    except WebLink.DoesNotExist:
                        try:
                            wl = WebLink.objects.create(index=r['review'],
                                    web_resource = WebResource.objects.get(slug=pub_type))
                        except IntegrityError:
                            wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type)

                    try:
                        pub_review = Publication.objects.get(web_link=wl)
                    except Publication.DoesNotExist:
                        pub_review = Publication()
                        try:
                            pub_review.web_link = wl
                            pub_review.save()
                        except IntegrityError:
                            pub_review = Publication.objects.get(web_link=wl)


                        if pub_type == 'doi':
                            pub_review.update_from_doi(doi=r['review'])
                        elif pub_type == 'pubmed':
                            pub_review.update_from_pubmed_data(index=r['review'])
                        try:
                            pub_review.save()
                        except:
                            self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type)
                            continue #if something off with publication, skip.
                        self.publication_cache[r['review']] = pub_review
                else:
                    pub_review = self.publication_cache[r['review']]
            else:
                pub_review = None

            l = None
            if str(r['ligand_name']) in self.ligand_cache:
                if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]:
                    l = self.ligand_cache[str(r['ligand_name'])][r['ligand_id']]
            else:
                self.ligand_cache[str(r['ligand_name'])] = {}

            if not l:
                try:
                    l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name']))
                except Exception as msg:
                    print('Something errored with ligand, aborting entry of mutation',r['ligand_name'],r['ligand_type'],r['ligand_id'],r['source_file'])
                    print(msg)
                    traceback.print_exc()
                    continue
                self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l


            l_ref = None
            if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache:
                l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])]
            else:
                if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True)
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False)
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = False
                    l_ref.ambigious_alias = True
                    l_ref.save()
                    l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    l_ref.save()
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists(): #amigious_alias not specified
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False)
                    l_ref.ambigious_alias = False
                    l_ref.save()
                elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = True
                    l_ref.ambigious_alias = False
                    try:
                        l_ref.save()
                        l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    except IntegrityError:
                        if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists():
                            l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True)
                        else:
                            l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False)
                        # print("error failing ligand, duplicate?")
                    try:
                        l_ref.save()
                    except IntegrityError:
                        l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False)
                        # print("error failing ligand, duplicate?")
                        # logger.error("FAILED SAVING LIGAND, duplicate?")
                else:
                    l_ref = None
                self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref


            protein_id = 0
            residue_id = 0

            protein=Protein.objects.filter(entry_name=r['protein'])
            if protein.exists():
                protein=protein.get()
                if r['protein'] in mutants_for_proteins:
                    mutants_for_proteins[r['protein']] += 1
                else:
                    mutants_for_proteins[r['protein']] = 1

            elif r['protein'] not in missing_proteins:

                try:
                    r['protein'] = wrong_uniport_ids[r['protein']]
                    real_uniprot = wrong_uniport_ids[r['protein']]
                    protein=Protein.objects.get(entry_name=r['protein'])
                    # print('fetched with lookup table',r['protein'])
                except:
                    # look for it as uniprot
                    protein=Protein.objects.filter(web_links__web_resource__slug='uniprot', web_links__index=r['protein'].upper())
                    if protein.exists():
                        protein=protein.get()
                        real_uniprot = protein.entry_name
                        if r['protein'] in mutants_for_proteins:
                            mutants_for_proteins[r['protein']] += 1
                        else:
                            mutants_for_proteins[r['protein']] = 1
                    else:
                        # Try to lookup in uniprot to catch typing errors / variants in entry_name
                        url = 'http://www.uniprot.org/uniprot/$index.xml'
                        cache_dir = ['uniprot', 'id']
                        uniprot_protein = fetch_from_web_api(url, r['protein'], cache_dir, xml = True)
                        try:
                            real_uniprot = uniprot_protein.find('.//{http://uniprot.org/uniprot}name').text.lower()
                            protein=Protein.objects.get(entry_name=real_uniprot)
                        except:
                            skipped += 1
                            if r['protein'] in missing_proteins:
                                missing_proteins[r['protein']] += 1
                            else:
                                missing_proteins[r['protein']] = 1
                                # print('Skipped due to no protein '+ r['protein'])
                                self.logger.error('Skipped due to no protein '+ r['protein'])
                            continue
                    wrong_uniport_ids[r['protein']] = protein.entry_name
                    r['protein'] = real_uniprot
            else:
                missing_proteins[r['protein']] += 1
                continue


            res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK
            if res.exists():
                res=res.get()
            else:
                self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'])
                # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file'])
                skipped += 1
                continue

            if r['ligand_class']:
                try:
                    l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'],
                        defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed
                except Exception as e:
                    if LigandRole.objects.filter(slug=slugify(r['ligand_class'])[:50]).exists():
                        l_role = LigandRole.objects.get(slug=slugify(r['ligand_class'])[:50])
                        if l_role.name == slugify(r['ligand_class'])[:50]:
                            #if name of role is same as slug, then it was created by constructs script, replace it
                            l_role.name = r['ligand_class']
                            l_role.save()
                    else:
                        print(e)
                        print("Error with",r['ligand_class'],slugify(r['ligand_class'])[:50] )
                        l_role, created = LigandRole.objects.get_or_create(slug=slugify(r['ligand_class'])[:50]) # FIXME this should not be needed
            else:
                l_role = None

            if r['exp_type']:
                exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type'])
            else:
                exp_type_id = None

            if r['exp_func']:
                exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func'])
            else:
                exp_func_id = None

            if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']:
                exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop'])
            else:
                exp_qual_id = None

            # if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']:
            #     exp_opt_id, created =  MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist'])
            # else:
            #     exp_opt_id = None

            try:
                mutation, created =  Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res)
            except IntegrityError:
                mutation = Mutation.objects.get(amino_acid=r['mutation_to'],protein=protein, residue=res)
            logtypes = ['pEC50','pIC50','pK']

            foldchange = 0
            typefold = ''
            if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format
                if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']):  #-log values!
                    try:
                        foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3);
                    except:
                        print(r)
                    typefold = r['exp_type']+"_log"
                elif "%"==r['exp_wt_unit']:
                    # if % then it's a difference case, then lower value is bad. Otherwise it's conc and lower is better
                    foldchange = round(r['exp_wt_value']/r['exp_mu_value_raw'],3);
                else:
                    foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3);
                    typefold = r['exp_type']+"_not_log"
                if foldchange>0 and foldchange<1 and foldchange!=0:
                    foldchange = -round((1/foldchange),3)
            elif r['fold_effect']!=0:
                    foldchange = round(r['fold_effect'],3);
                    if foldchange<1: foldchange = -round((1/foldchange),3);
            r['fold_effect'] = foldchange
            
            raw_experiment = self.insert_raw(r)
            # raw_experiment.save()
            bulk = MutationExperiment(
            refs=pub,
            review=pub_review,
            submitting_group = r['submitting_group'],
            data_container = r['data_container'],
            data_container_number = r['data_container_number'],
            protein=protein,
            residue=res,
            ligand=l,
            ligand_role=l_role,
            ligand_ref = l_ref,
            # raw = raw_experiment, #raw_experiment, OR None
            # optional = exp_opt_id,
            exp_type=exp_type_id,
            exp_func=exp_func_id,
            exp_qual = exp_qual_id,

            mutation=mutation,
            wt_value=r['exp_wt_value'], #
            wt_unit=r['exp_wt_unit'],

            mu_value = r['exp_mu_value_raw'],
            mu_sign = r['exp_mu_effect_sign'],
            foldchange = foldchange,
            opt_receptor_expression = r['opt_receptor_expression'],
            opt_basal_activity = r['opt_basal_activity'],
            opt_gain_of_activity = r['opt_gain_of_activity'],
            opt_ligand_emax = r['opt_ligand_emax'],
            opt_agonist =  r['opt_agonist'],
            )
            # for line,val in r.items():
            #     val = str(val)
            #     if len(val)>100:
            #         print(line,"too long",val)
            # mut_id = obj.id
            bulk_r.append(raw_experiment)
            bulk_m.append(bulk)
            # try:
            #     bulk.save()
            # except Exception as e:
            #     print(e)
            #     print(r)
            #     break
            #print('saved ',r['source_file'])
            inserted += 1
            end = time.time()
            diff = round(end - current,2)
            #print(diff)

        self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped))

        current = time.time()

        raws = MutationRaw.objects.bulk_create(bulk_r)
        for i,me in enumerate(bulk_m):
            me.raw = raws[i]
        MutationExperiment.objects.bulk_create(bulk_m)
        end = time.time()
        diff = round(end - current,2)
        current_sheet
        diff_2 = round(end - current_sheet,2)
        print("overall",diff_2,"bulk",diff,len(bulk_m),"skipped",str(skipped))
        sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True)