Example #1
0
    def fetch_ligand(self, ligand_id, smiles):
        """
        fetch ligands with Ligand model
        requires: ligand id, ligand id type, ligand name
        requires: source_file name
        """
        l = None

        try:
            if ligand_id in self.ligand_cache:
                l = self.ligand_cache[ligand_id]
            else:
                l = Ligand.objects.filter(properities__web_links__index=ligand_id).first()
                if l:
                    cid = l.properities.web_links.filter(web_resource__slug = 'pubchem').first()
                    if cid:
                        cid = cid.index
                    else:
                        l = None
                else:
                    l = get_or_make_ligand(smiles, 'SMILES', ligand_id,  )
        except Exception as msg:
            l = None
            # print('ligand_id---',l,'\n end')
        return l
Example #2
0
    def fetch_ligand(self, ligand_id, ligand_type, ligand_name, source_file):
        """
        fetch ligands with Ligand model
        requires: ligand id, ligand id type, ligand name
        requires: source_file name
        """
        l = None

        try:
            if ligand_id in self.ligand_cache:
                l = self.ligand_cache[ligand_id]
            else:
                l = get_or_make_ligand(ligand_id, ligand_type, ligand_name)
                self.ligand_cache[ligand_id] = l
        except Exception as msg:
            l = None
            # print('ligand_id---',l,'\n end')
        return l
Example #3
0
    def fetch_ligand(self, ligand_id, ligand_type, ligand_name, source_file):
        """
        fetch ligands with Ligand model
        requires: ligand id, ligand id type, ligand name
        requires: source_file name
        """
        l = None
        if str(ligand_id) in self.ligand_cache:
            if ligand_id in self.ligand_cache[str(ligand_id)]:
                l = self.ligand_cache[str(ligand_id)][ligand_id]
        else:
            self.ligand_cache[str(ligand_id)] = {}

        if not l:
            try:
                l = get_or_make_ligand(
                    ligand_id, ligand_type, str(ligand_name))
            except Exception as msg:
                l = None
                self.ligand_cache[str(ligand_name), ligand_id] = l
                self.mylog.exception("Protein fetching error | module: fetch_ligand. Row # is : ",
                                     ligand_name, ligand_type, ligand_id, source_file)
        return l
Example #4
0
 def fetch_ligand(self, ligand_id, ligand_type, ligand_name, source_file):
     """
     fetch ligands with Ligand model
     requires: ligand id, ligand id type, ligand name
     requires: source_file name
     """
     l = None
     try:
         if ligand_id in self.ligand_cache:
             l = self.ligand_cache[ligand_id]
         else:
             l = get_or_make_ligand(ligand_id, ligand_type, ligand_name)
             self.ligand_cache[ligand_id] = l
         if l == None:
             l = self.create_empty_ligand(ligand_name)
     except:
         web_resource = WebResource.objects.get(slug='pubchem')
         try:
             l = Ligand.objects.get(properities__web_links__web_resource=web_resource,
             properities__web_links__index=ligand_id)
         except:
             l = self.create_empty_ligand(ligand_name)
             # print('null ligand', l)
     return l
Example #5
0
 def fetch_ligand(self, ligand_id, smiles):
     """
     fetch ligands with Ligand model
     requires: ligand id, ligand id type, ligand name
     requires: source_file name
     """
     l = None
     try:
         if ligand_id in self.ligand_cache:
             l = self.ligand_cache[ligand_id]
         else:
             l = Ligand.objects.get(name=ligand_id)
             if l:
                 return l
             else:
                 l = get_or_make_ligand(
                     smiles,
                     'SMILES',
                     ligand_id,
                 )
     except Exception as msg:
         l = None
         # print('ligand_id---',l,'\n end')
     return l
Example #6
0
    def main_func(self, positions, iteration,count,lock):
        #####Create chembl compound link and connect it to the corresponding ligand/cid#####

        if iteration==0:
            # First load makes sure ligands are there
            list_of_chembl_ids = self.chembl_mol_ids
            while count.value<len(list_of_chembl_ids):
                with lock:
                    chembl_ligand = list_of_chembl_ids[count.value]
                    count.value +=1 
                    if count.value % 1000 == 0:
                        print('{} Status {} out of {}'.format(
                        datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S'), count.value, len(list_of_chembl_ids)))

                l = Ligand.objects.filter(properities__web_links__web_resource__slug = 'chembl_ligand', properities__web_links__index=chembl_ligand).first()
                if l:
                    cid = l.properities.web_links.filter(web_resource__slug = 'pubchem').first()
                    if cid:
                        cid = cid.index
                    else:
                        l = None
                        # make sure code blow is run

                if not l:
                    # if l already has chembl link, assume all is good.
                    if chembl_ligand not in self.chembl_cid_dict.keys():
                        cids, not_found = self.find_cid_for_chembl(chembl_ligand)
                        if not_found:
                            print('SKIPPED: Could not determine CID',chembl_ligand,cids)
                            continue
                    else:
                        cids = self.chembl_cid_dict[chembl_ligand]

                    temp = str(cids).split(';') #perhaps we should load all of the CIDs
                    cid = str(temp[0])

                    l = get_or_make_ligand(cid,'PubChem CID') #call the first cid if there are more than one
                    if not l:
                        print('SKIPPED: Ligand not found in PubChem', cid)
                        continue

                    if not l.properities.web_links.filter(web_resource__slug = 'pubchem',index = cid).exists():
                        # NO CID FOR LIGAND! Rare cases where SMILES was used for initial look up
                        wl, created = WebLink.objects.get_or_create(index=cid, web_resource=self.wr_pubchem)
                        l.properities.web_links.add(wl)

                    if not l.properities.web_links.filter(web_resource__slug = 'chembl_ligand',index = chembl_ligand).exists():
                        wl, created = WebLink.objects.get_or_create(index=chembl_ligand, web_resource=self.wr)
                        l.properities.web_links.add(wl)
                
                ###### Vendor stuff  ######
                if not len(l.properities.vendors.all()):
                    # If it has some, assume they are all loaded
                    cache_dir = ['pubchem', 'cid', 'vendors']
                    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/categories/compound/$index/JSON/'
                    vendors = fetch_from_web_api(url, cid, cache_dir)
                    
                    if vendors:
                        for vendor_data in vendors['SourceCategories']['Categories'][0]['Sources'] :
                            lv, created = LigandVendors.objects.get_or_create(slug = slugify(vendor_data['SourceName']))
                            lv.name = vendor_data['SourceName']
                            if 'SourceURL' in vendor_data:
                                lv.url = vendor_data['SourceURL']
                            lv.save()

                            if 'SID' in vendor_data:
                                #print (vendor_data['SID'])
                                lvls = LigandVendorLink.objects.filter(sid = vendor_data['SID'] )
                                if not lvls.exists():
                                    lvl = LigandVendorLink()
                                    lvl.vendor = lv
                                    lvl.lp = l.properities
                                    lvl.sid =  vendor_data['SID'] 
                                    if 'RegistryID' in vendor_data:
                                        lvl.vendor_external_id = vendor_data['RegistryID']
                                    if 'SourceRecordURL' in vendor_data:
                                        lvl.url = vendor_data['SourceRecordURL']
                                    else:
                                        continue
                                    lvl.save()

        elif iteration==1:
            # Third load loads the exp (based on ligand/assay)
            header = self.header_dict
            skipped = 0
            non_p = []
            wr_chembl_assays = WebResource.objects.get(slug='chembl_assays')
            while count.value<len(self.data):
                with lock:
                    record = self.data[count.value]
                    count.value +=1 
                    if count.value % 10000 == 0:
                        print('{} Status {} out of {}'.format(
                        datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S'), count.value, len(self.data)))


                target = record[header['target_chembl_id']]
                assay_id = record[header['assay_chembl_id']]

                assay, created = ChemblAssay.objects.get_or_create(assay_id=assay_id)
                if created:
                    wl, created = WebLink.objects.get_or_create(index=assay_id, web_resource=wr_chembl_assays)
                    assay.web_links.add(wl)


                ligand =record[header['molecule_chembl_id']]
                p = Protein.objects.filter(web_links__index = target, web_links__web_resource__slug = 'chembl').first()
                if not p:
                    if not target in non_p:
                        non_p.append(target)
                        print('Not found protein!',target)
                    continue

                ls = Ligand.objects.filter(properities__web_links__index=ligand, properities__web_links__web_resource__slug = 'chembl_ligand', canonical=True)
                if not ls.exists():
                    # if no ligand matches this, then ignore -- be sure this works later.
                    skipped += 1
                    continue
                for l in ls:
                    if len(ls)>1:
                        print('issue with canonical! give to munk',l,l.pk,ligand)
                        break
                assay_experiments = AssayExperiment.objects.filter( protein=p, ligand=l, assay=assay)
                
                if assay_experiments.exists():
                    assay_experiment = assay_experiments.get()
                else:
                    assay_experiment = AssayExperiment()
                    assay_experiment.assay = assay
                    assay_experiment.ligand = l
                    assay_experiment.protein = p

                
                
                assay_experiment.assay_type = record[header['assay_type']]
                assay_experiment.pchembl_value = record[header['pchembl_value']]
                assay_experiment.assay_description = record[header['assay_description']]
                assay_experiment.published_value = record[header['published_value']]
                assay_experiment.published_relation = record[header['published_relation']]
                assay_experiment.published_type = record[header['published_type']]
                assay_experiment.published_units = record[header['published_units']]
                
                assay_experiment.standard_value = record[header['standard_value']]
                assay_experiment.standard_relation = record[header['standard_relation']]
                assay_experiment.standard_type = record[header['standard_type']]
                assay_experiment.standard_units = record[header['standard_units']]
                
                try:
                    assay_experiment.save()
                except IntegrityError:
                    assay_experiment = AssayExperiment.objects.get( protein=p, ligand=l, assay=assay)

            print('done, skipped:',skipped)
Example #7
0
    def main_func(self, positions, iteration,count,lock):

        missing_proteins = {}

        c = 0
        skipped = 0
        rows = self.data_all

        while count.value<len(rows):
            with lock:
                r = rows[count.value]
                count.value +=1 

            current = time.time()
            c += 1

            # publication
            try: #fix if it thinks it's float.
                float(r['reference'])
                r['reference'] = str(int(r['reference']))
                float(r['review'])
                r['review'] = str(int(r['review']))
            except ValueError:
                pass

            if r['reference'].isdigit(): #assume pubmed
                pub_type = 'pubmed'
            else: #assume doi
                pub_type = 'doi'

            if r['reference'] not in self.publication_cache:
                try:
                    wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type)
                except WebLink.DoesNotExist:
                    try:
                        wl = WebLink.objects.create(index=r['reference'],
                                web_resource = WebResource.objects.get(slug=pub_type))
                    except IntegrityError:
                        wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type)


                try:
                    pub = Publication.objects.get(web_link=wl)
                except Publication.DoesNotExist:
                    pub = Publication()
                    try:
                        pub.web_link = wl
                        pub.save()
                    except IntegrityError:
                        pub = Publication.objects.get(web_link=wl)

                    if pub_type == 'doi':
                        pub.update_from_doi(doi=r['reference'])
                    elif pub_type == 'pubmed':
                        pub.update_from_pubmed_data(index=r['reference'])
                    try:
                        pub.save()
                    except:
                        self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type)
                        continue #if something off with publication, skip.
                self.publication_cache[r['reference']] = pub
            else:
                pub = self.publication_cache[r['reference']]


            l = None
            if str(r['ligand_name']) in self.ligand_cache:
                if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]:
                    l = self.ligand_cache[str(r['ligand_name'])][r['ligand_id']]
            else:
                self.ligand_cache[str(r['ligand_name'])] = {}

            if not l:
                try:
                    l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name']))
                except Exception as msg:
                    print('Something errored with ligand, aborting entry of mutation',r['ligand_name'],r['ligand_type'],r['ligand_id'],r['source_file'])
                    print(msg)
                    traceback.print_exc()
                    continue
                self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l


            protein=Protein.objects.filter(entry_name=r['protein'])
            if protein.exists():
                protein=protein.get()

            elif r['protein'] not in missing_proteins:
                # Can contain code to try to figure out what protein it is.
                pass
            else:
                missing_proteins[r['protein']] += 1
                continue

            res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK
            if res.exists():
                res=res.get()
            else:
                self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'])
                # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file'])
                skipped += 1
                continue

        self.logger.info('Parsed '+str(c)+' bias data entries. Skipped '+str(skipped))

        sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True)
        print(missing_proteins)
Example #8
0
    def main_func(self, positions, iteration):
        # filenames
        if not positions[1]:
            rows = self.data[positions[0]:]
        else:
            rows = self.data[positions[0]:positions[1]]

        missing_proteins = {}
        mutants_for_proteins = {}

        c = 0
        skipped = 0
        inserted = 0
        bulk_m = []
        bulk_r = []
        current_sheet = time.time()

        for r in rows:
            # print(source_file,c)
            # PRINT IF ERRORS OCCUR
            # self.logger.info('File '+str(r['source_file'])+' number '+str(c))
            current = time.time()
            c += 1
            # if c%100==0:
            #     self.logger.info('Parsed '+str(c)+' mutant data entries')

            # publication
            try:  #fix if it thinks it's float.
                float(r['reference'])
                r['reference'] = str(int(r['reference']))
                float(r['review'])
                r['review'] = str(int(r['review']))
            except ValueError:
                pass

            if r['reference'].isdigit():  #assume pubmed
                pub_type = 'pubmed'
            else:  #assume doi
                pub_type = 'doi'

            if r['reference'] not in self.publication_cache:
                try:
                    pub = Publication.objects.get(
                        web_link__index=r['reference'],
                        web_link__web_resource__slug=pub_type)
                except Publication.DoesNotExist:
                    pub = Publication()
                    try:
                        pub.web_link = WebLink.objects.get(
                            index=r['reference'], web_resource__slug=pub_type)
                    except WebLink.DoesNotExist:
                        wl = WebLink.objects.create(
                            index=r['reference'],
                            web_resource=WebResource.objects.get(
                                slug=pub_type))
                        pub.web_link = wl

                    if pub_type == 'doi':
                        pub.update_from_doi(doi=r['reference'])
                    elif pub_type == 'pubmed':
                        pub.update_from_pubmed_data(index=r['reference'])
                    try:
                        pub.save()
                    except:
                        self.logger.error('error with reference ' +
                                          str(r['reference']) + ' ' + pub_type)
                        continue  #if something off with publication, skip.
                self.publication_cache[r['reference']] = pub
            else:
                pub = self.publication_cache[r['reference']]

            # print(r['review'],r['reference'])
            if r['review'].isdigit():  #assume pubmed
                pub_type = 'pubmed'
            else:  #assume doi
                pub_type = 'doi'

            # print(r['review'],pub_type)
            if r['review']:
                if r['review'] not in self.publication_cache:
                    try:
                        pub_review = Publication.objects.get(
                            web_link__index=r['review'],
                            web_link__web_resource__slug=pub_type)
                    except Publication.DoesNotExist:
                        pub_review = Publication()
                        try:
                            pub_review.web_link = WebLink.objects.get(
                                index=r['review'], web_resource__slug=pub_type)
                        except WebLink.DoesNotExist:
                            wl = WebLink.objects.create(
                                index=r['review'],
                                web_resource=WebResource.objects.get(
                                    slug=pub_type))
                            pub_review.web_link = wl

                        if pub_type == 'doi':
                            pub_review.update_from_doi(doi=r['review'])
                        elif pub_type == 'pubmed':
                            pub_review.update_from_pubmed_data(
                                index=r['review'])
                        try:
                            pub_review.save()
                        except:
                            self.logger.error('error with review ' +
                                              str(r['review']) + ' ' +
                                              pub_type)
                            continue  #if something off with publication, skip.
                        self.publication_cache[r['review']] = pub_review
                else:
                    pub_review = self.publication_cache[r['review']]
            else:
                pub_review = None

            l = None
            if str(r['ligand_name']) in self.ligand_cache:
                if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]:
                    l = self.ligand_cache[str(
                        r['ligand_name'])][r['ligand_id']]
            else:
                self.ligand_cache[str(r['ligand_name'])] = {}

            if not l:
                l = get_or_make_ligand(r['ligand_id'], r['ligand_type'],
                                       str(r['ligand_name']))
                self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l

            l_ref = None
            if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache:
                l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])]
            else:
                if Ligand.objects.filter(
                        name=r['exp_mu_ligand_ref'], canonical=True
                ).exists(
                ):  #if this name is canonical and it has a ligand record already
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                               canonical=True)
                elif Ligand.objects.filter(
                        name=r['exp_mu_ligand_ref'],
                        canonical=False,
                        ambigious_alias=False
                ).exists(
                ):  #if this matches an alias that only has "one" parent canonical name - eg distinct
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                               canonical=False,
                                               ambigious_alias=False)
                elif Ligand.objects.filter(
                        name=r['exp_mu_ligand_ref'],
                        canonical=False,
                        ambigious_alias=True
                ).exists(
                ):  #if this matches an alias that only has several canonical parents, must investigate, start with empty.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = False
                    l_ref.ambigious_alias = True
                    l_ref.save()
                    l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    l_ref.save()
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'],
                                           canonical=False).exists(
                                           ):  #amigious_alias not specified
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                               canonical=False)
                    l_ref.ambigious_alias = False
                    l_ref.save()
                elif r['exp_mu_ligand_ref']:  #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = True
                    l_ref.ambigious_alias = False
                    l_ref.save()
                    l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    try:
                        l_ref.save()
                    except IntegrityError:
                        l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                                   canonical=False)
                        print("error failing ligand, duplicate?")
                        # logger.error("FAILED SAVING LIGAND, duplicate?")
                else:
                    l_ref = None
                self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref

            protein_id = 0
            residue_id = 0

            protein = Protein.objects.filter(entry_name=r['protein'])
            if protein.exists():
                protein = protein.get()
                if r['protein'] in mutants_for_proteins:
                    mutants_for_proteins[r['protein']] += 1
                else:
                    mutants_for_proteins[r['protein']] = 1

            else:
                skipped += 1
                if r['protein'] in missing_proteins:
                    missing_proteins[r['protein']] += 1
                else:
                    missing_proteins[r['protein']] = 1
                    self.logger.error('Skipped due to no protein ' +
                                      r['protein'])
                continue

            res = Residue.objects.filter(
                protein_conformation__protein=protein,
                amino_acid=r['mutation_from'],
                sequence_number=r['mutation_pos'])  #FIXME MAKE AA CHECK
            if res.exists():
                res = res.get()
            else:
                self.logger.error('Skipped due to no residue or mismatch AA ' +
                                  r['protein'] + ' pos:' +
                                  str(r['mutation_pos']) + ' AA:' +
                                  r['mutation_from'])
                skipped += 1
                continue

            if r['ligand_class']:
                l_role, created = LigandRole.objects.get_or_create(
                    name=r['ligand_class'],
                    defaults={'slug': slugify(r['ligand_class'])[:50]
                              })  # FIXME this should not be needed
            else:
                l_role = None

            if r['exp_type']:
                exp_type_id, created = MutationExperimentalType.objects.get_or_create(
                    type=r['exp_type'])
            else:
                exp_type_id = None

            if r['exp_func']:
                exp_func_id, created = MutationFunc.objects.get_or_create(
                    func=r['exp_func'])
            else:
                exp_func_id = None

            if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']:
                exp_qual_id, created = MutationQual.objects.get_or_create(
                    qual=r['exp_mu_effect_qual'],
                    prop=r['exp_mu_effect_ligand_prop'])
            else:
                exp_qual_id = None

            if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r[
                    'opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r[
                        'opt_agonist']:
                exp_opt_id, created = MutationOptional.objects.get_or_create(
                    type=r['opt_type'],
                    wt=r['opt_wt'],
                    mu=r['opt_mu'],
                    sign=r['opt_sign'],
                    percentage=r['opt_percentage'],
                    qual=r['opt_qual'],
                    agonist=r['opt_agonist'])
            else:
                exp_opt_id = None

            try:
                mutation, created = Mutation.objects.get_or_create(
                    amino_acid=r['mutation_to'], protein=protein, residue=res)
            except IntegrityError:
                mutation = Mutation.objects.get(amino_acid=r['mutation_to'],
                                                protein=protein,
                                                residue=res)
            logtypes = ['pEC50', 'pIC50', 'pK']

            foldchange = 0
            typefold = ''
            if r['exp_wt_value'] != 0 and r[
                    'exp_mu_value_raw'] != 0:  #fix for new format

                if re.match("(" + ")|(".join(logtypes) + ")",
                            r['exp_type']):  #-log values!
                    foldchange = round(
                        math.pow(10, -r['exp_mu_value_raw']) /
                        pow(10, -r['exp_wt_value']), 3)
                    typefold = r['exp_type'] + "_log"
                else:
                    foldchange = round(
                        r['exp_mu_value_raw'] / r['exp_wt_value'], 3)
                    typefold = r['exp_type'] + "_not_log"

                if foldchange < 1 and foldchange != 0:
                    foldchange = -round((1 / foldchange), 3)
            elif r['fold_effect'] != 0:
                foldchange = round(r['fold_effect'], 3)
                if foldchange < 1: foldchange = -round((1 / foldchange), 3)

            raw_experiment = self.insert_raw(r)
            bulk = MutationExperiment(
                refs=pub,
                review=pub_review,
                protein=protein,
                residue=res,
                ligand=l,
                ligand_role=l_role,
                ligand_ref=l_ref,
                #raw = raw_experiment, #raw_experiment, OR None
                optional=exp_opt_id,
                exp_type=exp_type_id,
                exp_func=exp_func_id,
                exp_qual=exp_qual_id,
                mutation=mutation,
                wt_value=r['exp_wt_value'],  #
                wt_unit=r['exp_wt_unit'],
                mu_value=r['exp_mu_value_raw'],
                mu_sign=r['exp_mu_effect_sign'],
                foldchange=foldchange)
            # mut_id = obj.id
            bulk_r.append(raw_experiment)
            bulk_m.append(bulk)
            inserted += 1
            end = time.time()
            diff = round(end - current, 2)
            #print(diff)

        self.logger.info('Parsed ' + str(c) +
                         ' mutant data entries. Skipped ' + str(skipped))

        current = time.time()

        raws = MutationRaw.objects.bulk_create(bulk_r)
        for i, me in enumerate(bulk_m):
            me.raw = raws[i]
        MutationExperiment.objects.bulk_create(bulk_m)
        end = time.time()
        diff = round(end - current, 2)
        current_sheet
        diff_2 = round(end - current_sheet, 2)
        print("overall", diff_2, "bulk", diff, len(bulk_m), "skipped",
              str(skipped))
Example #9
0
    def main_func(self, positions, iteration, count, lock):
        # filenames
        # if not positions[1]:
        #     rows = self.data[positions[0]:]
        # else:
        #     rows = self.data[positions[0]:positions[1]]

        missing_proteins = {}
        mutants_for_proteins = {}
        wrong_uniport_ids = {}

        c = 0
        skipped = 0
        inserted = 0
        bulk_m = []
        bulk_r = []
        current_sheet = time.time()

        rows = self.data_all
        while count.value < len(rows):
            with lock:
                r = rows[count.value]
                count.value += 1
        # for r in rows:
        # print(r['source_file'],c)
        # PRINT IF ERRORS OCCUR
        #self.logger.info('File '+str(r['source_file'])+' number '+str(c))
            current = time.time()
            c += 1
            # if c%100==0:
            #     self.logger.info('Parsed '+str(c)+' mutant data entries')

            # publication
            try:  #fix if it thinks it's float.
                float(r['reference'])
                r['reference'] = str(int(r['reference']))
                float(r['review'])
                r['review'] = str(int(r['review']))
            except ValueError:
                pass

            if r['reference'].isdigit():  #assume pubmed
                pub_type = 'pubmed'
            else:  #assume doi
                pub_type = 'doi'

            if r['reference'] not in self.publication_cache:
                try:
                    wl = WebLink.objects.get(index=r['reference'],
                                             web_resource__slug=pub_type)
                except WebLink.DoesNotExist:
                    try:
                        wl = WebLink.objects.create(
                            index=r['reference'],
                            web_resource=WebResource.objects.get(
                                slug=pub_type))
                    except IntegrityError:
                        wl = WebLink.objects.get(index=r['reference'],
                                                 web_resource__slug=pub_type)

                try:
                    pub = Publication.objects.get(web_link=wl)
                except Publication.DoesNotExist:
                    pub = Publication()
                    try:
                        pub.web_link = wl
                        pub.save()
                    except IntegrityError:
                        pub = Publication.objects.get(web_link=wl)

                    if pub_type == 'doi':
                        pub.update_from_doi(doi=r['reference'])
                    elif pub_type == 'pubmed':
                        pub.update_from_pubmed_data(index=r['reference'])
                    try:
                        pub.save()
                    except:
                        self.logger.error('error with reference ' +
                                          str(r['reference']) + ' ' + pub_type)
                        continue  #if something off with publication, skip.
                self.publication_cache[r['reference']] = pub
            else:
                pub = self.publication_cache[r['reference']]

            # print(r['review'],r['reference'])
            if r['review'].isdigit():  #assume pubmed
                pub_type = 'pubmed'
            elif r['review'].startswith('http'):
                pub_type = 'raw_link'
            else:  #assume doi
                pub_type = 'doi'

            # print(r['review'],pub_type)
            if r['review']:
                if r['review'] not in self.publication_cache:
                    try:
                        wl = WebLink.objects.get(index=r['review'],
                                                 web_resource__slug=pub_type)
                    except WebLink.DoesNotExist:
                        try:
                            wl = WebLink.objects.create(
                                index=r['review'],
                                web_resource=WebResource.objects.get(
                                    slug=pub_type))
                        except IntegrityError:
                            wl = WebLink.objects.get(
                                index=r['review'], web_resource__slug=pub_type)

                    try:
                        pub_review = Publication.objects.get(web_link=wl)
                    except Publication.DoesNotExist:
                        pub_review = Publication()
                        try:
                            pub_review.web_link = wl
                            pub_review.save()
                        except IntegrityError:
                            pub_review = Publication.objects.get(web_link=wl)

                        if pub_type == 'doi':
                            pub_review.update_from_doi(doi=r['review'])
                        elif pub_type == 'pubmed':
                            pub_review.update_from_pubmed_data(
                                index=r['review'])
                        try:
                            pub_review.save()
                        except:
                            self.logger.error('error with review ' +
                                              str(r['review']) + ' ' +
                                              pub_type)
                            continue  #if something off with publication, skip.
                        self.publication_cache[r['review']] = pub_review
                else:
                    pub_review = self.publication_cache[r['review']]
            else:
                pub_review = None

            l = None
            if str(r['ligand_name']) in self.ligand_cache:
                if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]:
                    l = self.ligand_cache[str(
                        r['ligand_name'])][r['ligand_id']]
            else:
                self.ligand_cache[str(r['ligand_name'])] = {}

            if not l:
                try:
                    l = get_or_make_ligand(r['ligand_id'], r['ligand_type'],
                                           str(r['ligand_name']))
                except Exception as msg:
                    print(
                        'Something errored with ligand, aborting entry of mutation',
                        r['ligand_name'], r['ligand_type'], r['ligand_id'],
                        r['source_file'])
                    print(msg)
                    traceback.print_exc()
                    continue
                self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l

            l_ref = None
            if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache:
                l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])]
            else:
                if Ligand.objects.filter(
                        name=r['exp_mu_ligand_ref'], canonical=True
                ).exists(
                ):  #if this name is canonical and it has a ligand record already
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                               canonical=True)
                elif Ligand.objects.filter(
                        name=r['exp_mu_ligand_ref'],
                        canonical=False,
                        ambigious_alias=False
                ).exists(
                ):  #if this matches an alias that only has "one" parent canonical name - eg distinct
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                               canonical=False,
                                               ambigious_alias=False)
                elif Ligand.objects.filter(
                        name=r['exp_mu_ligand_ref'],
                        canonical=False,
                        ambigious_alias=True
                ).exists(
                ):  #if this matches an alias that only has several canonical parents, must investigate, start with empty.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = False
                    l_ref.ambigious_alias = True
                    l_ref.save()
                    l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    l_ref.save()
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'],
                                           canonical=False).exists(
                                           ):  #amigious_alias not specified
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                               canonical=False)
                    l_ref.ambigious_alias = False
                    l_ref.save()
                elif r['exp_mu_ligand_ref']:  #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = True
                    l_ref.ambigious_alias = False
                    try:
                        l_ref.save()
                        l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    except IntegrityError:
                        if Ligand.objects.filter(name=r['exp_mu_ligand_ref'],
                                                 canonical=True).exists():
                            l_ref = Ligand.objects.get(
                                name=r['exp_mu_ligand_ref'], canonical=True)
                        else:
                            l_ref = Ligand.objects.get(
                                name=r['exp_mu_ligand_ref'], canonical=False)
                        # print("error failing ligand, duplicate?")
                    try:
                        l_ref.save()
                    except IntegrityError:
                        l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                                   canonical=False)
                        # print("error failing ligand, duplicate?")
                        # logger.error("FAILED SAVING LIGAND, duplicate?")
                else:
                    l_ref = None
                self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref

            protein_id = 0
            residue_id = 0

            protein = Protein.objects.filter(entry_name=r['protein'])
            if protein.exists():
                protein = protein.get()
                if r['protein'] in mutants_for_proteins:
                    mutants_for_proteins[r['protein']] += 1
                else:
                    mutants_for_proteins[r['protein']] = 1

            elif r['protein'] not in missing_proteins:

                try:
                    r['protein'] = wrong_uniport_ids[r['protein']]
                    real_uniprot = wrong_uniport_ids[r['protein']]
                    protein = Protein.objects.get(entry_name=r['protein'])
                    # print('fetched with lookup table',r['protein'])
                except:
                    # look for it as uniprot
                    protein = Protein.objects.filter(
                        web_links__web_resource__slug='uniprot',
                        web_links__index=r['protein'].upper())
                    if protein.exists():
                        protein = protein.get()
                        real_uniprot = protein.entry_name
                        if r['protein'] in mutants_for_proteins:
                            mutants_for_proteins[r['protein']] += 1
                        else:
                            mutants_for_proteins[r['protein']] = 1
                    else:
                        # Try to lookup in uniprot to catch typing errors / variants in entry_name
                        url = 'http://www.uniprot.org/uniprot/$index.xml'
                        cache_dir = ['uniprot', 'id']
                        uniprot_protein = fetch_from_web_api(url,
                                                             r['protein'],
                                                             cache_dir,
                                                             xml=True)
                        try:
                            real_uniprot = uniprot_protein.find(
                                './/{http://uniprot.org/uniprot}name'
                            ).text.lower()
                            protein = Protein.objects.get(
                                entry_name=real_uniprot)
                        except:
                            skipped += 1
                            if r['protein'] in missing_proteins:
                                missing_proteins[r['protein']] += 1
                            else:
                                missing_proteins[r['protein']] = 1
                                # print('Skipped due to no protein '+ r['protein'])
                                self.logger.error(
                                    'Skipped due to no protein ' +
                                    r['protein'])
                            continue
                    wrong_uniport_ids[r['protein']] = protein.entry_name
                    r['protein'] = real_uniprot
            else:
                missing_proteins[r['protein']] += 1
                continue

            res = Residue.objects.filter(
                protein_conformation__protein=protein,
                amino_acid=r['mutation_from'],
                sequence_number=r['mutation_pos'])  #FIXME MAKE AA CHECK
            if res.exists():
                res = res.get()
            else:
                self.logger.error('Skipped due to no residue or mismatch AA ' +
                                  r['protein'] + ' pos:' +
                                  str(r['mutation_pos']) + ' AA:' +
                                  r['mutation_from'])
                # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file'])
                skipped += 1
                continue

            if r['ligand_class']:
                try:
                    l_role, created = LigandRole.objects.get_or_create(
                        name=r['ligand_class'],
                        defaults={'slug': slugify(r['ligand_class'])[:50]
                                  })  # FIXME this should not be needed
                except Exception as e:
                    if LigandRole.objects.filter(
                            slug=slugify(r['ligand_class'])[:50]).exists():
                        l_role = LigandRole.objects.get(
                            slug=slugify(r['ligand_class'])[:50])
                        if l_role.name == slugify(r['ligand_class'])[:50]:
                            #if name of role is same as slug, then it was created by constructs script, replace it
                            l_role.name = r['ligand_class']
                            l_role.save()
                    else:
                        print(e)
                        print("Error with", r['ligand_class'],
                              slugify(r['ligand_class'])[:50])
                        l_role, created = LigandRole.objects.get_or_create(
                            slug=slugify(r['ligand_class'])
                            [:50])  # FIXME this should not be needed
            else:
                l_role = None

            if r['exp_type']:
                exp_type_id, created = MutationExperimentalType.objects.get_or_create(
                    type=r['exp_type'])
            else:
                exp_type_id = None

            if r['exp_func']:
                exp_func_id, created = MutationFunc.objects.get_or_create(
                    func=r['exp_func'])
            else:
                exp_func_id = None

            if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']:
                exp_qual_id, created = MutationQual.objects.get_or_create(
                    qual=r['exp_mu_effect_qual'],
                    prop=r['exp_mu_effect_ligand_prop'])
            else:
                exp_qual_id = None

            # if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']:
            #     exp_opt_id, created =  MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist'])
            # else:
            #     exp_opt_id = None

            try:
                mutation, created = Mutation.objects.get_or_create(
                    amino_acid=r['mutation_to'], protein=protein, residue=res)
            except IntegrityError:
                mutation = Mutation.objects.get(amino_acid=r['mutation_to'],
                                                protein=protein,
                                                residue=res)
            logtypes = ['pEC50', 'pIC50', 'pK']

            foldchange = 0
            typefold = ''
            if r['exp_wt_value'] != 0 and r[
                    'exp_mu_value_raw'] != 0:  #fix for new format
                if re.match("(" + ")|(".join(logtypes) + ")",
                            r['exp_type']):  #-log values!
                    try:
                        foldchange = round(
                            math.pow(10, -r['exp_mu_value_raw']) /
                            pow(10, -r['exp_wt_value']), 3)
                    except:
                        print(r)
                    typefold = r['exp_type'] + "_log"
                elif "%" == r['exp_wt_unit']:
                    # if % then it's a difference case, then lower value is bad. Otherwise it's conc and lower is better
                    foldchange = round(
                        r['exp_wt_value'] / r['exp_mu_value_raw'], 3)
                else:
                    foldchange = round(
                        r['exp_mu_value_raw'] / r['exp_wt_value'], 3)
                    typefold = r['exp_type'] + "_not_log"
                if foldchange > 0 and foldchange < 1 and foldchange != 0:
                    foldchange = -round((1 / foldchange), 3)
            elif r['fold_effect'] != 0:
                foldchange = round(r['fold_effect'], 3)
                if foldchange < 1: foldchange = -round((1 / foldchange), 3)
            r['fold_effect'] = foldchange

            raw_experiment = self.insert_raw(r)
            # raw_experiment.save()
            bulk = MutationExperiment(
                refs=pub,
                review=pub_review,
                submitting_group=r['submitting_group'],
                data_container=r['data_container'],
                data_container_number=r['data_container_number'],
                protein=protein,
                residue=res,
                ligand=l,
                ligand_role=l_role,
                ligand_ref=l_ref,
                # raw = raw_experiment, #raw_experiment, OR None
                # optional = exp_opt_id,
                exp_type=exp_type_id,
                exp_func=exp_func_id,
                exp_qual=exp_qual_id,
                mutation=mutation,
                wt_value=r['exp_wt_value'],  #
                wt_unit=r['exp_wt_unit'],
                mu_value=r['exp_mu_value_raw'],
                mu_sign=r['exp_mu_effect_sign'],
                foldchange=foldchange,
                opt_receptor_expression=r['opt_receptor_expression'],
                opt_basal_activity=r['opt_basal_activity'],
                opt_gain_of_activity=r['opt_gain_of_activity'],
                opt_ligand_emax=r['opt_ligand_emax'],
                opt_agonist=r['opt_agonist'],
            )
            # for line,val in r.items():
            #     val = str(val)
            #     if len(val)>100:
            #         print(line,"too long",val)
            # mut_id = obj.id
            bulk_r.append(raw_experiment)
            bulk_m.append(bulk)
            # try:
            #     bulk.save()
            # except Exception as e:
            #     print(e)
            #     print(r)
            #     break
            #print('saved ',r['source_file'])
            inserted += 1
            end = time.time()
            diff = round(end - current, 2)
            #print(diff)

        self.logger.info('Parsed ' + str(c) +
                         ' mutant data entries. Skipped ' + str(skipped))

        current = time.time()

        raws = MutationRaw.objects.bulk_create(bulk_r)
        for i, me in enumerate(bulk_m):
            me.raw = raws[i]
        MutationExperiment.objects.bulk_create(bulk_m)
        end = time.time()
        diff = round(end - current, 2)
        # current_sheet
        diff_2 = round(end - current_sheet, 2)
        print("overall", diff_2, "bulk", diff, len(bulk_m), "skipped",
              str(skipped))
        sorted_missing_proteins = sorted(missing_proteins.items(),
                                         key=operator.itemgetter(1),
                                         reverse=True)
Example #10
0
def add_construct(d):

    #delete if already name there
    Construct.objects.filter(name = d['construct_crystal']['pdb_name']).delete()

    protein = Protein.objects.filter(entry_name=d['construct_crystal']['uniprot']).get()
    structure = Structure.objects.filter(pdb_code__index=d['construct_crystal']['pdb'].upper()).get()
    protein_conformation = structure.protein_conformation

    construct = Construct()
    construct.protein = protein
    construct.name = d['construct_crystal']['pdb_name']
    construct.json = json.dumps(d, indent=4, separators=(',', ': '))
    construct.structure = structure

    #CrystalInfo
    crystal = CrystalInfo()
    crystal.resolution = structure.resolution
    crystal.pdb_data = structure.pdb_data
    crystal.pdb_code = structure.pdb_code.index
    crystal.save()

    construct.crystal = crystal

    #Contact INFO
    if 'contact_info' in d:
        construct.contributor, created = ContributorInfo.objects.get_or_create(name = d['contact_info']['name_cont'],
                                                       pi_email = d['contact_info']['pi_email'],
                                                       pi_name = d['contact_info']['pi_name'],
                                                       urls = d['contact_info']['url'],
                                                       date = datetime.datetime.strptime(d['contact_info']['date'], '%m/%d/%Y').strftime('%Y-%m-%d'),
                                                       address = d['contact_info']['address'])

    construct.save()
    #MUTATIONS
    for mutation in d['mutations']:

        if 'type' not in mutation:
            mutation['type'] = ''

        if 'remark' not in mutation:
            mutation['remark'] = ''

        mut = ConstructMutation.objects.create(sequence_number=mutation['pos'],wild_type_amino_acid=mutation['wt'],mutated_amino_acid=mutation['mut'],mutation_type=mutation['type'],remark=mutation['remark'])
        construct.mutations.add(mut)

    #DELETIONS
    insert_deletions = {}
    for deletion in d['deletions']:
        if 'start' in deletion:
            dele = ConstructDeletion.objects.create(start=deletion['start'],end=deletion['end'])
        else:
            dele = ConstructDeletion.objects.create(start=deletion['pos'],end=deletion['pos'])
        construct.deletions.add(dele)
        if deletion['origin']!='user':
            id = deletion['origin'].split('_')[1]
            insert_deletions[id] = deletion

    #INSERTIONS (AUX)
    for name,aux in d['auxiliary'].items():
        id = name.replace('aux','')
        aux_type,created = ConstructInsertionType.objects.get_or_create(name=aux['type'],subtype=aux['subtype'])
        insert = ConstructInsertion.objects.create(insert_type=aux_type,presence=aux['presence'],position=aux['position']+"_"+id)

        if insert.presence == 'YES' and insert.position.startswith('Within Receptor'):
            #need to fetch range
            if 'start' in aux:
                insert.start = aux['start']
                insert.end = aux['start']
            else:
                insert.start = insert_deletions[id]['start']
                insert.end = insert_deletions[id]['end']
            insert.save()

        construct.insertions.add(insert)

    #MODIFICATIONS
    for modification in d['modifications']:
        mod = ConstructModification.objects.create(modification=modification['type'],position_type=modification['position'][0],
                                                   pos_start=modification['position'][1][0],
                                                   pos_end=modification['position'][1][1],remark=modification['remark'] )
        construct.modifications.add(mod)


    #EXPRESSION
    if 'expression' in d:
        if 'expr_method' in d['expression']:
            if 'expr_remark' not in d['expression']:
                d['expression']['expr_remark'] = ''
            construct.expression,created = ExpressionSystem.objects.get_or_create(expression_method=d['expression']['expr_method'],
                                                            host_cell_type=d['expression']['host_cell_type'],
                                                            host_cell=d['expression']['host_cell'],
                                                            remarks=d['expression']['expr_remark'])


    
    #solubilization
    if 'solubilization' in d:
        if 'deterg_type' in d['solubilization']:
            c_list = ChemicalList()
            list_name,created  = ChemicalListName.objects.get_or_create(name='Solubilization')
            c_list.name = list_name
            c_list.save()
            ct, created = ChemicalType.objects.get_or_create(name='detergent')
            chem, created = Chemical.objects.get_or_create(name=d['solubilization']['deterg_type'], chemical_type=ct)
            cc, created = ChemicalConc.objects.get_or_create(concentration=d['solubilization']['deterg_concentr'], concentration_unit=d['solubilization']['deterg_concentr_unit'], chemical=chem)
            c_list.chemicals.add(cc)                
            ct, created = ChemicalType.objects.get_or_create(name='additive')
            chem, created = Chemical.objects.get_or_create(name=d['solubilization']['solub_additive'], chemical_type=ct)
            cc, created = ChemicalConc.objects.get_or_create(concentration=d['solubilization']['additive_concentr'], concentration_unit=d['solubilization']['addit_concentr_unit'], chemical=chem)
            c_list.chemicals.add(cc)

            solubilization = Solubilization.objects.create(chemical_list = c_list)

            construct.solubilization = solubilization
            construct.save()

            #Purification
            purification = Purification.objects.create()
            for puri,step in d['solubilization'].items():
                if not puri.startswith(('chem_enz_treatment','sol_remark')):
                    continue
                else:
                    s,created = PurificationStep.objects.get_or_create(name=step)
                    purification.steps.add(s)
            construct.purification = purification
    construct.save()

    #CRYSTALLIZATION 
    if 'crystallization' in d:
        if 'crystal_type' in d['crystallization']:
            c = Crystallization()
            sub_name = "" if 'lcp_lipid' not in d['crystallization'] else d['crystallization']['lcp_lipid']
            c_type, created = CrystallizationTypes.objects.get_or_create(name=d['crystallization']['crystal_type'], sub_name=sub_name)
            c_method, created = CrystallizationMethods.objects.get_or_create(name=d['crystallization']['crystal_method'])

            c.crystal_type = c_type
            c.crystal_method = c_method
            if 'crystal_remark' in d['crystallization']:
                c.remarks = d['crystallization']['crystal_remark']
            c.temp = d['crystallization']['temperature']

            if d['crystallization']['ph']=='single_ph':
                c.ph_start = d['crystallization']['ph_single']
                c.ph_end = d['crystallization']['ph_single']
            else:
                c.ph_start = d['crystallization']['ph_range_one']
                c.ph_end = d['crystallization']['ph_range_two']


            c.protein_conc = d['crystallization']['protein_concentr']
            c.protein_conc_unit = d['crystallization']['protein_conc_unit']

            c.save()

            #MAKE LISTS
            c_list = ChemicalList()
            list_name,created  = ChemicalListName.objects.get_or_create(name='crystallization_chemical_components')
            c_list.name = list_name
            c_list.save()
            for chemical in d['crystallization']['chemical_components']:
                ct, created = ChemicalType.objects.get_or_create(name='crystallization_chemical_components')
                chem, created = Chemical.objects.get_or_create(name=chemical['component'], chemical_type=ct)
                cc, created = ChemicalConc.objects.get_or_create(concentration=chemical['value'], concentration_unit=chemical['unit'], chemical=chem)
                c_list.chemicals.add(cc)
            c.chemical_lists.add(c_list)

            if d['crystallization']['crystal_type']=='lipidic cubic phase': #make list of LCP stuff
                c_list = ChemicalList()
                # c_list.name = d['crystallization']['lcp_lipid']
                list_name,created  = ChemicalListName.objects.get_or_create(name='LCP')
                c_list.name = list_name
                c_list.save()
                ct, created = ChemicalType.objects.get_or_create(name='LCP Lipid additive')
                chem, created = Chemical.objects.get_or_create(name=d['crystallization']['lcp_add'], chemical_type=ct)
                cc, created = ChemicalConc.objects.get_or_create(concentration=d['crystallization']['lcp_conc'], concentration_unit=d['crystallization']['lcp_conc_unit'], chemical=chem)
                c_list.chemicals.add(cc)
                c.chemical_lists.add(c_list)

            #DETERGENT
            if 'detergent' in d['crystallization']:
                c_list = ChemicalList()
                list_name,created  = ChemicalListName.objects.get_or_create(name='Detergent')
                c_list.name = list_name
                c_list.save()
                ct, created = ChemicalType.objects.get_or_create(name='detergent')
                chem, created = Chemical.objects.get_or_create(name=d['crystallization']['detergent'], chemical_type=ct)
                cc, created = ChemicalConc.objects.get_or_create(concentration=d['crystallization']['deterg_conc'], concentration_unit=d['crystallization']['deterg_conc_unit'], chemical=chem)
                c_list.chemicals.add(cc)
                c.chemical_lists.add(c_list)

            #LIPID
            if 'lipid' in d['crystallization']:
                c_list = ChemicalList()
                list_name,created  = ChemicalListName.objects.get_or_create(name='Lipid')
                c_list.name = list_name
                c_list.save()
                ct, created = ChemicalType.objects.get_or_create(name='lipid')
                chem, created = Chemical.objects.get_or_create(name=d['crystallization']['lipid'], chemical_type=ct)
                cc, created = ChemicalConc.objects.get_or_create(concentration=d['crystallization']['lipid_concentr'], concentration_unit=d['crystallization']['lipid_concentr_unit'], chemical=chem)
                c_list.chemicals.add(cc)
                c.chemical_lists.add(c_list)



            #Use ligand function to get ligand if it exists or otherwise create. Lots of checks for inchi/smiles/name
            ligand = get_or_make_ligand(d['construct_crystal']['ligand_id'],d['construct_crystal']['ligand_id_type'],d['construct_crystal']['ligand_name'])
            if 'ligand_activity' not in d['construct_crystal']:
                d['construct_crystal']['ligand_activity'] = 'unknown'
            if ligand and 'ligand_activity' in d['construct_crystal']:
                role_slug = slugify(d['construct_crystal']['ligand_activity'])
                try:
                    lr, created = LigandRole.objects.get_or_create(slug=role_slug,
                    defaults={'name': d['construct_crystal']['ligand_activity']})
                except IntegrityError:
                    lr = LigandRole.objects.get(slug=role_slug)
            if ligand:
                ligand_c = CrystallizationLigandConc()
                ligand_c.construct_crystallization = c
                ligand_c.ligand = ligand
                if lr: 
                    ligand_c.ligand_role = lr
                if 'ligand_conc' in d['construct_crystal']:
                    ligand_c.ligand_conc = d['construct_crystal']['ligand_conc']
                if 'ligand_conc_unit' in d['construct_crystal']:
                    ligand_c.ligand_conc_unit = d['construct_crystal']['ligand_conc_unit']
                ligand_c.save()

                c.ligands.add(ligand_c)

            construct.crystallization = c

    construct.save()
Example #11
0
    def create_mutant_data(self, filenames):
        self.logger.info('CREATING MUTANT DATA')
        
        # what files should be parsed?
        if not filenames:
            filenames = os.listdir(self.structure_data_dir)

        missing_proteins = {}
        mutants_for_proteins = {}

        for source_file in filenames:
            source_file_path = os.sep.join([self.structure_data_dir, source_file])
            if os.path.isfile(source_file_path) and source_file[0] != '.':
                self.logger.info('Reading file {}'.format(source_file_path))
                # read the yaml file

                if source_file[-4:]=='xlsx' or source_file[-3:]=='xls':
                    rows = self.loaddatafromexcel(source_file_path)
                    rows = self.analyse_rows(rows)
                elif source_file[-4:]=='yaml':
                    rows = yaml.load(open(source_file_path, 'r'))
                    temp = []
                    for r in rows:
                        d = {}
                        d['reference'] = r['pubmed']
                        d['protein'] = r['entry_name'].replace("__","_").lower()
                        d['mutation_pos'] = r['seq']
                        d['mutation_from'] = r['from_res']
                        d['mutation_to'] = r['to_res']
                        d['ligand_name'] = ''
                        d['ligand_type'] = ''
                        d['ligand_id'] = ''
                        d['ligand_class'] = ''
                        d['exp_type'] = ''
                        d['exp_func'] = ''
                        d['exp_wt_value'] = 0
                        d['exp_wt_unit'] = ''
                        d['exp_mu_effect_sign'] = ''
                        d['exp_mu_value_raw'] = 0
                        d['fold_effect'] = 0
                        d['exp_mu_effect_qual'] = ''
                        d['exp_mu_effect_ligand_prop'] = ''
                        d['exp_mu_ligand_ref'] = ''
                        d['opt_type'] = ''
                        d['opt_wt'] = 0
                        d['opt_mu'] = 0
                        d['opt_sign'] = ''
                        d['opt_percentage'] = 0
                        d['opt_qual'] = ''
                        d['opt_agonist'] = ''
                        if len(d['mutation_to'])>1 or len(d['mutation_from'])>1: #if something is off with amino acid
                            continue
                        temp.append(d)
                    rows = temp
                else:
                    self.logger.info('unknown format'.source_file)
                    continue

                c = 0
                skipped = 0
                inserted = 0
                for r in rows:
                    c += 1
                    if c%1000==0: 
                        self.logger.info('Parsed '+str(c)+' mutant data entries')

                    # publication
                    try: #fix if it thinks it's float.
                        float(r['reference'])
                        r['reference'] = str(int(r['reference']))
                    except ValueError:
                        pass

                    if r['reference'].isdigit(): #assume pubmed
                        pub_type = 'pubmed'
                    else: #assume doi
                        pub_type = 'doi'

                    try:
                        pub = Publication.objects.get(web_link__index=r['reference'], web_link__web_resource__slug=pub_type)
                    except Publication.DoesNotExist:
                        pub = Publication()
                        try:
                            pub.web_link = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type)
                        except WebLink.DoesNotExist:
                            wl = WebLink.objects.create(index=r['reference'],
                                web_resource = WebResource.objects.get(slug=pub_type))
                            pub.web_link = wl

                        if pub_type == 'doi':
                            pub.update_from_doi(doi=r['reference'])
                        elif pub_type == 'pubmed':
                            pub.update_from_pubmed_data(index=r['reference'])
                        try:
                            pub.save()
                        except:
                            self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type)
                            continue #if something off with publication, skip.

                    l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name']))

                    if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already
                        l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True)
                    elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct
                        l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False)
                    elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty.
                        lp = LigandProperities()
                        lp.save()
                        l_ref = Ligand()
                        l_ref.properities = lp
                        l_ref.name = r['exp_mu_ligand_ref']
                        l_ref.canonical = False
                        l_ref.ambigious_alias = True
                        l_ref.save()
                        l_ref.load_by_name(r['exp_mu_ligand_ref'])
                        l_ref.save()
                    elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status.
                        lp = LigandProperities()
                        lp.save()
                        l_ref = Ligand()
                        l_ref.properities = lp
                        l_ref.name = r['exp_mu_ligand_ref']
                        l_ref.canonical = True
                        l_ref.ambigious_alias = False
                        l_ref.save()
                        l_ref.load_by_name(r['exp_mu_ligand_ref'])
                        l_ref.save()
                    else:
                        l_ref = None

                    protein_id = 0
                    residue_id = 0

                    protein=Protein.objects.filter(entry_name=r['protein'])
                    if protein.exists():
                        protein=protein.get()
                        if r['protein'] in mutants_for_proteins:
                            mutants_for_proteins[r['protein']] += 1
                        else:
                            mutants_for_proteins[r['protein']] = 1

                    else:
                        skipped += 1
                        if r['protein'] in missing_proteins:
                            missing_proteins[r['protein']] += 1
                        else:
                            missing_proteins[r['protein']] = 1
                            self.logger.error('Skipped due to no protein '+ r['protein'])
                        continue

                    res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK
                    if res.exists():
                        res=res.get()
                    else:
                        self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'])
                        skipped += 1
                        continue

                    if r['ligand_class']:
                        l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'],
                            defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed
                    else:
                        l_role = None

                    if r['exp_type']:
                        exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type'])
                    else:
                        exp_type_id = None

                    if r['exp_func']:
                        exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func'])
                    else:
                        exp_func_id = None

                    if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']:
                        exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop'])
                    else:
                        exp_qual_id = None

                    if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']:
                        exp_opt_id, created =  MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist'])
                    else:
                        exp_opt_id = None

                    mutation, created =  Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res)

                    
                    logtypes = ['pEC50','pIC50','pK']
                    
                    
                    foldchange = 0
                    typefold = ''
                    if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format
                                
                        if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']):  #-log values!
                            foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3);
                            typefold = r['exp_type']+"_log"
                        else:
                            foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3);
                            typefold = r['exp_type']+"_not_log"
                        
                        
                        if foldchange<1 and foldchange!=0:
                            foldchange = -round((1/foldchange),3)
                    elif r['fold_effect']!=0:
                            foldchange = round(r['fold_effect'],3);
                            if foldchange<1: foldchange = -round((1/foldchange),3);
                    

                    raw_experiment = self.insert_raw(r)
                    obj, created = MutationExperiment.objects.get_or_create(
                    refs=pub, 
                    protein=protein, 
                    residue=res, 
                    ligand=l, 
                    ligand_role=l_role, 
                    ligand_ref = l_ref,
                    raw = raw_experiment,
                    optional = exp_opt_id,
                    exp_type=exp_type_id, 
                    exp_func=exp_func_id, 
                    exp_qual = exp_qual_id,

                    mutation=mutation, 
                    wt_value=r['exp_wt_value'], #
                    wt_unit=r['exp_wt_unit'], 

                    mu_value = r['exp_mu_value_raw'],
                    mu_sign = r['exp_mu_effect_sign'], 
                    foldchange = foldchange
                    )
                    mut_id = obj.id
                    inserted += 1

                self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped))

        sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True)
        sorted_mutants_for_proteins = sorted(mutants_for_proteins.items(), key=operator.itemgetter(1),reverse=True)

        self.logger.info('COMPLETED CREATING MUTANTS')
Example #12
0
    def main_func(self, positions, iteration):
        # filenames
        if not positions[1]:
            rows = self.data[positions[0]:]
        else:
            rows = self.data[positions[0]:positions[1]]


        missing_proteins = {}
        mutants_for_proteins = {}

        c = 0
        skipped = 0
        inserted = 0
        bulk_m = []
        bulk_r = []
        current_sheet = time.time()

        for r in rows:
            # print(source_file,c)
            # PRINT IF ERRORS OCCUR
            # self.logger.info('File '+str(r['source_file'])+' number '+str(c))
            current = time.time()
            c += 1
            # if c%100==0:
            #     self.logger.info('Parsed '+str(c)+' mutant data entries')

            # publication
            try: #fix if it thinks it's float.
                float(r['reference'])
                r['reference'] = str(int(r['reference']))
                float(r['review'])
                r['review'] = str(int(r['review']))
            except ValueError:
                pass

            if r['reference'].isdigit(): #assume pubmed
                pub_type = 'pubmed'
            else: #assume doi
                pub_type = 'doi'

            if r['reference'] not in self.publication_cache:
                try:
                    pub = Publication.objects.get(web_link__index=r['reference'], web_link__web_resource__slug=pub_type)
                except Publication.DoesNotExist:
                    pub = Publication()
                    try:
                        pub.web_link = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type)
                    except WebLink.DoesNotExist:
                        wl = WebLink.objects.create(index=r['reference'],
                            web_resource = WebResource.objects.get(slug=pub_type))
                        pub.web_link = wl

                    if pub_type == 'doi':
                        pub.update_from_doi(doi=r['reference'])
                    elif pub_type == 'pubmed':
                        pub.update_from_pubmed_data(index=r['reference'])
                    try:
                        pub.save()
                    except:
                        self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type)
                        continue #if something off with publication, skip.
                self.publication_cache[r['reference']] = pub
            else:
                pub = self.publication_cache[r['reference']]

            # print(r['review'],r['reference'])
            if r['review'].isdigit(): #assume pubmed
                pub_type = 'pubmed'
            else: #assume doi
                pub_type = 'doi'

            # print(r['review'],pub_type)
            if r['review']:
                if r['review'] not in self.publication_cache:
                    try:
                        pub_review = Publication.objects.get(web_link__index=r['review'], web_link__web_resource__slug=pub_type)
                    except Publication.DoesNotExist:
                        pub_review = Publication()
                        try:
                            pub_review.web_link = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type)
                        except WebLink.DoesNotExist:
                            wl = WebLink.objects.create(index=r['review'],
                                web_resource = WebResource.objects.get(slug=pub_type))
                            pub_review.web_link = wl

                        if pub_type == 'doi':
                            pub_review.update_from_doi(doi=r['review'])
                        elif pub_type == 'pubmed':
                            pub_review.update_from_pubmed_data(index=r['review'])
                        try:
                            pub_review.save()
                        except:
                            self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type)
                            continue #if something off with publication, skip.
                        self.publication_cache[r['review']] = pub_review
                else:
                    pub_review = self.publication_cache[r['review']]
            else:
                pub_review = None

            l = None
            if str(r['ligand_name']) in self.ligand_cache:
                if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]:
                    l = self.ligand_cache[str(r['ligand_name'])][r['ligand_id']]
            else:
                self.ligand_cache[str(r['ligand_name'])] = {}

            if not l:
                l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name']))
                self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l


            l_ref = None
            if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache:
                l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])]
            else:
                if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True)
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False)
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = False
                    l_ref.ambigious_alias = True
                    l_ref.save()
                    l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    l_ref.save()
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists(): #amigious_alias not specified
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False)
                    l_ref.ambigious_alias = False
                    l_ref.save()
                elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = True
                    l_ref.ambigious_alias = False
                    l_ref.save()
                    l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    try:
                        l_ref.save()
                    except IntegrityError:
                        l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False)
                        print("error failing ligand, duplicate?")
                        # logger.error("FAILED SAVING LIGAND, duplicate?")
                else:
                    l_ref = None
                self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref


            protein_id = 0
            residue_id = 0

            protein=Protein.objects.filter(entry_name=r['protein'])
            if protein.exists():
                protein=protein.get()
                if r['protein'] in mutants_for_proteins:
                    mutants_for_proteins[r['protein']] += 1
                else:
                    mutants_for_proteins[r['protein']] = 1

            else:
                skipped += 1
                if r['protein'] in missing_proteins:
                    missing_proteins[r['protein']] += 1
                else:
                    missing_proteins[r['protein']] = 1
                    self.logger.error('Skipped due to no protein '+ r['protein'])
                continue

            res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK
            if res.exists():
                res=res.get()
            else:
                self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'])
                skipped += 1
                continue

            if r['ligand_class']:
                l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'],
                    defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed
            else:
                l_role = None

            if r['exp_type']:
                exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type'])
            else:
                exp_type_id = None

            if r['exp_func']:
                exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func'])
            else:
                exp_func_id = None

            if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']:
                exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop'])
            else:
                exp_qual_id = None

            if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']:
                exp_opt_id, created =  MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist'])
            else:
                exp_opt_id = None

            try:
                mutation, created =  Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res)
            except IntegrityError:
                mutation = Mutation.objects.get(amino_acid=r['mutation_to'],protein=protein, residue=res)
            logtypes = ['pEC50','pIC50','pK']

            foldchange = 0
            typefold = ''
            if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format

                if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']):  #-log values!
                    foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3);
                    typefold = r['exp_type']+"_log"
                else:
                    foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3);
                    typefold = r['exp_type']+"_not_log"


                if foldchange<1 and foldchange!=0:
                    foldchange = -round((1/foldchange),3)
            elif r['fold_effect']!=0:
                    foldchange = round(r['fold_effect'],3);
                    if foldchange<1: foldchange = -round((1/foldchange),3);


            raw_experiment = self.insert_raw(r)
            bulk = MutationExperiment(
            refs=pub,
            review=pub_review,
            protein=protein,
            residue=res,
            ligand=l,
            ligand_role=l_role,
            ligand_ref = l_ref,
            #raw = raw_experiment, #raw_experiment, OR None
            optional = exp_opt_id,
            exp_type=exp_type_id,
            exp_func=exp_func_id,
            exp_qual = exp_qual_id,

            mutation=mutation,
            wt_value=r['exp_wt_value'], #
            wt_unit=r['exp_wt_unit'],

            mu_value = r['exp_mu_value_raw'],
            mu_sign = r['exp_mu_effect_sign'],
            foldchange = foldchange
            )
            # mut_id = obj.id
            bulk_r.append(raw_experiment)
            bulk_m.append(bulk)
            inserted += 1
            end = time.time()
            diff = round(end - current,2)
            #print(diff)

        self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped))

        current = time.time()

        raws = MutationRaw.objects.bulk_create(bulk_r)
        for i,me in enumerate(bulk_m):
            me.raw = raws[i]
        MutationExperiment.objects.bulk_create(bulk_m)
        end = time.time()
        diff = round(end - current,2)
        current_sheet
        diff_2 = round(end - current_sheet,2)
        print("overall",diff_2,"bulk",diff,len(bulk_m),"skipped",str(skipped))
Example #13
0
    def import_xtal(self):
        xtals = self.parse_excel(self.annotation_file,'Xtal-methods')
        xtals_list = {}
        for x in xtals:
            if x[1] in xtals_list:
                print('pdbcode duplicate?',x[1])
            xtals_list[x[1]] = x

        xtal_chems = self.parse_excel(self.annotation_file,'Xtal-Chemicals')
        xtals_chems_list = {}
        for x in xtal_chems:
            if x[1] not in xtals_chems_list:
                xtals_chems_list[x[1]] = []
            xtals_chems_list[x[1]].append(x)

        xtal_ligands = self.parse_excel(self.annotation_file,'PDB-ligand-complex')
        xtal_ligands_list = {}
        for x in xtal_ligands:
            if x[1] not in xtal_ligands_list:
                xtal_ligands_list[x[1]] = []
            xtal_ligands_list[x[1]].append(x)


        missing = list(set(self.all_pdbs) - set(xtals_list.keys()))
        print(sorted(missing)," do not have any Xtal-methods annotated -- add them to sheet with NONE if they have none")
        missing = list(set(self.all_pdbs) - set(xtals_chems_list.keys()))
        print(sorted(missing)," do not have any Xtal-Chemicals annotated -- add them to sheet with NONE if they have none")
        missing = list(set(self.all_pdbs) - set(xtal_ligands_list.keys()))
        print(sorted(missing)," do not have any PDB-ligand-complex annotated -- add them to sheet with NONE if they have none")

        for pdb,x in xtals_list.items():
            try:
                construct = Construct.objects.get(structure__pdb_code__index=pdb.upper())
            except:
                print(pdb,'cannot find pdb construct')
                continue
            c = Crystallization()
            c_type, created = CrystallizationTypes.objects.get_or_create(name=x[3], sub_name=x[4])
            c_method, created = CrystallizationMethods.objects.get_or_create(name=x[2])

            c.crystal_type = c_type
            c.crystal_method = c_method
            c.remarks = x[10]
            c.temp = x[7]

            # Some entries have it wrong here
            try:
                c.ph_start = float(x[8])
                c.ph_end = float(x[9])
            except:
                c.ph_start = 0
                c.ph_end = 0

            c.protein_conc = x[5]
            c.protein_conc_unit = x[6]
            c.save()

            if pdb in xtals_chems_list:
                chems = xtals_chems_list[pdb]
                chem_types = {}
                for chem in chems:
                    ctype = chem[2]
                    if ctype not in chem_types:
                        chem_types[ctype] = []
                    chem_types[ctype].append(chem)
                    
                for ctype,chems in chem_types.items():
                    c_list = ChemicalList()
                    list_name,created  = ChemicalListName.objects.get_or_create(name=ctype)
                    c_list.name = list_name
                    c_list.save()
                    for ch in chems:
                        ct, created = ChemicalType.objects.get_or_create(name=ch[4])
                        chem, created = Chemical.objects.get_or_create(name=ch[3], chemical_type=ct)
                        cc, created = ChemicalConc.objects.get_or_create(concentration=ch[5], concentration_unit=ch[6], chemical=chem)
                        c_list.chemicals.add(cc)
                    c.chemical_lists.add(c_list)
            else:
                print('no chems for ',pdb)


            if pdb in xtal_ligands_list:
                l = xtal_ligands_list[pdb][0]
                ligand = get_or_make_ligand(l[7],l[6],l[2])
                role_slug = slugify(l[3])
                try:
                    lr, created = LigandRole.objects.get_or_create(slug=role_slug,
                    defaults={'name': l[3]})
                except IntegrityError:
                    lr = LigandRole.objects.get(slug=role_slug)
                if ligand:
                    ligand_c = CrystallizationLigandConc()
                    ligand_c.construct_crystallization = c
                    ligand_c.ligand = ligand
                    if lr: 
                        ligand_c.ligand_role = lr
                    if l[4]:
                        ligand_c.ligand_conc = l[4]
                    if l[5]:
                        ligand_c.ligand_conc_unit = l[5]
                    ligand_c.save()

                    c.ligands.add(ligand_c)

            construct.crystallization = c
            construct.save()
Example #14
0
    def main_func(self, positions, iteration,count,lock):
        # filenames
        # if not positions[1]:
        #     rows = self.data[positions[0]:]
        # else:
        #     rows = self.data[positions[0]:positions[1]]


        missing_proteins = {}
        mutants_for_proteins = {}
        wrong_uniport_ids = {}

        c = 0
        skipped = 0
        inserted = 0
        bulk_m = []
        bulk_r = []
        current_sheet = time.time()

        rows = self.data_all
        while count.value<len(rows):
            with lock:
                r = rows[count.value]
                count.value +=1 
        # for r in rows:
            # print(r['source_file'],c)
            # PRINT IF ERRORS OCCUR
            #self.logger.info('File '+str(r['source_file'])+' number '+str(c))
            current = time.time()
            c += 1
            # if c%100==0:
            #     self.logger.info('Parsed '+str(c)+' mutant data entries')

            # publication
            try: #fix if it thinks it's float.
                float(r['reference'])
                r['reference'] = str(int(r['reference']))
                float(r['review'])
                r['review'] = str(int(r['review']))
            except ValueError:
                pass

            if r['reference'].isdigit(): #assume pubmed
                pub_type = 'pubmed'
            else: #assume doi
                pub_type = 'doi'

            if r['reference'] not in self.publication_cache:
                try:
                    wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type)
                except WebLink.DoesNotExist:
                    try:
                        wl = WebLink.objects.create(index=r['reference'],
                                web_resource = WebResource.objects.get(slug=pub_type))
                    except IntegrityError:
                        wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type)


                try:
                    pub = Publication.objects.get(web_link=wl)
                except Publication.DoesNotExist:
                    pub = Publication()
                    try:
                        pub.web_link = wl
                        pub.save()
                    except IntegrityError:
                        pub = Publication.objects.get(web_link=wl)


                    if pub_type == 'doi':
                        pub.update_from_doi(doi=r['reference'])
                    elif pub_type == 'pubmed':
                        pub.update_from_pubmed_data(index=r['reference'])
                    try:
                        pub.save()
                    except:
                        self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type)
                        continue #if something off with publication, skip.
                self.publication_cache[r['reference']] = pub
            else:
                pub = self.publication_cache[r['reference']]

            # print(r['review'],r['reference'])
            if r['review'].isdigit(): #assume pubmed
                pub_type = 'pubmed'
            elif r['review'].startswith('http'):
                pub_type = 'raw_link'
            else: #assume doi
                pub_type = 'doi'

            # print(r['review'],pub_type)
            if r['review']:
                if r['review'] not in self.publication_cache:
                    try:
                        wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type)
                    except WebLink.DoesNotExist:
                        try:
                            wl = WebLink.objects.create(index=r['review'],
                                    web_resource = WebResource.objects.get(slug=pub_type))
                        except IntegrityError:
                            wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type)

                    try:
                        pub_review = Publication.objects.get(web_link=wl)
                    except Publication.DoesNotExist:
                        pub_review = Publication()
                        try:
                            pub_review.web_link = wl
                            pub_review.save()
                        except IntegrityError:
                            pub_review = Publication.objects.get(web_link=wl)


                        if pub_type == 'doi':
                            pub_review.update_from_doi(doi=r['review'])
                        elif pub_type == 'pubmed':
                            pub_review.update_from_pubmed_data(index=r['review'])
                        try:
                            pub_review.save()
                        except:
                            self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type)
                            continue #if something off with publication, skip.
                        self.publication_cache[r['review']] = pub_review
                else:
                    pub_review = self.publication_cache[r['review']]
            else:
                pub_review = None

            l = None
            if str(r['ligand_name']) in self.ligand_cache:
                if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]:
                    l = self.ligand_cache[str(r['ligand_name'])][r['ligand_id']]
            else:
                self.ligand_cache[str(r['ligand_name'])] = {}

            if not l:
                try:
                    l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name']))
                except Exception as msg:
                    print('Something errored with ligand, aborting entry of mutation',r['ligand_name'],r['ligand_type'],r['ligand_id'],r['source_file'])
                    print(msg)
                    traceback.print_exc()
                    continue
                self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l


            l_ref = None
            if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache:
                l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])]
            else:
                if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True)
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False)
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = False
                    l_ref.ambigious_alias = True
                    l_ref.save()
                    l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    l_ref.save()
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists(): #amigious_alias not specified
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False)
                    l_ref.ambigious_alias = False
                    l_ref.save()
                elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = True
                    l_ref.ambigious_alias = False
                    try:
                        l_ref.save()
                        l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    except IntegrityError:
                        if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists():
                            l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True)
                        else:
                            l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False)
                        # print("error failing ligand, duplicate?")
                    try:
                        l_ref.save()
                    except IntegrityError:
                        l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False)
                        # print("error failing ligand, duplicate?")
                        # logger.error("FAILED SAVING LIGAND, duplicate?")
                else:
                    l_ref = None
                self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref


            protein_id = 0
            residue_id = 0

            protein=Protein.objects.filter(entry_name=r['protein'])
            if protein.exists():
                protein=protein.get()
                if r['protein'] in mutants_for_proteins:
                    mutants_for_proteins[r['protein']] += 1
                else:
                    mutants_for_proteins[r['protein']] = 1

            elif r['protein'] not in missing_proteins:

                try:
                    r['protein'] = wrong_uniport_ids[r['protein']]
                    real_uniprot = wrong_uniport_ids[r['protein']]
                    protein=Protein.objects.get(entry_name=r['protein'])
                    # print('fetched with lookup table',r['protein'])
                except:
                    # look for it as uniprot
                    protein=Protein.objects.filter(web_links__web_resource__slug='uniprot', web_links__index=r['protein'].upper())
                    if protein.exists():
                        protein=protein.get()
                        real_uniprot = protein.entry_name
                        if r['protein'] in mutants_for_proteins:
                            mutants_for_proteins[r['protein']] += 1
                        else:
                            mutants_for_proteins[r['protein']] = 1
                    else:
                        # Try to lookup in uniprot to catch typing errors / variants in entry_name
                        url = 'http://www.uniprot.org/uniprot/$index.xml'
                        cache_dir = ['uniprot', 'id']
                        uniprot_protein = fetch_from_web_api(url, r['protein'], cache_dir, xml = True)
                        try:
                            real_uniprot = uniprot_protein.find('.//{http://uniprot.org/uniprot}name').text.lower()
                            protein=Protein.objects.get(entry_name=real_uniprot)
                        except:
                            skipped += 1
                            if r['protein'] in missing_proteins:
                                missing_proteins[r['protein']] += 1
                            else:
                                missing_proteins[r['protein']] = 1
                                # print('Skipped due to no protein '+ r['protein'])
                                self.logger.error('Skipped due to no protein '+ r['protein'])
                            continue
                    wrong_uniport_ids[r['protein']] = protein.entry_name
                    r['protein'] = real_uniprot
            else:
                missing_proteins[r['protein']] += 1
                continue


            res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK
            if res.exists():
                res=res.get()
            else:
                self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'])
                # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file'])
                skipped += 1
                continue

            if r['ligand_class']:
                try:
                    l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'],
                        defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed
                except Exception as e:
                    if LigandRole.objects.filter(slug=slugify(r['ligand_class'])[:50]).exists():
                        l_role = LigandRole.objects.get(slug=slugify(r['ligand_class'])[:50])
                        if l_role.name == slugify(r['ligand_class'])[:50]:
                            #if name of role is same as slug, then it was created by constructs script, replace it
                            l_role.name = r['ligand_class']
                            l_role.save()
                    else:
                        print(e)
                        print("Error with",r['ligand_class'],slugify(r['ligand_class'])[:50] )
                        l_role, created = LigandRole.objects.get_or_create(slug=slugify(r['ligand_class'])[:50]) # FIXME this should not be needed
            else:
                l_role = None

            if r['exp_type']:
                exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type'])
            else:
                exp_type_id = None

            if r['exp_func']:
                exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func'])
            else:
                exp_func_id = None

            if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']:
                exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop'])
            else:
                exp_qual_id = None

            # if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']:
            #     exp_opt_id, created =  MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist'])
            # else:
            #     exp_opt_id = None

            try:
                mutation, created =  Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res)
            except IntegrityError:
                mutation = Mutation.objects.get(amino_acid=r['mutation_to'],protein=protein, residue=res)
            logtypes = ['pEC50','pIC50','pK']

            foldchange = 0
            typefold = ''
            if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format
                if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']):  #-log values!
                    try:
                        foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3);
                    except:
                        print(r)
                    typefold = r['exp_type']+"_log"
                elif "%"==r['exp_wt_unit']:
                    # if % then it's a difference case, then lower value is bad. Otherwise it's conc and lower is better
                    foldchange = round(r['exp_wt_value']/r['exp_mu_value_raw'],3);
                else:
                    foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3);
                    typefold = r['exp_type']+"_not_log"
                if foldchange>0 and foldchange<1 and foldchange!=0:
                    foldchange = -round((1/foldchange),3)
            elif r['fold_effect']!=0:
                    foldchange = round(r['fold_effect'],3);
                    if foldchange<1: foldchange = -round((1/foldchange),3);
            r['fold_effect'] = foldchange
            
            raw_experiment = self.insert_raw(r)
            # raw_experiment.save()
            bulk = MutationExperiment(
            refs=pub,
            review=pub_review,
            submitting_group = r['submitting_group'],
            data_container = r['data_container'],
            data_container_number = r['data_container_number'],
            protein=protein,
            residue=res,
            ligand=l,
            ligand_role=l_role,
            ligand_ref = l_ref,
            # raw = raw_experiment, #raw_experiment, OR None
            # optional = exp_opt_id,
            exp_type=exp_type_id,
            exp_func=exp_func_id,
            exp_qual = exp_qual_id,

            mutation=mutation,
            wt_value=r['exp_wt_value'], #
            wt_unit=r['exp_wt_unit'],

            mu_value = r['exp_mu_value_raw'],
            mu_sign = r['exp_mu_effect_sign'],
            foldchange = foldchange,
            opt_receptor_expression = r['opt_receptor_expression'],
            opt_basal_activity = r['opt_basal_activity'],
            opt_gain_of_activity = r['opt_gain_of_activity'],
            opt_ligand_emax = r['opt_ligand_emax'],
            opt_agonist =  r['opt_agonist'],
            )
            # for line,val in r.items():
            #     val = str(val)
            #     if len(val)>100:
            #         print(line,"too long",val)
            # mut_id = obj.id
            bulk_r.append(raw_experiment)
            bulk_m.append(bulk)
            # try:
            #     bulk.save()
            # except Exception as e:
            #     print(e)
            #     print(r)
            #     break
            #print('saved ',r['source_file'])
            inserted += 1
            end = time.time()
            diff = round(end - current,2)
            #print(diff)

        self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped))

        current = time.time()

        raws = MutationRaw.objects.bulk_create(bulk_r)
        for i,me in enumerate(bulk_m):
            me.raw = raws[i]
        MutationExperiment.objects.bulk_create(bulk_m)
        end = time.time()
        diff = round(end - current,2)
        current_sheet
        diff_2 = round(end - current_sheet,2)
        print("overall",diff_2,"bulk",diff,len(bulk_m),"skipped",str(skipped))
        sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True)