Beispiel #1
0
 def handle(self, *args, **options):
     chemical_names = Chemical.objects.annotate(
         name_lower=Lower("name")).values("name_lower").annotate(
             cnt=Count("name_lower")).order_by("-cnt")
     filtered = []
     for pair in chemical_names:
         if pair["cnt"] > 1:
             filtered.append(pair["name_lower"])
     chemicals = Chemical.objects.annotate(name_lower=Lower("name")).filter(
         name_lower__in=filtered).order_by("name_lower")
     merging_chemical = None
     remaining_cnt = chemicals.count()
     name_lower = ""
     cnt = 0
     first_cnt = 0
     last_cnt = 0
     with transaction.atomic():
         for chemical in chemicals:
             remaining_time(remaining_cnt)
             if name_lower == chemical.name_lower:
                 # print(merging_chemical.id, chemical.id)
                 if merging_chemical.id < chemical.id:
                     self.merge(merging_chemical, chemical)
                     first_cnt += 1
                 else:
                     self.merge(chemical, merging_chemical)
                     last_cnt += 1
                 cnt += 1
             else:
                 name_lower = chemical.name_lower
                 merging_chemical = chemical
     print("Successfully merged %s pairs of chemicals" % (cnt, ))
Beispiel #2
0
    def handle(self, *args, **options):

        data = pandas.read_csv(options.get('file'), header=0, delimiter=',', quoting=csv.QUOTE_ALL)
        data = data.replace(np.nan, '', regex=True)
        pathways = data[["PathwayName", "PathwayID"]].drop_duplicates()
        pathways.columns = ("pathway_name", "pathway_id")
        pathways = pathways.to_dict('records')
        data = data.to_dict('records')
        count = len(data)
        with transaction.atomic():
            interactions_to_create = []
            DiseasePathway.objects.bulk_create([DiseasePathway(
                pathway_name=record["pathway_name"],
                pathway_id=record["pathway_id"]
            ) for record in pathways])
            existing_pathways = dict(DiseasePathway.objects.values_list("pathway_id", "id"))
            existing_diseases = dict(DiseaseTrait.objects.filter(category=disease_category).values_list("ctd_id", "id"))
            existing_genes = dict(Gene.objects.values_list("symbol", "id"))
            for row in data:
                remaining_time(count)
                disease = existing_diseases.get(row["DiseaseID"])
                pathway = existing_pathways.get(row["PathwayID"])
                gene = existing_genes.get(row["InferenceGeneSymbol"].lower())
                interactions_to_create.append(
                    DiseasePathwayInteraction(
                        disease_id=disease,
                        pathway_id=pathway,
                        gene_id=gene,
                        gene_symbol=row["InferenceGeneSymbol"]
                    )
                )
            DiseasePathwayInteraction.objects.bulk_create(interactions_to_create)
Beispiel #3
0
    def handle(self, *args, **options):

        go_type = options.get('go_type')
        data = pandas.read_csv(options.get('file'),
                               header=0,
                               delimiter=',',
                               quoting=csv.QUOTE_ALL)
        data = data.replace(np.nan, '', regex=True)
        with transaction.atomic():
            existing_chemicals = dict(
                Chemical.objects.filter(
                    chemical_number__isnull=False).values_list(
                        "chemical_number", "id"))
            data = data.to_dict('records')
            count = len(data)
            interactions_to_save = []
            for row in data:
                remaining_time(count)
                chemical = existing_chemicals.get("MESH:" + row["ChemicalID"])
                if chemical:
                    interactions_to_save.append(
                        ChemicalPathway(
                            chemical_id=chemical,
                            cas_rn=row.get("CasRN"),
                            pathway_name=row.get("PathwayName"),
                            pathway_id=row.get("PathwayID"),
                            p_value=row.get("PValue"),
                            corrected_p_value=row.get("CorrectedPValue"),
                            target_match_qty=row.get("TargetMatchQty"),
                            target_total_qty=row.get("TargetTotalQty"),
                            background_match_qty=row.get("BackgroundMatchQty"),
                            background_total_qty=row.get(
                                "BackgroundTotalQty")))
            ChemicalPathway.objects.bulk_create(interactions_to_save)
Beispiel #4
0
    def handle(self, *args, **options):
        chemicals = Chemical.objects.filter(
            Q(name__iregex=r"(.+\-|.+\ ){4,}.+")
            | Q(name__iregex=r".*[\d].*[\-].*")).all()
        obscure_category = SubstanceCategory.objects.filter(
            slug="obscure_chemicals").first()
        rest = [
            record.get_family()
            for record in SubstanceCategory.objects.filter(slug__in=[
                "popular-drugs", "natural-treatments", "beneficial-substances",
                "important-natural-compounds", "gras",
                "chemical_of_bilological_interest"
            ]).all()
        ]
        rest = list(itertools.chain(*rest))

        chemicals = list(set(chemicals))
        count = len(chemicals)
        with transaction.atomic():
            for chemical in chemicals:
                remaining_time(count)
                chemical.categories.add(obscure_category)
                chemical.categories.remove(*rest)

        bio_interest = SubstanceCategory.objects.filter(
            slug="chemical_of_bilological_interest").first()
        chemicals = Chemical.objects.exclude(
            categories__in=obscure_category.get_family())
        count = len(chemicals)
        with transaction.atomic():
            for chemical in chemicals:
                remaining_time(count)
                chemical.categories.add(bio_interest)
Beispiel #5
0
    def handle(self, *args, **options):
        healtheffects = HealthEffect.objects.all()
        count = HealthEffect.objects.count()

        for effect in healtheffects:
            remaining_time(count)
            effect.slug = slugify(effect.name)
            effect.save()
Beispiel #6
0
 def handle(self, *args, **options):
     organisms = Organism.objects.all()
     count = organisms.count()
     with transaction.atomic():
         for organism in organisms:
             remaining_time(count)
             # if not organism.slug:
             organism.slug = slugify("-".join(
                 [str(organism.id), organism.latin_name]))
             organism.save()
Beispiel #7
0
 def handle(self, *args, **options):
     chemical_pathways = ChemicalPathway.objects.all()
     pathways = dict(Pathway.objects.values_list("pathway_id", "id"))
     count = chemical_pathways.count()
     with transaction.atomic():
         for chemical_pathway in chemical_pathways:
             remaining_time(count)
             related_pathway_id = pathways.get(chemical_pathway.pathway_id)
             if related_pathway_id:
                 chemical_pathway.related_pathway_id = related_pathway_id
                 chemical_pathway.save()
     print("Done")
Beispiel #8
0
 def handle(self, *args, **options):
     headers = {
         "GeneSymbol": str,
         "GeneName": str,
         "GeneID": str,
         "AltGeneIDs": str,
         "Synonyms": str,
         "BioGRIDIDs": str,
         "PharmGKBIDs": str,
         "UniprotIDs": str
     }
     data = pandas.read_csv(options.get('file'),
                            header=0,
                            delimiter=',',
                            quoting=csv.QUOTE_ALL,
                            dtype=headers)
     data.columns = [
         "gene_symbol", "gene_name", "gene_id", "alt_gene_id", "synonyms",
         "bio_grid_ids", "pharm_gkbid_ids", "uniprot_ids"
     ]
     data = data.replace(np.nan, '', regex=True)
     # data = data[["gene_symbol", "gene_name", "synonyms"]]
     existing_genes = Gene.objects.all()
     count = existing_genes.count()
     data.gene_symbol = data.gene_symbol.str.lower().str.strip()
     with transaction.atomic():
         for gene in existing_genes:
             remaining_time(count)
             row = data.ix[data.gene_symbol == gene.symbol]
             if row.index.size == 0:
                 row = data.synonyms.str.contains(gene.name)
                 row = row[row == True]
                 if row.index.size != 0:
                     row = data.ix[row.index]
                 else:
                     continue
             # synonyms_to_add = []
             # if row.synonyms[row.index[0]] and str(row.synonyms[row.index[0]]) != 'nan':
             #     for synonym in row.synonyms[row.index[0]].split("|"):
             #         synonyms_to_add.append(Synonym.objects.get_or_create(name=synonym)[0])
             # synonyms_to_add.append(Synonym.objects.get_or_create(name=gene.name)[0])
             # gene.synonyms.add(*synonyms_to_add)
             gene.ctd_id = row.gene_id[row.index[0]]
             gene.ctd_alt_gene_ids = row.alt_gene_id[row.index[0]]
             gene.ctd_bio_grid_ids = row.bio_grid_ids[row.index[0]]
             gene.ctd_pharm_gkb_ids = row.pharm_gkbid_ids[row.index[0]]
             gene.ctd_uniprot_ids = row.uniprot_ids[row.index[0]]
             # gene.full_name = row.gene_name[row.index[0]]
             # gene.symbol = row.gene_symbol[row.index[0]]
             gene.save()
Beispiel #9
0
 def handle(self, *args, **options):
     existing_interactions = ChemicalGeneInteraction.objects.all()
     count = existing_interactions.count()
     with transaction.atomic():
         for interactions in chunks(existing_interactions, 10000):
             for interaction in interactions:
                 remaining_time(count)
                 if interaction.pub_med_ids:
                     ids = interaction.pub_med_ids.split("|")
                     refs = [
                         "https://www.ncbi.nlm.nih.gov/pubmed/" + record
                         for record in ids
                     ]
                     refs = "\n".join(refs)
                     interaction.references = refs
                     interaction.save()
Beispiel #10
0
    def create_chemical_organism_interactions(self, file_path):
        data = pandas.read_csv(file_path, header=0, delimiter='\t', quoting=csv.QUOTE_ALL)
        data = data.replace(np.nan, '', regex=True)
        data = data.replace('NULL', '', regex=True)
        data = data.to_dict('records')
        cnt = len(data)
        interactions_to_create = []
        for row in data:
            remaining_time(cnt)
            preparation_id = None
            organism_id = None
            if row["rel_type"] == "organism" and row["related_item_id"]:
                organism_id = row["related_item_id"] if row["related_item_id"] > 566 else row["related_item_id"] - 1

            if row["rel_type"] == "preparation" and row["related_item_id"]:
                preparation_id = row["related_item_id"]
            interactions_to_create.append(ChemicalConcentration(
                rel_type=row["rel_type"],
                source_compound_id=int(row["source_compound_id"] or 0),
                source_food_id=int(row["source_food_id"] or 0),
                orig_food_id=row["orig_food_id"],
                orig_food_common_name=row["orig_food_common_name"],
                orig_food_scientific_name=row["orig_food_scientific_name"],
                orig_food_part=row["orig_food_part"],
                orig_compound_id=row["orig_compound_id"],
                orig_compound_name=row["orig_compound_name"],
                conc=float(row["conc"] or 0),
                conc_min=float(row["conc_min"] or 0),
                conc_max=float(row["conc_max"] or 0),
                conc_unit=row["conc_unit"],
                citation=row["citation"],
                citation_type=row["citation_type"],
                orig_method=row["orig_method"],
                orig_unit_expression=row["orig_unit_expression"],
                ref_compound=row["ref_compound"],
                ref_food=row["ref_food"],
                compound_id=int(row["compound_id"] or 0),
                related_item_id=int(row["related_item_id"] or 0),
                preparation_id=preparation_id,
                organism_id=organism_id
            ))
        print("Start bulk creation")
        # with transaction.atomic():
        for interactions in chunks(interactions_to_create, 25000):
            remaining_time(30)
            ChemicalConcentration.objects.bulk_create(interactions)
        print("Finished")
Beispiel #11
0
    def handle(self, *args, **options):

        concentrations = ChemicalConcentration.objects.only(
            "orig_compound_name")
        chemicals = dict(
            Chemical.objects.annotate(name_lower=Lower('name')).values_list(
                'name_lower', 'id'))
        cnt = concentrations.count()
        for chunk in chunks(concentrations, 2000):
            with transaction.atomic():
                for concentration in chunk:
                    remaining_time(cnt)
                    chemical_name = concentration.orig_compound_name.lower()
                    chemical = chemicals.get(chemical_name)
                    if chemical:
                        concentration.chemical.add(chemical)
        print("Success!")
Beispiel #12
0
 def handle(self, *args, **options):
     categories = SubstanceCategory.objects.filter(slug__in=[
         "natural-treatments", "beneficial-substances",
         "important-natural-compounds", "chemical_of_bilological_interest"
     ]).all()
     with open(options.get('file')) as f:
         lines = f.read().splitlines()
     count = len(lines)
     with transaction.atomic():
         for line in lines:
             remaining_time(count)
             line = line.strip()
             if not line:
                 continue
             chemicals = Chemical.objects.filter(
                 Q(name__iexact=line) | Q(synonyms__iexact=line)
                 | Q(synonyms__istartswith=line + "|")
                 | Q(synonyms__iendswith="|" + line)
                 | Q(synonyms__icontains="|" + line + "|")).all()
             for chemical in chemicals.all():
                 chemical.categories.add(*categories)
Beispiel #13
0
    def handle(self, *args, **options):

        concentrations = ChemicalConcentration.objects.only(
            "conc", "conc_unit", "conc_max")
        cnt = concentrations.count()
        for chunk in chunks(concentrations, 2000):
            with transaction.atomic():
                for concentration in chunk:
                    remaining_time(cnt)
                    if concentration.conc > 0:
                        concentration.unified_concentration = concentration.conc * multiplier.get(
                            concentration.conc_unit)
                    elif concentration.conc_max:
                        concentration.unified_concentration = concentration.conc_max * multiplier.get(
                            concentration.conc_unit)
                    # chemical_name = concentration.orig_compound_name.lower()
                    # chemical = chemicals.get(chemical_name)
                    # if chemical:
                    #     concentration.chemical.add(chemical)
                    concentration.save()
        print("Success!")
Beispiel #14
0
 def upload_chemical_disease_interactions(self, file):
     existing_chemicals = dict(Chemical.objects.filter(chemical_number__isnull=False).values_list("chemical_number", "id"))
     existing_diseases = dict(DiseaseTrait.objects.filter(ctd_id__isnull=False).values_list("ctd_id", "id"))
     existing_genes = dict(Gene.objects.filter(symbol__isnull=False).values_list("symbol", "id"))
     import os
     files = os.listdir(file)
     count = len(files)*1000000
     # import pdb
     # pdb.set_trace()
     with transaction.atomic():
         for part in files:
             data = pandas.read_csv(file + part, header=None, comment="#", delimiter=',', quoting=csv.QUOTE_ALL)
             data.columns = ["ChemicalName", "ChemicalID", "CasRN", "DiseaseName", "DiseaseID",
                             "DirectEvidence", "InferenceGeneSymbol", "InferenceScore", "OmimIDs", "PubMedIDs"]
             data = data.replace(np.nan, '', regex=True)
             data = data.to_dict('records')
             interactions_to_create = []
             for row in data:
                 remaining_time(count)
                 chemical = existing_chemicals.get("MESH:"+row["ChemicalID"])
                 disease = existing_diseases.get(row["DiseaseID"])
                 gene = existing_genes.get(row["InferenceGeneSymbol"].lower())
                 # import pdb
                 # pdb.set_trace()
                 if gene and disease:
                     interactions_to_create.append(
                         ChemicalDiseaseInteraction(
                             chemical_id=chemical,
                             disease_id=disease,
                             inference_gene_id=gene,
                             direct_evidence=row["DirectEvidence"],
                             inference_score=float(row["InferenceScore"]) if row["InferenceScore"] else 0,
                             omim_ids=row["OmimIDs"],
                             pub_med_ids=row["PubMedIDs"],
                             cas_rn=row["CasRN"]
                         )
                     )
                 else:
                     continue
             ChemicalDiseaseInteraction.objects.bulk_create(interactions_to_create)
Beispiel #15
0
 def handle(self, *args, **options):
     headers = {
         "DiseaseName": str,
         "DiseaseID": str,
         "AltDiseaseIDs": str,
         "Definition": str,
         "ParentIDs": str,
         "TreeNumbers": str,
         "ParentTreeNumbers": str,
         "Synonyms": str,
         "SlimMapping": str
     }
     data = pandas.read_csv(options.get('file'),
                            header=0,
                            delimiter=',',
                            quoting=csv.QUOTE_ALL,
                            dtype=headers)
     data.columns = [
         "ctd_name", "ctd_id", "ctd_alt_id", "ctd_definition",
         "ctd_parent_ids", "ctd_tree_numbers", "ctd_parent_tree_numbers",
         "ctd_synonyms", "ctd_slim_mapping"
     ]
     data = data.replace(np.nan, '', regex=True)
     count = len(data)
     with transaction.atomic():
         for index, row in data.iterrows():
             remaining_time(count)
             query = Q(name__iexact=row["ctd_name"])
             for synonym in row.ctd_synonyms.split("|"):
                 query |= Q(name__iexact=synonym)
             diseases = DiseaseTrait.objects.filter(
                 category=disease_category).filter(query)
             if diseases.exists():
                 for disease in diseases:
                     self.update_existing_disease(disease, row)
             else:
                 self.create_new_disease(row)
Beispiel #16
0
    def handle(self, *args, **options):

        go_type = options.get('go_type')
        data = pandas.read_csv(options.get('file'),
                               header=0,
                               delimiter=',',
                               quoting=csv.QUOTE_ALL)
        data = data.replace(np.nan, '', regex=True)
        go_data = data[["GOName", "GOID"
                        ]].sort("GOID").drop_duplicates().to_dict('records')
        # import pdb
        # pdb.set_trace()
        with transaction.atomic():
            GeneOntology.objects.bulk_create([
                GeneOntology(go_id=record.get("GOID"),
                             go_name=record.get("GOName"),
                             type=go_type) for record in go_data
            ])
            go_objects = dict(GeneOntology.objects.values_list("go_id", "id"))
            disease_objects = dict(
                DiseaseTrait.objects.filter(ctd_id__isnull=False).values_list(
                    "ctd_id", "id"))
            data = data.to_dict('records')
            count = len(data)
            interactions_to_save = []
            for row in data:
                remaining_time(count)
                go = go_objects.get(row["GOID"])
                disease = disease_objects.get("MESH:" + row["DiseaseID"])
                if go and disease:
                    interactions_to_save.append(
                        DiseaseGOAssociations(
                            disease_id=disease,
                            gene_ontology_id=go,
                            inference_gene=row["InferenceGeneSymbols"],
                            inference_gene_qty=row["InferenceGeneQty"]))
            DiseaseGOAssociations.objects.bulk_create(interactions_to_save)