def update_search(): """Populate fulltext indexes""" for gene in Gene.all().run(): AddGeneDocument(gene) for study in Study.all().run(): AddStudyDocument(study) for disease in Disease.all().run(): AddDiseaseDocument(disease) for snp in Snp.all().run(): AddSNPDocument(snp) print "Fulltext docs updated"
def populate(path="gwascatalog.txt", limit=100): """Populate the database with data from gwascatalog.txt - one hell of an import! - create models of SNP, Gene, Study and Disease and relations between the objects - based on the following field-names in the TSV (excel): Date Added to Catalog PUBMEDID First Author Date Journal Link Study Disease_Trait Initial Sample Size Replication Sample Size Region Chr_id Chr_pos Reported Gene(s) Mapped_gene Upstream_gene_id Downstream_gene_id Snp_gene_ids Upstream_gene_distance Downstream_gene_distance Strongest SNP-Risk Allele SNPs Merged Snp_id_current Context Intergenic Risk Allele Frequency p-Value Pvalue_mlog p-Value (text) OR or beta 95\% CI (text) Platform [SNPs passing QC] CNV""" docs = csv.DictReader(open(path,'rb'), dialect='excel-tab') pubids = {} # read all GWAS into a dictionary, using pubmed_id as key # - collecting all GWAS assorted with the same study under same key for doc in docs: pubid = doc["PUBMEDID"].strip() if pubids.has_key(pubid): pubids[pubid].append(doc) else: pubids[pubid] = [doc] i = 0 for pid, lines in pubids.iteritems(): i += 1 if i == 200: break #print i # create a new study object for each new iteration # - use the first line to initiate the study model init = lines[0] # create or get study with study_id study = Study.get_or_insert(pid, name=init["Study"].strip(), pubmed_id = pid) # disease_ref = disease, # populate study with static data # date = datetime.strptime(init["Date"].strip(), "%m/%d/%Y").date() study.date = datetime.strptime(init["Date"].strip(), "%m/%d/%Y").date() study.pubmed_url=init["Link"].strip() study.init_sample = init["Initial Sample Size"].strip() study.repl_sample= init["Replication Sample Size"].strip() study.platform = init["Platform [SNPs passing QC]"].strip() study.put() disease_name = None disease = None for line in lines: # if the disease in this GWAS row differs from the other # ones in this study, create new disease relation: tmp = line["Disease/Trait"].strip().lower() if disease_name != tmp: disease_name = tmp disease = Disease.get_or_insert(disease_name, name=disease_name) study.add_disease(disease) # A gwas has either a direct gene or a # down-stream and up-stream gene gene = None up_gene = None down_gene = None # The retards at GWAS use 1 == intergenic, 2 == not intergenic # ... no seriously, _retards_ intergenic = line["Intergenic"].strip() == "2" if not intergenic: geneid = line["Snp_gene_ids"].strip() if geneid != "": # not intergenic => one direct gene names = line["Mapped_gene"].split(" - ") if len(names) == 1: gene = Gene.get_or_insert(geneid, name = line["Mapped_gene"], geneid = geneid) if not study.key() in gene.studies: gene.studies.append(study.key()) if not disease.key() in gene.diseases: gene.diseases.append(disease.key()) gene.put() else: # up and downstream genes must be set down_id = line["Downstream_gene_id"].strip() up_id = line["Upstream_gene_id"].strip() if up_id != "" and down_id != "": up_down_names = line["Mapped_gene"].split(" - ") if len(up_down_names) < 2: # gene = NR / NS or whatever.. up_down_names = ["N/A", "N/A"] # create upstream gene up_name = up_down_names[0] down_name = up_down_names[1] # print down_name, up_name # assert("-" not in up_name) # assert("-" not in down_name) assert(up_id != "") assert(down_id != "") up_gene = Gene.get_or_insert(up_id, name = up_name, geneid = up_id) # up_gene.study.append(study.key()) down_gene = Gene.get_or_insert(down_id, name = down_name, geneid = down_id) # up_gene.study.append(study.key()) # init snps.. snp = None snpid = line["Snp_id_current"].strip() if snpid != "": # non=blank snp try: # create lineation only if a 'single' snp is in the gwas int(snpid) snp = Snp.get_or_insert(snpid, snpid=snpid) if gene: snp.gene = gene if not study.key() in snp.studies: snp.studies.append(study.key()) if not disease.key() in snp.diseases: snp.diseases.append(disease.key()) snp.put() except: # haplotype? snpid = "N/A" # if no gene or snp lineation is mentioned - ignore and just insert study if (gene is None or up_gene is None) and snp is None: print "skipping gwas" continue # init gwas gwas = GWAS(study=study, intergenic=intergenic, disease=disease, snp=snp) # if SNP is intergenic, save up/down-stream, else gene if intergenic: if down_gene is not None and up_gene is not None: gwas.downstream = down_gene gwas.upstream = up_gene else: # could be None gwas.gene = gene # parse remaining gwas information gwas.p_string = line["p-Value"].strip() # could be none gwas.snps = snpid # parse out the exponent: 6E-8 => -8 try: # test that p-Value is actually a float before parsing out float(line["p-Value"]) gwas.p_val = int(line["p-Value"].split("E-")[1]) except Exception, e: # forces the filter to downgrade this gwas wrt. p-value gwas.p_val = 0 # could be interesting gwas.strongest_snp_risk_allele = \ line["Strongest SNP-Risk Allele"].strip() # gwas.CI = line["95% CI (text)"].strip() gwas.OR_beta = line["OR or beta"].strip() gwas.riscAlleleFrequency = \ line["Risk Allele Frequency"].strip() gwas.put()