Beispiel #1
0
def update_search():
    """Populate fulltext indexes"""
    for gene in Gene.all().run():
        AddGeneDocument(gene)
    for study in Study.all().run():
        AddStudyDocument(study)
    for disease in Disease.all().run():
        AddDiseaseDocument(disease)
    for snp in Snp.all().run():
        AddSNPDocument(snp)
    print "Fulltext docs updated"
Beispiel #2
0
def populate(path="gwascatalog.txt", limit=100):
    """Populate the database with data from gwascatalog.txt - one hell of an import!
    - create models of SNP, Gene, Study and Disease and relations between the objects
    - based on the following field-names in the TSV (excel):
    Date Added to Catalog
    PUBMEDID
    First Author
    Date
    Journal
    Link
    Study
    Disease_Trait
    Initial Sample Size
    Replication Sample Size
    Region
    Chr_id
    Chr_pos
    Reported Gene(s)
    Mapped_gene
    Upstream_gene_id
    Downstream_gene_id
    Snp_gene_ids
    Upstream_gene_distance
    Downstream_gene_distance
    Strongest SNP-Risk Allele
    SNPs
    Merged
    Snp_id_current
    Context
    Intergenic
    Risk Allele Frequency
    p-Value
    Pvalue_mlog
    p-Value (text)
    OR or beta
    95\% CI (text)
    Platform [SNPs passing QC]
    CNV"""
    docs = csv.DictReader(open(path,'rb'), dialect='excel-tab')
    pubids = {}
    # read all GWAS into a dictionary, using pubmed_id as key
    # - collecting all GWAS assorted with the same study under same key
    for doc in docs:
        pubid = doc["PUBMEDID"].strip()
        if pubids.has_key(pubid):
            pubids[pubid].append(doc)
        else:
            pubids[pubid] = [doc]

    i = 0
    for pid, lines in pubids.iteritems():
        i += 1
        if i == 200:
            break
        #print i
        # create a new study object for each new iteration
        # - use the first line to initiate the study model
        init = lines[0]

        # create or get study with study_id
        study = Study.get_or_insert(pid,
            name=init["Study"].strip(),
            pubmed_id = pid) # disease_ref = disease,

        # populate study with static data
        # date = datetime.strptime(init["Date"].strip(), "%m/%d/%Y").date()
        study.date = datetime.strptime(init["Date"].strip(), "%m/%d/%Y").date()
        study.pubmed_url=init["Link"].strip()
        study.init_sample = init["Initial Sample Size"].strip()
        study.repl_sample= init["Replication Sample Size"].strip()
        study.platform = init["Platform [SNPs passing QC]"].strip()
        study.put()

        disease_name = None
        disease = None
        for line in lines:
            # if the disease in this GWAS row differs from the other
            # ones in this study, create new disease relation:
            tmp = line["Disease/Trait"].strip().lower()
            if disease_name != tmp:
                disease_name = tmp
                disease = Disease.get_or_insert(disease_name,
                    name=disease_name)
                study.add_disease(disease)
            
            # A gwas has either a direct gene or a 
            # down-stream and up-stream gene
            gene = None
            up_gene = None
            down_gene = None

            # The retards at GWAS use 1 == intergenic, 2 == not intergenic
            # ... no seriously, _retards_
            intergenic = line["Intergenic"].strip() == "2"

            if not intergenic:
                geneid = line["Snp_gene_ids"].strip()
                if geneid != "":
                # not intergenic => one direct gene
                    names = line["Mapped_gene"].split(" - ")
                    if len(names) == 1:
                        gene = Gene.get_or_insert(geneid,
                            name = line["Mapped_gene"],
                            geneid = geneid)
                        if not study.key() in gene.studies:
                            gene.studies.append(study.key())
                        if not disease.key() in gene.diseases:
                            gene.diseases.append(disease.key())
                        gene.put()
            else:
                # up and downstream genes must be set
                down_id = line["Downstream_gene_id"].strip()
                up_id = line["Upstream_gene_id"].strip()

                if up_id != "" and down_id != "":
                    up_down_names = line["Mapped_gene"].split(" - ")
                    if len(up_down_names) < 2:
                        # gene = NR / NS or whatever..
                        up_down_names = ["N/A", "N/A"]

                    # create upstream gene
                    up_name = up_down_names[0]
                    down_name = up_down_names[1]
                    # print down_name, up_name
                    # assert("-" not in up_name)
                    # assert("-" not in down_name)

                    assert(up_id != "")
                    assert(down_id != "")

                    up_gene = Gene.get_or_insert(up_id,
                        name = up_name, 
                        geneid = up_id)
                    # up_gene.study.append(study.key())
                    down_gene = Gene.get_or_insert(down_id,
                        name = down_name, 
                        geneid = down_id)
                    # up_gene.study.append(study.key())

            # init snps..
            snp = None
            snpid = line["Snp_id_current"].strip()

            if snpid != "":
                # non=blank snp
                try:
                    # create lineation only if a 'single' snp is in the gwas
                    int(snpid)
                    snp = Snp.get_or_insert(snpid,
                        snpid=snpid)
                    if gene:
                        snp.gene = gene
                    if not study.key() in snp.studies:
                        snp.studies.append(study.key())
                    if not disease.key() in snp.diseases:
                        snp.diseases.append(disease.key())
                    snp.put()
                except:
                    # haplotype?
                    snpid = "N/A"
            # if no gene or snp lineation is mentioned - ignore and just insert study
            if (gene is None or up_gene is None) and snp is None:
                print "skipping gwas"
                continue
            # init gwas
            gwas = GWAS(study=study,
                intergenic=intergenic,
                disease=disease,
                snp=snp)

            # if SNP is intergenic, save up/down-stream, else gene
            if intergenic:
                if down_gene is not None and up_gene is not None:
                    gwas.downstream = down_gene
                    gwas.upstream = up_gene
            else:
                # could be None
                gwas.gene = gene

            # parse remaining gwas information
            gwas.p_string = line["p-Value"].strip()
            # could be none
            gwas.snps = snpid

            # parse out the exponent: 6E-8 => -8
            try:
                # test that p-Value is actually a float before parsing out
                float(line["p-Value"])
                gwas.p_val = int(line["p-Value"].split("E-")[1])  
            except Exception, e:
                # forces the filter to downgrade this gwas wrt. p-value
                gwas.p_val = 0
            # could be interesting
            gwas.strongest_snp_risk_allele = \
                line["Strongest SNP-Risk Allele"].strip()
            # gwas.CI = line["95% CI (text)"].strip()
            gwas.OR_beta = line["OR or beta"].strip()
            gwas.riscAlleleFrequency = \
                line["Risk Allele Frequency"].strip()
            gwas.put()