Exemple #1
0
def task_create_riskreport(
        risk_report_id, genome_id
):  # NOTE: arguments for celery task should be JSON serializable
    risk_report = RiskReport.objects.get(id=risk_report_id)
    genome = Genome.objects.get(id=genome_id)

    log.info('Creating riskreport ...')

    # TODO: Check for updates
    latest_date = GwasCatalogSnp.objects.aggregate(
        Max('date_downloaded'))['date_downloaded__max']

    phenotypes = GwasCatalogPhenotype.objects.all()
    log.info('#phenotypes: {}'.format(len(phenotypes)))

    population = [genome.population]

    for phenotype in phenotypes:
        assert type(phenotype) == GwasCatalogPhenotype
        gwas_snps = GwasCatalogSnp.objects.filter(
            phenotype=phenotype,
            population__contains=population,
            date_downloaded=str(latest_date))

        if not gwas_snps:
            continue

        # Select only one article for one phenotype
        #
        # TODO: add conditions
        # - risk alleles are present
        # - odds ratios are present
        # - (beta-coeeff is not present)
        # - lower than minimum p-value
        evidence_article_1st = gwas_snps.exclude(
            pubmed_id__isnull=True).order_by('reliability_rank').values_list(
                'pubmed_id', flat=True).distinct().first()
        evidence_snps = gwas_snps.filter(pubmed_id=evidence_article_1st)
        evidence_snp_ids = evidence_snps.values_list('snp_id_current',
                                                     flat=True)

        freqs = get_freqs(evidence_snp_ids, population=population)
        genotypes = Genotype.objects.filter(genome__id=genome.id,
                                            rs_id_current__in=evidence_snp_ids)

        phenotype_risk_report, _ = PhenotypeRiskReport.objects.get_or_create(
            risk_report=risk_report, phenotype=phenotype)

        # Calculate cumulative risk
        estimated_snp_risks = []

        # Genotype specific risks for each SNP
        with transaction.atomic():
            for evidence_snp in evidence_snps:
                # Risk allele and its frequency
                risk_allele_forward = evidence_snp.risk_allele_forward
                risk_allele_freq = freqs.get(evidence_snp.snp_id_current,
                                             {}).get(risk_allele_forward)
                odds_ratio = evidence_snp.odds_ratio

                # My genotype
                try:
                    genotype = ''.join(
                        genotypes.get(rs_id_current=evidence_snp.snp_id_current
                                      ).genotype)
                    zygosities = zyg(genotype, risk_allele_forward)
                except Genotype.DoesNotExist:
                    zygosities = None

                # Genotype specific risks
                if None not in (risk_allele_freq, odds_ratio, zygosities):
                    genotype_specific_risks = genotype_specific_risks_relative_to_population(
                        risk_allele_freq, odds_ratio)
                    my_estimated_risk = estimated_risk(genotype_specific_risks,
                                                       zygosities)
                else:
                    my_estimated_risk = None

                SnpRiskReport(phenotype_risk_report=phenotype_risk_report,
                              evidence_snp=evidence_snp,
                              estimated_risk=my_estimated_risk).save()

                estimated_snp_risks.append(my_estimated_risk)

            phenotype_risk_report.estimated_risk = cumulative_risk(
                estimated_snp_risks)
            phenotype_risk_report.save()

    log.info('Done')
    def handle(self, *args, **options):
        current_tz = timezone.get_current_timezone()

        if not os.path.exists(settings.GWASCATALOG_DIR):
            os.makedirs(settings.GWASCATALOG_DIR)

        # TODO: automatically choose latest version
        log.info('Fetching latest gwascatalog...')
        catalog_path = os.path.join(settings.GWASCATALOG_DIR, 'dbsnp-pg-min-0.5.2-b144-GRCh37.gwascatalog-cleaned.tsv.gz')
        get_url_content(url='https://github.com/knmkr/dbsnp-pg-min/releases/download/0.5.2/dbsnp-pg-min-0.5.2-b144-GRCh37.gwascatalog-cleaned.tsv.gz',
                        dst=catalog_path,
                        if_not_exists=True)

        log.info('Fetching latest gwascatalog allele freq...')
        catalog_freq_path = os.path.join(settings.GWASCATALOG_DIR, 'dbsnp-pg-min-0.5.2-b144-GRCh37.gwascatalog-snps-allele-freq.tsv.gz')
        get_url_content(url='https://github.com/knmkr/dbsnp-pg-min/releases/download/0.5.2/dbsnp-pg-min-0.5.2-b144-GRCh37.gwascatalog-snps-allele-freq.tsv.gz',
                        dst=catalog_freq_path,
                        if_not_exists=True)

        # - Gwas Catalog Allele Freq
        log.info('Updating snp allele freq records for gwascatalog...')
        num_created = 0
        num_updated = 0

        with transaction.atomic():
            for record in csv.DictReader(gzip.open(catalog_freq_path, 'rb'), delimiter='\t',
                                         fieldnames=['snp_id_current', 'allele', 'freq', 'populations']):
                snp, created = Snp.objects.update_or_create(
                    snp_id_current=record['snp_id_current'],
                    population=record['populations'],
                    defaults={'allele': text2pg_array(record['allele']),
                              'freq': text2pg_array(record['freq'])}
                )
                if created:
                    num_created += 1
                else:
                    num_updated += 1

        log.info('updated: {} records'.format(num_updated))
        log.info('created: {} records'.format(num_created))

        # - Gwas Catalog
        log.info('Importing gwascatalog...')
        model_fields = [field for field in GwasCatalogSnp._meta.get_fields() if field.name not in ('id', 'created_at')]
        model_field_names = [field.name for field in model_fields]
        model_fields_map = dict(zip(model_field_names, model_fields))

        gwascatalog_snps = []
        num_phenotype_created = 0

        for record in csv.DictReader(gzip.open(catalog_path, 'rb'), delimiter='\t'):
            data = {}

            # If date_downloaded is already imported, abort.
            date_downloaded = record['date_downloaded']
            if GwasCatalogSnp.objects.filter(date_downloaded=date_downloaded).exists():
                raise GwasCatalogParseError('Already imported date_downloaded: {}'.format(date_downloaded))

            # Import only pre-defined model fields
            for k,v in record.items():
                if k in model_field_names:
                    # Set blank or null
                    if v == '':
                        if type(model_fields_map[k]) in (models.fields.CharField, models.fields.TextField):
                            v = ''
                        else:
                            v = None

                    # Set datetime with timezone
                    if type(model_fields_map[k]) == models.DateTimeField:
                        v = current_tz.localize(datetime(*(parse_date(v).timetuple()[:5])))

                    data[k] = v

            # Parse population
            population       = get_population(record['initial_sample'])

            # Calculate reliability rank
            reliability_rank = 1.0  # TODO: get_reliability_rank()

            try:
                # Parse and validate odds_ratio, beta_coeff
                _, unit                = get_ci_and_unit(record['confidence_interval_95_percent'])
                odds_ratio, beta_coeff = get_odds_ratio_or_beta_coeff(record['odds_ratio_or_beta_coeff'], unit)

                # Validate risk_allele
                #
                # Strands of risk alleles in GWAS Catalog are not set to forward strands with respect to
                # the human reference genome b37. So we get forward strand alleles by checking consistences of
                # allele frequencies between reported risk alleles and 1000 Genomes Project alleles.
                if data['snp_id_current']:
                    snp_id = int(data['snp_id_current'])
                    database_freq = get_freqs([snp_id], population).get(snp_id)
                    risk_allele_forward = get_database_strand_allele(record['risk_allele'], record['risk_allele_freq_reported'],
                                                                     database_freq, freq_diff_thrs=settings.GWASCATALOG_FREQ_DIFF_THRS)
                else:
                    risk_allele_forward = AMBIGOUS

                is_active = True

            except GwasCatalogParseError as e:
                log.error(e)
                odds_ratio, beta_coeff = None, None
                is_active = False

            # - Phenotype
            phenotype, phenotype_created = GwasCatalogPhenotype.objects.get_or_create(name=record['disease_or_trait'])
            if phenotype_created:
                num_phenotype_created += 1

            data.update({'population':          population,
                         'reliability_rank':    reliability_rank,
                         'odds_ratio':          odds_ratio,
                         'beta_coeff':          beta_coeff,
                         'beta_coeff_unit':     unit,
                         'risk_allele_forward': risk_allele_forward,
                         'phenotype':           phenotype,
                         'is_active':           is_active})

            gwascatalog_snps.append(GwasCatalogSnp(**data))
            # GwasCatalogSnp.objects.create(**data)

        with transaction.atomic():
            GwasCatalogSnp.objects.bulk_create(gwascatalog_snps)

        log.info('GWAS Catalog snps processed: {} records'.format(len(gwascatalog_snps)))
        log.info('GWAS Catalog phenotypes newly created: {} records'.format(num_phenotype_created))

        log.info('Done.')
Exemple #3
0
def task_create_riskreport(risk_report_id, genome_id):  # NOTE: arguments for celery task should be JSON serializable
    risk_report = RiskReport.objects.get(id=risk_report_id)
    genome = Genome.objects.get(id=genome_id)

    log.info('Creating riskreport ...')

    # TODO: Check for updates
    latest_date = GwasCatalogSnp.objects.aggregate(Max('date_downloaded'))['date_downloaded__max']

    phenotypes = GwasCatalogPhenotype.objects.all()
    log.info('#phenotypes: {}'.format(len(phenotypes)))

    population = [genome.population]

    for phenotype in phenotypes:
        assert type(phenotype) == GwasCatalogPhenotype
        gwas_snps = GwasCatalogSnp.objects.filter(phenotype=phenotype,
                                                  population__contains=population,
                                                  date_downloaded=str(latest_date))

        if not gwas_snps:
            continue

        # Select only one article for one phenotype
        #
        # TODO: add conditions
        # - risk alleles are present
        # - odds ratios are present
        # - (beta-coeeff is not present)
        # - lower than minimum p-value
        evidence_article_1st = gwas_snps.exclude(pubmed_id__isnull=True).order_by('reliability_rank').values_list('pubmed_id', flat=True).distinct().first()
        evidence_snps = gwas_snps.filter(pubmed_id=evidence_article_1st)
        evidence_snp_ids = evidence_snps.values_list('snp_id_current', flat=True)

        freqs = get_freqs(evidence_snp_ids, population=population)
        genotypes = Genotype.objects.filter(genome__id=genome.id, rs_id_current__in=evidence_snp_ids)

        phenotype_risk_report, _ = PhenotypeRiskReport.objects.get_or_create(risk_report=risk_report, phenotype=phenotype)

        # Calculate cumulative risk
        estimated_snp_risks = []

        # Genotype specific risks for each SNP
        with transaction.atomic():
            for evidence_snp in evidence_snps:
                # Risk allele and its frequency
                risk_allele_forward = evidence_snp.risk_allele_forward
                risk_allele_freq = freqs.get(evidence_snp.snp_id_current, {}).get(risk_allele_forward)
                odds_ratio = evidence_snp.odds_ratio

                # My genotype
                try:
                    genotype = ''.join(genotypes.get(rs_id_current=evidence_snp.snp_id_current).genotype)
                    zygosities = zyg(genotype, risk_allele_forward)
                except Genotype.DoesNotExist:
                    zygosities = None

                # Genotype specific risks
                if None not in (risk_allele_freq, odds_ratio, zygosities):
                    genotype_specific_risks = genotype_specific_risks_relative_to_population(risk_allele_freq, odds_ratio)
                    my_estimated_risk = estimated_risk(genotype_specific_risks, zygosities)
                else:
                    my_estimated_risk = None

                SnpRiskReport(phenotype_risk_report=phenotype_risk_report,
                              evidence_snp=evidence_snp,
                              estimated_risk=my_estimated_risk).save()

                estimated_snp_risks.append(my_estimated_risk)

            phenotype_risk_report.estimated_risk = cumulative_risk(estimated_snp_risks)
            phenotype_risk_report.save()

    log.info('Done')