def handle(self, *args, **options):
        tmp_dir = os.path.join(settings.BASE_DIR, 'tmp')
        bin_dir = os.path.join(settings.BASE_DIR, 'bin')

        log.info('Fetching go-vcf-tools ...')
        url = '{repo}/releases/download/{tag}/go-vcf.{os_platform}-amd64.tar.gz'.format(
            repo='https://github.com/knmkr/go-vcf-tools',
            tag='0.1.0',
            os_platform=platform.system().lower())
        tar_gz = os.path.join(tmp_dir, 'go-vcf.tar.gz')
        get_url_content(url, tar_gz, if_not_exists=True)

        log.info('Extracting go-vcf-tools ...')
        with tarfile.open(tar_gz, 'r') as tar:
            dst = os.path.join(tmp_dir, 'go-vcf')
            tar.extractall(dst)
            for tool in glob.glob(os.path.join(tmp_dir, 'go-vcf', '*')):
                shutil.copy(tool, bin_dir)

        os.remove(tar_gz)
        shutil.rmtree(dst)

        log.info('Fetching RsMergeArch ...')
        url = 'http://ftp.ncbi.nih.gov/snp/organisms/human_9606_b144_GRCh37p13/database/organism_data/RsMergeArch.bcp.gz'
        get_url_content(url,
                        settings.RS_MERGE_ARCH_PATH,
                        if_not_exists=True,
                        md5='836289e6fe867bd5a6754802f05b2fb8')

        log.info('Done.')
Ejemplo n.º 2
0
    def handle(self, *args, **options):
        tmp_dir = os.path.join(settings.BASE_DIR, 'tmp')
        bin_dir = os.path.join(settings.BASE_DIR, 'bin')

        log.info('Fetching go-vcf-tools ...')
        url = '{repo}/releases/download/{tag}/go-vcf.{os_platform}-amd64.tar.gz'.format(repo='https://github.com/knmkr/go-vcf-tools',
                                                                                        tag='0.1.0',
                                                                                        os_platform=platform.system().lower())
        tar_gz = os.path.join(tmp_dir, 'go-vcf.tar.gz')
        get_url_content(url, tar_gz, if_not_exists=True)

        log.info('Extracting go-vcf-tools ...')
        with tarfile.open(tar_gz, 'r') as tar:
            dst = os.path.join(tmp_dir, 'go-vcf')
            tar.extractall(dst)
            for tool in glob.glob(os.path.join(tmp_dir, 'go-vcf', '*')):
                shutil.copy(tool, bin_dir)

        os.remove(tar_gz)
        shutil.rmtree(dst)

        log.info('Fetching RsMergeArch ...')
        url = 'http://ftp.ncbi.nih.gov/snp/organisms/human_9606_b144_GRCh37p13/database/organism_data/RsMergeArch.bcp.gz'
        get_url_content(url, settings.RS_MERGE_ARCH_PATH, if_not_exists=True, md5='836289e6fe867bd5a6754802f05b2fb8')

        log.info('Done.')
    def handle(self, *args, **options):
        current_tz = timezone.get_current_timezone()

        if not os.path.exists(settings.GWASCATALOG_DIR):
            os.makedirs(settings.GWASCATALOG_DIR)

        # TODO: automatically choose latest version
        log.info('Fetching latest gwascatalog...')
        catalog_path = os.path.join(settings.GWASCATALOG_DIR, 'dbsnp-pg-min-0.5.2-b144-GRCh37.gwascatalog-cleaned.tsv.gz')
        get_url_content(url='https://github.com/knmkr/dbsnp-pg-min/releases/download/0.5.2/dbsnp-pg-min-0.5.2-b144-GRCh37.gwascatalog-cleaned.tsv.gz',
                        dst=catalog_path,
                        if_not_exists=True)

        log.info('Fetching latest gwascatalog allele freq...')
        catalog_freq_path = os.path.join(settings.GWASCATALOG_DIR, 'dbsnp-pg-min-0.5.2-b144-GRCh37.gwascatalog-snps-allele-freq.tsv.gz')
        get_url_content(url='https://github.com/knmkr/dbsnp-pg-min/releases/download/0.5.2/dbsnp-pg-min-0.5.2-b144-GRCh37.gwascatalog-snps-allele-freq.tsv.gz',
                        dst=catalog_freq_path,
                        if_not_exists=True)

        # - Gwas Catalog Allele Freq
        log.info('Updating snp allele freq records for gwascatalog...')
        num_created = 0
        num_updated = 0

        with transaction.atomic():
            for record in csv.DictReader(gzip.open(catalog_freq_path, 'rb'), delimiter='\t',
                                         fieldnames=['snp_id_current', 'allele', 'freq', 'populations']):
                snp, created = Snp.objects.update_or_create(
                    snp_id_current=record['snp_id_current'],
                    population=record['populations'],
                    defaults={'allele': text2pg_array(record['allele']),
                              'freq': text2pg_array(record['freq'])}
                )
                if created:
                    num_created += 1
                else:
                    num_updated += 1

        log.info('updated: {} records'.format(num_updated))
        log.info('created: {} records'.format(num_created))

        # - Gwas Catalog
        log.info('Importing gwascatalog...')
        model_fields = [field for field in GwasCatalogSnp._meta.get_fields() if field.name not in ('id', 'created_at')]
        model_field_names = [field.name for field in model_fields]
        model_fields_map = dict(zip(model_field_names, model_fields))

        gwascatalog_snps = []
        num_phenotype_created = 0

        for record in csv.DictReader(gzip.open(catalog_path, 'rb'), delimiter='\t'):
            data = {}

            # If date_downloaded is already imported, abort.
            date_downloaded = record['date_downloaded']
            if GwasCatalogSnp.objects.filter(date_downloaded=date_downloaded).exists():
                raise GwasCatalogParseError('Already imported date_downloaded: {}'.format(date_downloaded))

            # Import only pre-defined model fields
            for k,v in record.items():
                if k in model_field_names:
                    # Set blank or null
                    if v == '':
                        if type(model_fields_map[k]) in (models.fields.CharField, models.fields.TextField):
                            v = ''
                        else:
                            v = None

                    # Set datetime with timezone
                    if type(model_fields_map[k]) == models.DateTimeField:
                        v = current_tz.localize(datetime(*(parse_date(v).timetuple()[:5])))

                    data[k] = v

            # Parse population
            population       = get_population(record['initial_sample'])

            # Calculate reliability rank
            reliability_rank = 1.0  # TODO: get_reliability_rank()

            try:
                # Parse and validate odds_ratio, beta_coeff
                _, unit                = get_ci_and_unit(record['confidence_interval_95_percent'])
                odds_ratio, beta_coeff = get_odds_ratio_or_beta_coeff(record['odds_ratio_or_beta_coeff'], unit)

                # Validate risk_allele
                #
                # Strands of risk alleles in GWAS Catalog are not set to forward strands with respect to
                # the human reference genome b37. So we get forward strand alleles by checking consistences of
                # allele frequencies between reported risk alleles and 1000 Genomes Project alleles.
                if data['snp_id_current']:
                    snp_id = int(data['snp_id_current'])
                    database_freq = get_freqs([snp_id], population).get(snp_id)
                    risk_allele_forward = get_database_strand_allele(record['risk_allele'], record['risk_allele_freq_reported'],
                                                                     database_freq, freq_diff_thrs=settings.GWASCATALOG_FREQ_DIFF_THRS)
                else:
                    risk_allele_forward = AMBIGOUS

                is_active = True

            except GwasCatalogParseError as e:
                log.error(e)
                odds_ratio, beta_coeff = None, None
                is_active = False

            # - Phenotype
            phenotype, phenotype_created = GwasCatalogPhenotype.objects.get_or_create(name=record['disease_or_trait'])
            if phenotype_created:
                num_phenotype_created += 1

            data.update({'population':          population,
                         'reliability_rank':    reliability_rank,
                         'odds_ratio':          odds_ratio,
                         'beta_coeff':          beta_coeff,
                         'beta_coeff_unit':     unit,
                         'risk_allele_forward': risk_allele_forward,
                         'phenotype':           phenotype,
                         'is_active':           is_active})

            gwascatalog_snps.append(GwasCatalogSnp(**data))
            # GwasCatalogSnp.objects.create(**data)

        with transaction.atomic():
            GwasCatalogSnp.objects.bulk_create(gwascatalog_snps)

        log.info('GWAS Catalog snps processed: {} records'.format(len(gwascatalog_snps)))
        log.info('GWAS Catalog phenotypes newly created: {} records'.format(num_phenotype_created))

        log.info('Done.')