Ejemplo n.º 1
0
    def test_download_file(self, mock_urllib, mock_getsize, mock_isfile, mock_logger):

        # Test bad url
        with self.assertRaises(ValueError) as ve:
            download_file("bad_url")
        self.assertEqual(ve.exception.message, "Invalid url: bad_url")

        # Test already downloaded
        responses.add(responses.HEAD, 'https://mock_url/test_file.gz',
                      headers={"Content-Length": "1024"}, status=200)
        responses.add(responses.HEAD, 'ftp://mock_url/test_file.txt',
                      headers={"Content-Length": "1024"}, status=200)
        mock_isfile.return_value = True
        mock_getsize.return_value = 1024
        result = download_file('https://mock_url/test_file.gz')
        mock_logger.info.assert_called_with('Re-using {} previously downloaded from https://mock_url/test_file.gz'.format(result))

        mock_isfile.return_value = False
        mock_getsize.return_value = 0
        mock_logger.reset_mock()
        mock_urllib.urlopen.return_value = "test data\nanother line\n"
        result = download_file('ftp://mock_url/test_file.txt', self.test_dir)
        mock_logger.info.assert_called_with("Downloading ftp://mock_url/test_file.txt to {}/test_file.txt".format(self.test_dir))
        mock_urllib.urlopen.assert_called_with('ftp://mock_url/test_file.txt')
        self.assertEqual(result, "{}/test_file.txt".format(self.test_dir))

        with open("{}/test_file.txt".format(self.test_dir), 'r') as f:
            line1 = f.readline()
            line2 = f.readline()
        self.assertEqual(line1, "test data\n")
        self.assertEqual(line2, "another line\n")
Ejemplo n.º 2
0
def update_gtex(gtex_expression_data_path=None, gtex_sample_annotations_path=None):
    if not gtex_expression_data_path:
        gtex_expression_data_path = download_file(GTEX_EXPRESSION_DATA)
    if not gtex_sample_annotations_path:
        gtex_sample_annotations_path = download_file(GTEX_SAMPLE_ANNOTATIONS)

    created_counter = total_counter = 0
    for gene_id, expression_array in get_tissue_expression_values_by_gene(
        gtex_expression_data_path,
        gtex_sample_annotations_path,
    ):
        total_counter += 1
        gene = GeneInfo.objects.filter(gene_id=gene_id).only('id').first()
        if not gene:
            logger.info("GTEx gene id not found: %s", gene_id)
            continue
        _, created = GeneExpression.objects.get_or_create(
            gene=gene,
            expression_values=expression_array)

        if created:
            created_counter += 1

    logger.info("Done. Parsed %s records from %s. Created %s new GeneExpression entries.",
        total_counter, gtex_expression_data_path, created_counter)
Ejemplo n.º 3
0
def update_omim(omim_key=None, genemap2_file_path=None):
    """Updates the OMIM table, using either the genemap2_file_path to load an existing local genemap2.txt file, or
    if an omim_key is provided instead, using the omim_key to download the file from https://www.omim.org

    Args:
        omim_key (str): OMIM download key obtained by filling in a form at https://www.omim.org/downloads/
        genemap2_file_path (str):
    """
    if genemap2_file_path:
        genemap2_file = open(genemap2_file_path)
    elif omim_key:
        genemap2_file_path = download_file(url=GENEMAP2_URL.format(omim_key=omim_key))
        genemap2_file = open(genemap2_file_path)
    else:
        raise CommandError("Must provide --omim-key or genemap2.txt file path")

    logger.info("Parsing genemap2 file")
    try:
        genemap2_records = [r for r in parse_genemap2_table(tqdm(genemap2_file, unit=" lines"))]
    except Exception as e:
        raise CommandError("Unable to parse {}: {}".format(genemap2_file, e))

    logger.info("Deleting {} existing OMIM records".format(Omim.objects.count()))
    Omim.objects.all().delete()

    logger.info("Creating {} OMIM gene-phenotype association records".format(len(genemap2_records)))
    gene_symbol_to_gene_id = collections.defaultdict(set)  # lookup for symbols that have a 1-to-1 mapping to ENSG ids in gencode
    gene_id_to_gene_info = {}
    for gene_info in GeneInfo.objects.all().only('gene_id', 'gene_symbol'):
        gene_symbol_to_gene_id[gene_info.gene_symbol].add(gene_info.gene_id)
        gene_id_to_gene_info[gene_info.gene_id] = gene_info

    skip_counter = 0
    for omim_record in tqdm(genemap2_records, unit=" records"):
        gene_id = omim_record["gene_id"]
        gene_symbol = omim_record["gene_symbol"]
        if not gene_id and len(gene_symbol_to_gene_id.get(gene_symbol, [])) == 1:
            gene_id = iter(gene_symbol_to_gene_id[gene_symbol]).next()
            omim_record["gene_id"] = gene_id
            logger.info("Mapped gene symbol {} to gene_id {}".format(gene_symbol, gene_id))

        gene = gene_id_to_gene_info.get(gene_id)
        if not gene:
            skip_counter += 1
            logger.warn(("OMIM gene id '{}' not found in GeneInfo table. "
                         "Running ./manage.py update_gencode to update the gencode version might fix this. "
                         "Full OMIM record: {}").format(gene_id, omim_record))
            continue

        del omim_record["gene_id"]
        del omim_record["gene_symbol"]

        omim_record['gene'] = gene
        Omim.objects.create(**omim_record)

    logger.info("Done")
    logger.info("Loaded {} OMIM records from {}. Skipped {} records with unrecognized gene id".format(
        Omim.objects.count(), genemap2_file_path, skip_counter))
Ejemplo n.º 4
0
def update_records(reference_data_handler, file_path=None):
    """
    Args:
        file_path (str): optional local file path. If not specified, or the path doesn't exist, the table will be downloaded.
    """

    if not file_path or not os.path.isfile(file_path):
        file_path = download_file(reference_data_handler.url)

    model_cls = reference_data_handler.model_cls
    model_name = model_cls.__name__
    model_objects = getattr(model_cls, 'objects')

    if not reference_data_handler.keep_existing_records:
        logger.info("Deleting {} existing {} records".format(model_objects.count(), model_name))
        model_objects.all().delete()

    models = []
    skip_counter = 0
    logger.info('Parsing file')
    open_file = gzip.open if file_path.endswith('.gz') else open
    open_mode = 'rt' if file_path.endswith('.gz') else 'r'
    with open_file(file_path, open_mode) as f:
        header_fields = reference_data_handler.get_file_header(f)

        for line in tqdm(f, unit=" records"):
            record = dict(zip(header_fields, line.rstrip('\r\n').split('\t')))
            for record in reference_data_handler.parse_record(record):
                if record is None:
                    continue

                try:
                    record['gene'] = reference_data_handler.get_gene_for_record(record)
                except ValueError as e:
                    skip_counter += 1
                    logger.debug(e)
                    continue

                models.append(model_cls(**record))

    if reference_data_handler.post_process_models:
        reference_data_handler.post_process_models(models)

    logger.info("Creating {} {} records".format(len(models), model_name))
    model_objects.bulk_create(models)

    logger.info("Done")
    logger.info("Loaded {} {} records from {}. Skipped {} records with unrecognized genes.".format(
        model_objects.count(), model_name, file_path, skip_counter))
    if skip_counter > 0:
        logger.info('Running ./manage.py update_gencode to update the gencode version might fix missing genes')
Ejemplo n.º 5
0
def update_gene_constraint(gene_constraint_path=None):
    """
    Args:
        gene_constraint_path (str): optional local constraint table path. If not specified, or the path doesn't exist,
            the table will be downloaded.
    """
    if TranscriptInfo.objects.count() == 0:
        raise CommandError(
            "TranscriptInfo table is empty. Run './manage.py update_gencode' before running this command."
        )

    if not gene_constraint_path or not os.path.isfile(gene_constraint_path):
        gene_constraint_path = download_file(GENE_CONSTRAINT_SCORES_URL)

    logger.info("Deleting {} existing GeneConstraint records".format(
        GeneConstraint.objects.count()))
    GeneConstraint.objects.all().delete()

    constraint_records = parse_gene_constraint_table(gene_constraint_path)

    # add _rank fields
    for field in ['mis_z', 'pLI']:
        for i, record in enumerate(
                sorted(constraint_records,
                       key=lambda record: -1 * record[field])):
            record['{}_rank'.format(field)] = i

    logger.info("Creating {} GeneConstraint records".format(
        len(constraint_records)))
    skip_counter = 0
    for record in tqdm(constraint_records, unit=" records"):
        transcript_id = record["transcript_id"]
        del record["transcript_id"]

        transcript = TranscriptInfo.objects.filter(transcript_id=transcript_id)
        if not transcript.exists():
            skip_counter += 1
            logger.warn((
                "transcript id '{}' not found in TranscriptInfo table. "
                "Running ./manage.py update_gencode to update the gencode version might fix this. "
                "Full record: {}").format(transcript_id, record))
            continue

        record['gene'] = transcript.first().gene
        GeneConstraint.objects.create(**record)

    logger.info("Done")
    logger.info(
        "Loaded {} GeneConstraint records from {}. Skipped {} records with unrecognized transcript id."
        .format(GeneConstraint.objects.count(), gene_constraint_path,
                skip_counter))
Ejemplo n.º 6
0
    def __init__(self,
                 gtex_sample_annotations_path=None,
                 keep_existing_records=False,
                 **kwargs):
        if not gtex_sample_annotations_path:
            gtex_sample_annotations_path = download_file(
                GTEX_SAMPLE_ANNOTATIONS)
        self.tissue_type_map = _get_tissue_type_map(
            gtex_sample_annotations_path)
        self.tissues_by_columns = None
        self.keep_existing_records = keep_existing_records
        self.existing_gtex_gene_ids = {
            ge.gene.gene_id
            for ge in GeneExpression.objects.all().only(
                'gene').prefetch_related('gene')
        } if keep_existing_records else {}

        super(GtexReferenceDataHandler, self).__init__()
Ejemplo n.º 7
0
def update_gencode(gencode_release, gencode_gtf_path=None, genome_version=None, reset=False):
    """Update GeneInfo and TranscriptInfo tables.

    Args:
        gencode_release (int): the gencode release to load (eg. 25)
        gencode_gtf_path (str): optional local file path of gencode GTF file. If not provided, it will be downloaded.
        genome_version (str): '37' or '38'. Required only if gencode_gtf_path is specified.
        reset (bool): If True, all records will be deleted from GeneInfo and TranscriptInfo before loading the new data.
            Setting this to False can be useful to sequentially load more than one gencode release so that data in the
            tables represents the union of multiple gencode releases.
    """
    if gencode_gtf_path and genome_version and os.path.isfile(gencode_gtf_path):
        if gencode_release == 19 and genome_version != GENOME_VERSION_GRCh37:
            raise CommandError("Invalid genome_version: {}. gencode v19 only has a GRCh37 version".format(genome_version))
        elif gencode_release <= 22 and genome_version != GENOME_VERSION_GRCh38:
            raise CommandError("Invalid genome_version: {}. gencode v20, v21, v22 only have a GRCh38 version".format(genome_version))
        elif genome_version != GENOME_VERSION_GRCh38 and "lift" not in gencode_gtf_path.lower():
            raise CommandError("Invalid genome_version for file: {}. gencode v23 and up must have 'lift' in the filename or genome_version arg must be GRCh38".format(gencode_gtf_path))

        gencode_gtf_paths = {genome_version: gencode_gtf_path}
    elif gencode_gtf_path and not genome_version:
        raise CommandError("The genome version must also be specified after the gencode GTF file path")
    else:
        if gencode_release == 19:
            urls = [('37', GENCODE_GTF_URL.format(gencode_release=gencode_release))]
        elif gencode_release <= 22:
            urls = [('38', GENCODE_GTF_URL.format(gencode_release=gencode_release))]
        else:
            urls = [
                ('37', GENCODE_LIFT37_GTF_URL.format(gencode_release=gencode_release)),
                ('38', GENCODE_GTF_URL.format(gencode_release=gencode_release)),
            ]
        gencode_gtf_paths = {}
        for genome_version, url in urls:
            local_filename = download_file(url)
            gencode_gtf_paths.update({genome_version: local_filename})

    if reset:
        logger.info("Dropping the {} existing TranscriptInfo entries".format(TranscriptInfo.objects.count()))
        TranscriptInfo.objects.all().delete()
        logger.info("Dropping the {} existing GeneInfo entries".format(GeneInfo.objects.count()))
        GeneInfo.objects.all().delete()

    existing_gene_ids = {gene.gene_id for gene in GeneInfo.objects.all().only('gene_id')}
    existing_transcript_ids = {
        transcript.transcript_id for transcript in TranscriptInfo.objects.all().only('transcript_id')
    }

    counters = collections.defaultdict(int)
    new_genes = collections.defaultdict(dict)
    new_transcripts = collections.defaultdict(dict)

    for genome_version, gencode_gtf_path in gencode_gtf_paths.items():
        coding_region_size_field_name = "coding_region_size_grch{}".format(genome_version)

        logger.info("Loading {} (genome version: {})".format(gencode_gtf_path, genome_version))
        with gzip.open(gencode_gtf_path, 'rt') as gencode_file:

            for i, line in enumerate(tqdm(gencode_file, unit=' gencode records')):
                line = line.rstrip('\r\n')
                if not line or line.startswith('#'):
                    continue
                fields = line.split('\t')

                if len(fields) != len(GENCODE_FILE_HEADER):
                    raise ValueError("Unexpected number of fields on line #%s: %s" % (i, fields))

                record = dict(zip(GENCODE_FILE_HEADER, fields))

                if record['feature_type'] not in ('gene', 'transcript', 'CDS'):
                    continue

                # parse info field
                info_fields = [x.strip().split() for x in record['info'].split(';') if x != '']
                info_fields = {k: v.strip('"') for k, v in info_fields}
                record.update(info_fields)

                record['gene_id'] = record['gene_id'].split('.')[0]
                if 'transcript_id' in record:
                    record['transcript_id'] = record['transcript_id'].split('.')[0]
                record['chrom'] = record['chrom'].replace("chr", "").upper()
                record['start'] = int(record['start'])
                record['end'] = int(record['end'])

                if len(record["chrom"]) > 2:
                    continue  # skip super-contigs

                if record['feature_type'] == 'gene':
                    if record["gene_id"] in existing_gene_ids:
                        counters["genes_skipped"] += 1
                        continue

                    new_genes[record['gene_id']].update({
                        "gene_id": record["gene_id"],
                        "gene_symbol": record["gene_name"],

                        "chrom_grch{}".format(genome_version): record["chrom"],
                        "start_grch{}".format(genome_version): record["start"],
                        "end_grch{}".format(genome_version): record["end"],
                        "strand_grch{}".format(genome_version): record["strand"],

                        "gencode_gene_type": record["gene_type"],
                        "gencode_release": int(gencode_release),
                    })

                elif record['feature_type'] == 'transcript':
                    if record["transcript_id"] in existing_transcript_ids:
                        counters["transcripts_skipped"] += 1
                        continue

                    new_transcripts[record['transcript_id']].update({
                        "gene_id": record["gene_id"],
                        "transcript_id": record["transcript_id"],
                        "chrom_grch{}".format(genome_version): record["chrom"],
                        "start_grch{}".format(genome_version): record["start"],
                        "end_grch{}".format(genome_version): record["end"],
                        "strand_grch{}".format(genome_version): record["strand"],
                    })

                elif record['feature_type'] == 'CDS':
                    if record["transcript_id"] in existing_transcript_ids:
                        continue

                    # add + 1 because GTF has 1-based coords. (https://useast.ensembl.org/info/website/upload/gff.html)
                    transcript_size = record["end"] - record["start"] + 1
                    transcript_size += new_transcripts[record['transcript_id']].get(coding_region_size_field_name, 0)
                    new_transcripts[record['transcript_id']][coding_region_size_field_name] = transcript_size

                    if record['gene_id'] not in existing_gene_ids and \
                            transcript_size > new_genes[record['gene_id']].get(coding_region_size_field_name, 0):
                        new_genes[record['gene_id']][coding_region_size_field_name] = transcript_size

    logger.info('Creating {} GeneInfo records'.format(len(new_genes)))
    counters["genes_created"] = len(new_genes)
    GeneInfo.objects.bulk_create([GeneInfo(**record) for record in new_genes.values()])
    gene_id_to_gene_info = {g.gene_id: g for g in GeneInfo.objects.all().only('gene_id')}

    logger.info('Creating {} TranscriptInfo records'.format(len(new_transcripts)))
    counters["transcripts_created"] = len(new_transcripts)
    TranscriptInfo.objects.bulk_create([
        TranscriptInfo(gene=gene_id_to_gene_info[record.pop('gene_id')], **record) for record in new_transcripts.values()
    ], batch_size=50000)

    logger.info("Done")
    logger.info("Stats: ")
    for k, v in counters.items():
        logger.info("  %s: %s" % (k, v))
Ejemplo n.º 8
0
def update_gencode(gencode_release, gencode_gtf_path=None, genome_version=None, reset=False):
    """Update GeneInfo and TranscriptInfo tables.

    Args:
        gencode_release (int): the gencode release to load (eg. 25)
        gencode_gtf_path (str): optional local file path of gencode GTF file. If not provided, it will be downloaded.
        genome_version (str): '37' or '38'. Required only if gencode_gtf_path is specified.
        reset (bool): If True, all records will be deleted from GeneInfo and TranscriptInfo before loading the new data.
            Setting this to False can be useful to sequentially load more than one gencode release so that data in the
            tables represents the union of multiple gencode releases.
    """
    if gencode_gtf_path and genome_version and os.path.isfile(gencode_gtf_path):
        if gencode_release == 19 and genome_version != GENOME_VERSION_GRCh37:
            raise CommandError("Invalid genome_version: {}. gencode v19 only has a GRCh37 version".format(genome_version))
        elif gencode_release <= 22 and genome_version != GENOME_VERSION_GRCh38:
            raise CommandError("Invalid genome_version: {}. gencode v20, v21, v22 only have a GRCh38 version".format(genome_version))
        elif genome_version != GENOME_VERSION_GRCh38 and "lift" not in gencode_gtf_path.lower():
            raise CommandError("Invalid genome_version for file: {}. gencode v23 and up must have 'lift' in the filename or genome_version arg must be GRCh38".format(gencode_gtf_path))

        gencode_gtf_paths = {genome_version: gencode_gtf_path}
    elif gencode_gtf_path and not genome_version:
        raise CommandError("The genome version must also be specified after the gencode GTF file path")
    else:
        if gencode_release == 19:
            urls = [('37', GENCODE_GTF_URL.format(gencode_release=gencode_release))]
        elif gencode_release <= 22:
            urls = [('38', GENCODE_GTF_URL.format(gencode_release=gencode_release))]
        else:
            urls = [
                ('37', GENCODE_LIFT37_GTF_URL.format(gencode_release=gencode_release)),
                ('38', GENCODE_GTF_URL.format(gencode_release=gencode_release)),
            ]
        gencode_gtf_paths = {}
        for genome_version, url in urls:
            local_filename = download_file(url)
            gencode_gtf_paths.update({genome_version: local_filename})

    if reset:
        logger.info("Dropping the {} existing TranscriptInfo entries".format(TranscriptInfo.objects.count()))
        TranscriptInfo.objects.all().delete()
        logger.info("Dropping the {} existing GeneInfo entries".format(GeneInfo.objects.count()))
        GeneInfo.objects.all().delete()

    existing_gene_ids = {gene.gene_id for gene in GeneInfo.objects.all().only('gene_id')}
    existing_transcript_ids = {
        transcript.transcript_id for transcript in TranscriptInfo.objects.all().only('transcript_id')
    }

    counters = collections.defaultdict(int)
    new_genes = collections.defaultdict(dict)
    new_transcripts = collections.defaultdict(dict)

    for genome_version, gencode_gtf_path in gencode_gtf_paths.items():
        coding_region_size_field_name = "coding_region_size_grch{}".format(genome_version)

        logger.info("Loading {} (genome version: {})".format(gencode_gtf_path, genome_version))
        with gzip.open(gencode_gtf_path) as gencode_file:

            for i, line in enumerate(tqdm(gencode_file, unit=' gencode records')):
                line = line.rstrip('\r\n')
                if not line or line.startswith('#'):
                    continue
                fields = line.split('\t')

                if len(fields) != len(GENCODE_FILE_HEADER):
                    raise ValueError("Unexpected number of fields on line #%s: %s" % (i, fields))

                record = dict(zip(GENCODE_FILE_HEADER, fields))

                if record['feature_type'] not in ('gene', 'transcript', 'CDS'):
                    continue

                # parse info field
                info_fields = [x.strip().split() for x in record['info'].split(';') if x != '']
                info_fields = {k: v.strip('"') for k, v in info_fields}
                record.update(info_fields)

                record['gene_id'] = record['gene_id'].split('.')[0]
                if 'transcript_id' in record:
                    record['transcript_id'] = record['transcript_id'].split('.')[0]
                record['chrom'] = record['chrom'].replace("chr", "").upper()
                record['start'] = int(record['start'])
                record['end'] = int(record['end'])

                if len(record["chrom"]) > 2:
                    continue  # skip super-contigs

                if record['feature_type'] == 'gene':
                    if record["gene_id"] in existing_gene_ids:
                        counters["genes_skipped"] += 1
                        continue

                    new_genes[record['gene_id']].update({
                        "gene_id": record["gene_id"],
                        "gene_symbol": record["gene_name"],

                        "chrom_grch{}".format(genome_version): record["chrom"],
                        "start_grch{}".format(genome_version): record["start"],
                        "end_grch{}".format(genome_version): record["end"],
                        "strand_grch{}".format(genome_version): record["strand"],

                        "gencode_gene_type": record["gene_type"],
                        "gencode_release": int(gencode_release),
                    })

                elif record['feature_type'] == 'transcript':
                    if record["transcript_id"] in existing_transcript_ids:
                        counters["transcripts_skipped"] += 1
                        continue

                    new_transcripts[record['transcript_id']].update({
                        "gene_id": record["gene_id"],
                        "transcript_id": record["transcript_id"],
                        "chrom_grch{}".format(genome_version): record["chrom"],
                        "start_grch{}".format(genome_version): record["start"],
                        "end_grch{}".format(genome_version): record["end"],
                        "strand_grch{}".format(genome_version): record["strand"],
                    })

                elif record['feature_type'] == 'CDS':
                    if record["transcript_id"] in existing_transcript_ids:
                        continue

                    # add + 1 because GTF has 1-based coords. (https://useast.ensembl.org/info/website/upload/gff.html)
                    transcript_size = record["end"] - record["start"] + 1
                    transcript_size += new_transcripts[record['transcript_id']].get(coding_region_size_field_name, 0)
                    new_transcripts[record['transcript_id']][coding_region_size_field_name] = transcript_size

                    if record['gene_id'] not in existing_gene_ids and \
                            transcript_size > new_genes[record['gene_id']].get(coding_region_size_field_name, 0):
                        new_genes[record['gene_id']][coding_region_size_field_name] = transcript_size

    logger.info('Creating {} GeneInfo records'.format(len(new_genes)))
    counters["genes_created"] = len(new_genes)
    GeneInfo.objects.bulk_create([GeneInfo(**record) for record in new_genes.values()])
    gene_id_to_gene_info = {g.gene_id: g for g in GeneInfo.objects.all().only('gene_id')}

    logger.info('Creating {} TranscriptInfo records'.format(len(new_transcripts)))
    counters["transcripts_created"] = len(new_transcripts)
    TranscriptInfo.objects.bulk_create([
        TranscriptInfo(gene=gene_id_to_gene_info[record.pop('gene_id')], **record) for record in new_transcripts.values()
    ], batch_size=50000)

    logger.info("Done")
    logger.info("Stats: ")
    for k, v in counters.items():
        logger.info("  %s: %s" % (k, v))
Ejemplo n.º 9
0
def update_gencode(gencode_release,
                   gencode_gtf_path=None,
                   genome_version=None,
                   reset=False):
    """Update GeneInfo and TranscriptInfo tables.

    Args:
        gencode_release (int): the gencode release to load (eg. 25)
        gencode_gtf_path (str): optional local file path of gencode GTF file. If not provided, it will be downloaded.
        genome_version (str): '37' or '38'. Required only if gencode_gtf_path is specified.
        reset (bool): If True, all records will be deleted from GeneInfo and TranscriptInfo before loading the new data.
            Setting this to False can be useful to sequentially load more than one gencode release so that data in the
            tables represents the union of multiple gencode releases.
    """
    if gencode_gtf_path and genome_version and os.path.isfile(
            gencode_gtf_path):
        if gencode_release == 19 and genome_version != GENOME_VERSION_GRCh37:
            raise CommandError(
                "Invalid genome_version: {}. gencode v19 only has a GRCh37 version"
                .format(genome_version))
        elif gencode_release <= 22 and genome_version != GENOME_VERSION_GRCh38:
            raise CommandError(
                "Invalid genome_version: {}. gencode v20, v21, v22 only have a GRCh38 version"
                .format(genome_version))
        elif genome_version != GENOME_VERSION_GRCh38 and "lift" not in gencode_gtf_path.lower(
        ):
            raise CommandError(
                "Invalid genome_version for file: {}. gencode v23 and up must have 'lift' in the filename or genome_version arg must be GRCh38"
                .format(gencode_gtf_path))

        gencode_gtf_paths = {genome_version: gencode_gtf_path}
    elif gencode_gtf_path and not genome_version:
        raise CommandError(
            "The genome version must also be specified after the gencode GTF file path"
        )
    else:
        if gencode_release == 19:
            urls = [('37',
                     GENCODE_GTF_URL.format(gencode_release=gencode_release))]
        elif gencode_release <= 22:
            urls = [('38',
                     GENCODE_GTF_URL.format(gencode_release=gencode_release))]
        else:
            urls = [
                ('37',
                 GENCODE_LIFT37_GTF_URL.format(
                     gencode_release=gencode_release)),
                ('38',
                 GENCODE_GTF_URL.format(gencode_release=gencode_release)),
            ]
        gencode_gtf_paths = {}
        for genome_version, url in urls:
            local_filename = download_file(url)
            gencode_gtf_paths.update({genome_version: local_filename})

    if reset:
        logger.info("Dropping the {} existing TranscriptInfo entries".format(
            TranscriptInfo.objects.count()))
        TranscriptInfo.objects.all().delete()
        logger.info("Dropping the {} existing GeneInfo entries".format(
            GeneInfo.objects.count()))
        GeneInfo.objects.all().delete()

    for genome_version, gencode_gtf_path in gencode_gtf_paths.items():
        load_gencode_gtf_file(gencode_gtf_path, genome_version,
                              gencode_release)
Ejemplo n.º 10
0
def update_dbnsfp_gene(dbnsfp_gene_table_path=None):
    """
    Args:
        dbnsfp_gene_table_path (str): optional local dbNSFP_geen file path. If not specified, or the path doesn't exist,
            the table will be downloaded.
    """

    if GeneInfo.objects.count() == 0:
        raise CommandError(
            "GeneInfo table is empty. Run './manage.py update_gencode' before running this command."
        )

    if not dbnsfp_gene_table_path or not os.path.isfile(
            dbnsfp_gene_table_path):
        dbnsfp_gene_table_path = download_file(DBNSFP_GENE_URL)

    gene_id_to_gene_info = {
        g.gene_id: g
        for g in GeneInfo.objects.all().only('gene_id')
    }

    counters = defaultdict(int)
    records = []
    with open(dbnsfp_gene_table_path) as f:
        header = next(f).rstrip('\r\n').split('\t')
        logger.info("Header: ")
        logger.info(", ".join(header))
        logger.info(
            "Parsing gene records from {}".format(dbnsfp_gene_table_path))

        dbNSFPGene.objects.all().delete()

        for line in tqdm(f, unit=' genes'):
            counters['total'] += 1
            fields = line.rstrip('\r\n').split('\t')
            fields = [field if field != '.' else '' for field in fields]

            fields = dict(zip(header, fields))

            gene_id = fields['Ensembl_gene']
            if not gene_id:
                continue

            gene = gene_id_to_gene_info.get(gene_id)
            if not gene:
                logger.warn((
                    "dbNSFP gene id '{}' not found in GeneInfo table. "
                    "Running ./manage.py update_gencode to update the gencode version might fix this. "
                    "Full dbNSFP record: {}").format(gene_id, fields))
                continue

            # based on dbNSFP_gene schema README: https://drive.google.com/file/d/0B60wROKy6OqcNGJ2STJlMTJONk0/view
            records.append({
                "gene":
                gene,
                "uniprot_acc":
                fields['Uniprot_acc'],
                "uniprot_id":
                fields['Uniprot_id'],
                "entrez_gene_id":
                fields['Entrez_gene_id'],
                "ccds_id":
                fields['CCDS_id'],
                "refseq_id":
                fields["Refseq_id"],
                "ucsc_id":
                fields['ucsc_id'],
                "pathway_uniprot":
                fields['Pathway(Uniprot)'],
                "pathway_biocarta_short":
                fields[
                    'Pathway(BioCarta)_short'],  #  Short name of the Pathway(s) the gene belongs to (from BioCarta)
                "pathway_biocarta_full":
                fields[
                    'Pathway(BioCarta)_full'],  #  Full name(s) of the Pathway(s) the gene belongs to (from BioCarta)
                "pathway_consensus_path_db":
                fields[
                    'Pathway(ConsensusPathDB)'],  # Pathway(s) the gene belongs to (from ConsensusPathDB)
                "pathway_kegg_id":
                fields[
                    'Pathway(KEGG)_id'],  # ID(s) of the Pathway(s) the gene belongs to (from KEGG)
                "pathway_kegg_full":
                fields[
                    'Pathway(KEGG)_full'],  # Full name(s) of the Pathway(s) the gene belongs to (from KEGG)
                "function_desc":
                fields['Function_description'].replace(
                    "FUNCTION: ",
                    ""),  # Function description of the gene (from Uniprot)
                "disease_desc":
                fields['Disease_description'].replace(
                    "FUNCTION: ", ""
                ),  # Disease(s) the gene caused or associated with (from Uniprot)
                "trait_association_gwas":
                fields[
                    'Trait_association(GWAS)'],  # Trait(s) the gene associated with (from GWAS catalog)
                "go_biological_process":
                fields[
                    'GO_biological_process'],  # GO terms for biological process
                "go_cellular_component":
                fields[
                    'GO_cellular_component'],  # GO terms for cellular component
                "go_molecular_function":
                fields[
                    'GO_molecular_function'],  # GO terms for molecular function
                "tissue_specificity":
                fields[
                    'Tissue_specificity(Uniprot)'],  # Tissue specificity description from Uniprot
                "expression_egenetics":
                fields[
                    'Expression(egenetics)'],  # Tissues/organs the gene expressed in (egenetics data from BioMart)
                "expression_gnf_atlas":
                fields[
                    'Expression(GNF/Atlas)'],  # Tissues/organs the gene expressed in (GNF/Atlas data from BioMart)
                "rvis_exac":
                fields['RVIS_ExAC'],
                "ghis":
                fields['GHIS'],
                "essential_gene":
                fields[
                    'Essential_gene'],  # Essential ("E") or Non-essential phenotype-changing ("N") based on Mouse Genome Informatics database. from doi:10.1371/journal.pgen.1003484
                "mgi_mouse_gene":
                fields['MGI_mouse_gene'],  # Homolog mouse gene name from MGI
                "mgi_mouse_phenotype":
                fields[
                    'MGI_mouse_phenotype'],  # Phenotype description for the homolog mouse gene from MGI
                "zebrafish_gene":
                fields[
                    'ZFIN_zebrafish_gene'],  # Homolog zebrafish gene name from ZFIN
                "zebrafish_structure":
                fields[
                    'ZFIN_zebrafish_structure'],  # Affected structure of the homolog zebrafish gene from ZFIN
                "zebrafish_phenotype_quality":
                fields[
                    'ZFIN_zebrafish_phenotype_quality'],  # Phenotype description for the homolog zebrafish gene from ZFIN
                "zebrafish_phenotype_tag":
                fields[
                    'ZFIN_zebrafish_phenotype_tag'],  # Phenotype tag for the homolog zebrafish gene from ZFIN
            })

    print("Parsed {} records. Inserting them into dbNSFPGene".format(
        len(records)))

    dbNSFPGene.objects.bulk_create(
        (dbNSFPGene(**record) for record in tqdm(records, unit=' genes')),
        batch_size=1000)

    logger.info("Done loading {} records into dbNSFPGene".format(
        dbNSFPGene.objects.count()))