Exemple #1
0
    def __init__(self,
                 database: Database,
                 host='ftp.ncbi.nlm.nih.gov',
                 data_dir='gene/DATA/',
                 src_data_dir=APP_ROOT / 'data' / 'ncbi'):
        """Construct the NCBI ETL instance.

        :param Database database: gene database for adding new data
        :param str host: FTP host name
        :param str data_dir: FTP data directory to use
        :param Path src_data_dir: Data directory for NCBI
        """
        super().__init__(database, host, data_dir, src_data_dir)
        self._sequence_location = SequenceLocation()
        self._chromosome_location = ChromosomeLocation()
        self._data_url = f"ftp://{host}"
        self._assembly = None
        self._date_today = datetime.today().strftime('%Y%m%d')
Exemple #2
0
    def __init__(self,
                 database: Database,
                 host='ftp.ebi.ac.uk',
                 data_dir='pub/databases/genenames/hgnc/json/',
                 src_data_dir=APP_ROOT / 'data' / 'hgnc',
                 fn='hgnc_complete_set.json'):
        """Initialize HGNC ETL class.

        :param Database database: DynamoDB database
        :param str host: FTP host name
        :param str data_dir: FTP data directory to use
        :param Path src_data_dir: Data directory for HGNC
        :param str fn: Data file to download
        """
        super().__init__(database, host, data_dir, src_data_dir)
        self._chromosome_location = ChromosomeLocation()
        self._data_url = f"ftp://{host}/{data_dir}{fn}"
        self._fn = fn
        self._version = None
Exemple #3
0
class NCBI(Base):
    """ETL class for NCBI source"""
    def __init__(self,
                 database: Database,
                 host='ftp.ncbi.nlm.nih.gov',
                 data_dir='gene/DATA/',
                 src_data_dir=APP_ROOT / 'data' / 'ncbi'):
        """Construct the NCBI ETL instance.

        :param Database database: gene database for adding new data
        :param str host: FTP host name
        :param str data_dir: FTP data directory to use
        :param Path src_data_dir: Data directory for NCBI
        """
        super().__init__(database, host, data_dir, src_data_dir)
        self._sequence_location = SequenceLocation()
        self._chromosome_location = ChromosomeLocation()
        self._data_url = f"ftp://{host}"
        self._assembly = None
        self._date_today = datetime.today().strftime('%Y%m%d')

    def perform_etl(self):
        """Perform ETL methods.

        :return: Concept IDs of concepts successfully loaded
        """
        self._extract_data()
        self._transform_data()
        self._database.flush_batch()
        return self._processed_ids

    def _download_data(self):
        """Download NCBI info, history, and GRCh38 files.

        :param str ncbi_dir: The NCBI data directory
        """
        # Download info
        data_dir = f'{self._data_dir}GENE_INFO/Mammalia/'
        fn = f'ncbi_info_{self._date_today}.tsv'
        data_fn = 'Homo_sapiens.gene_info.gz'
        logger.info('Downloading NCBI gene_info....')
        self._ftp_download(self._host, data_dir, fn, self.src_data_dir,
                           data_fn)
        logger.info('Successfully downloaded NCBI gene_info.')

        # Download history
        fn = f'ncbi_history_{self._date_today}.tsv'
        data_fn = 'gene_history.gz'
        logger.info('Downloading NCBI gene_history...')
        self._ftp_download(self._host, self._data_dir, fn, self.src_data_dir,
                           data_fn)
        logger.info('Successfully downloaded NCBI gene_history.')

        # Download gff
        self._download_gff()

    def _download_gff(self) -> None:
        """Download latest gff data"""
        regex_patern = r"GCF_\d+\.\d+_(?P<assembly>GRCh\d+\.\S+)_genomic.gff.gz"
        regex = re.compile(regex_patern)
        with FTP(self._host) as ftp:
            ftp.login()
            ftp.cwd("genomes/refseq/vertebrate_mammalian/Homo_sapiens/"
                    "latest_assembly_versions")
            dir = ftp.nlst()[0]
            ftp.cwd(dir)
            for f in ftp.nlst():
                match = regex.match(f)
                if match:
                    resp = match.groupdict()
                    self._assembly = resp["assembly"]
                    new_fn = f"ncbi_{self._assembly}.gff"
                    if not (self.src_data_dir / new_fn).exists():
                        self._ftp_download_file(ftp, f, self.src_data_dir,
                                                new_fn)
                        logger.info(f"Successfully downloaded NCBI {f} data.")
                    else:
                        logger.info(f"NCBI {f} already exists.")
                    break

    def _files_downloaded(self, data_dir: Path) -> bool:
        """Check whether needed source files exist.

        :param Path data_dir: source data directory
        :return: true if all needed files exist, false otherwise
        """
        files = data_dir.iterdir()

        info_downloaded: bool = False
        history_downloaded: bool = False
        gff_downloaded: bool = False

        for f in files:
            if f.name.startswith(f'ncbi_info_{self._date_today}'):
                info_downloaded = True
            elif f.name.startswith(f'ncbi_history_{self._date_today}'):
                history_downloaded = True
            elif f.name.startswith('ncbi_GRCh38.p13'):
                gff_downloaded = True
        return info_downloaded and history_downloaded and gff_downloaded

    def _extract_data(self):
        """Gather data from local files or download from source.
        - Data is expected to be in <PROJECT ROOT>/data/ncbi.
        - For now, data files should all be from the same source data version.
        """
        self._create_data_directory()
        if not self._files_downloaded(self.src_data_dir):
            self._download_data()
        local_files = [
            f for f in self.src_data_dir.iterdir() if f.name.startswith('ncbi')
        ]
        local_files.sort(key=lambda f: f.name.split('_')[-1], reverse=True)
        self._info_src = [
            f for f in local_files if f.name.startswith('ncbi_info')
        ][0]
        self._history_src = [
            f for f in local_files if f.name.startswith('ncbi_history')
        ][0]
        self._gff_src = [
            f for f in local_files if f.name.startswith('ncbi_GRCh')
        ][0]
        self._version = self._info_src.stem.split('_')[-1]

    def _get_prev_symbols(self):
        """Store a gene's symbol history.

        :return: A dictionary of a gene's previous symbols
        """
        # get symbol history
        history_file = open(self._history_src, 'r')
        history = csv.reader(history_file, delimiter='\t')
        next(history)
        prev_symbols = {}
        with self._database.genes.batch_writer() as batch:
            for row in history:
                # Only interested in rows that have h**o sapiens tax id
                if row[0] == '9606':
                    if row[1] != '-':
                        gene_id = row[1]
                        if gene_id in prev_symbols.keys():
                            prev_symbols[gene_id].append(row[3])
                        else:
                            prev_symbols[gene_id] = [row[3]]
                    else:
                        # Load discontinued genes
                        params = {
                            'concept_id':
                            f'{NamespacePrefix.NCBI.value.lower()}:'
                            f'{row[2]}',
                            'symbol': row[3],
                            'symbol_status': SymbolStatus.DISCONTINUED.value
                        }
                        self._load_gene(params, batch)
        history_file.close()
        return prev_symbols

    def _add_xrefs_associated_with(self, val, params):
        """Add xrefs and associated_with refs to a transformed gene.

        :param list val: A list of source ids for a given gene
        :param dict params: A transformed gene record
        """
        params['xrefs'] = []
        params['associated_with'] = []
        for src in val:
            src_name = src.split(':')[0].upper()
            src_id = src.split(':')[-1]
            if src_name == "GENEID":
                params['concept_id'] = f"{NamespacePrefix.NCBI.value}:{src_id}"
            elif src_name in NamespacePrefix.__members__ and \
                    NamespacePrefix[src_name].value in PREFIX_LOOKUP:
                params['xrefs'].append(f"{NamespacePrefix[src_name].value}"
                                       f":{src_id}")
            else:
                if src_name.startswith("MIM"):
                    prefix = NamespacePrefix.OMIM.value
                elif src_name.startswith("IMGT/GENE-DB"):
                    prefix = NamespacePrefix.IMGT_GENE_DB.value
                elif src_name.startswith("MIRBASE"):
                    prefix = NamespacePrefix.MIRBASE.value
                else:
                    prefix = None
                if prefix:
                    params['associated_with'].append(f"{prefix}:{src_id}")
                else:
                    logger.info(f"{src_name} is not in NameSpacePrefix.")
        if not params['xrefs']:
            del params['xrefs']
        if not params['associated_with']:
            del params['associated_with']

    def _get_gene_info(self, prev_symbols):
        """Store genes from NCBI info file.

        :param dict prev_symbols: A dictionary of a gene's previous symbols
        :return: A dictionary of gene's from the NCBI info file.
        """
        # open info file, skip headers
        info_file = open(self._info_src, 'r')
        info = csv.reader(info_file, delimiter='\t')
        next(info)

        info_genes = dict()
        for row in info:
            params = dict()
            params['concept_id'] = f"{NamespacePrefix.NCBI.value}:{row[1]}"
            # get symbol
            params['symbol'] = row[2]
            # get aliases
            if row[4] != '-':
                params['aliases'] = row[4].split('|')
            else:
                params['aliases'] = []
            # get associated_with
            if row[5] != '-':
                associated_with = row[5].split('|')
                self._add_xrefs_associated_with(associated_with, params)
            # get chromosome location
            vrs_chr_location = self._get_vrs_chr_location(row, params)
            if 'exclude' in vrs_chr_location:
                # Exclude genes with multiple distinct locations (e.g. OMS)
                continue
            if not vrs_chr_location:
                vrs_chr_location = []
            params['locations'] = vrs_chr_location
            # get label
            if row[8] != '-':
                params['label'] = row[8]
            # add prev symbols
            if row[1] in prev_symbols.keys():
                params['previous_symbols'] = prev_symbols[row[1]]
            info_genes[params['symbol']] = params
            # get type
            params['gene_type'] = row[9]
        return info_genes

    def _get_gene_gff(self, db, info_genes, sr):
        """Store genes from NCBI gff file.

        :param FeatureDB db: GFF database
        :param dict info_genes: A dictionary of gene's from the NCBI info file.
        :param SeqRepo sr: Access to the seqrepo
        """
        for f in db.all_features():
            if f.attributes.get('ID'):
                f_id = f.attributes.get('ID')[0]
                if f_id.startswith('gene'):
                    symbol = f.attributes['Name'][0]
                    if symbol in info_genes:
                        # Just need to add SequenceLocation
                        params = info_genes.get(symbol)
                        vrs_sq_location = \
                            self._get_vrs_sq_location(db, sr, params, f_id)
                        if vrs_sq_location:
                            params['locations'].append(vrs_sq_location)
                    else:
                        # Need to add entire gene
                        gene = self._add_gff_gene(db, f, sr, f_id)
                        info_genes[gene['symbol']] = gene

    def _add_gff_gene(self, db, f, sr, f_id):
        """Create a transformed gene recor from NCBI gff file.

        :param FeatureDB db: GFF database
        :param Feature f: A gene from the gff data file
        :param SeqRepo sr: Access to the seqrepo
        :param str f_id: The feature's ID
        :return: A gene dictionary if the ID attribute exists.
                 Else return None.
        """
        params = dict()
        params['src_name'] = SourceName.NCBI.value
        self._add_attributes(f, params)
        sq_loc = self._get_vrs_sq_location(db, sr, params, f_id)
        if sq_loc:
            params['locations'] = [sq_loc]
        else:
            params['locations'] = list()
        params['label_and_type'] = \
            f"{params['concept_id'].lower()}##identity"
        return params

    def _add_attributes(self, f, gene):
        """Add concept_id, symbol, and xrefs/associated_with to a gene record.

        :param gffutils.feature.Feature f: A gene from the data
        :param gene: A transformed gene record
        """
        attributes = ['ID', 'Name', 'description', 'Dbxref']

        for attribute in f.attributes.items():
            key = attribute[0]
            if key in attributes:
                val = attribute[1]

                if len(val) == 1 and key != 'Dbxref':
                    val = val[0]

                if key == 'Dbxref':
                    self._add_xrefs_associated_with(val, gene)
                elif key == 'Name':
                    gene['symbol'] = val

    def _get_vrs_sq_location(self, db, sr, params, f_id):
        """Store GA4GH VRS SequenceLocation in a gene record.
        https://vr-spec.readthedocs.io/en/1.1/terms_and_model.html#sequencelocation

        :param FeatureDB db: GFF database
        :param SeqRepo sr: Access to the seqrepo
        :param dict params: A transformed gene record
        :param str f_id: The feature's ID
        :return: A GA4GH VRS SequenceLocation
        """
        gene = db[f_id]
        params['strand'] = gene.strand
        return self._sequence_location.add_location(gene.seqid, gene, params,
                                                    sr)

    def _get_xref_associated_with(self, src_name, src_id):
        """Get xref or associated_with ref.

        :param str src_name: Source name
        :param src_id: The source's accession number
        :return: A dict containing an xref or associated_with ref
        """
        source = dict()
        if src_name.startswith('HGNC'):
            source['xrefs'] = \
                [f"{NamespacePrefix.HGNC.value}:{src_id}"]
        elif src_name.startswith('NCBI'):
            source['xrefs'] = \
                [f"{NamespacePrefix.NCBI.value}:{src_id}"]
        elif src_name.startswith('UniProt'):
            source['associated_with'] = [
                f"{NamespacePrefix.UNIPROT.value}:{src_id}"
            ]  # noqa E501
        elif src_name.startswith('miRBase'):
            source['associated_with'] = [
                f"{NamespacePrefix.MIRBASE.value}:{src_id}"
            ]  # noqa E501
        elif src_name.startswith('RFAM'):
            source['associated_with'] = [
                f"{NamespacePrefix.RFAM.value}:{src_id}"
            ]  # noqa E501
        return source

    def _get_vrs_chr_location(self, row, params):
        """Store GA4GH VRS ChromosomeLocation in a gene record.
        https://vr-spec.readthedocs.io/en/1.1/terms_and_model.html#chromosomelocation

        :param list row: A row in NCBI data file
        :param dict params: A transformed gene record
        :return: A list of GA4GH VRS ChromosomeLocations
        """
        params['location_annotations'] = list()
        chromosomes_locations = self._set_chromsomes_locations(row, params)
        locations = chromosomes_locations['locations']
        chromosomes = chromosomes_locations['chromosomes']
        if chromosomes_locations['exclude']:
            return ['exclude']

        location_list = list()
        if chromosomes and not locations:
            for chromosome in chromosomes:
                if chromosome == 'MT':
                    params['location_annotations'].append(
                        Chromosome.MITOCHONDRIA.value)
                else:
                    params['location_annotations'].append(chromosome.strip())
        elif locations:
            self._add_chromosome_location(locations, location_list, params)
        if not params['location_annotations']:
            del params['location_annotations']
        return location_list

    def _set_chromsomes_locations(self, row, params):
        """Set chromosomes and locations for a given gene record.

        :param list row: A gene row in the NCBI data file
        :param dict params: A transformed gene record
        :return: A dictionary containing a gene's chromosomes and locations
        """
        chromosomes = None
        if row[6] != '-':
            if '|' in row[6]:
                chromosomes = row[6].split('|')
            else:
                chromosomes = [row[6]]

            if len(chromosomes) >= 2:
                if chromosomes and 'X' not in chromosomes and \
                        'Y' not in chromosomes:
                    logger.info(f'{row[2]} contains multiple distinct '
                                f'chromosomes: {chromosomes}.')
                    chromosomes = None

        locations = None
        exclude = False
        if row[7] != '-':
            if '|' in row[7]:
                locations = row[7].split('|')
            elif ';' in row[7]:
                locations = row[7].split(';')
            elif 'and' in row[7]:
                locations = row[7].split('and')
            else:
                locations = [row[7]]

            # Sometimes locations will store the same location twice
            if len(locations) == 2:
                if locations[0] == locations[1]:
                    locations = [locations[0]]

            # Exclude genes where there are multiple distinct locations
            # i.e. OMS: '10q26.3', '19q13.42-q13.43', '3p25.3'
            if len(locations) > 2:
                logger.info(f'{row[2]} contains multiple distinct '
                            f'locations: {locations}.')
                locations = None
                exclude = True

            # NCBI sometimes contains invalid map locations
            if locations:
                for i in range(len(locations)):
                    loc = locations[i].strip()
                    if not re.match("^([1-9][0-9]?|X[pq]?|Y[pq]?)", loc):
                        logger.info(f'{row[2]} contains invalid map location:'
                                    f'{loc}.')
                        params['location_annotations'].append(loc)
                        del locations[i]
        return {
            'locations': locations,
            'chromosomes': chromosomes,
            'exclude': exclude
        }

    def _add_chromosome_location(self, locations, location_list, params):
        """Add a chromosome location to the location list.

        :param list locations: NCBI map locations for a gene record.
        :param list location_list: A list to store chromosome locations.
        :param dict params: A transformed gene record
        """
        for i in range(len(locations)):
            loc = locations[i].strip()
            location = dict()

            if Annotation.ALT_LOC.value in loc:
                loc = loc.split(f"{Annotation.ALT_LOC.value}")[0].strip()
                params['location_annotations'].append(Annotation.ALT_LOC.value)

            contains_centromere = False
            if 'cen' in loc:
                contains_centromere = True

            arm_match = re.search("[pq]", loc)
            if arm_match and not contains_centromere:
                arm_ix = arm_match.start()
                chromosome = loc[:arm_ix].strip()

                # NCBI sometimes stores invalid map locations
                # i.e. 7637 stores 'map from Rosati ref via FISH [AFS]'
                if not re.match("^([1-9][0-9]?|X|Y|MT)$", chromosome):
                    continue
                location['chr'] = chromosome

                # Check to see if there is a band / sub band included
                if arm_ix != len(loc) - 1:
                    if '-' in loc:
                        self._chromosome_location.set_interval_range(
                            loc, arm_ix, location)
                    else:
                        # Location only gives start
                        start = loc[arm_ix:]
                        location['start'] = start
                        location['end'] = start
                else:
                    # Only arm is included
                    location['start'] = loc[arm_ix]
                    location['end'] = loc[arm_ix]
            elif contains_centromere:
                self._set_centromere_location(loc, location)
            else:
                # Location only gives chr
                params['location_annotations'].append(loc)

            chr_location = \
                self._chromosome_location.get_location(location, params)
            if chr_location:
                location_list.append(chr_location)

    def _set_centromere_location(self, loc, location):
        """Set centromere location for a gene.

        :param str loc: A gene location
        :param dict location: GA4GH location
        """
        centromere_ix = re.search("cen", loc).start()
        if '-' in loc:
            # Location gives both start and end
            range_ix = re.search('-', loc).start()
            if 'q' in loc:
                location['chr'] = loc[:centromere_ix].strip()
                location['start'] = "cen"
                location['end'] = loc[range_ix + 1:]
            elif 'p' in loc:
                p_ix = re.search("p", loc).start()
                location['chr'] = loc[:p_ix].strip()
                location['end'] = "cen"
                location['start'] = loc[:range_ix]
        else:
            location['chr'] = loc[:centromere_ix].strip()
            location['start'] = "cen"
            location['end'] = "cen"

    def _transform_data(self):
        """Modify data and pass to loading functions."""
        logger.info('Transforming NCBI...')
        self._add_meta()
        prev_symbols = self._get_prev_symbols()
        info_genes = self._get_gene_info(prev_symbols)

        # create db for gff file
        db = gffutils.create_db(str(self._gff_src),
                                dbfn=":memory:",
                                force=True,
                                merge_strategy="create_unique",
                                keep_order=True)

        self._get_gene_gff(db, info_genes, self.seqrepo)

        with self._database.genes.batch_writer() as batch:
            for gene in info_genes.keys():
                self._load_gene(info_genes[gene], batch)
        logger.info('Successfully transformed NCBI.')

    def _add_meta(self):
        """Load metadata"""
        metadata = SourceMeta(
            data_license="custom",
            data_license_url="https://www.ncbi.nlm.nih.gov/home/"
            "about/policies/",
            version=self._version,
            data_url=self._data_url,
            rdp_url="https://reusabledata.org/ncbi-gene.html",
            data_license_attributes={
                'non_commercial': False,
                'share_alike': False,
                'attribution': False
            },
            genome_assemblies=[self._assembly])

        self._load_meta(self._database, metadata, SourceName.NCBI.value)
Exemple #4
0
class HGNC(Base):
    """ETL the HGNC source into the normalized database."""
    def __init__(self,
                 database: Database,
                 host='ftp.ebi.ac.uk',
                 data_dir='pub/databases/genenames/hgnc/json/',
                 src_data_dir=APP_ROOT / 'data' / 'hgnc',
                 fn='hgnc_complete_set.json'):
        """Initialize HGNC ETL class.

        :param Database database: DynamoDB database
        :param str host: FTP host name
        :param str data_dir: FTP data directory to use
        :param Path src_data_dir: Data directory for HGNC
        :param str fn: Data file to download
        """
        super().__init__(database, host, data_dir, src_data_dir)
        self._chromosome_location = ChromosomeLocation()
        self._data_url = f"ftp://{host}/{data_dir}{fn}"
        self._fn = fn
        self._version = None

    def _download_data(self, *args, **kwargs):
        """Download HGNC JSON data file."""
        logger.info('Downloading HGNC data file...')
        self._create_data_directory()
        tmp_fn = 'hgnc_version.json'
        self._version = \
            self._ftp_download(self._host, self._data_dir, tmp_fn,
                               self.src_data_dir, self._fn)
        shutil.move(f"{self.src_data_dir}/{tmp_fn}",
                    f"{self.src_data_dir}/hgnc_{self._version}.json")
        logger.info('Successfully downloaded HGNC data file.')

    def _extract_data(self, *args, **kwargs):
        """Extract data from the HGNC source."""
        if 'data_path' in kwargs:
            self._data_src = kwargs['data_path']
        else:
            self._data_src = sorted(list(self.src_data_dir.iterdir()))[-1]

    def _transform_data(self, *args, **kwargs):
        """Transform the HGNC source."""
        logger.info('Transforming HGNC...')
        with open(self._data_src, 'r') as f:
            data = json.load(f)

        records = data['response']['docs']

        with self._database.genes.batch_writer() as batch:
            for r in records:
                gene = dict()
                gene['concept_id'] = r['hgnc_id'].lower()
                gene['label_and_type'] = \
                    f"{gene['concept_id']}##identity"
                gene['item_type'] = 'identity'
                gene['symbol'] = r['symbol']
                gene['label'] = r['name']
                gene['src_name'] = SourceName.HGNC.value
                if r['status']:
                    if r['status'] == 'Approved':
                        gene['symbol_status'] = \
                            SymbolStatus.APPROVED.value
                    elif r['status'] == 'Entry Withdrawn':
                        gene['symbol_status'] =\
                            SymbolStatus.WITHDRAWN.value
                gene['src_name'] = SourceName.HGNC.value

                # store alias, xref, associated_with, prev_symbols, location
                self._get_aliases(r, gene)
                self._get_xrefs_associated_with(r, gene)
                if 'prev_symbol' in r:
                    self._get_previous_symbols(r, gene)
                if 'location' in r:
                    self._get_location(r, gene)
                if "locus_type" in r:
                    gene["gene_type"] = r["locus_type"]
                self._load_gene(gene, batch)
        logger.info('Successfully transformed HGNC.')

    def _get_aliases(self, r, gene):
        """Store aliases in a gene record.

        :param dict r: A gene record in the HGNC data file
        :param dict gene: A transformed gene record
        """
        alias_symbol = list()
        enzyme_id = list()
        if 'alias_symbol' in r:
            alias_symbol = r['alias_symbol']

        if 'enzyme_id' in r:
            enzyme_id = r['enzyme_id']

        if alias_symbol or enzyme_id:
            gene['aliases'] = list(set(alias_symbol + enzyme_id))

    def _get_previous_symbols(self, r, gene):
        """Store previous symbols in a gene record.

        :param dict r: A gene record in the HGNC data file
        :param dict gene: A transformed gene record
        """
        prev_symbols = r['prev_symbol']
        if prev_symbols:
            gene['previous_symbols'] = list(set(prev_symbols))

    def _get_xrefs_associated_with(self, r, gene):
        """Store xrefs and/or associated_with refs in a gene record.

        :param dict r: A gene record in the HGNC data file
        :param dict gene: A transformed gene record
        """
        xrefs = list()
        associated_with = list()
        sources = [
            'entrez_id', 'ensembl_gene_id', 'vega_id', 'ucsc_id', 'ccds_id',
            'uniprot_ids', 'pubmed_id', 'cosmic', 'omim_id', 'mirbase',
            'homeodb', 'snornabase', 'orphanet', 'horde_id', 'merops', 'imgt',
            'iuphar', 'kznf_gene_catalog', 'mamit-trnadb', 'cd', 'lncrnadb',
            'ena', 'pseudogene.org', 'refseq_accession'
        ]

        for src in sources:
            if src in r:
                if '-' in src:
                    key = src.split('-')[0]
                elif '.' in src:
                    key = src.split('.')[0]
                elif '_' in src:
                    key = src.split("_")[0]
                else:
                    key = src

                if key.upper() in NamespacePrefix.__members__:
                    if NamespacePrefix[key.upper()].value \
                            in PREFIX_LOOKUP.keys():
                        self._get_xref_associated_with(key, src, r, xrefs)
                    else:
                        self._get_xref_associated_with(key, src, r,
                                                       associated_with)
                else:
                    logger.warning(f"{key} not in schemas.py")

        if xrefs:
            gene['xrefs'] = xrefs
        if associated_with:
            gene['associated_with'] = associated_with

    def _get_xref_associated_with(self, key, src, r, src_type):
        """Add an xref or associated_with ref to a gene record.

        :param str key: The source's name
        :param str src: HGNC's source field
        :param dict r: A gene record in the HGNC data file
        :param list src_type: Either xrefs or associated_with list
        """
        if type(r[src]) == list:
            for xref in r[src]:
                src_type.append(f"{NamespacePrefix[key.upper()].value}:{xref}")
        else:
            if isinstance(r[src], str) and ':' in r[src]:
                r[src] = r[src].split(':')[-1].strip()
            src_type.append(f"{NamespacePrefix[key.upper()].value}"
                            f":{r[src]}")

    def _get_location(self, r, gene):
        """Store GA4GH VRS ChromosomeLocation in a gene record.
        https://vr-spec.readthedocs.io/en/1.1/terms_and_model.html#chromosomelocation

        :param dict r: A gene record in the HGNC data file
        :param dict gene: A transformed gene record
        """
        # Get list of a gene's map locations
        if 'and' in r['location']:
            locations = r['location'].split('and')
        else:
            locations = [r['location']]

        location_list = list()
        gene['location_annotations'] = list()
        for loc in locations:
            loc = loc.strip()
            loc = self._set_annotation(loc, gene)

            if loc:
                if loc == 'mitochondria':
                    gene['location_annotations'].append(
                        Chromosome.MITOCHONDRIA.value)
                else:
                    location = dict()
                    self._set_location(loc, location, gene)
                    chr_location = \
                        self._chromosome_location.get_location(location, gene)
                    if chr_location:
                        location_list.append(chr_location)

        if location_list:
            gene['locations'] = location_list
        if not gene['location_annotations']:
            del gene['location_annotations']

    def _set_annotation(self, loc, gene):
        """Set the annotations attribute if one is provided.
           Return `True` if a location is provided, `False` otherwise.

        :param str loc: A gene location
        :return: A bool whether or not a gene map location is provided
        """
        annotations = {v.value for v in Annotation.__members__.values()}

        for annotation in annotations:
            if annotation in loc:
                gene['location_annotations'].append(annotation)
                # Check if location is also included
                loc = loc.split(annotation)[0].strip()
                if not loc:
                    return None
        return loc

    def _set_location(self, loc, location, gene):
        """Set a gene's location.

        :param str loc: A gene location
        :param dict location: GA4GH location
        :param dict gene: A transformed gene record
        """
        arm_match = re.search('[pq]', loc)

        if arm_match:
            # Location gives arm and sub / sub band
            arm_ix = arm_match.start()
            location['chr'] = loc[:arm_ix]

            if '-' in loc:
                # Location gives both start and end
                self._chromosome_location.set_interval_range(
                    loc, arm_ix, location)
            else:
                # Location only gives start
                start = loc[arm_ix:]
                location['start'] = start
                location['end'] = start
        else:
            # Only gives chromosome
            gene['location_annotations'].append(loc)

    def perform_etl(self, *args, **kwargs):
        """Extract, Transform, and Load data into DynamoDB database.

        :return: Concept IDs of concepts successfully loaded
        """
        self._download_data()
        self._extract_data()
        self._add_meta()
        self._transform_data()
        self._database.flush_batch()
        return self._processed_ids

    def _add_meta(self, *args, **kwargs):
        """Add HGNC metadata to the gene_metadata table."""
        metadata = SourceMeta(
            data_license='custom',
            data_license_url='https://www.genenames.org/about/',
            version=self._version,
            data_url=self._data_url,
            rdp_url=None,
            data_license_attributes={
                'non_commercial': False,
                'share_alike': False,
                'attribution': False
            },
            genome_assemblies=[])

        self._load_meta(self._database, metadata, SourceName.HGNC.value)