def test_build_exon_missing_key(parsed_exon, key): ## GIVEN a dictionary with exon information # WHEN key is deleted from dict parsed_exon.pop(key) # THEN calling build_exon() will raise KeyError with pytest.raises(KeyError): build_exon(parsed_exon)
def test_build_exon_inappropriate_type(parsed_exon, key): ## GIVEN a dictionary with exon information # WHEN setting key to None parsed_exon[key] = None # THEN calling build_exon() will raise TypeError with pytest.raises(TypeError): build_exon(parsed_exon)
def test_build_exon_no_hgnc(parsed_exon): ## GIVEN a dictionary with exon information parsed_exon.pop('hgnc_id') ## WHEN building a exon object with pytest.raises(KeyError): ## THEN assert that a exception is raised since there is no hgnc_id exon_obj = build_exon(parsed_exon)
def test_build_exon(parsed_exon): ## GIVEN a dictionary with exon information ## WHEN building a exon object exon_obj = build_exon(parsed_exon) ## THEN assert that a dictionary is returned assert isinstance(exon_obj, dict)
def load_exons(adapter, exon_lines, build='37', nr_exons=None): """Build and load all the exons of a build Transcript information is from ensembl. First check that the gene that the transcript belongs to exist in the database. If so check that the exon belongs to one of the identifier transcripts of that gene. Args: adapter(MongoAdapter) exon_lines(iterable): iterable with ensembl exon lines build(str) """ nr_exons = nr_exons or 100000 # Fetch all genes with ensemblid as keys ensembl_genes = adapter.ensembl_genes(build=build, id_transcripts=True) LOG.debug("Parsing ensembl exons from iterable") exons = parse_ensembl_exons(exon_lines) start_insertion = datetime.now() loaded_exons = 0 exon_bulk = [] LOG.info("Loading exons...") current_chrom = None with progressbar(exons, label="Loading exons", length=nr_exons) as bar: for exon in bar: ensg_id = exon['gene'] enst_id = exon['transcript'] gene_obj = ensembl_genes.get(ensg_id) if not gene_obj: continue hgnc_id = gene_obj['hgnc_id'] if not enst_id in gene_obj.get('id_transcripts', set()): continue exon_id = exon['exon_id'] exon['hgnc_id'] = hgnc_id exon_obj = build_exon(exon, build) exon_bulk.append(exon_obj) if len(exon_bulk) > 10000: adapter.load_exon_bulk(exon_bulk) exon_bulk = [] loaded_exons += 1 if exon_bulk: adapter.load_exon_bulk(exon_bulk) LOG.info('Number of exons in build {0}: {1}'.format(build, nr_exons)) LOG.info('Number loaded: {0}'.format(loaded_exons)) LOG.info('Time to load exons: {0}'.format(datetime.now() - start_insertion))
def load_exons(self, exons, genes=None, build='37'): """Create exon objects and insert them into the database Args: exons(iterable(dict)) """ genes = genes or self.ensembl_genes(build) for exon in exons: exon_obj = build_exon(exon, genes) if not exon_obj: continue res = self.exon_collection.insert_one(exon_obj)
def load_exons(adapter, exon_lines, build='37', ensembl_genes=None): """Load all the exons Transcript information is from ensembl. Check that the transcript that the exon belongs to exists in the database Args: adapter(MongoAdapter) exon_lines(iterable): iterable with ensembl exon lines build(str) ensembl_transcripts(dict): Existing ensembl transcripts """ # Fetch all genes with ensemblid as keys ensembl_genes = ensembl_genes or adapter.ensembl_genes(build) hgnc_id_transcripts = adapter.id_transcripts_by_gene(build=build) if isinstance(exon_lines, DataFrame): exons = parse_ensembl_exon_request(exon_lines) nr_exons = exon_lines.shape[0] else: exons = parse_ensembl_exons(exon_lines) nr_exons = 1000000 start_insertion = datetime.now() loaded_exons = 0 LOG.info("Loading exons...") with progressbar(exons, label="Loading exons", length=nr_exons) as bar: for exon in bar: ensg_id = exon['gene'] enst_id = exon['transcript'] gene_obj = ensembl_genes.get(ensg_id) if not gene_obj: continue hgnc_id = gene_obj['hgnc_id'] if not enst_id in hgnc_id_transcripts[hgnc_id]: continue exon['hgnc_id'] = hgnc_id exon_obj = build_exon(exon, build) adapter.load_exon(exon_obj) loaded_exons += 1 LOG.info('Number of exons in build {0}: {1}'.format(build, nr_exons)) LOG.info('Number loaded: {0}'.format(loaded_exons)) LOG.info('Time to load exons: {0}'.format(datetime.now() - start_insertion))
def test_build_exon_no_hgnc(): ## GIVEN a dictionary with exon information exon_info = { "exon_id": '1', "chrom": '1', "start": 10, "end": 100, "transcript": '12', "rank": 2, # Order of exon in transcript } ## WHEN building a exon object with pytest.raises(KeyError): ## THEN assert that a exception is raised since there is no hgnc_id exon_obj = build_exon(exon_info)
def test_build_exon(): ## GIVEN a dictionary with exon information exon_info = { "exon_id": '1', "chrom": '1', "start": 10, "end": 100, "transcript": '12', "hgnc_id": 11, "rank": 2, # Order of exon in transcript } ## WHEN building a exon object exon_obj = build_exon(exon_info) ## THEN assert that a dictionary is returned assert isinstance(exon_obj, dict)