def test_open_known_genes_missing_lines(self): ''' test that open_known_genes() works correctly when we can't find any genes ''' header = ['gene', 'chr', 'start', 'stop', 'type', 'mode', 'mech', 'hgnc_id'] self.temp.write(('\t'.join(header) + '\n').encode('utf8')) self.temp.flush() # if we have checked the file, and there aren't any genes in it, this # raises an error, since the most likely explanation is that something # has gone wrong with the data file, and likely the line-endings with self.assertRaises(ValueError): open_known_genes(self.temp.name)
def test_open_known_genes_missing_lines(self): ''' test that open_known_genes() works correctly when we can't find any genes ''' header = [ 'gene', 'chr', 'start', 'stop', 'type', 'mode', 'mech', 'hgnc_id' ] self.temp.write(('\t'.join(header) + '\n').encode('utf8')) self.temp.flush() # if we have checked the file, and there aren't any genes in it, this # raises an error, since the most likely explanation is that something # has gone wrong with the data file, and likely the line-endings with self.assertRaises(ValueError): open_known_genes(self.temp.name)
def test_open_known_genes_wrong_status(self): ''' test that open_known_genes() filters out genes without a good status ''' header = [ 'gene', 'chr', 'start', 'stop', 'type', 'mode', 'mech', 'hgnc_id' ] line1 = [ 'TEST', '1', '1000', '2000', 'possible dd gene', 'Monoallelic', 'Loss-of-function', '1001' ] line2 = [ 'TEST2', '1', '3000', '4000', 'confirmed dd gene', 'Monoallelic', 'Loss-of-function', '2001' ] self.temp.write(('\t'.join(header) + '\n').encode('utf8')) self.temp.write(('\t'.join(line1) + '\n').encode('utf8')) self.temp.write(('\t'.join(line2) + '\n').encode('utf8')) self.temp.flush() self.assertEqual( open_known_genes(self.temp.name), { '2001': { 'chrom': '1', 'start': 3000, 'end': 4000, 'symbol': 'TEST2', 'status': set(['confirmed dd gene']), 'inh': { 'Monoallelic': set(['Loss-of-function']) } } })
def test_open_known_genes_multimechs(self): ''' test that open_known_genes() works correctly for genes with >1 mechs ''' header = [ 'gene', 'chr', 'start', 'stop', 'type', 'mode', 'mech', 'hgnc_id' ] line1 = [ 'TEST', '1', '1000', '2000', 'confirmed dd gene', 'Monoallelic', 'Loss-of-function', '1001' ] line2 = [ 'TEST', '1', '1000', '2000', 'confirmed dd gene', 'Monoallelic', 'Activating', '1001' ] self.temp.write(('\t'.join(header) + '\n').encode('utf8')) self.temp.write(('\t'.join(line1) + '\n').encode('utf8')) self.temp.write(('\t'.join(line2) + '\n').encode('utf8')) self.temp.flush() self.assertEqual( open_known_genes(self.temp.name), { '1001': { 'chrom': '1', 'start': 1000, 'end': 2000, 'symbol': 'TEST', 'status': set(['confirmed dd gene']), 'inh': { 'Monoallelic': set(['Loss-of-function', 'Activating']) } } })
def test_open_known_genes(self): ''' test that open_known_genes() works correctly ''' header = ['gene', 'chr', 'start', 'stop', 'type', 'mode', 'mech', 'hgnc_id'] line = ['TEST', '1', '1000', '2000', 'confirmed dd gene', 'Monoallelic', 'Loss-of-function', '1001'] self.temp.write(('\t'.join(header) + '\n').encode('utf8')) self.temp.write(('\t'.join(line) + '\n').encode('utf8')) self.temp.flush() self.assertEqual(open_known_genes(self.temp.name), {'1001': {'chrom': '1', 'start': 1000, 'end': 2000, 'symbol': 'TEST', 'status': set(['confirmed dd gene']), 'inh': {'Monoallelic': set(['Loss-of-function'])}} })
def __init__(self, population_tags=None, count=0, known_genes=None, date=None, regions=None, lof_sites=None, pp_filter=0.0, sum_x_lr2_file=None, output_path=None, export_vcf=None, debug_chrom=None, debug_pos=None): """ initialise the class object Args: population_tags: list of population ID tags, that could exist within the INFO field, or None. count: number of probands to analyse, helpful for tracking progress in output logs. known_genes: path to table of genes genes known to be associated with genetic disorders, or None. date: date of the known_genes file, or None if not using/unknown. regions: path to a table of regions for DECIPHER CNV syndromes. lof_sites: path to json file of [chrom, position] coordinates in genome, for modifying to a loss-of-function consequence if required. Can be None if unneeded. pp_filter: threshold from 0 to 1 for pp_dnm value to filter out candidiate DNMs which fall below this value sum_x_lr2_file: File containing sum of l2r values on x chromosome for each person output_path: path to write output tab-separated file to export_vcf: path to file or folder to write VCFs to. debug_chrom: chromosome for debugging purposes. debug_pos: position for debugging variant filtering at. """ self.pp_filter = pp_filter self.total = count self.count = 0 self.populations = population_tags self.debug_chrom = debug_chrom self.debug_pos = debug_pos # open reference datasets, these return None if the paths are None self.known_genes = open_known_genes(known_genes) self.cnv_regions = open_cnv_regions(regions) self.last_base = open_last_base_sites(lof_sites) #open file containing sum of mean log 2 ratios on X, returns an empty dict if path is None self.sum_x_lr2 = open_x_lr2_file(sum_x_lr2_file) self.reporter = Report(output_path, export_vcf, date)
def test_open_known_genes_wrong_status(self): ''' test that open_known_genes() filters out genes without a good status ''' header = ['gene', 'chr', 'start', 'stop', 'type', 'mode', 'mech', 'hgnc_id'] line1 = ['TEST', '1', '1000', '2000', 'possible dd gene', 'Monoallelic', 'Loss-of-function', '1001'] line2 = ['TEST2', '1', '3000', '4000', 'confirmed dd gene', 'Monoallelic', 'Loss-of-function', '2001'] self.temp.write(('\t'.join(header) + '\n').encode('utf8')) self.temp.write(('\t'.join(line1) + '\n').encode('utf8')) self.temp.write(('\t'.join(line2) + '\n').encode('utf8')) self.temp.flush() self.assertEqual(open_known_genes(self.temp.name), {'2001': {'chrom': '1', 'start': 3000, 'end': 4000, 'symbol': 'TEST2', 'status': set(['confirmed dd gene']), 'inh': {'Monoallelic': set(['Loss-of-function'])}} })
def test_open_known_genes_multimechs(self): ''' test that open_known_genes() works correctly for genes with >1 mechs ''' header = ['gene', 'chr', 'start', 'stop', 'type', 'mode', 'mech'] line1 = ['TEST', '1', '1000', '2000', 'confirmed dd gene', 'Monoallelic', 'Loss-of-function'] line2 = ['TEST', '1', '1000', '2000', 'confirmed dd gene', 'Monoallelic', 'Activating'] self.temp.write(('\t'.join(header) + '\n').encode('utf8')) self.temp.write(('\t'.join(line1) + '\n').encode('utf8')) self.temp.write(('\t'.join(line2) + '\n').encode('utf8')) self.temp.flush() self.assertEqual(open_known_genes(self.temp.name), {'TEST': {'chrom': '1', 'start': 1000, 'end': 2000, 'status': set(['confirmed dd gene']), 'inh': {'Monoallelic': set(['Loss-of-function', 'Activating'])}} })
def load_definitions_files(self): """loads all the config files for the script (eg filters, gene IDs) """ # if we have named a gene file, then load a dictionary of genes, and # add them to the filters, so we can screen variants for being in genes # known to be involved with disorders self.known_genes = None if self.options.genes is not None: self.known_genes = open_known_genes(self.options.genes) # if we have named an ID mapping file, the load a dictionary of IDs and # alternate IDs, so we can convert between different ID schemes. self.ID_mapper = None if self.options.alternate_ids is not None: self.ID_mapper = create_person_ID_mapper(self.options.alternate_ids) # open a list of regions associated with DECIPHER syndromes self.cnv_regions = None if self.options.regions is not None: self.cnv_regions = open_cnv_regions(self.options.regions)
def load_definitions_files(self): """loads all the config files for the script (eg filters, gene IDs) """ # if we have named a gene file, then load a dictionary of genes, and # add them to the filters, so we can screen variants for being in genes # known to be involved with disorders self.known_genes = None if self.options.genes is not None: self.known_genes = open_known_genes(self.options.genes) # if we have named an ID mapping file, the load a dictionary of IDs and # alternate IDs, so we can convert between different ID schemes. self.ID_mapper = None if self.options.alternate_ids is not None: self.ID_mapper = create_person_ID_mapper( self.options.alternate_ids) # open a list of regions associated with DECIPHER syndromes self.cnv_regions = None if self.options.regions is not None: self.cnv_regions = open_cnv_regions(self.options.regions)