def test_writing(self): path = test_utils.test_tmpfile('test_gfile') with gfile.Open(path, 'w') as f: f.write('test\n') f.write('end\n') with gfile.Open(path, 'r') as f2: lines = f2.readlines() self.assertEqual(['test\n', 'end\n'], lines)
def save_to_png(arr, path=None, image_mode=None, show=True, labels=None, scale=None): """Make a PNG and show it from a numpy array of dtype=np.uint8. Args: arr: numpy array. Input array to save. path: str. File path at which to save the image. A .png prefix is added if the path does not already have one. Leave empty to save at /tmp/tmp.png, which is useful when only temporarily showing the image in a Colab notebook. image_mode: "RGB" or "L". Leave as default=None to choose based on image dimensions. show: bool. Whether to display the image using IPython (for notebooks). labels: list of str. Labels to show across the top of the image. scale: integer. Number of pixels wide and tall to show each cell in the array. This sizes up the image while keeping exactly the same number of pixels for every cell in the array, preserving resolution and preventing any interpolation or overlapping of pixels. Default None adapts to the size of the image to multiply it up until a limit of 500 pixels, a convenient size for use in notebooks. If saving to a file for automated processing, scale=1 is recommended to keep output files small and simple while still retaining all the information content. Returns: None. Saves an image at path and optionally shows it with IPython.display. """ if image_mode is None: image_mode = _get_image_type_from_array(arr) img = Image.fromarray(arr, mode=image_mode) if labels is not None: img = add_header(img, labels) if scale is None: scale = max(1, int(500 / max(arr.shape))) if scale != 1: img = img.resize((img.size[0] * scale, img.size[1] * scale)) # Saving to a temporary file is needed even when showing in a notebook if path is None: path = '/tmp/tmp.png' elif not path.endswith('.png'): # Only PNG is supported because JPEG files are unnecessarily 3 times larger. path = '{}.png'.format(path) with gfile.Open(path, 'wb') as fout: img.save(fout, format=path.split('.')[-1]) # Show image (great for notebooks) if show: display.display(display.Image(path))
def _parse_read_with_aux_tags(self, tag_string): # Minimal header line to create a valid SAM file. header_lines = '@HD\tVN:1.3\tSO:coordinate\n@SQ\tSN:chr1\tLN:248956422\n' # A single stock read we'll add our AUX fields to. read = 'read_name\t0\tchr1\t1\t0\t3M\t*\t0\t0\tCCC\tAAA\t' + tag_string path = test_utils.test_tmpfile('aux_tags.bam') with gfile.Open(path, 'w') as fout: fout.write(header_lines) fout.write(read + '\n') with sam.SamReader(path, parse_aux_fields=True) as reader: return list(reader.iterate())
def assertWrittenVCFRecordsEqual(self, path, expected_lines): def cleanup_line(line): if isinstance(line, (list, tuple)): return '\t'.join(str(x) for x in line) else: return line expected_lines = [cleanup_line(line) for line in expected_lines] with gfile.Open(path, 'r') as fin: self.assertEqual([ line.strip() for line in fin.readlines() if not line.startswith('#') ], expected_lines)
def test_tmpfile(name, contents=None): """Returns a path to a tempfile named name in the test_tmpdir. Args: name: str; the name of the file, should not contain any slashes. contents: bytes, or None. If not None, tmpfile's contents will be set to contents before returning the path. Returns: str path to a tmpfile with filename name in our test tmpfile directory. """ path = os.path.join(absltest.get_default_test_tmpdir(), name) if contents is not None: with gfile.Open(path, 'wb') as fout: fout.write(contents) return path
def test_writing_canned_records(self): """Tests writing all the records that are 'canned' in our tfrecord file.""" # This file is in TFRecord format. tfrecord_file = test_utils.genomics_core_testdata( 'test_regions.bed.tfrecord') header = bed_pb2.BedHeader(num_fields=12) writer_options = bed_pb2.BedWriterOptions() bed_records = list( tfrecord.read_tfrecords(tfrecord_file, proto=bed_pb2.BedRecord)) out_fname = test_utils.test_tmpfile('output.bed') with bed_writer.BedWriter.to_file(out_fname, header, writer_options) as writer: for record in bed_records: writer.write(record) with gfile.Open(out_fname, 'r') as f: self.assertEqual(f.readlines(), self.expected_bed_content)
def test_writing_canned_records(self): """Tests writing all the variants that are 'canned' in our tfrecord file.""" # This file is in TFRecord format. tfrecord_file = test_utils.genomics_core_testdata( 'test_reads.fastq.tfrecord') writer_options = fastq_pb2.FastqWriterOptions() fastq_records = list( tfrecord.read_tfrecords(tfrecord_file, proto=fastq_pb2.FastqRecord)) out_fname = test_utils.test_tmpfile('output.fastq') with fastq_writer.FastqWriter.to_file(out_fname, writer_options) as writer: for record in fastq_records: writer.write(record) with gfile.Open(out_fname, 'r') as f: self.assertEqual(f.readlines(), self.expected_fastq_content)
def bedpe_parser(filename): """Parses Range objects from a BEDPE-formatted file object. See http://bedtools.readthedocs.org/en/latest/content/general-usage.html for more information on the BEDPE format. Skips events that span across chromosomes. For example, if the starting location is on chr1 and the ending location is on chr2, that record will not appear in the output. Args: filename: file name of a BEDPE-formatted file. Yields: nucleus.genomics.v1.Range protobuf objects. """ for line in gfile.Open(filename): parts = line.split('\t') if parts[0] == parts[3]: # only keep events on the same chromosome yield make_range(parts[0], int(parts[1]), int(parts[5]))
def test_reading(self): with gfile.Open( test_utils.genomics_core_testdata('headerless.sam')) as f: for line in f: self.assertTrue(line.startswith('SRR3656745'))
def test_writing_canned_variants(self): """Tests writing all the variants that are 'canned' in our tfrecord file.""" # This file is in the TF record format tfrecord_file = test_utils.genomics_core_testdata( 'test_samples.vcf.golden.tfrecord') writer_options = variants_pb2.VcfWriterOptions() header = variants_pb2.VcfHeader( contigs=[ reference_pb2.ContigInfo(name='chr1', n_bases=248956422), reference_pb2.ContigInfo(name='chr2', n_bases=242193529), reference_pb2.ContigInfo(name='chr3', n_bases=198295559), reference_pb2.ContigInfo(name='chrX', n_bases=156040895) ], sample_names=['NA12878_18_99'], filters=[ variants_pb2.VcfFilterInfo(id='PASS', description='All filters passed'), variants_pb2.VcfFilterInfo(id='LowQual', description=''), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL95.00to96.00'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL96.00to97.00'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL97.00to99.00'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.00to99.50'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.50to99.90'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.90to99.95'), variants_pb2.VcfFilterInfo( id='VQSRTrancheINDEL99.95to100.00+'), variants_pb2.VcfFilterInfo(id='VQSRTrancheINDEL99.95to100.00'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.50to99.60'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.60to99.80'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.80to99.90'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.90to99.95'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00+'), variants_pb2.VcfFilterInfo(id='VQSRTrancheSNP99.95to100.00'), ], infos=[ variants_pb2.VcfInfo( id='END', number='1', type='Integer', description='Stop position of the interval') ], formats=[ variants_pb2.VcfFormatInfo(id='GT', number='1', type='String', description='Genotype'), variants_pb2.VcfFormatInfo(id='GQ', number='1', type='Integer', description='Genotype Quality'), variants_pb2.VcfFormatInfo( id='DP', number='1', type='Integer', description='Read depth of all passing filters reads.'), variants_pb2.VcfFormatInfo( id='MIN_DP', number='1', type='Integer', description='Minimum DP observed within the GVCF block.'), variants_pb2.VcfFormatInfo( id='AD', number='R', type='Integer', description= 'Read depth of all passing filters reads for each allele.' ), variants_pb2.VcfFormatInfo( id='VAF', number='A', type='Float', description='Variant allele fractions.'), variants_pb2.VcfFormatInfo( id='PL', number='G', type='Integer', description='Genotype likelihoods, Phred encoded'), ], ) variant_records = list( tfrecord.read_tfrecords(tfrecord_file, proto=variants_pb2.Variant)) out_fname = test_utils.test_tmpfile('output.vcf') with vcf_writer.VcfWriter.to_file(out_fname, header, writer_options) as writer: for record in variant_records[:5]: writer.write(record) # Check: are the variants written as expected? # pylint: disable=line-too-long expected_vcf_content = [ '##fileformat=VCFv4.2\n', '##FILTER=<ID=PASS,Description="All filters passed">\n', '##FILTER=<ID=LowQual,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL95.00to96.00,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL96.00to97.00,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL97.00to99.00,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.00to99.50,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.50to99.90,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.90to99.95,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00+,Description="">\n', '##FILTER=<ID=VQSRTrancheINDEL99.95to100.00,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.50to99.60,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.60to99.80,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.80to99.90,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.90to99.95,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.95to100.00+,Description="">\n', '##FILTER=<ID=VQSRTrancheSNP99.95to100.00,Description="">\n', '##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of ' 'the interval">\n', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n', '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">\n', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth of all ' 'passing filters reads.">\n', '##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP ' 'observed within the GVCF block.">\n', '##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth of all ' 'passing filters reads for each allele.">\n', '##FORMAT=<ID=VAF,Number=A,Type=Float,Description=\"Variant allele ' 'fractions.">\n', '##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Genotype ' 'likelihoods, Phred encoded">\n', '##contig=<ID=chr1,length=248956422>\n', '##contig=<ID=chr2,length=242193529>\n', '##contig=<ID=chr3,length=198295559>\n', '##contig=<ID=chrX,length=156040895>\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12878_18_99\n', 'chr1\t13613\t.\tT\tA\t39.88\tVQSRTrancheSNP99.90to99.95\t.\tGT:GQ:DP:AD:PL\t0/1:16:4:1,3:68,0,16\n', 'chr1\t13813\t.\tT\tG\t90.28\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:9:3:0,3:118,9,0\n', 'chr1\t13838\trs28428499\tC\tT\t62.74\tPASS\t.\tGT:GQ:DP:AD:PL\t1/1:6:2:0,2:90,6,0\n', 'chr1\t14397\trs756427959\tCTGT\tC\t37.73\tPASS\t.\tGT:GQ:DP:AD:PL\t0/1:75:5:3,2:75,0,152\n', 'chr1\t14522\t.\tG\tA\t49.77\tVQSRTrancheSNP99.60to99.80\t.\tGT:GQ:DP:AD:PL\t0/1:78:10:6,4:78,0,118\n' ] # pylint: enable=line-too-long with gfile.Open(out_fname, 'r') as f: self.assertEqual(f.readlines(), expected_vcf_content)