def test_filter_with_just_field_name_excludes_missing_rows(self, standard_gwas_parser_basic): reader = readers.IterableReader(["1\t100\tnull\tNone\t0.05", "2\t200\tA\tC\t5e-8"], parser=standard_gwas_parser_basic) reader.add_filter('ref') assert len(reader._filters) == 1 # File will act on it assert len(list(reader)) == 1, "output was restricted to the expected rows"
def test_writer_defaults_to_parser_columns(self, tmpdir, standard_gwas_parser_basic): reader = readers.IterableReader(['1\t100\tA\tC\t0.05', '2\t200\tA\tC\t5e-8'], parser=standard_gwas_parser_basic) expected_fn = tmpdir / 'test.txt' out_fn = reader.write(expected_fn) with open(out_fn, 'r') as f: assert f.readline() == '#chrom\tpos\trsid\tref\talt\tneg_log_pvalue\tbeta\tstderr_beta\talt_allele_freq\n'
def test_can_write_output(self, tmpdir, standard_gwas_parser_basic): reader = readers.IterableReader(["1\t100\tA\tC\t0.05", "2\t200\tA\tC\t5e-8"], parser=standard_gwas_parser_basic) expected_fn = tmpdir / 'test.txt' out_fn = reader.write(expected_fn, columns=['chrom'], make_tabix=False) assert expected_fn == out_fn assert os.path.isfile(out_fn), "Output filename exists" with open(out_fn, 'r') as f: assert f.readlines() == ["#chrom\n", "1\n", "2\n"]
def test_writer_represents_missing_data_correctly(self, tmpdir, standard_gwas_parser_basic): """The writer should represent explicit missing values as `.` (instead of eg Python None)""" reader = readers.IterableReader(["1\t100\tA\tC\tNone", "2\t200\tA\tC\t."], parser=standard_gwas_parser_basic) expected_fn = tmpdir / 'test.txt' out_fn = reader.write(expected_fn, columns=['neg_log_pvalue'], make_tabix=False) assert expected_fn == out_fn assert os.path.isfile(out_fn), "Output filename exists" with open(out_fn, 'r') as f: assert f.readlines() == ["#neg_log_pvalue\n", ".\n", ".\n"]
def test_can_write_tabixed_output(self, tmpdir, standard_gwas_parser_basic): reader = readers.IterableReader(["1\t100\tA\tC\t0.05", "2\t200\tA\tC\t5e-8"], parser=standard_gwas_parser_basic) expected_fn = tmpdir / 'test.gz' out_fn = reader.write(str(expected_fn), columns=['chrom', 'pos'], make_tabix=True) assert expected_fn != out_fn assert out_fn.endswith('.gz') assert os.path.exists('{}.tbi'.format(out_fn)), "Tabix index exists" assert os.path.isfile(out_fn), "Output filename exists" # Now try to use the file that was written check_output = readers.TabixReader(out_fn) assert len(list(check_output.fetch('1', 1, 300))) == 1, 'Output file can be read with tabix features'
def test_can_optionally_iterate_sans_parsing(self): reader = readers.IterableReader(["walrus", "carpenter"], parser=None) results = list(reader) assert results == ["walrus", "carpenter"], "Returns unparsed data"
def test_skips_empty_rows_padding_file(self): reader = readers.IterableReader(["", ""]) results = list(reader) assert len(results) == 0, "Skipped empty lines"
def test_can_specify_iterable_as_source(self): # Any row-based iterable can serve as the source and will be parsed as expected reader = readers.IterableReader(["X\t1\tA\tG"]) result = next(iter(reader)) assert result[0] == "X"
def test_transforms_can_work_with_any_parser(self): # Transforms bypass the mechanisms for data validation, so they don't perform sanity checks on field names. # A side effect is that they can be used even if the parser doesn't support field names reader = readers.IterableReader([[1, 2, 3]], parser=None) reader.add_transform(lambda row: row) assert all(isinstance(row, list) for row in reader), 'Transforms work even without named fields'
def test_named_fields_require_named_field_parser(self): reader = readers.IterableReader([[1, 2, 3]], parser=None) with pytest.raises(exceptions.ConfigurationException, match='name-based'): reader.add_lookup('chrom', lambda parsed: 'Y')
def test_add_filter_fails_with_too_many_arguments(self): reader = readers.IterableReader(["X\t1\tA\tG"]) with pytest.raises(exceptions.ConfigurationException, match='Invalid filter format'): reader.add_filter('afield', 42, 'superfluous argument')
def test_no_headers_in_short_file(self): reader = readers.IterableReader(['walrus', 'carpenter'], parser=None) with pytest.raises(exceptions.SnifferException, match='entire file'): sniffers.get_headers(reader, delimiter='\t')
def test_warns_if_file_is_unreadable(self): reader = readers.IterableReader(['mwa', 'ha', 'ha'], parser=doomed_parser, skip_errors=True, max_errors=2) with pytest.raises(exceptions.TooManyBadLinesException): list(reader) assert len(reader.errors) == 2, "Reader gave up after two lines, but tracked the errors"
def test_can_track_errors(self): reader = readers.IterableReader(['mwa', 'ha', 'ha'], parser=doomed_parser, skip_errors=True, max_errors=10) results = list(reader) assert len(results) == 0, "No data could actually be read!" assert len(reader.errors) == 3, "Three lines could not be parsed"
def test_can_fail_on_first_error(self): reader = readers.IterableReader(['mwa', 'ha', 'ha'], parser=doomed_parser, skip_errors=False) with pytest.raises(exceptions.LineParseException): list(reader)
def test_writer_validates_options_when_sending_to_console(self, standard_gwas_parser_basic): reader = readers.IterableReader(['1\t100\tA\tC\t0.05', '2\t200\tA\tC\t5e-8'], parser=standard_gwas_parser_basic) with pytest.raises(exceptions.ConfigurationException, match='stream'): reader.write(make_tabix=True)
def test_writer_can_send_to_console_stdout(self, capsys, standard_gwas_parser_basic): reader = readers.IterableReader(['1\t100\tA\tC\t0.05', '2\t200\tA\tC\t5e-8'], parser=standard_gwas_parser_basic) reader.write() out, err = capsys.readouterr() assert out.splitlines()[0] == '#chrom\tpos\trsid\tref\talt\tneg_log_pvalue\tbeta\tstderr_beta\talt_allele_freq'
def test_writer_needs_to_know_column_names(self, tmpdir): reader = readers.IterableReader(['1\t100\tA\tC\t0.05', '2\t200\tA\tC\t5e-8'], parser=lambda line: ('A', 'B')) expected_fn = tmpdir / 'test.txt' with pytest.raises(exceptions.ConfigurationException, match='column names'): reader.write(expected_fn)
def test_handles_lack_of_headers(self): reader = readers.IterableReader(['X\t100', 'X\t101'], parser=None) n, content = sniffers.get_headers(reader, delimiter='\t') assert n == 0, 'File has no header rows' assert content is None, 'No header row, so headers are blank'
def test_stops_header_search_after_limit(self): reader = readers.IterableReader(['walrus', 'carpenter'], parser=None) with pytest.raises(exceptions.SnifferException, match='after limit'): sniffers.get_headers(reader, delimiter='\t', max_check=1)
def test_add_filter_validates_one_argument_syntax(self): reader = readers.IterableReader(["X\t1\tA\tG"]) with pytest.raises(exceptions.ConfigurationException, match='function or a field name'): reader.add_filter(42)
def test_can_find_headers(self): reader = readers.IterableReader( ["#Comment line", '#Header\tLabels', 'X\t100'], parser=None) n, content = sniffers.get_headers(reader, delimiter='\t') assert n == 2, 'Skipped two header rows' assert content == '#Header\tLabels', 'Found correct header row'