def test_sniffer_warns_if_cant_find_pval(self): data = _fixture_to_strings([ ['rsid', 'marker'], ['rs1234', '0.5'], ]) with pytest.raises(exceptions.SnifferException, match='pvalue'): sniffers.guess_gwas_generic(data)
def test_warns_if_file_lacks_required_fields(self): data = _fixture_to_strings([ ['rsid', 'pval'], ['rs1234', '0.5'], ]) with pytest.raises(exceptions.SnifferException): sniffers.guess_gwas_generic(data)
def test_can_guess_rvtests(self): data = _fixture_to_strings([ [ 'CHROM', 'POS', 'REF', 'ALT', 'N_INFORMATIVE', 'AF', 'INFORMATIVE_ALT_AC', 'CALL_RATE', 'HWE_PVALUE', 'N_REF', 'N_HET', 'N_ALT', 'U_STAT', 'SQRT_V_STAT', 'ALT_EFFSIZE', 'PVALUE' ], # noqa: E501 [ '1', '761893', 'G', 'T', '19292', '2.59624e-05:0.000655308:0', '1:1:0', '0.998289:0.996068:0.998381', '1:1:1', '19258:759:18499', '1:1:0', '0:0:0', '1.33113', '0.268484', '18.4664', '7.12493e-07' ] # noqa: E501 ]) actual = sniffers.guess_gwas_generic(data) assert h(actual._parser._chrom_col) == 1, 'Found index of chr col' assert h(actual._parser._pos_col) == 2, 'Found index of pos col' assert h(actual._parser._ref_col) == 3, 'Found index of ref col' assert h(actual._parser._alt_col) == 4, 'Found index of alt col' assert h(actual._parser._pvalue_col) == 16, 'Found index of pval col' assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log' assert h(actual._parser._beta_col) == 15, 'beta field detected' assert actual._parser._stderr_col is None, 'No stderr_beta field detected'
def test_can_guess_a_mystery_format(self): # TODO: Identify the program used and make test more explicit # FIXME: This test underscores difficulty of reliable ref/alt detection- a1 comes # before a0, but it might be more valid to switch the order of these columns. Leave meaning up to the user. data = _fixture_to_strings([ [ 'chr', 'rs', 'ps', 'n_mis', 'n_obs', 'allele1', 'allele0', 'af', 'beta', 'se', 'p_score' ], [ '1', 'rs75333668', '762320', '0', '3610', 'T', 'C', '0.013', '-5.667138e-02', '1.027936e-01', '5.814536e-01' ] # noqa: E501 ]) actual = sniffers.guess_gwas_generic(data) assert h(actual._parser._chrom_col) == 1, 'Found index of chr col' assert h(actual._parser._pos_col) == 3, 'Found index of pos col' assert h(actual._parser._ref_col) == 6, 'Found index of ref col' assert h(actual._parser._alt_col) == 7, 'Found index of alt col' assert h(actual._parser._pvalue_col) == 11, 'Found index of pval col' assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log' assert h(actual._parser._beta_col) == 9, 'beta field detected' assert h( actual._parser._stderr_col) == 10, 'stderr_beta field detected'
def test_can_guess_output_of_alisam_pipeline(self): data = _fixture_to_strings([ [ 'MarkerName', 'chr', 'pos', 'ref', 'alt', 'minor.allele', 'maf', 'mac', 'n', 'pvalue', 'SNPID', 'BETA', 'SE', 'ALTFreq', 'SNPMarker' ], # noqa: E501 [ 'chr1-281876-AC-A', 'chr1', '281876', 'AC', 'A', 'alt', '0.231428578495979', '1053', '2275', '0.447865946615285', 'rs72502741', '-0.0872936159370696', '0.115014743551501', '0.231428578495979', 'chr1:281876_AC/A' ] # noqa: E501 ]) actual = sniffers.guess_gwas_generic(data) assert h(actual._parser._chrom_col) == 2, 'Found index of chr col' assert h(actual._parser._pos_col) == 3, 'Found index of pos col' assert h(actual._parser._ref_col) == 4, 'Found index of ref col' assert h(actual._parser._alt_col) == 5, 'Found index of alt col' assert h(actual._parser._pvalue_col) == 10, 'Found index of pval col' assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log' assert h(actual._parser._beta_col) == 12, 'beta field detected' assert h( actual._parser._stderr_col) == 13, 'stderr_beta field detected'
def test_can_provide_extra_options_for_parser(self): data = _fixture_to_strings([[ '#chrom', 'pos', 'ref', 'alt', 'neg_log_pvalue', 'alt_allele_freq' ], ['1', '762320', 'C', 'T', '0.36947042857317597', '0.5', '0.1']]) actual = sniffers.guess_gwas_generic( data, parser_options={'allele_freq_col': 6}) assert actual._parser._allele_freq_col == 5, 'Sniffer used an option that it could not have auto-detected'
def test_can_guess_lipidgenetics_glgc_format_with_help(self): # The sniffer won't try to guess hybrid marker/allele formats (4 items in 3 columns), but it can parse # it with help data = _fixture_to_strings([ [ 'SNP_hg18', 'SNP_hg19', 'rsid', 'A1', 'A2', 'beta', 'se', 'N', 'P-value', 'Freq.A1.1000G.EUR' ], [ 'chr10:10000135', 'chr10:9960129', 'rs4747841', 'g', 'a', '0.0026', '0.0048', '93561.00', '0.7538', '0.5092' ], # noqa: E501 ]) actual = sniffers.guess_gwas_generic(data, parser_options={ 'marker_col': 2, 'ref_col': 4, 'alt_col': 5 }) assert h(actual._parser._marker_col) == 2, 'Found index of marker col' assert actual._parser._chrom_col is None, 'Prefers marker and does not try to guess chrom col' assert h(actual._parser._ref_col) == 4, 'Found index of ref col' assert h(actual._parser._pvalue_col) == 9, 'Found index of pvalue col' assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log' assert h(actual._parser._beta_col) == 6, 'beta field detected' assert h(actual._parser._stderr_col) == 7, 'stderr_beta field detected'
def test_can_guess_whatever_diagram_was_using(self): # FIXME: If this format turns out to be common, we should improve it to fetch all four values, instead of just # the two that the marker will provide data = _fixture_to_strings([ [ 'Chr:Position', 'Allele1', 'Allele2', 'Effect', 'StdErr', 'P-value', 'TotalSampleSize' ], ['5:29439275', 'T', 'C', '-0.0003', '0.015', '0.99', '111309'], ]) actual = sniffers.guess_gwas_generic(data) assert h(actual._parser._marker_col) == 1, 'Found index of marker col' assert h(actual._parser._pvalue_col) == 6, 'Found index of pval col' assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log' assert h(actual._parser._beta_col) == 4, 'beta field detected' assert h(actual._parser._stderr_col) == 5, 'stderr_beta field detected'
def main(source: ty.Union[str, ty.Iterable], out_fn: ty.Union[str, None], parser_options: dict, auto_config=False, skip_rows=None, skip_errors=True, max_errors=100, make_tabix: bool = False): try: parser = parsers.GenericGwasLineParser(**parser_options) except exceptions.ConfigurationException: parser = None if source is None: source = sys.stdin if not auto_config and (skip_rows is None or parser is None): logger.error( 'Please provide all options required to parse the file, or use the --auto flag to guess' ) sys.exit(1) # Guess how to read the file. If no parser was provided, try to guess columns. reader = sniffers.guess_gwas_generic(source, skip_rows=skip_rows, parser=parser, parser_options=parser_options, skip_errors=skip_errors, max_errors=max_errors) try: dest_fn = reader.write(out_fn, make_tabix=make_tabix) or 'console' except exceptions.TooManyBadLinesException: logger.error('ERROR: Too many lines failed to parse; stopping.') except Exception: logger.exception('Conversion failed due to unknown error') else: logger.info( 'Conversion succeeded! Results written to: {}'.format(dest_fn)) finally: for n, reason, _ in reader.errors: logger.error( 'Excluded row {} from output due to parse error: {}'.format( n, reason))
def test_can_guess_standard_format(self): # Tracks the "standard format" defined as a convenience parser data = _fixture_to_strings([[ '#chrom', 'pos', 'ref', 'alt', 'neg_log_pvalue', 'beta', 'stderr_beta', 'alt_allele_freq' ], [ '1', '762320', 'C', 'T', '0.36947042857317597', '0.5', '0.1', '0.5' ]]) actual = sniffers.guess_gwas_generic(data) assert h(actual._parser._chrom_col) == 1, 'Found index of chr col' assert h(actual._parser._pos_col) == 2, 'Found index of pos col' assert h(actual._parser._ref_col) == 3, 'Found index of ref col' assert h(actual._parser._alt_col) == 4, 'Found index of alt col' assert h(actual._parser._pvalue_col) == 5, 'Found index of pval col' assert actual._parser._is_neg_log_pvalue is True, 'Determined whether is log' assert h(actual._parser._beta_col) == 6, 'beta field detected' assert h(actual._parser._stderr_col) == 7, 'stderr_beta field detected' assert actual._parser._allele_freq_col is None, 'Sniffer does not try to detect allele freq'
def test_can_guess_bolt_lmm(self): data = _fixture_to_strings([ [ 'SNP', 'CHR', 'BP', 'A1', 'A0', 'MAF', 'HWEP', 'INFO', 'BETA', 'SE', 'P' ], [ '10:48698435_A_G', '10', '48698435', 'A', 'G', '0.01353', '0.02719', '0.960443', '0.0959329', '0.0941266', '3.3E-01' ] # noqa: E501 ]) actual = sniffers.guess_gwas_generic(data) assert h(actual._parser._marker_col) == 1, 'Found index of marker col' assert h(actual._parser._pvalue_col) == 11, 'Found index of pval col' assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log' assert h(actual._parser._beta_col) == 9, 'beta field detected' assert h( actual._parser._stderr_col) == 10, 'stderr_beta field detected'
def test_can_guess_plink(self): # Format: https://www.cog-genomics.org/plink2/formats # Sample: https://github.com/babelomics/babelomics/wiki/plink.assoc # h/t Josh Weinstock data = _fixture_to_strings( [['CHR', 'SNP', 'BP', 'A1', 'F_A', 'F_U', 'A2', 'CHISQ', 'P'], [ '1', 'rs3094315', '742429', 'C', '0.1509', '0.1394', 'T', '0.0759', '0.782', '1.097' ]]) actual = sniffers.guess_gwas_generic(data) assert h(actual._parser._chrom_col) == 1, 'Found index of col' assert h(actual._parser._pos_col) == 3, 'Found index of pos col' assert h(actual._parser._ref_col) == 4, 'Found index of ref col' assert h(actual._parser._alt_col) == 7, 'Found index of alt col' assert h(actual._parser._pvalue_col) == 9, 'Found index of pval col' assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log' assert actual._parser._beta_col is None, 'No beta field detected' assert actual._parser._stderr_col is None, 'No stderr_beta field detected'
def test_can_guess_epacts(self): data = _fixture_to_strings([ [ '#CHROM', 'BEGIN', 'END', 'MARKER_ID', 'NS', 'AC', 'CALLRATE', 'MAF', 'PVALUE', 'SCORE', 'N.CASE', 'N.CTRL', 'AF.CASE', 'AF.CTRL' ], # noqa: E501 [ '20', '1610894', '1610894', '20:1610894_G/A_Synonymous:SIRPG', '266', '138.64', '1', '0.26061', '6.9939e-05', '3.9765', '145', '121', '0.65177', '0.36476' ] # noqa: E501 ]) actual = sniffers.guess_gwas_generic(data) assert h(actual._parser._marker_col) == 4, 'Found index of marker col' assert h(actual._parser._pvalue_col) == 9, 'Found index of pval col' assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log' assert actual._parser._beta_col is None, 'No beta field detected' assert actual._parser._stderr_col is None, 'No stderr_beta field detected'
def test_can_guess_emmax_epacts(self): """Fileformat sample provided by multiple tools""" data = _fixture_to_strings([ [ '#CHROM', 'BEG', 'END', 'MARKER_ID', 'NS', 'AC', 'CALLRATE', 'GENOCNT', 'MAF', 'STAT', 'PVALUE', 'BETA', 'SEBETA', 'R2' ], # noqa: E501 [ '1', '762320', '762320', '1:762320_C/T_rs75333668', '3805', '100.00', '1.00000', '3707/96/2', '0.01314', '0.7942', '0.4271', '0.08034', '0.1012', '0.0001658' ] # noqa: E501 ]) actual = sniffers.guess_gwas_generic(data) assert h(actual._parser._marker_col) == 4, 'Found index of marker col' assert h(actual._parser._pvalue_col) == 11, 'Found index of pval col' assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log' assert h(actual._parser._beta_col) == 12, 'beta field detected' assert h( actual._parser._stderr_col) == 13, 'stderr_beta field detected'
def test_can_guess_saige(self): data = _fixture_to_strings([ [ 'CHR', 'POS', 'SNPID', 'Allele1', 'Allele2', 'AC_Allele2', 'AF_Allele2', 'N', 'BETA', 'SE', 'Tstat', 'p.value', 'p.value.NA', 'Is.SPA.converge', 'varT', 'varTstar' ], # noqa: E501 [ 'chr1', '76792', 'chr1:76792:A:C', 'A', 'C', '57', '0.00168639048933983', '16900', '0.573681678183941', '0.663806747906141', '1.30193005902619', '0.387461577915637', '0.387461577915637', '1', '2.2694293866027', '2.41152256615949' ] # noqa: E501 ]) actual = sniffers.guess_gwas_generic(data) assert h(actual._parser._marker_col) == 3, 'Found index of marker col' assert h(actual._parser._pvalue_col) == 12, 'Found index of pval col' assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log' assert h(actual._parser._beta_col) == 9, 'beta field detected' assert h( actual._parser._stderr_col) == 10, 'stderr_beta field detected'
def test_can_guess_raremetalworker(self): data = _fixture_to_strings([ [ '#CHROM', 'POS', 'REF', 'ALT', 'N_INFORMATIVE', 'FOUNDER_AF', 'ALL_AF', 'INFORMATIVE_ALT_AC', 'CALL_RATE', 'HWE_PVALUE', 'N_REF', 'N_HET', 'N_ALT', 'U_STAT', 'SQRT_V_STAT', 'ALT_EFFSIZE', 'PVALUE' ], # noqa: E501 [ '9', '400066155', 'T', 'C', '432', '0', '0', '0', '1', '1', '432', '0', '0', 'NA', 'NA', 'NA', 'NA' ] ]) actual = sniffers.guess_gwas_generic(data) assert h(actual._parser._chrom_col) == 1, 'Found index of chr col' assert h(actual._parser._pos_col) == 2, 'Found index of pos col' assert h(actual._parser._ref_col) == 3, 'Found index of ref col' assert h(actual._parser._alt_col) == 4, 'Found index of alt col' assert h(actual._parser._pvalue_col) == 17, 'Found index of pval col' assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log' assert h(actual._parser._beta_col) == 16, 'beta field detected' assert actual._parser._stderr_col is None, 'No stderr_beta field detected'
def test_can_guess_raremetal(self): data = _fixture_to_strings([ [ '#CHROM', 'POS', 'REF', 'ALT', 'N', 'POOLED_ALT_AF', 'DIRECTION_BY_STUDY', 'EFFECT_SIZE', 'EFFECT_SIZE_SD', 'H2', 'PVALUE' ], # noqa: E501 [ '1', '10177', 'A', 'AC', '491984', '0.00511094', '?-????????????????-????+???????????????????????????????????????????????????????????????????-????????????????????????????????????????????????????????????????????????????????', '-0.0257947', '0.028959', '1.61266e-06', '0.373073' ] # noqa: E501 ]) actual = sniffers.guess_gwas_generic(data) assert h(actual._parser._chrom_col) == 1, 'Found index of chr col' assert h(actual._parser._pos_col) == 2, 'Found index of pos col' assert h(actual._parser._ref_col) == 3, 'Found index of ref col' assert h(actual._parser._alt_col) == 4, 'Found index of alt col' assert h(actual._parser._pvalue_col) == 11, 'Found index of pval col' assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log' assert h(actual._parser._beta_col) == 8, 'Beta field detected' assert h(actual._parser._stderr_col) == 9, 'stderr_beta field detected'
def test_can_guess_gwas_catalog_mostly(self): data = _fixture_to_strings([ [ 'chromosome', 'base_pair_location', 'effect_allele', 'other_allele', 'odds_ratio', 'ci_lower', 'ci_upper', 'standard_error', 'p_value' ], # noqa [ '1', '1108138', 'A', 'G', '1.081', '0.8822', '1.325', '0.1038', '0.4517' ] ]) actual = sniffers.guess_gwas_generic(data) assert h(actual._parser._chrom_col) == 1, 'Found index of chr col' assert h(actual._parser._pos_col) == 2, 'Found index of pos col' # The EBI GWAS catalog uses "effect" and "non_effect". The meaning of this varies from one analysis to another. # A user will have to decide how to handle the reference genome for themselves. assert actual._parser._ref_col is None, 'Did NOT identify ref col, b/c GWAS catalog uses ambiguous "effect"' assert actual._parser._alt_col is None, 'Did NOT identify alt col, b/c GWAS catalog uses ambiguous "effect"' assert h(actual._parser._stderr_col) == 8, 'stderr_beta field detected' assert h(actual._parser._pvalue_col) == 9, 'Found index of pval col' assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log'
def test_sniffer_validates_options(self): with pytest.raises(exceptions.ConfigurationException, match='exclusive'): sniffers.guess_gwas_generic(['1', '2'], parser=parsers.TupleLineParser(), parser_options={'option': 1})