Beispiel #1
0
 def test_sniffer_warns_if_cant_find_pval(self):
     data = _fixture_to_strings([
         ['rsid', 'marker'],
         ['rs1234', '0.5'],
     ])
     with pytest.raises(exceptions.SnifferException, match='pvalue'):
         sniffers.guess_gwas_generic(data)
Beispiel #2
0
 def test_warns_if_file_lacks_required_fields(self):
     data = _fixture_to_strings([
         ['rsid', 'pval'],
         ['rs1234', '0.5'],
     ])
     with pytest.raises(exceptions.SnifferException):
         sniffers.guess_gwas_generic(data)
Beispiel #3
0
    def test_can_guess_rvtests(self):
        data = _fixture_to_strings([
            [
                'CHROM', 'POS', 'REF', 'ALT', 'N_INFORMATIVE', 'AF',
                'INFORMATIVE_ALT_AC', 'CALL_RATE', 'HWE_PVALUE', 'N_REF',
                'N_HET', 'N_ALT', 'U_STAT', 'SQRT_V_STAT', 'ALT_EFFSIZE',
                'PVALUE'
            ],  # noqa: E501
            [
                '1', '761893', 'G', 'T', '19292', '2.59624e-05:0.000655308:0',
                '1:1:0', '0.998289:0.996068:0.998381', '1:1:1',
                '19258:759:18499', '1:1:0', '0:0:0', '1.33113', '0.268484',
                '18.4664', '7.12493e-07'
            ]  # noqa: E501
        ])
        actual = sniffers.guess_gwas_generic(data)
        assert h(actual._parser._chrom_col) == 1, 'Found index of chr col'
        assert h(actual._parser._pos_col) == 2, 'Found index of pos col'
        assert h(actual._parser._ref_col) == 3, 'Found index of ref col'
        assert h(actual._parser._alt_col) == 4, 'Found index of alt col'
        assert h(actual._parser._pvalue_col) == 16, 'Found index of pval col'
        assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log'

        assert h(actual._parser._beta_col) == 15, 'beta field detected'
        assert actual._parser._stderr_col is None, 'No stderr_beta field detected'
Beispiel #4
0
    def test_can_guess_a_mystery_format(self):
        # TODO: Identify the program used and make test more explicit
        # FIXME: This test underscores difficulty of reliable ref/alt detection- a1 comes
        #   before a0, but it might be more valid to switch the order of these columns. Leave meaning up to the user.
        data = _fixture_to_strings([
            [
                'chr', 'rs', 'ps', 'n_mis', 'n_obs', 'allele1', 'allele0',
                'af', 'beta', 'se', 'p_score'
            ],
            [
                '1', 'rs75333668', '762320', '0', '3610', 'T', 'C', '0.013',
                '-5.667138e-02', '1.027936e-01', '5.814536e-01'
            ]  # noqa: E501
        ])
        actual = sniffers.guess_gwas_generic(data)
        assert h(actual._parser._chrom_col) == 1, 'Found index of chr col'
        assert h(actual._parser._pos_col) == 3, 'Found index of pos col'
        assert h(actual._parser._ref_col) == 6, 'Found index of ref col'
        assert h(actual._parser._alt_col) == 7, 'Found index of alt col'
        assert h(actual._parser._pvalue_col) == 11, 'Found index of pval col'
        assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log'

        assert h(actual._parser._beta_col) == 9, 'beta field detected'
        assert h(
            actual._parser._stderr_col) == 10, 'stderr_beta field detected'
Beispiel #5
0
    def test_can_guess_output_of_alisam_pipeline(self):
        data = _fixture_to_strings([
            [
                'MarkerName', 'chr', 'pos', 'ref', 'alt', 'minor.allele',
                'maf', 'mac', 'n', 'pvalue', 'SNPID', 'BETA', 'SE', 'ALTFreq',
                'SNPMarker'
            ],  # noqa: E501
            [
                'chr1-281876-AC-A', 'chr1', '281876', 'AC', 'A', 'alt',
                '0.231428578495979', '1053', '2275', '0.447865946615285',
                'rs72502741', '-0.0872936159370696', '0.115014743551501',
                '0.231428578495979', 'chr1:281876_AC/A'
            ]  # noqa: E501
        ])
        actual = sniffers.guess_gwas_generic(data)
        assert h(actual._parser._chrom_col) == 2, 'Found index of chr col'
        assert h(actual._parser._pos_col) == 3, 'Found index of pos col'
        assert h(actual._parser._ref_col) == 4, 'Found index of ref col'
        assert h(actual._parser._alt_col) == 5, 'Found index of alt col'
        assert h(actual._parser._pvalue_col) == 10, 'Found index of pval col'
        assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log'

        assert h(actual._parser._beta_col) == 12, 'beta field detected'
        assert h(
            actual._parser._stderr_col) == 13, 'stderr_beta field detected'
Beispiel #6
0
 def test_can_provide_extra_options_for_parser(self):
     data = _fixture_to_strings([[
         '#chrom', 'pos', 'ref', 'alt', 'neg_log_pvalue', 'alt_allele_freq'
     ], ['1', '762320', 'C', 'T', '0.36947042857317597', '0.5', '0.1']])
     actual = sniffers.guess_gwas_generic(
         data, parser_options={'allele_freq_col': 6})
     assert actual._parser._allele_freq_col == 5, 'Sniffer used an option that it could not have auto-detected'
Beispiel #7
0
    def test_can_guess_lipidgenetics_glgc_format_with_help(self):
        # The sniffer won't try to guess hybrid marker/allele formats (4 items in 3 columns), but it can parse
        #   it with help
        data = _fixture_to_strings([
            [
                'SNP_hg18', 'SNP_hg19', 'rsid', 'A1', 'A2', 'beta', 'se', 'N',
                'P-value', 'Freq.A1.1000G.EUR'
            ],
            [
                'chr10:10000135', 'chr10:9960129', 'rs4747841', 'g', 'a',
                '0.0026', '0.0048', '93561.00', '0.7538', '0.5092'
            ],  # noqa: E501
        ])
        actual = sniffers.guess_gwas_generic(data,
                                             parser_options={
                                                 'marker_col': 2,
                                                 'ref_col': 4,
                                                 'alt_col': 5
                                             })
        assert h(actual._parser._marker_col) == 2, 'Found index of marker col'
        assert actual._parser._chrom_col is None, 'Prefers marker and does not try to guess chrom col'
        assert h(actual._parser._ref_col) == 4, 'Found index of ref col'
        assert h(actual._parser._pvalue_col) == 9, 'Found index of pvalue col'
        assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log'

        assert h(actual._parser._beta_col) == 6, 'beta field detected'
        assert h(actual._parser._stderr_col) == 7, 'stderr_beta field detected'
Beispiel #8
0
    def test_can_guess_whatever_diagram_was_using(self):
        # FIXME: If this format turns out to be common, we should improve it to fetch all four values, instead of just
        #   the two that the marker will provide
        data = _fixture_to_strings([
            [
                'Chr:Position', 'Allele1', 'Allele2', 'Effect', 'StdErr',
                'P-value', 'TotalSampleSize'
            ],
            ['5:29439275', 'T', 'C', '-0.0003', '0.015', '0.99', '111309'],
        ])
        actual = sniffers.guess_gwas_generic(data)
        assert h(actual._parser._marker_col) == 1, 'Found index of marker col'
        assert h(actual._parser._pvalue_col) == 6, 'Found index of pval col'
        assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log'

        assert h(actual._parser._beta_col) == 4, 'beta field detected'
        assert h(actual._parser._stderr_col) == 5, 'stderr_beta field detected'
Beispiel #9
0
def main(source: ty.Union[str, ty.Iterable],
         out_fn: ty.Union[str, None],
         parser_options: dict,
         auto_config=False,
         skip_rows=None,
         skip_errors=True,
         max_errors=100,
         make_tabix: bool = False):
    try:
        parser = parsers.GenericGwasLineParser(**parser_options)
    except exceptions.ConfigurationException:
        parser = None

    if source is None:
        source = sys.stdin

    if not auto_config and (skip_rows is None or parser is None):
        logger.error(
            'Please provide all options required to parse the file, or use the --auto flag to guess'
        )
        sys.exit(1)

    # Guess how to read the file. If no parser was provided, try to guess columns.
    reader = sniffers.guess_gwas_generic(source,
                                         skip_rows=skip_rows,
                                         parser=parser,
                                         parser_options=parser_options,
                                         skip_errors=skip_errors,
                                         max_errors=max_errors)

    try:
        dest_fn = reader.write(out_fn, make_tabix=make_tabix) or 'console'
    except exceptions.TooManyBadLinesException:
        logger.error('ERROR: Too many lines failed to parse; stopping.')
    except Exception:
        logger.exception('Conversion failed due to unknown error')
    else:
        logger.info(
            'Conversion succeeded! Results written to: {}'.format(dest_fn))
    finally:
        for n, reason, _ in reader.errors:
            logger.error(
                'Excluded row {} from output due to parse error: {}'.format(
                    n, reason))
Beispiel #10
0
    def test_can_guess_standard_format(self):
        # Tracks the "standard format" defined as a convenience parser
        data = _fixture_to_strings([[
            '#chrom', 'pos', 'ref', 'alt', 'neg_log_pvalue', 'beta',
            'stderr_beta', 'alt_allele_freq'
        ], [
            '1', '762320', 'C', 'T', '0.36947042857317597', '0.5', '0.1', '0.5'
        ]])
        actual = sniffers.guess_gwas_generic(data)
        assert h(actual._parser._chrom_col) == 1, 'Found index of chr col'
        assert h(actual._parser._pos_col) == 2, 'Found index of pos col'
        assert h(actual._parser._ref_col) == 3, 'Found index of ref col'
        assert h(actual._parser._alt_col) == 4, 'Found index of alt col'

        assert h(actual._parser._pvalue_col) == 5, 'Found index of pval col'
        assert actual._parser._is_neg_log_pvalue is True, 'Determined whether is log'

        assert h(actual._parser._beta_col) == 6, 'beta field detected'
        assert h(actual._parser._stderr_col) == 7, 'stderr_beta field detected'
        assert actual._parser._allele_freq_col is None, 'Sniffer does not try to detect allele freq'
Beispiel #11
0
    def test_can_guess_bolt_lmm(self):
        data = _fixture_to_strings([
            [
                'SNP', 'CHR', 'BP', 'A1', 'A0', 'MAF', 'HWEP', 'INFO', 'BETA',
                'SE', 'P'
            ],
            [
                '10:48698435_A_G', '10', '48698435', 'A', 'G', '0.01353',
                '0.02719', '0.960443', '0.0959329', '0.0941266', '3.3E-01'
            ]  # noqa: E501
        ])

        actual = sniffers.guess_gwas_generic(data)
        assert h(actual._parser._marker_col) == 1, 'Found index of marker col'
        assert h(actual._parser._pvalue_col) == 11, 'Found index of pval col'
        assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log'

        assert h(actual._parser._beta_col) == 9, 'beta field detected'
        assert h(
            actual._parser._stderr_col) == 10, 'stderr_beta field detected'
Beispiel #12
0
    def test_can_guess_plink(self):
        # Format: https://www.cog-genomics.org/plink2/formats
        # Sample: https://github.com/babelomics/babelomics/wiki/plink.assoc
        # h/t Josh Weinstock
        data = _fixture_to_strings(
            [['CHR', 'SNP', 'BP', 'A1', 'F_A', 'F_U', 'A2', 'CHISQ', 'P'],
             [
                 '1', 'rs3094315', '742429', 'C', '0.1509', '0.1394', 'T',
                 '0.0759', '0.782', '1.097'
             ]])
        actual = sniffers.guess_gwas_generic(data)
        assert h(actual._parser._chrom_col) == 1, 'Found index of col'
        assert h(actual._parser._pos_col) == 3, 'Found index of pos col'
        assert h(actual._parser._ref_col) == 4, 'Found index of ref col'
        assert h(actual._parser._alt_col) == 7, 'Found index of alt col'
        assert h(actual._parser._pvalue_col) == 9, 'Found index of pval col'
        assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log'

        assert actual._parser._beta_col is None, 'No beta field detected'
        assert actual._parser._stderr_col is None, 'No stderr_beta field detected'
Beispiel #13
0
    def test_can_guess_epacts(self):
        data = _fixture_to_strings([
            [
                '#CHROM', 'BEGIN', 'END', 'MARKER_ID', 'NS', 'AC', 'CALLRATE',
                'MAF', 'PVALUE', 'SCORE', 'N.CASE', 'N.CTRL', 'AF.CASE',
                'AF.CTRL'
            ],  # noqa: E501
            [
                '20', '1610894', '1610894', '20:1610894_G/A_Synonymous:SIRPG',
                '266', '138.64', '1', '0.26061', '6.9939e-05', '3.9765', '145',
                '121', '0.65177', '0.36476'
            ]  # noqa: E501
        ])

        actual = sniffers.guess_gwas_generic(data)
        assert h(actual._parser._marker_col) == 4, 'Found index of marker col'
        assert h(actual._parser._pvalue_col) == 9, 'Found index of pval col'
        assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log'

        assert actual._parser._beta_col is None, 'No beta field detected'
        assert actual._parser._stderr_col is None, 'No stderr_beta field detected'
Beispiel #14
0
    def test_can_guess_emmax_epacts(self):
        """Fileformat sample provided by multiple tools"""
        data = _fixture_to_strings([
            [
                '#CHROM', 'BEG', 'END', 'MARKER_ID', 'NS', 'AC', 'CALLRATE',
                'GENOCNT', 'MAF', 'STAT', 'PVALUE', 'BETA', 'SEBETA', 'R2'
            ],  # noqa: E501
            [
                '1', '762320', '762320', '1:762320_C/T_rs75333668', '3805',
                '100.00', '1.00000', '3707/96/2', '0.01314', '0.7942',
                '0.4271', '0.08034', '0.1012', '0.0001658'
            ]  # noqa: E501
        ])

        actual = sniffers.guess_gwas_generic(data)
        assert h(actual._parser._marker_col) == 4, 'Found index of marker col'
        assert h(actual._parser._pvalue_col) == 11, 'Found index of pval col'
        assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log'

        assert h(actual._parser._beta_col) == 12, 'beta field detected'
        assert h(
            actual._parser._stderr_col) == 13, 'stderr_beta field detected'
Beispiel #15
0
    def test_can_guess_saige(self):
        data = _fixture_to_strings([
            [
                'CHR', 'POS', 'SNPID', 'Allele1', 'Allele2', 'AC_Allele2',
                'AF_Allele2', 'N', 'BETA', 'SE', 'Tstat', 'p.value',
                'p.value.NA', 'Is.SPA.converge', 'varT', 'varTstar'
            ],  # noqa: E501
            [
                'chr1', '76792', 'chr1:76792:A:C', 'A', 'C', '57',
                '0.00168639048933983', '16900', '0.573681678183941',
                '0.663806747906141', '1.30193005902619', '0.387461577915637',
                '0.387461577915637', '1', '2.2694293866027', '2.41152256615949'
            ]  # noqa: E501
        ])
        actual = sniffers.guess_gwas_generic(data)
        assert h(actual._parser._marker_col) == 3, 'Found index of marker col'
        assert h(actual._parser._pvalue_col) == 12, 'Found index of pval col'
        assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log'

        assert h(actual._parser._beta_col) == 9, 'beta field detected'
        assert h(
            actual._parser._stderr_col) == 10, 'stderr_beta field detected'
Beispiel #16
0
    def test_can_guess_raremetalworker(self):
        data = _fixture_to_strings([
            [
                '#CHROM', 'POS', 'REF', 'ALT', 'N_INFORMATIVE', 'FOUNDER_AF',
                'ALL_AF', 'INFORMATIVE_ALT_AC', 'CALL_RATE', 'HWE_PVALUE',
                'N_REF', 'N_HET', 'N_ALT', 'U_STAT', 'SQRT_V_STAT',
                'ALT_EFFSIZE', 'PVALUE'
            ],  # noqa: E501
            [
                '9', '400066155', 'T', 'C', '432', '0', '0', '0', '1', '1',
                '432', '0', '0', 'NA', 'NA', 'NA', 'NA'
            ]
        ])
        actual = sniffers.guess_gwas_generic(data)
        assert h(actual._parser._chrom_col) == 1, 'Found index of chr col'
        assert h(actual._parser._pos_col) == 2, 'Found index of pos col'
        assert h(actual._parser._ref_col) == 3, 'Found index of ref col'
        assert h(actual._parser._alt_col) == 4, 'Found index of alt col'
        assert h(actual._parser._pvalue_col) == 17, 'Found index of pval col'
        assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log'

        assert h(actual._parser._beta_col) == 16, 'beta field detected'
        assert actual._parser._stderr_col is None, 'No stderr_beta field detected'
Beispiel #17
0
    def test_can_guess_raremetal(self):
        data = _fixture_to_strings([
            [
                '#CHROM', 'POS', 'REF', 'ALT', 'N', 'POOLED_ALT_AF',
                'DIRECTION_BY_STUDY', 'EFFECT_SIZE', 'EFFECT_SIZE_SD', 'H2',
                'PVALUE'
            ],  # noqa: E501
            [
                '1', '10177', 'A', 'AC', '491984', '0.00511094',
                '?-????????????????-????+???????????????????????????????????????????????????????????????????-????????????????????????????????????????????????????????????????????????????????',
                '-0.0257947', '0.028959', '1.61266e-06', '0.373073'
            ]  # noqa: E501
        ])
        actual = sniffers.guess_gwas_generic(data)
        assert h(actual._parser._chrom_col) == 1, 'Found index of chr col'
        assert h(actual._parser._pos_col) == 2, 'Found index of pos col'
        assert h(actual._parser._ref_col) == 3, 'Found index of ref col'
        assert h(actual._parser._alt_col) == 4, 'Found index of alt col'
        assert h(actual._parser._pvalue_col) == 11, 'Found index of pval col'
        assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log'

        assert h(actual._parser._beta_col) == 8, 'Beta field detected'
        assert h(actual._parser._stderr_col) == 9, 'stderr_beta field detected'
Beispiel #18
0
    def test_can_guess_gwas_catalog_mostly(self):
        data = _fixture_to_strings([
            [
                'chromosome', 'base_pair_location', 'effect_allele',
                'other_allele', 'odds_ratio', 'ci_lower', 'ci_upper',
                'standard_error', 'p_value'
            ],  # noqa
            [
                '1', '1108138', 'A', 'G', '1.081', '0.8822', '1.325', '0.1038',
                '0.4517'
            ]
        ])
        actual = sniffers.guess_gwas_generic(data)

        assert h(actual._parser._chrom_col) == 1, 'Found index of chr col'
        assert h(actual._parser._pos_col) == 2, 'Found index of pos col'
        # The EBI GWAS catalog uses "effect" and "non_effect". The meaning of this varies from one analysis to another.
        #   A user will have to decide how to handle the reference genome for themselves.
        assert actual._parser._ref_col is None, 'Did NOT identify ref col, b/c GWAS catalog uses ambiguous "effect"'
        assert actual._parser._alt_col is None, 'Did NOT identify alt col, b/c GWAS catalog uses ambiguous "effect"'

        assert h(actual._parser._stderr_col) == 8, 'stderr_beta field detected'
        assert h(actual._parser._pvalue_col) == 9, 'Found index of pval col'
        assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log'
Beispiel #19
0
 def test_sniffer_validates_options(self):
     with pytest.raises(exceptions.ConfigurationException,
                        match='exclusive'):
         sniffers.guess_gwas_generic(['1', '2'],
                                     parser=parsers.TupleLineParser(),
                                     parser_options={'option': 1})