예제 #1
0
def test_process_windowcalc_s_snps():
    infile = (
        'chrom\twinstart\twinend\tn_region_ind_snps\tind_id\tpop'
        '\ts_star\tnum_s_star_snps\tn_s_star_snps_hap1\tn_s_star_snps_hap2\n'
        '363104291\t0\t50000\t179\tmsp_110\tEUR\t85315\t8\t2\t8\n'
        '363104291\t0\t50000\t173\tmsp_111\tEUR\t47145\t2\t2\t2\n'
        '363104291\t0\t50000\t174\tmsp_112\tASN\t22028\t4\t2\t2\n'
        '363104291\t0\t50000\t180\tmsp_113\tEUR\t82882\t12\t7\t4\n'
        '363104291\t0\t50000\t184\tmsp_114\tEUR\t111004\t15\t8\t7\n'
        '363104291\t0\t50000\t174\tmsp_116\tEUR\t50976\t15\t4\t1\n'
        '363104291\t0\t50000\t176\tmsp_117\tASN\t70315\t7\t7\t0\n'
        '363104292\t0\t50000\t175\tmsp_118\tEUR\t67973\t3\t0\t3\n'
        '363104291\t0\t50000\t171\tmsp_115\tASN\t0\t0\t0\t0\n')
    result = Sstar_ECDF.process_windowcalc(StringIO(infile))

    # 116 no longer passes as the num is too high
    # 111 should also not pass as it matches both (high overlap)
    aae(result.columns, [
        'start', 'end', 'n_region_ind_snps', 'pop', 's_star', 'haplotype',
        'msp_ID'
    ])
    aae(result.start.values, [0, 0, 0])
    aae(result.end.values, [50000, 50000, 50000])
    aae(result.n_region_ind_snps.values, [179, 176, 175])
    aae(result['pop'].values, ['EUR', 'ASN', 'EUR'])
    aae(result.s_star.values, [85315, 70315, 67973])
    aae(result.haplotype.values, ['msp_110:2', 'msp_117:1', 'msp_118:2'])
    aae(result.msp_ID.values,
        ['msp_110:2_363104291', 'msp_117:1_363104291', 'msp_118:2_363104292'])
예제 #2
0
def test_filter_by_match():
    infile = (
        'chrom\twinstart\twinend\tn_region_ind_snps\tind_id\tpop'
        '\ts_star\tnum_s_star_snps\tn_s_star_snps_hap1\tn_s_star_snps_hap2\n'
        '363104291\t0\t50000\t179\tmsp_110\tEUR\t85315\t10\t2\t8\n'
        '363104291\t0\t50000\t173\tmsp_111\tEUR\t47145\t4\t2\t2\n'
        '363104291\t0\t50000\t174\tmsp_112\tASN\t22028\t4\t2\t2\n'
        '363104291\t0\t50000\t180\tmsp_113\tEUR\t82882\t12\t7\t4\n'
        '363104291\t0\t50000\t184\tmsp_114\tEUR\t111004\t15\t8\t7\n'
        '363104291\t0\t50000\t174\tmsp_116\tEUR\t50976\t5\t4\t1\n'
        '363104291\t0\t50000\t176\tmsp_117\tASN\t70315\t7\t7\t0\n'
        '363104292\t0\t50000\t175\tmsp_118\tEUR\t67973\t3\t0\t3\n'
        '363104291\t0\t50000\t171\tmsp_115\tASN\t0\t0\t0\t0\n')
    window = Sstar_ECDF.process_windowcalc(StringIO(infile), 0.5)

    match_file = (
        'chr\tstart\tend\tisc\thaplotype\tpopulation'
        '\tmatch_pct\tpvalue\tmatching_windows\n'
        '363104291\t0\t50000\t199\tmsp_110:1\tEUR\t0.8\t0.01\t2217\n'
        '363104291\t0\t50000\t199\tmsp_110:2\tEUR\t0.8\t0.02\t2217\n')

    result = Sstar_ECDF.filter_by_match(window, StringIO(match_file), 0.02)
    afe(
        result,
        pd.DataFrame({
            'start': [0],
            'end': [50000],
            'n_region_ind_snps': [179],
            'pop': ['EUR'],
            's_star': [85315],
            'msp_ID': ['msp_110:2_363104291'],
        }))

    # result empty as pvalue too low
    result = Sstar_ECDF.filter_by_match(window, StringIO(match_file), 0.01)
    afe(result,
        pd.DataFrame({
            'start': [],
            'end': [],
            'n_region_ind_snps': [],
            'pop': [],
            's_star': [],
            'msp_ID': [],
        }),
        check_index_type=False,
        check_dtype=False)
예제 #3
0
def test_get_chromosomes(mocker):
    data = StringIO('1063938750\n' '1222356006\n' '1634154403\n')
    mock_file = mocker.patch('Sstar_ECDF.open', return_value=data)

    result = [e for e in Sstar_ECDF.get_chromosomes('mock')]

    mock_file.assert_called_once_with('mock', 'r')

    assert result == ['1063938750', '1222356006', '1634154403']

    mock_file.reset_mock()

    data = StringIO('')
    mock_file = mocker.patch('Sstar_ECDF.open', return_value=data)
    assert [e for e in Sstar_ECDF.get_chromosomes('mock_empty')] == []

    mock_file.assert_called_once_with('mock_empty', 'r')
예제 #4
0
def test_null_db_combine(null_db):
    infile = StringIO(
        'chrom\twinstart\twinend\tn_snps\tn_ind_snps\t'
        'n_region_ind_snps\tind_id\tpop\ts_star\n' +
        '1 0 50000 333 4 161 msp_110 EUR 25211\n'.replace(' ', '\t') +
        '1 0 50000 333 4 162 msp_111 EUR 25211\n'.replace(' ', '\t') +
        '1 0 50000 333 4 162 msp_112 EUR 25212\n'.replace(' ', '\t') +
        '1 0 50000 333 4 162 msp_113 EUR 25212\n'.replace(' ', '\t') +
        '1 0 50000 333 4 162 msp_114 EUR 25212\n'.replace(' ', '\t') +
        '1 0 50000 333 4 162 msp_115 EUR 25212\n'.replace(' ', '\t') +
        '1 0 50000 333 4 162 msp_116 EUR 25212\n'.replace(' ', '\t') +
        '1 0 50000 333 4 162 msp_116 ASN 25212\n'.replace(' ', '\t') +
        '1 0 50000 333 4 162 msp_116 EUR 0\n'.replace(' ', '\t') +  # skip
        '')
    null_db.read_windowcalc(infile)
    print(null_db.DB)
    print(null_db.DB.index)

    index = pd.MultiIndex.from_tuples(
        [
            ('ASN', 162, 25212),
            ('EUR', 161, 25211),
            ('EUR', 162, 25211),
            ('EUR', 162, 25212),
        ],
        names=['pop', 'n_region_ind_snps', 's_star'])
    ase(null_db.DB, pd.Series([1, 1, 1, 5], index=index))

    infile = StringIO(
        'chrom\twinstart\twinend\tn_snps\tn_ind_snps\t'
        'n_region_ind_snps\tind_id\tpop\ts_star\n' +
        '1 0 50000 333 4 161 msp_110 EUR 25211\n'.replace(' ', '\t') +
        '1 0 50000 333 4 162 msp_111 EUR 25211\n'.replace(' ', '\t') + '')
    null_db2 = Sstar_ECDF.Null_DB()
    null_db2.read_windowcalc(infile)

    index = pd.MultiIndex.from_tuples(
        [
            ('EUR', 161, 25211),
            ('EUR', 162, 25211),
        ],
        names=['pop', 'n_region_ind_snps', 's_star'])
    ase(null_db2.DB, pd.Series([1, 1], index=index))

    null_db.combine(null_db2)

    index = pd.MultiIndex.from_tuples(
        [
            ('ASN', 162, 25212),
            ('EUR', 161, 25211),
            ('EUR', 162, 25211),
            ('EUR', 162, 25212),
        ],
        names=['pop', 'n_region_ind_snps', 's_star'])
    ase(null_db.DB, pd.Series([1, 2, 2, 5], index=index))
예제 #5
0
def test_region_files():
    result = [c for c in Sstar_ECDF.region_files('base', 'a b c'.split())]

    assert result == [
        os.path.join('base', 'a.windowcalc.gz'),
        os.path.join('base', 'b.windowcalc.gz'),
        os.path.join('base', 'c.windowcalc.gz'),
    ]

    result = [
        c for c in Sstar_ECDF.region_files('', '1 2 3'.split(), '{chrom}.txt')
    ]

    assert result == [
        '1.txt',
        '2.txt',
        '3.txt',
    ]

    result = [c for c in Sstar_ECDF.region_files('base', [])]

    assert result == []
예제 #6
0
def null_db():
    return Sstar_ECDF.Null_DB()
예제 #7
0
def test_filter_by_sstar():
    df = pd.DataFrame({
        'start': [0, 0, 0],
        'end': [50000, 50000, 50000],
        'n_region_ind_snps': [179, 179, 178],
        'pop': ['EUR', 'EUR', 'EUR'],
        's_star': [85315, 85314, 85313],
        'msp_ID':
        ['msp_110:2_363104292', 'msp_110:2_363104293', 'msp_110:2_363104291'],
    })

    null_db = Sstar_ECDF.Null_DB()
    index = pd.MultiIndex.from_tuples(
        [
            ('EUR', 179, 25211),
            ('EUR', 179, 85313),
            ('EUR', 179, 85314),
            ('EUR', 179, 85315),
            ('EUR', 179, 85316),
            ('EUR', 178, 25211),
        ],
        names=['pop', 'n_region_ind_snps', 's_star'])
    null_db.DB = pd.Series([2, 2, 2, 2, 2, 2], index=index)

    # nothing
    result = Sstar_ECDF.filter_by_sstar(df, null_db, -1)
    afe(result,
        pd.DataFrame({
            'start': [],
            'end': [],
            'msp_ID': []
        }),
        check_like=True,
        check_dtype=False)

    # last
    result = Sstar_ECDF.filter_by_sstar(df, null_db, 0)
    afe(result,
        pd.DataFrame({
            'start': [0],
            'end': [50000],
            'msp_ID': ['msp_110:2_363104291']
        }),
        check_like=True,
        check_dtype=False)

    result = Sstar_ECDF.filter_by_sstar(df, null_db, 0.2)
    afe(result,
        pd.DataFrame({
            'start': [0, 0],
            'end': [50000, 50000],
            'msp_ID': ['msp_110:2_363104291', 'msp_110:2_363104292']
        }),
        check_like=True,
        check_dtype=False)

    # Note, groupby sorts by pop, then n_region
    result = Sstar_ECDF.filter_by_sstar(df, null_db, 0.4)
    afe(result,
        pd.DataFrame({
            'start': [0, 0, 0],
            'end': [50000, 50000, 50000],
            'msp_ID': [
                'msp_110:2_363104291', 'msp_110:2_363104292',
                'msp_110:2_363104293'
            ]
        }),
        check_like=True,
        check_dtype=False)

    # empty window input
    df = pd.DataFrame({
        'start': [],
        'end': [],
        'n_region_ind_snps': [],
        'pop': [],
        's_star': [],
        'msp_ID': [],
    })
    result = Sstar_ECDF.filter_by_sstar(df, null_db, 0.05)
    afe(result,
        pd.DataFrame({
            'start': [],
            'end': [],
            'msp_ID': []
        }),
        check_like=True,
        check_dtype=False)
예제 #8
0
def test_main_generate_bed():
    runner = CliRunner()
    with runner.isolated_filesystem():
        # build up null db
        index = pd.MultiIndex.from_tuples(
            [
                ('ASN', 161, 25211),
                ('ASN', 162, 25212),
                ('ASN', 164, 25211),
                ('ASN', 164, 25212),
                ('EUR', 161, 25211),
                ('EUR', 162, 25211),
                ('EUR', 162, 25212),
                ('EUR', 164, 25211),
                ('EUR', 164, 25212),
                ('EUR', 164, 25213),
                ('EUR', 164, 25214),
                ('EUR', 164, 25215),
            ],
            names=['pop', 'n_region_ind_snps', 's_star'])
        dat = list(range(12))
        null_db = Sstar_ECDF.Null_DB()
        null_db.DB = pd.Series(dat, index=index)
        null_db.save('null.pkl')

        # build up admixed window calc files
        with gzip.open('1.windowcalc.gz', 'wt') as window:
            window.write(
                'chrom\twinstart\twinend\tn_region_ind_snps\tind_id\tpop'
                '\ts_star\tnum_s_star_snps\tn_s_star_snps_hap1\tn_s_star_snps_hap2\n'
                '1\t0\t50000\t179\tmsp_110\tEUR\t85315\t10\t2\t8\n')

        with gzip.open('2.windowcalc.gz', 'wt') as window:
            window.write(
                'chrom\twinstart\twinend\tn_region_ind_snps\tind_id\tpop'
                '\ts_star\tnum_s_star_snps\tn_s_star_snps_hap1\tn_s_star_snps_hap2\n'
                '2\t0\t50000\t179\tmsp_110\tEUR\t85315\t10\t2\t8\n')

        # build tsv files
        with gzip.open('1.tsv.gz', 'wt') as tsv:
            tsv.write('chr\tstart\tend\tisc\thaplotype\tpopulation'
                      '\tmatch_pct\tpvalue\tmatching_windows\n'
                      '1\t0\t50000\t199\tmsp_110:1\tEUR\t0.8\t0.03\t2217\n'
                      '1\t0\t50000\t199\tmsp_110:2\tEUR\t0.8\t0.01\t2217\n')

        with gzip.open('2.tsv.gz', 'wt') as tsv:
            tsv.write('chr\tstart\tend\tisc\thaplotype\tpopulation'
                      '\tmatch_pct\tpvalue\tmatching_windows\n'
                      '2\t0\t50000\t199\tmsp_110:1\tEUR\t0.8\t0.01\t2217\n')

        # chrom list
        with open('chroms.txt', 'w') as chrom:
            chrom.write('1\n2\n')

        # run main
        result = runner.invoke(
            Sstar_ECDF.main, 'generate-bed '
            '--chr-list chroms.txt '
            '--outfile {chrom}.test.bed '
            '--null-db null.pkl ')
        assert result.exit_code == 0

        # check output files
        out1 = open('1.test.bed').readlines()
        assert out1 == ['msp_ID\tstart\tend\n', 'msp_110:2_1\t0\t50000\n']
        out2 = open('2.test.bed').readlines()
        assert out2 == ['msp_ID\tstart\tend\n']
예제 #9
0
def test_main_build_null_db():
    runner = CliRunner()
    with runner.isolated_filesystem():
        # build chromosome file
        with open('chroms.txt', 'w') as output:
            output.write('10\n' '12\n' '16\n')

        # make fake window files
        with gzip.open('10.windowcalc.gz', 'wt') as output:
            output.write(
                'chrom\twinstart\twinend\tn_snps\tn_ind_snps\t'
                'n_region_ind_snps\tind_id\tpop\ts_star\n' +
                '2 0 50000 333 4 161 msp_110 EUR 25211\n'.replace(' ', '\t') +
                '2 0 50000 333 4 162 msp_111 EUR 25211\n'.replace(' ', '\t') +
                '2 0 50000 333 4 162 msp_112 EUR 25212\n'.replace(' ', '\t') +
                '2 0 50000 333 4 162 msp_113 EUR 25212\n'.replace(' ', '\t') +
                '2 0 50000 333 4 162 msp_114 EUR 25212\n'.replace(' ', '\t') +
                '2 0 50000 333 4 162 msp_115 EUR 25212\n'.replace(' ', '\t') +
                '2 0 50000 333 4 162 msp_116 EUR 25212\n'.replace(' ', '\t') +
                '2 0 50000 333 4 162 msp_116 ASN 25212\n'.replace(' ', '\t') +
                '')

        with gzip.open('12.windowcalc.gz', 'wt') as output:
            output.write(
                'chrom\twinstart\twinend\tn_snps\tn_ind_snps\t'
                'n_region_ind_snps\tind_id\tpop\ts_star\n' +
                '3 0 50000 333 4 161 msp_110 ASN 25211\n'.replace(' ', '\t') +
                '3 0 50000 333 4 162 msp_111 EUR 25211\n'.replace(' ', '\t') +
                '3 0 50000 333 4 162 msp_112 ASN 25212\n'.replace(' ', '\t') +
                '3 0 50000 333 4 162 msp_113 EUR 25212\n'.replace(' ', '\t') +
                '3 0 50000 333 4 162 msp_114 ASN 25212\n'.replace(' ', '\t') +
                '3 0 50000 333 4 162 msp_115 EUR 25212\n'.replace(' ', '\t') +
                '3 0 50000 333 4 162 msp_116 EUR 25212\n'.replace(' ', '\t') +
                '3 0 50000 333 4 162 msp_116 ASN 25212\n'.replace(' ', '\t') +
                '')

        with gzip.open('16.windowcalc.gz', 'wt') as output:
            output.write(
                'chrom\twinstart\twinend\tn_snps\tn_ind_snps\t'
                'n_region_ind_snps\tind_id\tpop\ts_star\n' +
                '1 0 50000 333 4 164 msp_110 EUR 25211\n'.replace(' ', '\t') +
                '1 0 50000 333 4 164 msp_111 ASN 25211\n'.replace(' ', '\t') +
                '1 0 50000 333 4 164 msp_112 EUR 25212\n'.replace(' ', '\t') +
                '1 0 50000 333 4 164 msp_113 ASN 25212\n'.replace(' ', '\t') +
                '1 0 50000 333 4 164 msp_114 ASN 25212\n'.replace(' ', '\t') +
                '1 0 50000 333 4 164 msp_115 EUR 25212\n'.replace(' ', '\t') +
                '1 0 50000 333 4 164 msp_116 EUR 25212\n'.replace(' ', '\t') +
                '1 0 50000 333 4 164 msp_116 ASN 25212\n'.replace(' ', '\t') +
                '')

        with gzip.open('18.windowcalc.gz', 'wt') as output:
            output.write(
                'chrom\twinstart\twinend\tn_snps\tn_ind_snps\t'
                'n_region_ind_snps\tind_id\tpop\ts_star\n' +
                '1 0 50000 333 4 161 msp_110 EUR 25211\n'.replace(' ', '\t') +
                'NOT READ\n'
                '1 0 50000 333 4 162 msp_116 ASN 25212\n'.replace(' ', '\t') +
                '')

        # build db
        result = runner.invoke(
            Sstar_ECDF.main, ' build-null-db '
            '--chr-list chroms.txt '
            '--outfile test.pkl')

        assert result.exit_code == 0

        # check db
        assert os.path.exists('test.pkl')

        db = Sstar_ECDF.Null_DB()
        db.load('test.pkl')
        index = pd.MultiIndex.from_tuples(
            [
                ('ASN', 161, 25211),
                ('ASN', 162, 25212),
                ('ASN', 164, 25211),
                ('ASN', 164, 25212),
                ('EUR', 161, 25211),
                ('EUR', 162, 25211),
                ('EUR', 162, 25212),
                ('EUR', 164, 25211),
                ('EUR', 164, 25212),
            ],
            names=['pop', 'n_region_ind_snps', 's_star'])
        ase(db.DB, pd.Series([1, 4, 1, 3, 1, 2, 8, 1, 3], index=index))
예제 #10
0
def test_main_combine_null_dbs():
    runner = CliRunner()
    with runner.isolated_filesystem():
        # build up null dbs
        index = pd.MultiIndex.from_tuples(
            [
                ('ASN', 161, 25211),
                ('ASN', 162, 25212),
                ('ASN', 164, 25211),
                ('ASN', 164, 25212),
                ('EUR', 161, 25211),
                ('EUR', 162, 25211),
                ('EUR', 162, 25212),
                ('EUR', 164, 25211),
                ('EUR', 164, 25212),
                ('EUR', 164, 25213),
                ('EUR', 164, 25214),
                ('EUR', 164, 25215),
            ],
            names=['pop', 'n_region_ind_snps', 's_star'])
        dat = list(range(12))
        null_db = Sstar_ECDF.Null_DB()
        null_db.DB = pd.Series(dat, index=index)
        null_db.save('null.pkl')

        index = pd.MultiIndex.from_tuples(
            [
                ('ASN', 161, 25211),
                ('ASN', 162, 25212),
                ('ASN', 164, 25211),
                ('ASN', 164, 25212),
                ('EUR', 161, 25211),
                ('EUR', 162, 25211),
                ('EUR', 162, 25212),
                ('EUR', 164, 25211),
                ('EUR', 164, 25212),
                ('EUR', 164, 25213),
                ('EUR', 164, 25214),
                ('EUR', 164, 25215),
            ],
            names=['pop', 'n_region_ind_snps', 's_star'])
        dat = list(range(0, 24, 2))
        null_db = Sstar_ECDF.Null_DB()
        null_db.DB = pd.Series(dat, index=index)
        null_db.save('null2.pkl')

        index = pd.MultiIndex.from_tuples(
            [
                ('ASN', 161, 25211),
                ('ASN', 162, 25212),
                ('ASN', 164, 25211),
                ('ASN', 164, 25212),
                ('EUR', 161, 25211),
                ('EUR', 162, 25211),
                ('EUR', 162, 25212),
                ('EUR', 164, 25211),
                ('EUR', 164, 25212),
                ('EUR', 164, 25213),
                ('EUR', 164, 25214),
                ('EUR', 164, 25215),
            ],
            names=['pop', 'n_region_ind_snps', 's_star'])
        dat = list(range(0, 36, 3))
        null_db = Sstar_ECDF.Null_DB()
        null_db.DB = pd.Series(dat, index=index)
        null_db.save('null3.pkl')

        # run main
        result = runner.invoke(
            Sstar_ECDF.main, 'combine-null-dbs '
            '--outfile test.pkl '
            'null.pkl null2.pkl null3.pkl')
        assert result.exit_code == 0

        # check output files
        db = Sstar_ECDF.Null_DB()
        db.load('test.pkl')
        index = pd.MultiIndex.from_tuples(
            [
                ('ASN', 161, 25211),
                ('ASN', 162, 25212),
                ('ASN', 164, 25211),
                ('ASN', 164, 25212),
                ('EUR', 161, 25211),
                ('EUR', 162, 25211),
                ('EUR', 162, 25212),
                ('EUR', 164, 25211),
                ('EUR', 164, 25212),
                ('EUR', 164, 25213),
                ('EUR', 164, 25214),
                ('EUR', 164, 25215),
            ],
            names=['pop', 'n_region_ind_snps', 's_star'])
        ase(db.DB, pd.Series(range(0, 72, 6), index=index))