Python txt_to_df Examples, preprocessing.txt_to_df Python Examples

Example #1

0

Show file

File: test_composition.py Project: shoz/ProtLearn

def test_composition():
    "Test sequence compositions"

    # load data
    df = txt_to_df(path + '/tests/docs/test_seq.txt', 0)

    # test relative composition
    comp_rel = composition(df, 'relative')

    # test if frequencies add to 1
    for i in range(df.shape[0]):
        assert round(comp_rel.iloc[0, :].sum()) == 1

    # test decimal places
    comp_dec2 = composition(df, 'relative', round_fraction=2)
    x = str(comp_dec2['A'][0])
    assert x[::-1].find('.') == 2
    comp_dec2 = composition(df, 'relative', round_fraction=5)
    y = str(comp_dec2['A'][0])
    assert y[::-1].find('.') == 5

    # test absolute composition
    comp_abs = composition(df, 'absolute')

    # test if frequences == sequence length
    all_lengths = [6, 9, 7, 6]
    for i in range(df.shape[0]):
        assert comp_abs.iloc[i, :].sum() == all_lengths[i]

Example #2

0

Show file

def test_aaindex3():
    "Test AAIndex3"

    # load data
    df = txt_to_df(path + '/tests/docs/test_seq.txt', 0)

    # get aaindex2
    aaind3 = aaindex3(df)

    # test shape
    assert aaind3.shape == (4, 43)

    # test some triangular indices
    TANS760101 = np.array([-4.72, -5.975, -4.18333, -4.04])
    GODA950101 = np.array([np.nan, -.05, -.1333, -.14])
    ZHAC000106 = np.array([.196, -.34875, .46666, .972])
    np.testing.assert_equal(np.round(aaind3['TANS760101'], 3),\
                            np.round(TANS760101, 3))
    # this column contains NaNs
    assert ('GODA950101' in aaind3) == False
    np.testing.assert_equal(np.round(aaind3['ZHAC000106'], 3),\
                            np.round(ZHAC000106, 3))

    # test some square indices
    ZHAC000102 = np.array([-.408, -1.415, .475, 1.532])
    ZHAC000103 = np.array([-.052, -.625, .59166, 1.096])
    ZHAC000105 = np.array([-.242, -.72, .17, .952])
    np.testing.assert_equal(np.round(aaind3['ZHAC000102'], 3),\
                            np.round(ZHAC000102, 3))
    np.testing.assert_equal(np.round(aaind3['ZHAC000103'], 3),\
                            np.round(ZHAC000103, 3))
    np.testing.assert_equal(np.round(aaind3['ZHAC000105'], 3),\
                            np.round(ZHAC000105, 3))

    # test standardization (zscore)
    aaind3_z = aaindex3(df, 'zscore')
    # test mean = 0
    for i in range(aaind3_z.shape[0]):
        assert abs(round(aaind3_z.iloc[:, 1].mean())) == 0
    # test std --> 1
    for i in range(aaind3_z.shape[0]):
        assert round(aaind3_z.iloc[:,i].std(), 1) ==\
               round(aaind3_z.iloc[:,0].std(), 1)

    # test standardization (minmax)
    aaind3_mm = aaindex3(df, 'minmax')
    # test minimum and maximum
    for i in range(aaind3_mm.shape[0]):
        assert round(aaind3_mm.iloc[:, i].min()) == 0
        assert round(aaind3_mm.iloc[:, i].max()) == 1

Example #3

0

Show file

File: test_aaindex2.py Project: ushareng/ProtLearn

def test_aaindex2():
    "Test AAIndex2"

    # load data
    df = txt_to_df(path + '/tests/docs/test_seq.txt', 0)

    # get aaindex2
    aaind2 = aaindex2(df)

    # test shape
    assert aaind2.shape == (4, 94)

    # test some triangular indices
    ALTS910101 = np.array([-2, -.125, .333, -2])
    VOGG950101 = np.array([4.28, 5.2, 6.05, 4.32])
    CROG050101 = np.array([-1.8, .625, .5, -.4])
    np.testing.assert_equal(np.round(aaind2['ALTS910101'], 3),\
                            np.round(ALTS910101, 3))
    np.testing.assert_equal(np.round(aaind2['VOGG950101'], 3),\
                            np.round(VOGG950101, 3))
    np.testing.assert_equal(np.round(aaind2['CROG050101'], 3),\
                            np.round(CROG050101, 3))

    # test some square indices
    LINK010101 = np.array([.0266, .0955, .13, .1276])
    KOSJ950108 = np.array([1.62, 18.0875, 16.05, 14.68])
    DOSZ010101 = np.array([1.32, 15.7625, -1.1833, -5.12])
    np.testing.assert_equal(np.round(aaind2['LINK010101'], 3),\
                            np.round(LINK010101, 3))
    np.testing.assert_equal(np.round(aaind2['KOSJ950108'], 3),\
                            np.round(KOSJ950108, 3))
    np.testing.assert_equal(np.round(aaind2['DOSZ010101'], 3),\
                            np.round(DOSZ010101, 3))

    # test standardization (zscore)
    aaind2_z = aaindex2(df, 'zscore')
    # test mean = 0
    for i in range(aaind2_z.shape[0]):
        assert abs(round(aaind2_z.iloc[:, 1].mean())) == 0
    # test std --> 1
    for i in range(aaind2_z.shape[0]):
        assert round(aaind2_z.iloc[:,i].std(), 1) ==\
               round(aaind2_z.iloc[:,0].std(), 1)

    # test standardization (minmax)
    aaind2_mm = aaindex2(df, 'minmax')
    # test minimum and maximum
    for i in range(aaind2_mm.shape[0]):
        assert round(aaind2_mm.iloc[:, i].min()) == 0
        assert round(aaind2_mm.iloc[:, i].max()) == 1

Example #4

0

Show file

File: test_length.py Project: ushareng/ProtLearn

def test_lengths():
    "Test sequence lengths"

    # load data
    df = txt_to_df(path + '/tests/docs/test_seq.txt', 0)

    # test integer lengths
    len_int = length(df, 'int')
    assert np.array_equal(len_int, np.array([6, 9, 7, 6]))

    # test one-hot-encoded lengths
    len_ohe = length(df, 'ohe')
    # columns: [6, 7, 9]
    assert np.array_equal(
        len_ohe,
        np.array([[1., 0., 0.], [0., 0., 1.], [0., 1., 0.], [1., 0., 0.]]))

Example #5

0

Show file

File: test_txt_to_df.py Project: ushareng/ProtLearn

def test_conversion():
    "Test txt_to_df conversion"
    
    # load data
    df = txt_to_df(path+'/tests/docs/test_seq.txt', 0)
    
    # test labels and df shape
    assert df.columns[0] == 'Sequence'
    assert df.columns[1] == 'Label'
    assert df.shape == (4, 2)
    
    # test sequences
    assert df['Sequence'][0] == 'AGTYLK'
    assert df['Sequence'][1] == 'VCIMMMPFP'
    assert df['Sequence'][2] == 'LRSAHHN'
    assert df['Sequence'][3] == 'AQEEWD'
    
    # test labels
    for i in range(df.shape[0]):
        assert df['Label'][i] == 0

Example #6

0

Show file

File: test_aaindex1.py Project: ushareng/ProtLearn

def test_aaindex1():
    "Test AAIndex1"

    # load data
    df = txt_to_df(path + '/tests/docs/test_seq.txt', 0)

    # get aaindex1
    aaind1 = aaindex1(df)

    # test shape
    assert aaind1.shape == (4, 553)

    # test some indices
    ANDN920101 = np.array([4.3, 4.40555, 4.48714, 4.46])
    QIAN880126 = np.array([.01166, -.17111, .05857, -.04333])
    KARS160122 = np.array([2.014, 5.48522, 2.789, 1.751])
    np.testing.assert_equal(np.round(aaind1['ANDN920101'], 3),\
                            np.round(ANDN920101, 3))
    np.testing.assert_equal(np.round(aaind1['QIAN880126'], 3),\
                            np.round(QIAN880126, 3))
    np.testing.assert_equal(np.round(aaind1['KARS160122'], 3),\
                            np.round(KARS160122, 3))

    # test standardization (zscore)
    aaind1_z = aaindex1(df, 'zscore')
    # test mean = 0
    for i in range(aaind1_z.shape[0]):
        assert abs(round(aaind1_z.iloc[:, 1].mean())) == 0
    # test std --> 1
    for i in range(aaind1_z.shape[0]):
        assert round(aaind1_z.iloc[:,i].std(), 1) ==\
               round(aaind1_z.iloc[:,0].std(), 1)

    # test standardization (minmax)
    aaind1_mm = aaindex1(df, 'minmax')
    # test minimum and maximum
    for i in range(aaind1_mm.shape[0]):
        assert round(aaind1_mm.iloc[:, i].min()) == 0
        assert round(aaind1_mm.iloc[:, i].max()) == 1

Example #7

0

Show file

def test_integer_encode():
    "Test integer encoding"

    # load data
    df = txt_to_df(path + '/tests/docs/test_seq.txt', 0)
    enc = integer_encode(df)

    # test array shape and type
    assert enc.shape == (4, )
    assert type(enc) == np.ndarray

    # test array contents
    assert np.array_equal(enc[0], np.array([1, 6, 17, 20, 10, 9]))
    assert np.array_equal(enc[1], np.array([18, 2, 8, 11, 11, 11, 13, 5, 13]))
    assert np.array_equal(enc[2], np.array([10, 15, 16, 1, 7, 7, 12]))
    assert np.array_equal(enc[3], np.array([1, 14, 4, 4, 19, 3]))

    # test padding
    enc = integer_encode(df, padding=True)
    assert enc.shape == (4, 9)
    assert [enc[0][i] == 0 for i in [6, 7, 8]]
    assert [enc[2][i] == 0 for i in [7, 8]]
    assert [enc[3][i] == 0 for i in [6, 7, 8]]