Exemple #1
0
def test_make_kmers(kmer_lengths):
    from kvector import make_kmers
    test = make_kmers(kmer_lengths)

    if kmer_lengths == 3:
        true = ['AAA', 'AAC', 'AAG', 'AAT', 'ACA', 'ACC', 'ACG', 'ACT', 'AGA',
                'AGC', 'AGG', 'AGT', 'ATA', 'ATC', 'ATG', 'ATT', 'CAA', 'CAC',
                'CAG', 'CAT', 'CCA', 'CCC', 'CCG', 'CCT', 'CGA', 'CGC', 'CGG',
                'CGT', 'CTA', 'CTC', 'CTG', 'CTT', 'GAA', 'GAC', 'GAG', 'GAT',
                'GCA', 'GCC', 'GCG', 'GCT', 'GGA', 'GGC', 'GGG', 'GGT', 'GTA',
                'GTC', 'GTG', 'GTT', 'TAA', 'TAC', 'TAG', 'TAT', 'TCA', 'TCC',
                'TCG', 'TCT', 'TGA', 'TGC', 'TGG', 'TGT', 'TTA', 'TTC', 'TTG',
                'TTT']
    else:
        true = ['AAA', 'AAC', 'AAG', 'AAT', 'ACA', 'ACC', 'ACG', 'ACT', 'AGA',
                'AGC', 'AGG', 'AGT', 'ATA', 'ATC', 'ATG', 'ATT', 'CAA', 'CAC',
                'CAG', 'CAT', 'CCA', 'CCC', 'CCG', 'CCT', 'CGA', 'CGC', 'CGG',
                'CGT', 'CTA', 'CTC', 'CTG', 'CTT', 'GAA', 'GAC', 'GAG', 'GAT',
                'GCA', 'GCC', 'GCG', 'GCT', 'GGA', 'GGC', 'GGG', 'GGT', 'GTA',
                'GTC', 'GTG', 'GTT', 'TAA', 'TAC', 'TAG', 'TAT', 'TCA', 'TCC',
                'TCG', 'TCT', 'TGA', 'TGC', 'TGG', 'TGT', 'TTA', 'TTC', 'TTG',
                'TTT', 'AAAA', 'AAAC', 'AAAG', 'AAAT', 'AACA', 'AACC', 'AACG',
                'AACT', 'AAGA', 'AAGC', 'AAGG', 'AAGT', 'AATA', 'AATC', 'AATG',
                'AATT', 'ACAA', 'ACAC', 'ACAG', 'ACAT', 'ACCA', 'ACCC', 'ACCG',
                'ACCT', 'ACGA', 'ACGC', 'ACGG', 'ACGT', 'ACTA', 'ACTC', 'ACTG',
                'ACTT', 'AGAA', 'AGAC', 'AGAG', 'AGAT', 'AGCA', 'AGCC', 'AGCG',
                'AGCT', 'AGGA', 'AGGC', 'AGGG', 'AGGT', 'AGTA', 'AGTC', 'AGTG',
                'AGTT', 'ATAA', 'ATAC', 'ATAG', 'ATAT', 'ATCA', 'ATCC', 'ATCG',
                'ATCT', 'ATGA', 'ATGC', 'ATGG', 'ATGT', 'ATTA', 'ATTC', 'ATTG',
                'ATTT', 'CAAA', 'CAAC', 'CAAG', 'CAAT', 'CACA', 'CACC', 'CACG',
                'CACT', 'CAGA', 'CAGC', 'CAGG', 'CAGT', 'CATA', 'CATC', 'CATG',
                'CATT', 'CCAA', 'CCAC', 'CCAG', 'CCAT', 'CCCA', 'CCCC', 'CCCG',
                'CCCT', 'CCGA', 'CCGC', 'CCGG', 'CCGT', 'CCTA', 'CCTC', 'CCTG',
                'CCTT', 'CGAA', 'CGAC', 'CGAG', 'CGAT', 'CGCA', 'CGCC', 'CGCG',
                'CGCT', 'CGGA', 'CGGC', 'CGGG', 'CGGT', 'CGTA', 'CGTC', 'CGTG',
                'CGTT', 'CTAA', 'CTAC', 'CTAG', 'CTAT', 'CTCA', 'CTCC', 'CTCG',
                'CTCT', 'CTGA', 'CTGC', 'CTGG', 'CTGT', 'CTTA', 'CTTC', 'CTTG',
                'CTTT', 'GAAA', 'GAAC', 'GAAG', 'GAAT', 'GACA', 'GACC', 'GACG',
                'GACT', 'GAGA', 'GAGC', 'GAGG', 'GAGT', 'GATA', 'GATC', 'GATG',
                'GATT', 'GCAA', 'GCAC', 'GCAG', 'GCAT', 'GCCA', 'GCCC', 'GCCG',
                'GCCT', 'GCGA', 'GCGC', 'GCGG', 'GCGT', 'GCTA', 'GCTC', 'GCTG',
                'GCTT', 'GGAA', 'GGAC', 'GGAG', 'GGAT', 'GGCA', 'GGCC', 'GGCG',
                'GGCT', 'GGGA', 'GGGC', 'GGGG', 'GGGT', 'GGTA', 'GGTC', 'GGTG',
                'GGTT', 'GTAA', 'GTAC', 'GTAG', 'GTAT', 'GTCA', 'GTCC', 'GTCG',
                'GTCT', 'GTGA', 'GTGC', 'GTGG', 'GTGT', 'GTTA', 'GTTC', 'GTTG',
                'GTTT', 'TAAA', 'TAAC', 'TAAG', 'TAAT', 'TACA', 'TACC', 'TACG',
                'TACT', 'TAGA', 'TAGC', 'TAGG', 'TAGT', 'TATA', 'TATC', 'TATG',
                'TATT', 'TCAA', 'TCAC', 'TCAG', 'TCAT', 'TCCA', 'TCCC', 'TCCG',
                'TCCT', 'TCGA', 'TCGC', 'TCGG', 'TCGT', 'TCTA', 'TCTC', 'TCTG',
                'TCTT', 'TGAA', 'TGAC', 'TGAG', 'TGAT', 'TGCA', 'TGCC', 'TGCG',
                'TGCT', 'TGGA', 'TGGC', 'TGGG', 'TGGT', 'TGTA', 'TGTC', 'TGTG',
                'TGTT', 'TTAA', 'TTAC', 'TTAG', 'TTAT', 'TTCA', 'TTCC', 'TTCG',
                'TTCT', 'TTGA', 'TTGC', 'TTGG', 'TTGT', 'TTTA', 'TTTC', 'TTTG',
                'TTTT']

    pdt.assert_equal(test, true)
Exemple #2
0
def test_make_kmers(kmer_lengths):
    from kvector import make_kmers
    test = make_kmers(kmer_lengths)

    if kmer_lengths == 3:
        true = [
            'AAA', 'AAC', 'AAG', 'AAT', 'ACA', 'ACC', 'ACG', 'ACT', 'AGA',
            'AGC', 'AGG', 'AGT', 'ATA', 'ATC', 'ATG', 'ATT', 'CAA', 'CAC',
            'CAG', 'CAT', 'CCA', 'CCC', 'CCG', 'CCT', 'CGA', 'CGC', 'CGG',
            'CGT', 'CTA', 'CTC', 'CTG', 'CTT', 'GAA', 'GAC', 'GAG', 'GAT',
            'GCA', 'GCC', 'GCG', 'GCT', 'GGA', 'GGC', 'GGG', 'GGT', 'GTA',
            'GTC', 'GTG', 'GTT', 'TAA', 'TAC', 'TAG', 'TAT', 'TCA', 'TCC',
            'TCG', 'TCT', 'TGA', 'TGC', 'TGG', 'TGT', 'TTA', 'TTC', 'TTG',
            'TTT'
        ]
    else:
        true = [
            'AAA', 'AAC', 'AAG', 'AAT', 'ACA', 'ACC', 'ACG', 'ACT', 'AGA',
            'AGC', 'AGG', 'AGT', 'ATA', 'ATC', 'ATG', 'ATT', 'CAA', 'CAC',
            'CAG', 'CAT', 'CCA', 'CCC', 'CCG', 'CCT', 'CGA', 'CGC', 'CGG',
            'CGT', 'CTA', 'CTC', 'CTG', 'CTT', 'GAA', 'GAC', 'GAG', 'GAT',
            'GCA', 'GCC', 'GCG', 'GCT', 'GGA', 'GGC', 'GGG', 'GGT', 'GTA',
            'GTC', 'GTG', 'GTT', 'TAA', 'TAC', 'TAG', 'TAT', 'TCA', 'TCC',
            'TCG', 'TCT', 'TGA', 'TGC', 'TGG', 'TGT', 'TTA', 'TTC', 'TTG',
            'TTT', 'AAAA', 'AAAC', 'AAAG', 'AAAT', 'AACA', 'AACC', 'AACG',
            'AACT', 'AAGA', 'AAGC', 'AAGG', 'AAGT', 'AATA', 'AATC', 'AATG',
            'AATT', 'ACAA', 'ACAC', 'ACAG', 'ACAT', 'ACCA', 'ACCC', 'ACCG',
            'ACCT', 'ACGA', 'ACGC', 'ACGG', 'ACGT', 'ACTA', 'ACTC', 'ACTG',
            'ACTT', 'AGAA', 'AGAC', 'AGAG', 'AGAT', 'AGCA', 'AGCC', 'AGCG',
            'AGCT', 'AGGA', 'AGGC', 'AGGG', 'AGGT', 'AGTA', 'AGTC', 'AGTG',
            'AGTT', 'ATAA', 'ATAC', 'ATAG', 'ATAT', 'ATCA', 'ATCC', 'ATCG',
            'ATCT', 'ATGA', 'ATGC', 'ATGG', 'ATGT', 'ATTA', 'ATTC', 'ATTG',
            'ATTT', 'CAAA', 'CAAC', 'CAAG', 'CAAT', 'CACA', 'CACC', 'CACG',
            'CACT', 'CAGA', 'CAGC', 'CAGG', 'CAGT', 'CATA', 'CATC', 'CATG',
            'CATT', 'CCAA', 'CCAC', 'CCAG', 'CCAT', 'CCCA', 'CCCC', 'CCCG',
            'CCCT', 'CCGA', 'CCGC', 'CCGG', 'CCGT', 'CCTA', 'CCTC', 'CCTG',
            'CCTT', 'CGAA', 'CGAC', 'CGAG', 'CGAT', 'CGCA', 'CGCC', 'CGCG',
            'CGCT', 'CGGA', 'CGGC', 'CGGG', 'CGGT', 'CGTA', 'CGTC', 'CGTG',
            'CGTT', 'CTAA', 'CTAC', 'CTAG', 'CTAT', 'CTCA', 'CTCC', 'CTCG',
            'CTCT', 'CTGA', 'CTGC', 'CTGG', 'CTGT', 'CTTA', 'CTTC', 'CTTG',
            'CTTT', 'GAAA', 'GAAC', 'GAAG', 'GAAT', 'GACA', 'GACC', 'GACG',
            'GACT', 'GAGA', 'GAGC', 'GAGG', 'GAGT', 'GATA', 'GATC', 'GATG',
            'GATT', 'GCAA', 'GCAC', 'GCAG', 'GCAT', 'GCCA', 'GCCC', 'GCCG',
            'GCCT', 'GCGA', 'GCGC', 'GCGG', 'GCGT', 'GCTA', 'GCTC', 'GCTG',
            'GCTT', 'GGAA', 'GGAC', 'GGAG', 'GGAT', 'GGCA', 'GGCC', 'GGCG',
            'GGCT', 'GGGA', 'GGGC', 'GGGG', 'GGGT', 'GGTA', 'GGTC', 'GGTG',
            'GGTT', 'GTAA', 'GTAC', 'GTAG', 'GTAT', 'GTCA', 'GTCC', 'GTCG',
            'GTCT', 'GTGA', 'GTGC', 'GTGG', 'GTGT', 'GTTA', 'GTTC', 'GTTG',
            'GTTT', 'TAAA', 'TAAC', 'TAAG', 'TAAT', 'TACA', 'TACC', 'TACG',
            'TACT', 'TAGA', 'TAGC', 'TAGG', 'TAGT', 'TATA', 'TATC', 'TATG',
            'TATT', 'TCAA', 'TCAC', 'TCAG', 'TCAT', 'TCCA', 'TCCC', 'TCCG',
            'TCCT', 'TCGA', 'TCGC', 'TCGG', 'TCGT', 'TCTA', 'TCTC', 'TCTG',
            'TCTT', 'TGAA', 'TGAC', 'TGAG', 'TGAT', 'TGCA', 'TGCC', 'TGCG',
            'TGCT', 'TGGA', 'TGGC', 'TGGG', 'TGGT', 'TGTA', 'TGTC', 'TGTG',
            'TGTT', 'TTAA', 'TTAC', 'TTAG', 'TTAT', 'TTCA', 'TTCC', 'TTCG',
            'TTCT', 'TTGA', 'TTGC', 'TTGG', 'TTGT', 'TTTA', 'TTTC', 'TTTG',
            'TTTT'
        ]

    for test_item, true_item in zip(test, true):
        assert test_item == true_item
Exemple #3
0
def test_score_kmers(pwm):
    import kvector

    kmers = kvector.make_kmers(3)

    test = pd.Series(kvector.score_kmers(pwm, kmers),
                     index=kmers)
    s = '''AAA,0.4421139794502956
AAC,0.3010679195832866
AAG,0.32337240282664487
AAT,0.42448485149540616
ACA,0.3128897488745921
ACC,0.1718436890075831
ACG,0.19414817225094133
ACT,0.2952606209197027
AGA,0.3128897488745921
AGC,0.1718436890075831
AGG,0.19414817225094133
AGT,0.2952606209197027
ATA,0.5067260947381473
ATC,0.3656800348711383
ATG,0.38798451811449663
ATT,0.48909696678325787
CAA,0.2935687551893772
CAC,0.15252269532236817
CAG,0.17482717856572644
CAT,0.2759396272344877
CCA,0.16434452461367371
CCC,0.023298464746664645
CCG,0.045602947990022916
CCT,0.1467153966587842
CGA,0.16434452461367371
CGC,0.023298464746664645
CGG,0.045602947990022916
CGT,0.1467153966587842
CTA,0.3581808704772289
CTC,0.2171348106102199
CTG,0.23943929385357818
CTT,0.3405517425223395
GAA,0.2935687551893772
GAC,0.15252269532236817
GAG,0.17482717856572644
GAT,0.2759396272344877
GCA,0.16434452461367371
GCC,0.023298464746664645
GCG,0.045602947990022916
GCT,0.1467153966587842
GGA,0.16434452461367371
GGC,0.023298464746664645
GGG,0.045602947990022916
GGT,0.1467153966587842
GTA,0.3581808704772289
GTC,0.2171348106102199
GTG,0.23943929385357818
GTT,0.3405517425223395
TAA,0.4420016217023659
TAC,0.30095556183535693
TAG,0.3232600450787152
TAT,0.4243724937474765
TCA,0.3127773911266625
TCC,0.1717313312596534
TCG,0.1940358145030117
TCT,0.29514826317177295
TGA,0.3127773911266625
TGC,0.1717313312596534
TGG,0.1940358145030117
TGT,0.29514826317177295
TTA,0.5066137369902177
TTC,0.3655676771232087
TTG,0.387872160366567
TTT,0.48898460903532825
'''
    true = pd.read_csv(six.StringIO(s), index_col=0, squeeze=True, header=None)
    true.name = None
    true.index.name = None

    pdt.assert_series_equal(test, true)
Exemple #4
0
def test_score_kmers(pwm):
    import kvector

    kmers = kvector.make_kmers(3)

    test = pd.Series(kvector.score_kmers(pwm, kmers), index=kmers)
    s = '''AAA,0.4421139794502956
AAC,0.3010679195832866
AAG,0.32337240282664487
AAT,0.42448485149540616
ACA,0.3128897488745921
ACC,0.1718436890075831
ACG,0.19414817225094133
ACT,0.2952606209197027
AGA,0.3128897488745921
AGC,0.1718436890075831
AGG,0.19414817225094133
AGT,0.2952606209197027
ATA,0.5067260947381473
ATC,0.3656800348711383
ATG,0.38798451811449663
ATT,0.48909696678325787
CAA,0.2935687551893772
CAC,0.15252269532236817
CAG,0.17482717856572644
CAT,0.2759396272344877
CCA,0.16434452461367371
CCC,0.023298464746664645
CCG,0.045602947990022916
CCT,0.1467153966587842
CGA,0.16434452461367371
CGC,0.023298464746664645
CGG,0.045602947990022916
CGT,0.1467153966587842
CTA,0.3581808704772289
CTC,0.2171348106102199
CTG,0.23943929385357818
CTT,0.3405517425223395
GAA,0.2935687551893772
GAC,0.15252269532236817
GAG,0.17482717856572644
GAT,0.2759396272344877
GCA,0.16434452461367371
GCC,0.023298464746664645
GCG,0.045602947990022916
GCT,0.1467153966587842
GGA,0.16434452461367371
GGC,0.023298464746664645
GGG,0.045602947990022916
GGT,0.1467153966587842
GTA,0.3581808704772289
GTC,0.2171348106102199
GTG,0.23943929385357818
GTT,0.3405517425223395
TAA,0.4420016217023659
TAC,0.30095556183535693
TAG,0.3232600450787152
TAT,0.4243724937474765
TCA,0.3127773911266625
TCC,0.1717313312596534
TCG,0.1940358145030117
TCT,0.29514826317177295
TGA,0.3127773911266625
TGC,0.1717313312596534
TGG,0.1940358145030117
TGT,0.29514826317177295
TTA,0.5066137369902177
TTC,0.3655676771232087
TTG,0.387872160366567
TTT,0.48898460903532825
'''
    true = pd.read_csv(six.StringIO(s), index_col=0, squeeze=True, header=None)
    true.name = None
    true.index.name = None

    pdt.assert_series_equal(test, true)