Example #1
0
    def test_multiple(self):
        # first test, align string, no output, no input
        output = self.run_cli('multiple -s woldemort waldemar walter')
        self.assertIn('w\ta\tl\tt\te\t-\t-\tr\t-', output)

        # second test, test output as file, no input, vary method as sca
        mlt = main('multiple', '-s', 'woldemort', 'waldemar',
                                  'walter', '--method', 'sca', '--output-file',
                                  self.tmp_path('out.msa').as_posix())
        assert mlt[0] == list('woldemort')

        # third test, test output and input
        # second test, test output as file, no input, vary method as sca
        mlt = main('multiple', '-i', test_data('harryp.msa'),
                                  '--method', 'sca', '--output-file',
                                  self.tmp_path('out2.msa').as_posix(), '--align-method',
                                  'library')
        assert len(mlt[0]) == 7

        # fourth test, test output and input with method=basic
        mlt = main('multiple', '-i', test_data('harryp.msa'),
                                  '--method', 'basic', '--output-file',
                                  self.tmp_path('out2.msa').as_posix())
        assert len(mlt[0]) == 7
        assert len([x for x in mlt[1][-1] if x != '-']) == 4
Example #2
0
    def test_pairwise(self):
        # first test, align string, no output, no input
        output = self.run_cli('pairwise -s woldemort waldemar')
        self.assertEqual(
            [line.split('\t') for line in output.split('\n')][:2],
            [
                ['w', 'o', 'l', 'd', 'e', 'm', 'o', 'r', 't'],
                ['w', 'a', 'l', 'd', 'e', 'm', 'a', 'r', '-'],
            ])

        # second test, test output as file, no input, vary method as sca
        tmp = self.tmp_path('test1')
        self.run_cli(
            'pairwise -s woldemort waldemar --method sca -o {0} --distance'.format(
                tmp.as_posix()))
        assert tmp.exists()

        # third test, test output and input
        # second test, test output as file, no input, vary method as sca
        tmp = self.tmp_path('test2')
        self.run_cli('pairwise -i {0} --method sca -o {1} -m overlap'.format(
            test_data('harry_potter.psa'), tmp.as_posix()))
        #
        # FIXME: It should not be the case that an output file is soecified, but the
        # output is actually written to a different file!
        #
        assert tmp.parent.joinpath(tmp.name + '.psa').exists()

        # fourth test, test output and input with method=basic
        tmp = self.tmp_path('test3')
        self.run_cli('pairwise -i {0} --method basic -o {1}'.format(
            test_data('harry_potter.psa'), tmp.as_posix()))
        assert tmp.parent.joinpath(tmp.name + '.psa').exists()
Example #3
0
    def test_get_GLS(self):
        phy = PhyBo(self.inputfile, output_dir=self.tmp.as_posix())
        _ = PhyBo(test_data('phybo2.qlc'), output_dir=self.tmp.as_posix(),
                  tree=test_data('phylogeny.tre'))
        _ = PhyBo(test_data('phybo2.qlc'), output_dir=self.tmp.as_posix())

        # test default scenario
        phy.get_GLS()

        # check for weight in one of the scenarios
        assert phy.gls['w-1-1']['2:1'][1] == 9
        assert phy.gls['w-1-1']['8:1'][1] == 2

        # test restriction scenario
        phy.get_GLS(mode='restriction', force=True)
        assert phy.gls['r-3']['12:1'][1] == 3
        assert phy.gls['r-3']['8:1'][1] == 2

        # test topdown, somehow, the algorithmic ordering leads to unstable
        # outputs, this should be fixed, but for testing, it is not unexpected
        # right now, which is why I change to the less than construct for the
        # moment
        phy.get_GLS(mode='topdown', force=True)
        assert phy.gls['t-3']['29:3'][1] < 3
        assert phy.gls['t-3']['8:1'][1] == 2
        phy.get_GLS(mode='weighted', force=True)

        glm = list(phy.stats.keys())[0]
        phy.get_stats(glm)
Example #4
0
 def test_init(self):
     self._make_one({0: ['ID', 'doculect', 'concept', 'IPA'],
                     1: ['1', 'deu', 'hand', 'hant']}, model='sca')
     ls = self._make_one({0: ['ID', 'doculect', 'concept', 'IPA'],
                          1: ['1', 'deu', 'hand', 'hant']})
     self.assertIn('lexstat', repr(ls))
     self._make_one(ls)
     self._make_one({0: ['ID', 'doculect', 'concept', 'tokens'],
                     1: ['1', 'deu', 'hand', 'hant']})
     self.assertRaises(AssertionError, LexStat, {0: ['ID', 'doculect',
                                                     'concept'],
                                                 1: ['1', 'deu', 'hand']})
     self._make_one(test_data('phybo.qlc'), check=True)
     with patch('lingpy.compare.lexstat.log', self.log):
         self._make_one(test_data('KSL.qlc'), check=True)
         assert self.log.info.called
     error_log = self.tmp_path('errors')
     with patch('lingpy.util.confirm', Mock(return_value=True)):
         lex = self._make_one({
             0: ['ID', 'doculect', 'concept', 'IPA', 'tokens'],
             1: ['1', 'deu', 'hand', 'hand', ['']],
             2: ['2', 'eng', 'hand', 'hand', ['abc']],
             3: ['3', 'xyz', 'hand', 'hund', 'h u n d'],
         }, check=True, errors='%s' % error_log)
         assert error_log.exists()
         self.assertTrue(lex.filename.endswith('_cleaned.qlc'))
         self.assertTrue(os.path.exists(lex.filename))
         os.remove(lex.filename)
         self.assertEquals(len(lex._meta['errors']), 2)
Example #5
0
def test_csv2dict():

    if_path1 = test_data('test_csv.csv')
    if_path2 = test_data('test_csv')

    # check default setting
    dat1 = csv2dict(if_path1)

    # pass data type and header
    dat2 = csv2dict(
            if_path2,
            fileformat = 'csv',
            dtype = [text_type, text_type, text_type, int, text_type],
            sep = '\t',
            header = True
            )

    # pass another separator
    dat3 = csv2dict(
            if_path1,
            sep = '_'
            )

    # modify the comment char
    dat4 = csv2dict(
            if_path1,
            comment = "?"
            )

    # check for correct results
    assert 'This' in dat1
    assert 'This' not in dat2
    assert dat2['We'][2] == 2
    assert dat3['This\tis\tthe'][0] == 'head\tline'
    assert len(dat4) == 4 and '#I' in dat4
Example #6
0
 def test_get_wordlist(self):
     from lingpy.basic.wordlist import get_wordlist
     wl1 = get_wordlist(test_data('mycsvwordlist.csv'))
     wl2 = get_wordlist(test_data('mycsvwordlistwithoutids.csv'))
     assert wl1.height == wl2.height
     for k in wl1:
         assert wl1[k, 'concept'] == wl2[k, 'concept']
Example #7
0
    def test_csv2list(self):
        if_path1 = test_data('test_csv.csv')
        if_path2 = test_data('test_csv')

        # check default setting
        dat1 = csv2list(if_path1)

        # pass data type and header
        dat2 = csv2list(
            if_path2,
            fileformat='csv',
            dtype=[text_type, text_type, text_type, int, text_type],
            sep='\t',
            header=True
        )

        # pass another separator
        dat3 = csv2list(
            if_path1,
            sep='_'
        )

        # modify the comment char
        dat4 = csv2list(
            if_path1,
            comment="?"
        )

        # check for correct parsing
        assert dat1[0][1] == 'is'
        assert dat2[0][1] == 'are'
        assert sum([x[3] for x in dat2]) == 8
        assert dat3[0][0] == 'This\tis\tthe'
        assert dat4[3][0] == '#I'
Example #8
0
    def test_alignments(self):
        tmp = self.tmp_path('alignments')

        def cmd(i, rem=''):
            return 'alignments -i {0} -c cogid -o {1} {2}'.format(i, tmp.as_posix(), rem)

        self.run_cli(cmd(test_data('KSL.qlc')))
        self.run_cli(cmd(test_data('KSL3.qlc'), ' --format html --use-logodds'))
Example #9
0
 def setUp(self):
     WithTempDir.setUp(self)
     self.lex = LexStat(test_data('KSL.qlc'))
     self.part = Partial(test_data('partial_cognates.tsv'),
             segments='segments')
     self.part.add_entries('pid1', 'partial_cognate_sets', lambda x: x)
     self.part.add_entries('pid2', 'partialids2', lambda x: [int(y)
         for y in x.split(' ')])
Example #10
0
 def test_conceptlists(self):
     comparison1 = compare_conceptlists(test_data('listA.tsv'), test_data('listB.tsv'))
     comparison2 = compare_conceptlists(test_data('listA.tsv'),
             test_data('listB.tsv'), output="tsv",
             filename=text_type(self.tmp_path('out')))
     
     assert comparison2 is None
     assert isinstance(comparison1, list)
Example #11
0
 def test_EvalPSA(self):
     obj = EvalPSA(
         PSA(test_data('harry_potter.psa')),
         PSA(test_data('harry_potter_misaligned.psa')))
     obj.c_score()
     obj.r_score()
     obj.sp_score()
     obj.jc_score()
     obj.diff(filename='%s' % self.tmp_path('test_EvalPSA.diff'))
Example #12
0
def test_parse_gloss():
    data = csv2list(test_data('Ringe-2002-421.tsv'))[1:]
    target_list = [x[0] for x in csv2list(test_data('target_list_ringe.tsv'))]
    for line,target in zip(data,target_list):
        datum = line[1]
        glosses = parse_gloss(datum)
        for a,b,c,d,e,f,g,h,i in glosses:
            print(datum,'=>',','.join([x for x in [a,b,c,d,e,f,g,''.join(h),i]]),'\t',a)
            assert a == target
Example #13
0
 def test_colexification_network(self):
     graph = colexification_network(Wordlist(test_data('colexification.tsv')))
     assert "hand" in graph and "arm" in graph
     
     graph = colexification_network(Wordlist(test_data('colexification.tsv')),
             bipartite=True)
     assert 'arm' in graph.edge['l4.4'] and 'hand' in graph.edge['l4.4']
     
     graph = colexification_network(Wordlist(test_data('colexification.tsv')),
             output="gml", filename=text_type(self.tmp_path("test")))
Example #14
0
    def test_EvalPSA(self):
        from lingpy.evaluate.apa import EvalPSA

        obj = EvalPSA(
            PSA(test_data('harry_potter.psa')),
            PSA(test_data('harry_potter_misaligned.psa')))
        obj.c_score()
        obj.r_score()
        obj.sp_score()
        obj.jc_score()
        obj.diff(filename='%s' % self.tmp_path('test_EvalPSA.diff'))
Example #15
0
    def test_colexification_network(self):
        graph = colexification_network(
            Wordlist(test_data('colexification.tsv')))
        assert "hand" in graph and "arm" in graph

        graph = colexification_network(Wordlist(
            test_data('colexification.tsv')),
                                       bipartite=True)
        assert 'arm' in graph.edge['l4.4'] and 'hand' in graph.edge['l4.4']

        graph = colexification_network(
            Wordlist(test_data('colexification.tsv')),
            output="gml",
            filename=text_type(self.tmp_path("test")))
Example #16
0
    def test_msa2str(self):
        aranger = '{body}{meta}'

        # read msa traditionally into an object
        msaA = lingpy.MSA(test_data('harry.msa'))

        # read msa from dictionary
        msaB = lingpy.read.qlc.read_msa(test_data('harry.msa'))

        # read msa with IDs
        msaC = lingpy.read.qlc.read_msa(test_data('harry_with_ids.msa'),
                                        ids=True,
                                        header=False)

        # we adjust the dataset and the seq_id since otherwise we won't have
        # similar output
        msaC['seq_id'] = 'test'
        msaC['dataset'] = 'file'

        # when converting these different objects to string with the same body and
        # the like, they should be identical, so we check this here
        strA = msa2str(msaA, _arange=aranger)
        strB = msa2str(msaB, _arange=aranger)
        strC = msa2str(msaC, _arange=aranger, wordlist=False)

        assert strA == strB == strC

        # we next test for converting with the merging attribute
        strD = msa2str(msaC, _arange=aranger, wordlist=True, merge=True)
        strE = msa2str(msaC, _arange=aranger, wordlist=True, merge=False)

        # remove tabstops for checking similar strings
        strDst = strD.replace('\t', '')
        strEst = strE.replace('\t', '')

        # get index until 'COLUMN'
        idx = strDst.index('COLUMNID')
        assert strD != strE and strDst[:idx] == strEst[:idx]

        # add a consensus string to all msa objects
        consensusA = lingpy.align.sca.get_consensus(lingpy.align.sca.MSA(msaB),
                                                    gaps=True)
        consensusB = lingpy.align.sca.get_consensus(lingpy.align.sca.MSA(msaC),
                                                    gaps=True)

        msaB['consensus'] = consensusA
        msaC['consensus'] = consensusB

        assert msa2str(msaB) == msa2str(msaC, wordlist=False)
Example #17
0
 def test_renumber(self):
     from lingpy.basic.ops import renumber
     tmp = Wordlist(test_data('good_file.tsv'))
     tmp.renumber('cogid', 'newcogid')
     assert 'newcogid' in tmp.header
     tmp.renumber('mock')
     assert 'mockid' in tmp.header
Example #18
0
    def test_load_tree(self):
        # test to load a given tree-file
        tree = LoadTree(test_data('phylogeny.tre'))

        taxa = sorted(["Beijing", "Changsha", "Chengdu", "Fuzhou", "Guangzhou",
                       "Guiyang", "Haerbin", "Haikou", "Hangzhou", "Hefei",
                       "Huhehaote", "Jian\u2019ou", "Jinan", "Kunming",
                       "Lanzhou",
                       "Meixian", "Nanchang", "Nanjing", "Nanning", "Pingyao",
                       "Qingdao", "Shanghai", "Shantou", "Shexian", "Suzhou",
                       "Taibei", "Taiyuan", "Taoyuan", "Tianjin", "Tunxi",
                       "Wenzhou", "Wuhan", "Wulumuqi", "Xi\u2019an", "Xiamen",
                       "Xianggang", "Xiangtan", "Xining", "Yinchuan",
                       "Zhengzhou"])

        for a, b in zip(sorted(tree.taxa), taxa):
            assert a == b

        tree = LoadTree("((((((((Taiyuan,Pingyao,Huhehaote),"
                        "((((Xi’an,Xining,Zhengzhou),(Lanzhou,Yinchuan,"
                        "Wulumuqi)),"
                        "(((Tianjin,Jinan),Qingdao),Beijing,Haerbin)),"
                        "(((Guiyang,Kunming),Chengdu,Wuhan),(Nanjing,Hefei)))),"
                        "(Xiangtan,Changsha)),Nanchang),(Shexian,Tunxi)),"
                        "((Shanghai,Suzhou,Hangzhou),Wenzhou)),"
                        "(((Xianggang,Guangzhou),Nanning),(Meixian,Taoyuan))),"
                        "((((Xiamen,Taibei),Shantou,Haikou),Fuzhou),Jian’ou));")

        for a, b in zip(sorted(tree.taxa), taxa):
            assert a == b
Example #19
0
    def test_csv2multidict(self):
        if_path1 = test_data('test_csv.csv')

        md = csv2multidict(if_path1)

        assert md['We']['is'] == 'are'
        assert sum([int(md[x]['head']) for x in md]) == 8
Example #20
0
    def test_get_consensus(self):
        # align all sequences using standard params
        self.alm.align()

        tree = TreeNode(Name='root',
                        Children=[
                            TreeNode(Name=line.split('\t')[1])
                            for line in read_config_file(test_data('KSL2.qlc'))
                        ])

        self.alm.get_consensus(consensus="consensus", tree=tree)
        self.alm.get_consensus(consensus="consensus", classes=True)
        self.alm.get_consensus(consensus="consensus")

        # check whether Turkish strings are identical
        assert self.alm.get_list(
                    language="Turkish",
                    entry="consensus",
                    flat=True
                    ) == \
                            [''.join(x) for x in self.alm.get_list(
                                language="Turkish",
                                entry="tokens",
                                flat=True
                                )
                                ]
Example #21
0
    def test_csv2multidict(self):
        if_path1 = test_data('test_csv.csv')

        md = csv2multidict(if_path1)

        assert md['We']['is'] == 'are'
        assert sum([int(md[x]['head']) for x in md]) == 8
Example #22
0
    def test_get_consensus(self):
        # align all sequences using standard params
        self.alm.align()

        tree = TreeNode(
            Name='root',
            Children=[TreeNode(Name=line.split('\t')[1]) for line in
                      read_config_file(test_data('KSL2.qlc'))])

        self.alm.get_consensus(consensus="consensus", tree=tree)
        self.alm.get_consensus(consensus="consensus", classes=True)
        self.alm.get_consensus(consensus="consensus")

        # check whether Turkish strings are identical
        assert self.alm.get_list(
                    language="Turkish",
                    entry="consensus",
                    flat=True
                    ) == \
                            [''.join(x) for x in self.alm.get_list(
                                language="Turkish",
                                entry="tokens",
                                flat=True
                                )
                                ]
Example #23
0
 def test_renumber(self):
     from lingpy.basic.ops import renumber
     tmp = Wordlist(test_data('good_file.tsv'))
     tmp.renumber('cogid', 'newcogid')
     assert 'newcogid' in tmp.header
     tmp.renumber('mock')
     assert 'mockid' in tmp.header
Example #24
0
 def test_get_subset(self):
     self.lex.get_subset([])
     self.assertEquals([v for v in self.lex.subsets.values() if v], [])
     pairs = jsonlib.load(test_data('KSL.pairs.json'))
     self.assertEquals(
         sorted('---'.join(k) for k in self.lex.subsets.keys()),
         sorted(pairs.keys()))
Example #25
0
 def test_reduce_msa(self):
     msa = MSA(read_msa(test_data('test_reduce.msa')))
     reduced_alignment = reduce_alignment(msa.alignment)
     for i, line in enumerate(reduced_alignment):
         assert len(line) == 4 and \
                 ''.join(line) == ''.join(
                         msa.alignment[i])[:msa.alignment[i].index('(')]
Example #26
0
 def test_get_subset(self):
     self.lex.get_subset([])
     self.assertEquals([v for v in self.lex.subsets.values() if v], [])
     pairs = jsonload(test_data('KSL.pairs.json'))
     self.assertEquals(
         sorted('---'.join(k) for k in self.lex.subsets.keys()),
         sorted(pairs.keys()))
Example #27
0
    def test_wl2qlc(self):
        stamp = 'test-stamp'
        out = self.tmp_path('test')

        wl2qlc(self.wordlist.header, self.wordlist._data,
               filename=out.as_posix(), stamp=stamp)
        out = self.tmp_path('test.qlc')

        with out.open(encoding='utf8') as fp:
            self.assertTrue(fp.read().endswith(stamp))

        # load a worldist with alignments and otuput it as string with msapairs
        tmp = Alignments(test_data('good_file.tsv'), ref='cogid')
        tmp.align(ref="cogid")

        wl2qlc(tmp.header, tmp._data, meta=tmp._meta, filename=out.as_posix(),
               stamp='stampo', ignore=[])
        tmp.get_consensus(ref="cogid")

        wl2qlc([h.upper()
                for h in sorted(tmp.header, key=lambda x: tmp.header[x])],
               tmp._data, meta=tmp._meta, filename=out.as_posix(),
               stamp='stampo', ignore=[], formatter="doculect,concept")
        wl2qlc([h.upper()
                for h in sorted(tmp.header, key=lambda x: tmp.header[x])],
               tmp._data, meta=tmp._meta, filename=out.as_posix(),
               stamp='stampo', ignore=[], formatter="doculect")
Example #28
0
    def test_msa2str(self):
        aranger = '{body}{meta}'

        # read msa traditionally into an object
        msa_a = MSA(test_data('harry.msa'))

        # read msa from dictionary
        msa_b = qlc.read_msa(test_data('harry.msa'))

        # read msa with IDs
        msa_c = qlc.read_msa(test_data('harry_with_ids.msa'),
                             ids=True,
                             header=False)

        # we adjust the dataset and the seq_id since otherwise we won't have
        # similar output
        msa_c['seq_id'] = 'test'
        msa_c['dataset'] = 'file'

        # when converting these different objects to string with the same body
        # and the like, they should be identical, so we check this here
        str_a = msa2str(msa_a, _arange=aranger)
        str_b = msa2str(msa_b, _arange=aranger)
        str_c = msa2str(msa_c, _arange=aranger, wordlist=False)

        assert str_a == str_b == str_c

        # we next test for converting with the merging attribute
        str_d = msa2str(msa_c, _arange=aranger, wordlist=True, merge=True)
        str_e = msa2str(msa_c, _arange=aranger, wordlist=True, merge=False)

        # remove tabstops for checking similar strings
        str_d_st = str_d.replace('\t', '')
        str_e_st = str_e.replace('\t', '')

        # get index until 'COLUMN'
        idx = str_d_st.index('COLUMNID')
        assert str_d != str_e and str_d_st[:idx] == str_e_st[:idx]

        # add a consensus string to all msa objects
        consensus_a = get_consensus(MSA(msa_b), gaps=True)
        consensus_b = get_consensus(MSA(msa_c), gaps=True)

        msa_b['consensus'] = consensus_a
        msa_c['consensus'] = consensus_b

        assert msa2str(msa_b) == msa2str(msa_c, wordlist=False)
Example #29
0
    def test_msa2str(self):
        aranger = '{body}{meta}'

        # read msa traditionally into an object
        msaA = lingpy.MSA(test_data('harry.msa'))

        # read msa from dictionary
        msaB = lingpy.read.qlc.read_msa(test_data('harry.msa'))

        # read msa with IDs
        msaC = lingpy.read.qlc.read_msa(
            test_data('harry_with_ids.msa'), ids=True, header=False)

        # we adjust the dataset and the seq_id since otherwise we won't have
        # similar output
        msaC['seq_id'] = 'test'
        msaC['dataset'] = 'file'

        # when converting these different objects to string with the same body and
        # the like, they should be identical, so we check this here
        strA = msa2str(msaA, _arange=aranger)
        strB = msa2str(msaB, _arange=aranger)
        strC = msa2str(msaC, _arange=aranger, wordlist=False)

        assert strA == strB == strC

        # we next test for converting with the merging attribute
        strD = msa2str(msaC, _arange=aranger, wordlist=True, merge=True)
        strE = msa2str(msaC, _arange=aranger, wordlist=True, merge=False)

        # remove tabstops for checking similar strings
        strDst = strD.replace('\t', '')
        strEst = strE.replace('\t', '')

        # get index until 'COLUMN'
        idx = strDst.index('COLUMNID')
        assert strD != strE and strDst[:idx] == strEst[:idx]

        # add a consensus string to all msa objects
        consensusA = lingpy.align.sca.get_consensus(lingpy.align.sca.MSA(msaB), gaps=True)
        consensusB = lingpy.align.sca.get_consensus(lingpy.align.sca.MSA(msaC), gaps=True)

        msaB['consensus'] = consensusA
        msaC['consensus'] = consensusB

        assert msa2str(msaB) == msa2str(msaC, wordlist=False)
Example #30
0
    def test_EvalMSA(self):
        msa = MSA(test_data('harry.msa'))
        msa2 = MSA(test_data('harryp.msa'))

        for test in [msa, msa2]:
            obj = EvalMSA(msa, test)
            for mode in range(1, 5):
                obj.c_score(mode=mode)
                if hasattr(obj, 'pic'):
                    del obj.pic
            self.assertRaises(ValueError, obj.c_score, 10)
            res = obj.r_score()
            if test == msa:
                self.assertAlmostEquals(res, 1.0)
            obj.sp_score()
            obj.jc_score()
            obj.check_swaps()
Example #31
0
    def test_EvalMSA(self):
        msa = MSA(test_data('harry.msa'))
        msa2 = MSA(test_data('harryp.msa'))

        for test in [msa, msa2]:
            obj = EvalMSA(msa, test)
            for mode in range(1, 5):
                obj.c_score(mode=mode)
                if hasattr(obj, 'pic'):
                    del obj.pic
            self.assertRaises(ValueError, obj.c_score, 10)
            res = obj.r_score()
            if test == msa:
                self.assertAlmostEquals(res, 1.0)
            obj.sp_score()
            obj.jc_score()
            obj.check_swaps()
Example #32
0
 def test__cache(self):
     parser = QLCParserWithRowsAndCols(test_data('KSL.qlc'), 'gloss', 'cogid', {})
     idx = list(parser._data.keys())[0]
     parser._get_cached(idx)
     parser._get_cached(idx)
     parser._clean_cache()
     parser._data.pop(idx)
     self.assertRaises(KeyError, parser._get_cached, idx)
Example #33
0
def test_from_cldf():
    from lingpy.basic.wordlist import from_cldf
    wl = from_cldf(test_data('cldf/test-metadata.json'),
                   language='Name',
                   concept='Name',
                   concepticon="Concepticon_ID")
    assert wl.width == 29
    assert wl.height == 1
Example #34
0
    def test_get_confidence(self):

        lex = LexStat(test_data('KSL3.qlc'))
        tmpDict = dict([(k,lex[k,'numbers']) for k in lex])
        self.alm.add_entries('numbers', tmpDict, lambda x: x) 
        corrs = self.alm.get_confidence(lex.rscorer, ref='cogid')
        self.alm.output('html', filename=text_type(self.tmp_path('alm')),
                confidence=True)
Example #35
0
 def test_get_confidence(self):
     lex = LexStat(test_data('KSL3.qlc'))
     tmp_dict = dict([(k, lex[k, 'numbers']) for k in lex])
     self.alm.add_entries('numbers', tmp_dict, lambda x: x)
     # Run get_confidence to populate the output variable.
     # TODO: Check and document side-effects of this.
     _ = self.alm.get_confidence(lex.rscorer, ref='cogid')
     self.alm.output('html', filename=text_type(self.tmp_path('alm')),
                     confidence=True)
Example #36
0
    def test_Workflow(self):
        from lingpy.basic.workflow import Workflow

        outfile = self.tmp_path('test')
        wf = Workflow(test_data('KSL.qlc'))
        wf.cognate_detection(export='tsv,html', outfile=str(outfile))
        self.assertTrue(self.tmp_path('test.tsv').exists())
        self.assertTrue(self.tmp_path('test.html').exists())
        wf.cognate_detection(cognate_method='lexstat')
Example #37
0
    def test_get_confidence(self):

        lex = LexStat(test_data('KSL3.qlc'))
        tmpDict = dict([(k, lex[k, 'numbers']) for k in lex])
        self.alm.add_entries('numbers', tmpDict, lambda x: x)
        corrs = self.alm.get_confidence(lex.rscorer, ref='cogid')
        self.alm.output('html',
                        filename=text_type(self.tmp_path('alm')),
                        confidence=True)
Example #38
0
    def test_Workflow(self):
        from lingpy.basic.workflow import Workflow

        outfile = self.tmp_path('test')
        wf = Workflow(test_data('KSL.qlc'))
        wf.cognate_detection(export='tsv,html', outfile=str(outfile))
        self.assertTrue(self.tmp_path('test.tsv').exists())
        self.assertTrue(self.tmp_path('test.html').exists())
        wf.cognate_detection(cognate_method='lexstat')
Example #39
0
 def test_init3(self):  # with kw check=True
     bad_file = Path(test_data('bad_file.tsv'))
     assert_raises(ValueError, LexStat, bad_file.as_posix())
     ls = self._make_one(bad_file.as_posix(), check=True, apply_checks=True)
     assert hasattr(ls, 'errors')
     cleaned = bad_file.parent.joinpath(bad_file.name + '_cleaned.qlc')
     self.assertTrue(cleaned.exists())
     os.remove(cleaned.as_posix())
     assert_raises(ValueError, LexStat, {0: ['concept', 'language', 'ipa']})
Example #40
0
 def test_output(self):
     msa = MSA(test_data('harry.msa'))
     msa.ipa2cls()
     # well. it is a list, but the code apparently wants a dict ...
     msa.merge = {'a': 'x', 'b': 'x'}
     fname = text_type(self.tmp_path('test'))
     for fmt in 'msa psa msq html tex'.split():
         for s, u in product([True, False], [True, False]):
             msa.output(fileformat=fmt, filename=fname, sorted_seqs=s, unique_seqs=u)
Example #41
0
 def test_init3(self):  # with kw check=True
     bad_file = Path(test_data('bad_file.tsv'))
     assert_raises(ValueError, LexStat, bad_file.as_posix())
     ls = self._make_one(bad_file.as_posix(), check=True, apply_checks=True)
     assert hasattr(ls, 'errors')
     cleaned = bad_file.parent.joinpath(bad_file.name + '_cleaned.tsv')
     self.assertTrue(cleaned.exists())
     os.remove(cleaned.as_posix())
     assert_raises(ValueError, LexStat, {0: ['concept', 'language', 'ipa']})
Example #42
0
 def test_get_confidence(self):
     lex = LexStat(test_data('KSL3.qlc'))
     tmp_dict = dict([(k, lex[k, 'numbers']) for k in lex])
     self.alm.add_entries('numbers', tmp_dict, lambda x: x)
     # Run get_confidence to populate the output variable.
     # TODO: Check and document side-effects of this.
     _ = self.alm.get_confidence(lex.rscorer, ref='cogid')
     self.alm.output('html',
                     filename=text_type(self.tmp_path('alm')),
                     confidence=True)
Example #43
0
    def test_load_from_cldf_metadata(self):
        wl = Wordlist.from_cldf(test_data('cldf/test-metadata.json'),
                                col="Language_ID".lower(),
                                row="Parameter_ID".lower())

        assert wl.width == 29
        assert wl.height == 1
        assert wl.entries[0] == 'alignment'
        assert wl.cols[0] == 'anuta'.lower()
        assert wl.cols[28] == 'wallisian'
Example #44
0
def test_read_asjp():
    lex = LexStat(read_asjp(
        test_data('asjp_test_list.csv'), family="CELTIC", classification="wls_gen"))
    assert len(lex) == 249
    
    evaluate = lambda x,y,z: x[y[1]].startswith(z)

    lex = LexStat(read_asjp(
        test_data('asjp_test_list.csv'), family='GERMANIC',
        classification='wls_fam,wls_gen', evaluate=evaluate))
    
    assert len(lex) == 1429

    # check if loans have been traced and if at least one word is represented
    # as expected
    entry = lex.get_dict(doculect="YIDDISH_EASTERN")
    idx = entry['person'][0]
    assert lex[idx,'known_borrowings'] == 1
    assert lex[idx,'counterpart'] == "pErzon"
Example #45
0
    def test_ipa2tokens(self):
        seq = 'ˈtʲʰoɔːix_tərp͡f¹¹'
        assert len(ipa2tokens(seq)) != len(list(seq))

        seq = 'ʰto͡i'
        assert len(ipa2tokens(seq)) == 2

        seq = 'th o x t a'
        assert len(ipa2tokens(seq)) == len(seq.split(' '))

        seq = '# b l a #'
        assert len(ipa2tokens(seq)) == len(seq.split(' ')) - 2

        # now check with all possible data we have so far, but only on cases
        # where tokenization doesn't require the merge_vowels = False flag
        tokens = csv2list(test_data('test_tokenization.tsv'))

        for a, b in tokens:
            tks = ' '.join(ipa2tokens(a))
            # we check for two variants, since we don't know whether vowels are
            # merged or not in the test data
            assert tks == b

        # now test on smaller set with unmerged vowels
        tokens = csv2list(test_data('test_tokenization_mv.tsv'))

        for a, b in tokens:
            tks = ' '.join(
                ipa2tokens(a, merge_vowels=False, merge_geminates=False))
            # we check for two variants, since we don't know whether vowels are
            # merged or not in the test data
            assert tks == b

        tokens = csv2list(test_data('test_tokenization_nasals.tsv'))
        for a, b in tokens:
            tks = ' '.join(
                ipa2tokens(a,
                           merge_vowels=True,
                           merge_geminates=True,
                           expand_nasals=True,
                           semi_diacritics='h'))
            assert tks == b
Example #46
0
    def test_read_dst(self):
        t1, m1 = read_dst(test_data('phylip_basic.dst'))
        t2, m2 = read_dst(test_data('phylip_tabstop.dst'), taxlen=0)
        t3, m3 = read_dst(MATRIX, taxlen=0)

        assert t1 == t2 == t3

        ma0 = sum([m[0] for m in m1])  # 1.9
        ma1 = sum([m[1] for m in m1])  # 2.1
        ma2 = sum([m[2] for m in m1])  # 1.2
        ma3 = sum([m[3] for m in m1])  # 1.8
        mb0 = sum([m[0] for m in m2])  # 1.9
        mb1 = sum([m[1] for m in m2])  # 2.1
        mb2 = sum([m[2] for m in m2])  # 1.2
        mb3 = sum([m[3] for m in m2])  # 1.8

        assert round(ma0, 2) == round(mb0, 2) == 1.9
        assert round(ma1, 2) == round(mb1, 2) == 2.1
        assert round(ma2, 2) == round(mb2, 2) == 1.2
        assert round(ma3, 2) == round(mb3, 2) == 1.8
Example #47
0
 def test_init(self):
     self._make_one(
         {
             0: ['ID', 'doculect', 'concept', 'IPA'],
             1: ['1', 'deu', 'hand', 'hant']
         },
         model='sca')
     ls = self._make_one({
         0: ['ID', 'doculect', 'concept', 'IPA'],
         1: ['1', 'deu', 'hand', 'hant']
     })
     self.assertIn('lexstat', repr(ls))
     self._make_one(ls)
     self._make_one({
         0: ['ID', 'doculect', 'concept', 'tokens'],
         1: ['1', 'deu', 'hand', ['h', 'a', 'n', 't']]
     })
     self.assertRaises(AssertionError, LexStat, {
         0: ['ID', 'doculect', 'concept'],
         1: ['1', 'deu', 'hand']
     })
     self._make_one(test_data('phybo.qlc'), check=True)
     with patch('lingpy.compare.lexstat.log', self.log):
         self._make_one(test_data('KSL.qlc'), check=True)
         assert self.log.info.called
     error_log = self.tmp_path('errors')
     with patch('lingpy.util.confirm', Mock(return_value=True)):
         lex = self._make_one(
             {
                 0: ['ID', 'doculect', 'concept', 'IPA', 'tokens'],
                 1: ['1', 'deu', 'hand', 'hand', ['']],
                 2: ['2', 'eng', 'hand', 'hand', ['abc']],
                 3: ['3', 'xyz', 'hand', 'hund', 'h u n d'],
             },
             check=True,
             errors='%s' % error_log)
         assert error_log.exists()
         self.assertTrue(lex.filename.endswith('_cleaned.tsv'))
         self.assertTrue(os.path.exists(lex.filename))
         os.remove(lex.filename)
         self.assertEquals(len(lex._meta['errors']), 2)
Example #48
0
    def test_cache(self):
        filename = 'lingpy_test.qlc'
        self.parser.pickle(filename=filename)
        from_cache = QLCParser.unpickle(filename)
        self.assertEqual(self.parser.header, from_cache.header)
        os.remove(str(path(filename)))

        wl = Wordlist(test_data('KSL.qlc'))
        wl.pickle(filename=filename)
        from_cache = Wordlist.unpickle(filename)
        self.assert_(from_cache._class)
        os.remove(str(path(filename)))
Example #49
0
 def test_output(self):
     msa = MSA(test_data('harry.msa'))
     msa.ipa2cls()
     # well. it is a list, but the code apparently wants a dict ...
     msa.merge = {'a': 'x', 'b': 'x'}
     fname = text_type(self.tmp_path('test'))
     for fmt in 'msa psa msq html tex'.split():
         for s, u in product([True, False], [True, False]):
             msa.output(fileformat=fmt,
                        filename=fname,
                        sorted_seqs=s,
                        unique_seqs=u)
Example #50
0
    def test_from_cldf(self):
        wl = from_cldf(test_data('cldf/test-metadata.json'),
                       language='Name',
                       concept='Name',
                       concepticon="Concepticon_ID",
                       glottocode='glottocode')

        assert wl.width == 29
        assert wl.height == 1
        assert wl.entries[0] == 'alignment'
        assert wl.cols[0] == 'Anuta'
        assert wl.cols[28] == 'Vaeakau-Taumako'
Example #51
0
    def test_read_scorer(self):
        scorer = read_scorer(test_data('dolgo.matrix'))

        assert sorted(scorer.chars2int)[0] == '+'
        for letter in 'PTKRSM':
            assert scorer[letter, 'V'] == -10

        assert max(scorer.chars2int.values()) == 15

        # add scorer from string
        scorer2 = read_scorer(DOLGO)
        assert sorted(scorer.chars2int) == sorted(scorer2.chars2int)
Example #52
0
    def test_multiple(self):
        # first test, align string, no output, no input
        output = self.run_cli('multiple -s woldemort waldemar walter')
        self.assertIn('w\ta\tl\tt\te\t-\t-\tr\t-', output)

        # second test, test output as file, no input, vary method as sca
        mlt = main('multiple', '-s', 'woldemort', 'waldemar', 'walter',
                   '--method', 'sca', '--output-file',
                   self.tmp_path('out.msa').as_posix())
        # third test, test output and input
        # second test, test output as file, no input, vary method as sca
        mlt = main('multiple', '-i', test_data('harryp.msa'), '--method',
                   'sca', '--output-file',
                   self.tmp_path('out2.msa').as_posix(), '--align-method',
                   'library')
        assert len(mlt[0]) == 7

        # fourth test, test output and input with method=basic
        mlt = main('multiple', '-i', test_data('harryp.msa'), '--method',
                   'basic', '--output-file',
                   self.tmp_path('out2.msa').as_posix())
        assert len(mlt[0]) == 7
        assert len([x for x in mlt[1][-1] if x != '-']) == 4
Example #53
0
def test_read_asjp():
    lex = LexStat(
        read_asjp(test_data('asjp_test_list.csv'),
                  family="CELTIC",
                  classification="wls_gen"))
    assert len(lex) == 249

    evaluate = lambda x, y, z: x[y[1]].startswith(z)

    lex = LexStat(
        read_asjp(test_data('asjp_test_list.csv'),
                  family='GERMANIC',
                  classification='wls_fam,wls_gen',
                  evaluate=evaluate))

    assert len(lex) == 1429

    # check if loans have been traced and if at least one word is represented
    # as expected
    entry = lex.get_dict(doculect="YIDDISH_EASTERN")
    idx = entry['person'][0]
    assert lex[idx, 'known_borrowings'] == 1
    assert lex[idx, 'counterpart'] == "pErzon"
Example #54
0
    def test_plots(self):
        plot_gls(self.gls,
                 self.tree,
                 filename=text_type(self.tmp_path('test')))
        plot_tree(self.tree, filename=text_type(self.tmp_path('test')))
        plot_concept_evolution(self.scenarios,
                               self.tree,
                               filename=text_type(self.tmp_path('test')))

        wl = Wordlist(test_data('KSL.qlc'))
        wl.calculate('tree')
        plot_heatmap(wl,
                     filename=text_type(self.tmp_path('test')),
                     ref="cogid",
                     refB="cogid",
                     steps=1)
Example #55
0
 def test_init(self):
     p = QLCParser({0: ['a']})
     QLCParser(p)
     self.assertRaises(IOError, QLCParser, 'not-extisting-file')
     self.assertRaises(TypeError, QLCParser, None)
     self.assertRaises(ValueError, QLCParserWithRowsAndCols, {0: ['a']}, 'x', 'y', {})
     
     self.assertRaises(ValueError, QLCParserWithRowsAndCols, 
             {0: ['concept', 'language', 'bla'],
                 1 : ['bla', 'blu']}, 'concept', 'language', '')
     
     p2 = QLCParserWithRowsAndCols(test_data('bad_file2.tsv'), 'concept',
         'language', data_path('conf', 'wordlist.rc'))
     assert p2.get_entries('cogid')[0][-1] == 'ff'
     self.assertRaises(KeyError, p2.__getitem__, tuple([2000, 'bla']))
     assert p2[3, 'language'] == 'l3'
     assert p2[3, 'nothing'] is None
Example #56
0
    def test_output(self):
        fpsa = self.tmp_path('test.psa')
        write_text_file(fpsa, '\n')
        psa = PSA(text_type(fpsa))
        fname = text_type(self.tmp_path('test'))
        psa.output(fileformat='psa', filename=fname)

        psq = self.tmp_path('test.psq')
        write_text_file(psq, '\n')
        psa = PSA(text_type(psq))
        fname = text_type(self.tmp_path('test'))
        psa.output(fileformat='psq', filename=fname)

        psa = PSA(text_type(test_data('harry_potter.psa')))
        psa.align()
        psa.output(fileformat="psa", filename=fname, scores=True)
        psa.output(fileformat="psq", filename=fname)
Example #57
0
    def test_wl2qlc(self):
        stamp = 'test-stamp'
        out = self.tmp_path('test')

        wl2qlc(self.wordlist.header,
               self.wordlist._data,
               filename=out.as_posix(),
               stamp=stamp)
        out = self.tmp_path('test.qlc')

        with out.open(encoding='utf8') as fp:
            self.assertTrue(fp.read().endswith(stamp))

        # load a worldist with alignments and otuput it as string with msapairs
        tmp = Alignments(test_data('good_file.tsv'), ref='cogid')
        tmp.align(ref="cogid")

        wl2qlc(tmp.header,
               tmp._data,
               meta=tmp._meta,
               filename=out.as_posix(),
               stamp='stampo',
               ignore=[])
        tmp.get_consensus(ref="cogid")

        wl2qlc([
            h.upper() for h in sorted(tmp.header, key=lambda x: tmp.header[x])
        ],
               tmp._data,
               meta=tmp._meta,
               filename=out.as_posix(),
               stamp='stampo',
               ignore=[],
               formatter="doculect,concept")
        wl2qlc([
            h.upper() for h in sorted(tmp.header, key=lambda x: tmp.header[x])
        ],
               tmp._data,
               meta=tmp._meta,
               filename=out.as_posix(),
               stamp='stampo',
               ignore=[],
               formatter="doculect")
Example #58
0
    def test_init2(self):
        freqs = self.lex.freqs['Hawaiian']
        seq = {'5.W.C': 19, '5.I.V': 87, '5.Y.V': 75, '5.U.V': 87}

        for char, n in seq.items():
            self.assertEqual(freqs[char], n)

        self.assertEqual(len(self.lex.chars), 187)
        self.assertEqual(len(self.lex.rchars), 35)
        self.maxDiff = None

        for name in 'bscorer rscorer pairs'.split():
            obj = jsonlib.load(test_data('KSL.%s.json' % name))
            if name != 'pairs':
                self.assertEqual(getattr(self.lex, name).matrix, obj)
            else:
                for key, values in self.lex.pairs.items():
                    values = set(values)
                    ovalues = set(tuple(v) for v in obj['---'.join(key)])
                    if name != 'pairs':
                        self.assertEqual(values, ovalues)