コード例 #1
0
ファイル: tokenizer.py プロジェクト: RichardLitt/lingpy
    def matrix_output(self):
        # produce Jelena style output format with matrix
        pass

    def qlc_output_format(self):
        # produce counterpart \t concept \t language QLC output format
        print("COUNTERPART"+"\t"+"ORTHO_PARSE"+"\t"+"CONCEPT"+"\t"+"LANGUAGE")
        for counterpart, concept, language in self._wordlist_iterator:
            if counterpart == "?" or counterpart == "NONE":
                print(counterpart+"\t"+counterpart+"\t"+concept+"\t"+language)                
                continue
            grapheme_parsed_counterpart_tuple = self.o.parse_string_to_graphemes_string(counterpart)
            
            # skip shit that doesn't parse
            if grapheme_parsed_counterpart_tuple[0] == False:
                continue

            ortho_parse = grapheme_parsed_counterpart_tuple[1]
            print(counterpart+"\t"+ortho_parse+"\t"+concept+"\t"+language)




if __name__=="__main__":
    from qlc.tokenizer import Tokenizer
    from qlc import ngram
    t = Tokenizer()
    t.lingpy_output()
#    words = t.get_qlc_tokenized_words()
#    ngram.unigram_model(words)
コード例 #2
0
ファイル: tokenizer.py プロジェクト: pombredanne/qlc
            ortho_parse = grapheme_parsed_counterpart_tuple[1]
            print(language+"\t"+concept+"\t"+counterpart+"\t"+grapheme_parsed_counterpart_tuple[1])

    def matrix_output(self):
        # produce Jelena style output format with matrix
        pass

    def qlc_output_format(self):
        # produce counterpart \t concept \t language QLC output format
        print("COUNTERPART"+"\t"+"ORTHO_PARSE"+"\t"+"CONCEPT"+"\t"+"LANGUAGE")
        for counterpart, concept, language in self._wordlist_iterator:
            if counterpart == "?" or counterpart == "NONE":
                print(counterpart+"\t"+counterpart+"\t"+concept+"\t"+language)                
                continue
            grapheme_parsed_counterpart_tuple = self.o.parse_string_to_graphemes_string(counterpart)
            
            # skip shit that doesn't parse
            if grapheme_parsed_counterpart_tuple[0] == False:
                continue

            ortho_parse = grapheme_parsed_counterpart_tuple[1]
            print(counterpart+"\t"+ortho_parse+"\t"+concept+"\t"+language)

if __name__=="__main__":
    from qlc.tokenizer import Tokenizer
    from qlc import ngram
    t = Tokenizer()
    t.qlc_output_format()
#    words = t.get_qlc_tokenized_words()
#    ngram.unigram_model(words)