def matrix_output(self): # produce Jelena style output format with matrix pass def qlc_output_format(self): # produce counterpart \t concept \t language QLC output format print("COUNTERPART"+"\t"+"ORTHO_PARSE"+"\t"+"CONCEPT"+"\t"+"LANGUAGE") for counterpart, concept, language in self._wordlist_iterator: if counterpart == "?" or counterpart == "NONE": print(counterpart+"\t"+counterpart+"\t"+concept+"\t"+language) continue grapheme_parsed_counterpart_tuple = self.o.parse_string_to_graphemes_string(counterpart) # skip shit that doesn't parse if grapheme_parsed_counterpart_tuple[0] == False: continue ortho_parse = grapheme_parsed_counterpart_tuple[1] print(counterpart+"\t"+ortho_parse+"\t"+concept+"\t"+language) if __name__=="__main__": from qlc.tokenizer import Tokenizer from qlc import ngram t = Tokenizer() t.lingpy_output() # words = t.get_qlc_tokenized_words() # ngram.unigram_model(words)
ortho_parse = grapheme_parsed_counterpart_tuple[1] print(language+"\t"+concept+"\t"+counterpart+"\t"+grapheme_parsed_counterpart_tuple[1]) def matrix_output(self): # produce Jelena style output format with matrix pass def qlc_output_format(self): # produce counterpart \t concept \t language QLC output format print("COUNTERPART"+"\t"+"ORTHO_PARSE"+"\t"+"CONCEPT"+"\t"+"LANGUAGE") for counterpart, concept, language in self._wordlist_iterator: if counterpart == "?" or counterpart == "NONE": print(counterpart+"\t"+counterpart+"\t"+concept+"\t"+language) continue grapheme_parsed_counterpart_tuple = self.o.parse_string_to_graphemes_string(counterpart) # skip shit that doesn't parse if grapheme_parsed_counterpart_tuple[0] == False: continue ortho_parse = grapheme_parsed_counterpart_tuple[1] print(counterpart+"\t"+ortho_parse+"\t"+concept+"\t"+language) if __name__=="__main__": from qlc.tokenizer import Tokenizer from qlc import ngram t = Tokenizer() t.qlc_output_format() # words = t.get_qlc_tokenized_words() # ngram.unigram_model(words)