def __init__(self): # deal with configuration file # configparser.read(default.cfg) cfg = SafeConfigParser() cfg.read("default.cfg") data = cfg.get("Paths", "data") orthography_profile = cfg.get("Paths", "orthography_profile") # set variables, e.g. source, orthography parser, etc. self.data = open(data, "r") self.o = OrthographyParser(orthography_profile) # self.o = GraphemeParser() self._languages = collections.defaultdict(int) # given unique ID to each unique language name self._concepts = collections.defaultdict(int) # ... self._counterparts = collections.defaultdict(int) # .. self._wordlist_iterator = self._process_input(self.data) # print(type(self.iterator)) # print(len(self.counterparts)) # words = self.get_qlc_tokenized_words() """ count = 0 for line in words: if line != "": print(line) count += 1 print(count) """ """
def main(argv): if len(argv) < 2: print("call: counterparts_huber1992.py data_path") exit(1) cr = CorpusReaderWordlist(os.path.join(argv[1], "csv")) o = OrthographyParser(os.path.join(argv[1], "orthography_profiles", "huber1992.txt")) ngrams_by_language_count = list() ngrams_set = set() for i, wordlistdata_id in enumerate(cr.wordlistdata_ids_for_bibtex_key('huber1992')): #counterparts = cr.counterpartsForWordlistdataId(wordlistdata_id) #print wordlistdata_id language_bookname = cr.get_language_bookname_for_wordlistdata_id(wordlistdata_id) language_code = cr.get_language_code_for_wordlistdata_id(wordlistdata_id) counterpart_graphemes = (o.parse_string_to_graphemes(counterpart) \ for counterpart in cr.counterparts_for_wordlistdata_id(wordlistdata_id)) matrix = qlc.ngram.words_ngrams_matrix_for_graphemes_list(counterpart_graphemes, 2) sum = numpy.sum(matrix.matrix, 0) #print("Sum length: {0}".format(len(sum))) #print("Column length: {0}".format(len(columns))) if len(sum.nonzero()[0]) != matrix.number_of_columns: print("Error: ") print("{0} != {1}".format(len(sum.nonzero()[0]), len(columns))) print(language_bookname) ngrams_by_language_count.append(collections.defaultdict(int)) for j, c in enumerate(matrix.column_names): ngrams_set.add(c) ngrams_by_language_count[i][c] = sum[j] ngrams_list = sorted(list(ngrams_set)) matrix = qlc.matrix.Matrix(ngrams_by_language_count, ngrams_list) # matrix = numpy.zeros( ( len(ngrams_by_language_count), len(ngrams_list) ) ) for i in range(matrix.number_of_rows): for j, ngram in enumerate(ngrams_list): matrix.matrix[i][j] = ngrams_by_language_count[i][ngram] print(matrix.matrix)
class Tokenizer: """ takes as input a file with the QLC format: counterpart \t concept \t language and does things like - tokenizes the file into LINGPY format - tokenizes the data into ortographically parsed QLC format - locates unicorns """ def __init__(self): # deal with configuration file # configparser.read(default.cfg) cfg = SafeConfigParser() cfg.read("default.cfg") data = cfg.get("Paths", "data") orthography_profile = cfg.get("Paths", "orthography_profile") # set variables, e.g. source, orthography parser, etc. self.data = open(data, "r") self.o = OrthographyParser(orthography_profile) # self.o = GraphemeParser() self._languages = collections.defaultdict(int) # given unique ID to each unique language name self._concepts = collections.defaultdict(int) # ... self._counterparts = collections.defaultdict(int) # .. self._wordlist_iterator = self._process_input(self.data) # print(type(self.iterator)) # print(len(self.counterparts)) # words = self.get_qlc_tokenized_words() """ count = 0 for line in words: if line != "": print(line) count += 1 print(count) """ """ self.cr = CorpusReaderWordlist("data/csv") self.wordlist_iterator = ( (wordlistdata_id, concept, counterpart) for wordlistdata_id in self.cr.wordlistdata_ids_for_bibtex_key(source) for concept, counterpart in self.cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id) ) """ def _process_input(self, file): languages_id = 1 concepts_id = 1 counterparts_id = 1 header = file.readline() lines = [] for line in file: line = line.strip() line = line.replace(" ", " ") counterpart, concept, language = line.split("\t") result = (counterpart, concept, language) lines.append(result) if language not in self._languages: self._languages[language] = languages_id languages_id += 1 if concept not in self._concepts: self._concepts[concept] = concepts_id concepts_id += 1 if counterpart not in self._counterparts: counterparts_id += 1 self._counterparts[counterpart] = counterparts_id return ((concept, counterpart, language) for concept, counterpart, language in lines) def get_qlc_tokenized_words(self): unparasables = open("unparsables.txt", "w") tokenized_words = [] for counterpart, concept, language in self._wordlist_iterator: counterpart = unicodedata.normalize("NFD", counterpart) grapheme_parsed_counterpart_tuple = self.o.parse_string_to_graphemes_string(counterpart) if grapheme_parsed_counterpart_tuple[0] == False: unparsables.write(grapheme_parsed_counterpart_tuple[1]) continue grapheme_parse = grapheme_parsed_counterpart_tuple[1] tokenized_words.append(grapheme_parse) return tokenized_words def get_ipa_tokenized_words(self): tokenized_words = [] words = get_list_qlc_tokenized_words() for word in words: grapheme_parsed_counterpart_tuple = self.o.parse_string_to_graphemes_string(counterpart) def lingpy_output(self): row_id = 1 # given some data set from the corpusreader or somewhere else, output a lingpy format print("ID"+"\t"+"Taxa"+"\t"+"TaxonID"+"\t"+"Gloss"+"\t"+"GlossID"+"\t"+"IPA"+"\t"+"Orthography") # print("# LANGUAGE"+"\t"+"CONCEPT"+"\t"+"COUNTERPART"+"\t"+"ORTHO_PARSE") for counterpart, concept, language in self._wordlist_iterator: # counterpart, concept, language in self._wordlist_iterator: # skip for Mattis if counterpart == "?" or counterpart == "NONE": continue grapheme_parsed_counterpart_tuple = self.o.parse_string_to_graphemes_string(counterpart) if grapheme_parsed_counterpart_tuple[0] == False: continue ortho_parse = grapheme_parsed_counterpart_tuple[1] print(str(row_id)+"\t"+language+"\t"+str(self._languages[language])+"\t"+concept+"\t"+str(self._concepts[concept])+"\t"+counterpart+"\t"+grapheme_parsed_counterpart_tuple[1]) # print(language+"\t"+concept+"\t"+counterpart+"\t"+grapheme_parsed_counterpart_tuple[1]) row_id += 1 def matrix_output(self): # produce Jelena style output format with matrix pass def qlc_output_format(self): # produce counterpart \t concept \t language QLC output format print("COUNTERPART"+"\t"+"ORTHO_PARSE"+"\t"+"CONCEPT"+"\t"+"LANGUAGE") for counterpart, concept, language in self._wordlist_iterator: if counterpart == "?" or counterpart == "NONE": print(counterpart+"\t"+counterpart+"\t"+concept+"\t"+language) continue grapheme_parsed_counterpart_tuple = self.o.parse_string_to_graphemes_string(counterpart) # skip shit that doesn't parse if grapheme_parsed_counterpart_tuple[0] == False: continue ortho_parse = grapheme_parsed_counterpart_tuple[1] print(counterpart+"\t"+ortho_parse+"\t"+concept+"\t"+language)
def report_unparsables(wordlistdata_id, concept, counterpart, parsed_counterpart_tuple): invalid_parse_string = parsed_counterpart_tuple[1] error = wordlistdata_id+"\t"+concept+"\t"+counterpart+"\t"+invalid_parse_string unparsables.write(error) if len(sys.argv) != 2: print("call: python parse_counterparts.py bibtex_key_source\n") source = sys.argv[1] # cr = CorpusReaderWordlist("data/testcorpus") cr = CorpusReaderWordlist("data/csv") o = OrthographyParser("data/orthography_profiles/"+source+".txt") rules = OrthographyRulesParser("data/orthography_profiles/"+"rules_"+source+".txt") # create a generator of corpus reader data wordlist_iterator = ( (wordlistdata_id, concept, counterpart) for wordlistdata_id in cr.wordlistdata_ids_for_bibtex_key(source) for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id) ) # print header print("wordlist_id"+"\t"+"language_book_name"+"\t"+"concept"+"\t"+"counterpart"+"\t"+"graphemic_parse"+"\t"+"ipa_parse"+"\t"+"orthographic_rules_parse") err_count = 0 errors = ""
invalid_parse_string = parsed_counterpart_tuple[1] error = wordlistdata_id+"\t"+concept+"\t"+counterpart+"\t"+invalid_parse_string unparsables.write(error) if len(sys.argv) != 2: print("call: python parse_counterparts.py bibtex_key_source\n") source = sys.argv[1] # cr = CorpusReaderWordlist("data/testcorpus") # cr = CorpusReaderDict("data/testcorpus") # cr = CorpusReaderWordlist("data/csv") cr = CorpusReaderDict("data/csv") o = OrthographyParser("data/orthography_profiles/"+source+".txt") rules_file_flag = 0 if os.path.isfile("data/orthography_profiles/"+"rules_"+source+".txt"): rules = OrthographyRulesParser("data/orthography_profiles/"+"rules_"+source+".txt") rules_file_flag = 1 # create a generator of corpus reader data wordlist_iterator = ( (wordlistdata_id, head, translation) for wordlistdata_id in cr.dictdata_ids_for_bibtex_key(source) for head, translation in cr.heads_with_translations_for_dictdata_id(wordlistdata_id) ) # print header if rules_file_flag: