def main(argv): if len(argv) < 3: print("call: translations_spanish_1.py data_path bibtex_key") sys.exit(1) cr = CorpusReaderDict(argv[1]) dictdata_ids = [] dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2]) if len(dictdata_ids) == 0: print("did not find any dictionary data for the bibtex_key.") sys.exit(1) for dictdata_id in dictdata_ids: translations = collections.defaultdict(int) heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id) dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id) output = codecs.open("translations_subentries_for_%s.txt" % dictdata_string, "w", "utf-8") for entry_id in heads_with_translations: if heads_with_translations[entry_id]['is_subentry'] == 't': for t in heads_with_translations[entry_id]['translations']: translations[t] += 1 for w in sorted(translations.iteritems(), key=itemgetter(1), reverse=True): output.write("{0}\t{1}\n".format(w[0], w[1]))
def main(argv): if len(argv) < 2: print("call: heads_with_translations.py data_path [(bibtex_key|component)]") exit(1) cr = CorpusReaderDict(argv[1]) print("Data loaded", file=sys.stderr) dictdata_ids = [] if len(argv) == 3: dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2]) if len(dictdata_ids) == 0: dictdata_ids = cr.dictdata_ids_for_component(argv[2]) if len(dictdata_ids) == 0: print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2])) sys.exit(1) else: dictdata_ids = cr.dictdata_string_ids for dictdata_id in dictdata_ids: #heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id) dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id) print("Writing data for dictdata string ID {0}".format(dictdata_string), file=sys.stderr) output = codecs.open("heads_with_translations_%s.txt" % dictdata_string, "w", "utf-8") for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id): output.write("%s\t%s\n" % (head, translation)) output.close()
def main(argv): if len(argv) < 2: print("call: phonology.py data_path [bibtex_key]") exit(1) cr = CorpusReaderDict(argv[1]) bibtex_key = None dictdata_ids = [] if len(argv) > 2: bibtex_key = argv[2] dictdata_ids = cr.dictdataIdsForBibtexKey(bibtex_key) else: dictdata_ids = cr.dictdataStringIds() for dictdata_id in dictdata_ids: phonology = cr.phonologyForDictdataId(dictdata_id) dictdata_string = cr.dictdataStringIdForDictataId(dictdata_id) bibtex_key = dictdata_string.split("_")[0] output = codecs.open("phonology_%s.txt" % dictdata_string, "w", "utf-8") for entry_id in phonology: for p in phonology[entry_id]["phonology"]: output.write("%s\thttp://cidles.eu/quanthistling/source/%s/%s/%s/index.html\n" % (p, bibtex_key, phonology[entry_id]["startpage"], phonology[entry_id]["pos_on_page"])) output.close()
def main(argv): if len(argv) < 3: print("call: translations_spanish_graph.py data_path (bibtex_key|component)") sys.exit(1) cr = CorpusReaderDict(argv[1]) dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2]) if len(dictdata_ids) == 0: dictdata_ids = cr.dictdata_ids_for_component(argv[2]) if len(dictdata_ids) == 0: print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2])) sys.exit(1) for dictdata_id in dictdata_ids: gr = Graph() src_language_iso = cr.src_languages_iso_for_dictdata_id(dictdata_id) tgt_language_iso = cr.tgt_languages_iso_for_dictdata_id(dictdata_id) if (src_language_iso != ['spa']) and (tgt_language_iso != ['spa']): continue if (len(src_language_iso) > 1) or (len(tgt_language_iso) > 1): continue language_iso = None if tgt_language_iso == [ 'spa' ]: language_iso = src_language_iso[0] else: language_iso = tgt_language_iso[0] dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id) bibtex_key = dictdata_string.split("_")[0] for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id): if src_language_iso == [ 'spa' ]: (head, translation) = (translation, head) head_with_source = escape_string("{0}|{1}".format(head, bibtex_key)) translation = escape_string(translation) #translation_with_language = "{0}|{1}".format(translation, language_iso) #if head_with_source not in gr: gr.add_node(head_with_source, attr_dict={ "lang": language_iso, "source": bibtex_key }) #if translation not in gr: gr.add_node(translation, attr_dict={ "lang": "spa" }) #if not gr.has_edge((head_with_source, translation)): gr.add_edge(head_with_source, translation) output = codecs.open("{0}.dot".format(dictdata_string), "w", "utf-8") output.write(write(gr)) output.close()
def main(argv): # check for the right number of command line arguments if len(argv) < 3: print() print("Call: create_initial_orthography_profile.py data_path data_source") print() print("python create_initial_orthography_profile.py data/csv/ thiesen1998") sys.exit(1) data_path = sys.argv[1] data_source = sys.argv[2] orthography_profile = open(data_source+"_initial_profile.txt", "w") # output file cr = CorpusReaderDict(data_path) dictdata_ids = cr.dictdata_ids_for_bibtex_key(data_source) # make sure the resource is in the data if len(dictdata_ids) == 0: print("There is no dictionary source for the data source you provided: "+data_source) sys.exit(1) grapheme_frequency_dict = collections.defaultdict(int) grapheme_count = 0.0 for dictdata_id in dictdata_ids: for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id): graphemes = qlc.utils.parseGraphemes(head) for grapheme in graphemes: grapheme_count += 1 grapheme_frequency_dict[grapheme] += 1 header = "grapheme"+"\t"+"count"+"\t"+"total frequency" print(header) orthography_profile.write(header+"\n") for k, v in grapheme_frequency_dict.items(): if k == " ": # skip space between words continue result = k+"\t"+str(v)+"\t"+str(v/grapheme_count*100) print(result) orthography_profile.write(result+"\n")
# -*- coding: utf-8 -*- import codecs, unicodedata import collections import re import copy from qlc.corpusreader import CorpusReaderDict from qlc.translationgraph import read, write import qlc.utils from nltk.stem.snowball import SpanishStemmer import networkx cr = CorpusReaderDict("c:/data/qlc") dictdata_ids = cr.dictdata_ids_for_component("Witotoan") re_quotes = re.compile('"') print(dictdata_ids) graphs = list() for dictdata_id in dictdata_ids: gr = networkx.Graph() src_language_iso = cr.src_language_iso_for_dictdata_id(dictdata_id) tgt_language_iso = cr.tgt_language_iso_for_dictdata_id(dictdata_id) if src_language_iso != 'spa' and tgt_language_iso != 'spa': continue
def main(argv): if len(argv) < 2: print("call: translations_spanish_1.py data_path [component]") sys.exit(1) cr = CorpusReaderDict(argv[1]) dictdata_ids = [] if len(argv) == 3: dictdata_ids = cr.dictdata_ids_for_component(argv[2]) if len(dictdata_ids) == 0: print("did not find any dictionary data for the bibtex_key.") sys.exit(1) else: dictdata_ids = cr.dictdata_string_ids spanish_singleword_dict = {} languages_iso = [] spanish_len2_dict = collections.defaultdict(int) spanish_len3_dict = collections.defaultdict(int) spanish_lengreater3_dict = collections.defaultdict(int) stemmer = Stemmer.Stemmer('spanish') stopwords = spanish_stopwords() re_stopwords = re.compile(r"\b(?:{0})\b".format( "|".join(stopwords).encode("utf-8") )) for dictdata_id in dictdata_ids: src_language_iso = cr.src_language_iso_for_dictdata_id(dictdata_id) tgt_language_iso = cr.tgt_language_iso_for_dictdata_id(dictdata_id) if src_language_iso != 'spa' and tgt_language_iso != 'spa': continue heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id) dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id) bibtex_key = dictdata_string.split("_")[0] language_iso = bibtex_key if bibtex_key not in languages_iso: languages_iso.append(bibtex_key) for entry_id in heads_with_translations: if tgt_language_iso == 'spa': heads = heads_with_translations[entry_id]['heads'] translations = heads_with_translations[entry_id]['translations'] else: heads = heads_with_translations[entry_id]['translations'] translations = heads_with_translations[entry_id]['heads'] for translation in translations: len_translation = len(translation.split(' ')) if len_translation > 1: translation_without_stopwords = re_stopwords.sub("", translation) translation_without_stopwords = translation_without_stopwords.strip(" ") translation_without_stopwords = re.sub(" +", " ", translation_without_stopwords) if translation_without_stopwords == " " or translation_without_stopwords == "": translation_without_stopwords = translation len_translation_without_stopwords = len_translation else: len_translation_without_stopwords = len(translation_without_stopwords.split(' ')) else: translation_without_stopwords = translation len_translation_without_stopwords = len_translation if len_translation_without_stopwords == 1: #print translation.encode("utf-8") translation_stem = stemmer.stemWord(translation_without_stopwords) if not translation_stem in spanish_singleword_dict: spanish_singleword_dict[translation_stem] = collections.defaultdict(set) for head in heads_with_translations[entry_id]['heads']: spanish_singleword_dict[translation_stem][language_iso].add(head) spanish_singleword_dict[translation_stem]["spa"].add(translation) elif len_translation == 2: #output2.write("%s\n" % (translation)) spanish_len2_dict[translation] += 1 elif len_translation == 3: #output3.write("%s\n" % (translation)) spanish_len3_dict[translation] += 1 else: #output4.write("%s\n" % (translation)) spanish_lengreater3_dict[translation] += 1 #output1 = codecs.open("spanish_len1.txt", "w", "utf-8") #for w in sorted(spanish_singleword_dict.iteritems(), key=itemgetter(0), reverse=True): # output1.write(u"{0}\n".format(w[0])) output2 = codecs.open("spanish_len2.txt", "w", "utf-8") for w in sorted(spanish_len2_dict.items(), key=itemgetter(1), reverse=True): output2.write("{0}\t{1}\n".format(w[0], w[1])) output3 = codecs.open("spanish_len3.txt", "w", "utf-8") for w in sorted(spanish_len3_dict.items(), key=itemgetter(1), reverse=True): output3.write("{0}\t{1}\n".format(w[0], w[1])) output4 = codecs.open("spanish_len_greater3.txt", "w", "utf-8") for w in sorted(spanish_lengreater3_dict.items(), key=itemgetter(1), reverse=True): output4.write("{0}\t{1}\n".format(w[0], w[1])) output = codecs.open("spanish_singlewords_matrix.txt", "w", "utf-8") output1 = codecs.open("spanish_len1.txt", "w", "utf-8") total_count = 0 more_than_one_lang_count = 0 output.write("%s\t%s\n" % ("spa", "\t".join(languages_iso[1:]))) for sp in sorted(spanish_singleword_dict): #output.write(sp) output.write("%s" % ('|'.join(sorted(spanish_singleword_dict[sp]["spa"])))) #print spanish_singleword_dict[sp].keys() count_languages = 0 for lang in languages_iso[1:]: if len(spanish_singleword_dict[sp][lang]) > 0: count_languages += 1 output.write("\t%s" % ('|'.join(sorted(spanish_singleword_dict[sp][lang])))) output.write("\n") output1.write("{0}\n".format(sp)) if count_languages > 1: more_than_one_lang_count += 1 total_count += 1 print("total number of entries in single word matrix: {0}".format(total_count)) print("number of entries with more than one language: {0}".format(more_than_one_lang_count))
def report_unparsables(wordlistdata_id, concept, counterpart, parsed_counterpart_tuple): invalid_parse_string = parsed_counterpart_tuple[1] error = wordlistdata_id+"\t"+concept+"\t"+counterpart+"\t"+invalid_parse_string unparsables.write(error) if len(sys.argv) != 2: print("call: python parse_counterparts.py bibtex_key_source\n") source = sys.argv[1] # cr = CorpusReaderWordlist("data/testcorpus") # cr = CorpusReaderDict("data/testcorpus") # cr = CorpusReaderWordlist("data/csv") cr = CorpusReaderDict("data/csv") o = OrthographyParser("data/orthography_profiles/"+source+".txt") rules_file_flag = 0 if os.path.isfile("data/orthography_profiles/"+"rules_"+source+".txt"): rules = OrthographyRulesParser("data/orthography_profiles/"+"rules_"+source+".txt") rules_file_flag = 1 # create a generator of corpus reader data wordlist_iterator = ( (wordlistdata_id, head, translation) for wordlistdata_id in cr.dictdata_ids_for_bibtex_key(source) for head, translation in cr.heads_with_translations_for_dictdata_id(wordlistdata_id) ) # print header