def main(argv):

    if len(argv) < 3:
        print("call: translations_spanish_1.py data_path bibtex_key")
        sys.exit(1)

    cr = CorpusReaderDict(argv[1])

    dictdata_ids = []    
    dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2])
    if len(dictdata_ids) == 0:
        print("did not find any dictionary data for the bibtex_key.")
        sys.exit(1)

    
    for dictdata_id in dictdata_ids:
        translations = collections.defaultdict(int)
        heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id)
        dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id)
        output = codecs.open("translations_subentries_for_%s.txt" % dictdata_string, "w", "utf-8")
        
        for entry_id in heads_with_translations:
            if heads_with_translations[entry_id]['is_subentry'] == 't':
                for t in heads_with_translations[entry_id]['translations']:
                    translations[t] += 1

        for w in sorted(translations.iteritems(), key=itemgetter(1), reverse=True):
            output.write("{0}\t{1}\n".format(w[0], w[1]))
def main(argv):

    if len(argv) < 2:
        print("call: heads_with_translations.py data_path [(bibtex_key|component)]")
        exit(1)

    cr = CorpusReaderDict(argv[1])
    print("Data loaded", file=sys.stderr)
    
    dictdata_ids = []    
    if len(argv) == 3:
        dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2])
        if len(dictdata_ids) == 0:
            dictdata_ids = cr.dictdata_ids_for_component(argv[2])
            if len(dictdata_ids) == 0:
                print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2]))
                sys.exit(1)
    else:
        dictdata_ids = cr.dictdata_string_ids
        
    
    for dictdata_id in dictdata_ids:
        #heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id)
        dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id)
        print("Writing data for dictdata string ID {0}".format(dictdata_string), file=sys.stderr)

        output = codecs.open("heads_with_translations_%s.txt" % dictdata_string, "w", "utf-8")
        
        for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id):
            output.write("%s\t%s\n" % (head, translation))
        
        output.close()
Exemple #3
0
def main(argv):

    if len(argv) < 2:
        print("call: phonology.py data_path [bibtex_key]")
        exit(1)

    cr = CorpusReaderDict(argv[1])
    
    bibtex_key = None
    dictdata_ids = []
    if len(argv) > 2:
        bibtex_key = argv[2]
        dictdata_ids = cr.dictdataIdsForBibtexKey(bibtex_key)
    else:
        dictdata_ids = cr.dictdataStringIds()
        
    for dictdata_id in dictdata_ids:
        phonology = cr.phonologyForDictdataId(dictdata_id)
        dictdata_string = cr.dictdataStringIdForDictataId(dictdata_id)
        bibtex_key = dictdata_string.split("_")[0]
        output = codecs.open("phonology_%s.txt" % dictdata_string, "w", "utf-8")
        
        for entry_id in phonology:
            for p in phonology[entry_id]["phonology"]:
                output.write("%s\thttp://cidles.eu/quanthistling/source/%s/%s/%s/index.html\n" % (p, bibtex_key, phonology[entry_id]["startpage"], phonology[entry_id]["pos_on_page"]))
        
        output.close()
def main(argv):
    
    if len(argv) < 3:
        print("call: translations_spanish_graph.py data_path (bibtex_key|component)")
        sys.exit(1)

    cr = CorpusReaderDict(argv[1])

    dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2])
    if len(dictdata_ids) == 0:
        dictdata_ids = cr.dictdata_ids_for_component(argv[2])
        if len(dictdata_ids) == 0:
            print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2]))
            sys.exit(1)
        

    for dictdata_id in dictdata_ids:
        gr = Graph()
        src_language_iso = cr.src_languages_iso_for_dictdata_id(dictdata_id)
        tgt_language_iso = cr.tgt_languages_iso_for_dictdata_id(dictdata_id)
        if (src_language_iso != ['spa']) and (tgt_language_iso != ['spa']):
            continue
        
        if (len(src_language_iso) > 1) or (len(tgt_language_iso) > 1):
            continue
        
        language_iso = None
        if tgt_language_iso == [ 'spa' ]:
            language_iso = src_language_iso[0]
        else:
            language_iso = tgt_language_iso[0]
                        
        dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id)
        bibtex_key = dictdata_string.split("_")[0]

        for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id):
            if src_language_iso == [ 'spa' ]:
                (head, translation) = (translation, head)
                
            head_with_source = escape_string("{0}|{1}".format(head, bibtex_key))
            translation = escape_string(translation)
            
            #translation_with_language = "{0}|{1}".format(translation, language_iso)
            
            #if head_with_source not in gr:
            gr.add_node(head_with_source, attr_dict={ "lang": language_iso, "source": bibtex_key })
            
            #if translation not in gr:
            gr.add_node(translation, attr_dict={ "lang": "spa" })
                
            #if not gr.has_edge((head_with_source, translation)):
            gr.add_edge(head_with_source, translation)

        output = codecs.open("{0}.dot".format(dictdata_string), "w", "utf-8")
        output.write(write(gr))
        output.close()
def main(argv):
    # check for the right number of command line arguments
    if len(argv) < 3:
        print()
        print("Call: create_initial_orthography_profile.py data_path data_source")
        print()
        print("python create_initial_orthography_profile.py data/csv/ thiesen1998")
        sys.exit(1)

    data_path = sys.argv[1]
    data_source = sys.argv[2]

    orthography_profile = open(data_source+"_initial_profile.txt", "w") # output file
    cr = CorpusReaderDict(data_path) 
    dictdata_ids = cr.dictdata_ids_for_bibtex_key(data_source)

    # make sure the resource is in the data
    if len(dictdata_ids) == 0:
        print("There is no dictionary source for the data source you provided: "+data_source)
        sys.exit(1)


    grapheme_frequency_dict = collections.defaultdict(int)
    grapheme_count = 0.0

    for dictdata_id in dictdata_ids:
        for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id):
            graphemes = qlc.utils.parseGraphemes(head)
            for grapheme in graphemes:
                grapheme_count += 1
                grapheme_frequency_dict[grapheme] += 1

    header = "grapheme"+"\t"+"count"+"\t"+"total frequency"
    print(header)
    orthography_profile.write(header+"\n")
    for k, v in grapheme_frequency_dict.items():
        if k == " ": # skip space between words
            continue
        result = k+"\t"+str(v)+"\t"+str(v/grapheme_count*100)
        print(result)
        orthography_profile.write(result+"\n")
Exemple #6
0
# -*- coding: utf-8 -*-

import codecs, unicodedata
import collections
import re
import copy

from qlc.corpusreader import CorpusReaderDict
from qlc.translationgraph import read, write
import qlc.utils

from nltk.stem.snowball import SpanishStemmer

import networkx

cr = CorpusReaderDict("c:/data/qlc")

dictdata_ids = cr.dictdata_ids_for_component("Witotoan")
re_quotes = re.compile('"')

print(dictdata_ids)

graphs = list()
for dictdata_id in dictdata_ids:
    gr = networkx.Graph()

    src_language_iso = cr.src_language_iso_for_dictdata_id(dictdata_id)
    tgt_language_iso = cr.tgt_language_iso_for_dictdata_id(dictdata_id)
    if src_language_iso != 'spa' and tgt_language_iso != 'spa':
        continue
    
def main(argv):

    if len(argv) < 2:
        print("call: translations_spanish_1.py data_path [component]")
        sys.exit(1)

    cr = CorpusReaderDict(argv[1])

    dictdata_ids = []    
    if len(argv) == 3:
        dictdata_ids = cr.dictdata_ids_for_component(argv[2])
        if len(dictdata_ids) == 0:
            print("did not find any dictionary data for the bibtex_key.")
            sys.exit(1)
    else:
        dictdata_ids = cr.dictdata_string_ids
        
    spanish_singleword_dict = {}
    languages_iso = []
    spanish_len2_dict = collections.defaultdict(int)
    spanish_len3_dict = collections.defaultdict(int)
    spanish_lengreater3_dict = collections.defaultdict(int)
    
    stemmer = Stemmer.Stemmer('spanish')
    stopwords = spanish_stopwords()
    re_stopwords = re.compile(r"\b(?:{0})\b".format( "|".join(stopwords).encode("utf-8") ))

    for dictdata_id in dictdata_ids:
        src_language_iso = cr.src_language_iso_for_dictdata_id(dictdata_id)
        tgt_language_iso = cr.tgt_language_iso_for_dictdata_id(dictdata_id)
        if src_language_iso != 'spa' and tgt_language_iso != 'spa':
            continue
            
            
        heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id)
        dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id)
        bibtex_key = dictdata_string.split("_")[0]

        language_iso = bibtex_key
        if bibtex_key not in languages_iso:
            languages_iso.append(bibtex_key)

        for entry_id in heads_with_translations:
            if tgt_language_iso == 'spa':
                heads = heads_with_translations[entry_id]['heads']
                translations = heads_with_translations[entry_id]['translations']
            else:
                heads = heads_with_translations[entry_id]['translations']
                translations = heads_with_translations[entry_id]['heads']
                
            for translation in translations:
                len_translation = len(translation.split(' '))
                if len_translation > 1:
                    translation_without_stopwords = re_stopwords.sub("", translation)
                    translation_without_stopwords = translation_without_stopwords.strip(" ")
                    translation_without_stopwords = re.sub(" +", " ", translation_without_stopwords)
                    if translation_without_stopwords == " " or translation_without_stopwords == "":
                        translation_without_stopwords = translation
                        len_translation_without_stopwords = len_translation
                    else:
                        len_translation_without_stopwords = len(translation_without_stopwords.split(' '))

                else:
                    translation_without_stopwords = translation
                    len_translation_without_stopwords = len_translation

                
                if len_translation_without_stopwords == 1:
                    #print translation.encode("utf-8")
                    translation_stem = stemmer.stemWord(translation_without_stopwords)
                    if not translation_stem in  spanish_singleword_dict:
                        spanish_singleword_dict[translation_stem] = collections.defaultdict(set)
                    for head in heads_with_translations[entry_id]['heads']:
                        spanish_singleword_dict[translation_stem][language_iso].add(head)
                    spanish_singleword_dict[translation_stem]["spa"].add(translation)
                        

                elif len_translation == 2:
                    #output2.write("%s\n" % (translation))
                    spanish_len2_dict[translation] += 1

                elif len_translation == 3:
                    #output3.write("%s\n" % (translation))
                    spanish_len3_dict[translation] += 1

                else:
                    #output4.write("%s\n" % (translation))
                    spanish_lengreater3_dict[translation] += 1

    #output1 = codecs.open("spanish_len1.txt", "w", "utf-8")
    #for w in sorted(spanish_singleword_dict.iteritems(), key=itemgetter(0), reverse=True):
    #    output1.write(u"{0}\n".format(w[0]))

    output2 = codecs.open("spanish_len2.txt", "w", "utf-8")
    for w in sorted(spanish_len2_dict.items(), key=itemgetter(1), reverse=True):
        output2.write("{0}\t{1}\n".format(w[0], w[1]))

    output3 = codecs.open("spanish_len3.txt", "w", "utf-8")
    for w in sorted(spanish_len3_dict.items(), key=itemgetter(1), reverse=True):
        output3.write("{0}\t{1}\n".format(w[0], w[1]))

    output4 = codecs.open("spanish_len_greater3.txt", "w", "utf-8")
    for w in sorted(spanish_lengreater3_dict.items(), key=itemgetter(1), reverse=True):
        output4.write("{0}\t{1}\n".format(w[0], w[1]))

    output = codecs.open("spanish_singlewords_matrix.txt", "w", "utf-8")
    output1 = codecs.open("spanish_len1.txt", "w", "utf-8")
    total_count = 0
    more_than_one_lang_count = 0
    output.write("%s\t%s\n" % ("spa", "\t".join(languages_iso[1:])))
    for sp in sorted(spanish_singleword_dict):
        #output.write(sp)
        output.write("%s" % ('|'.join(sorted(spanish_singleword_dict[sp]["spa"]))))
        #print spanish_singleword_dict[sp].keys()
        count_languages = 0
        for lang in languages_iso[1:]:
            if len(spanish_singleword_dict[sp][lang]) > 0:
                count_languages += 1
            output.write("\t%s" % ('|'.join(sorted(spanish_singleword_dict[sp][lang]))))
        output.write("\n")
        output1.write("{0}\n".format(sp))
        if count_languages > 1:
            more_than_one_lang_count += 1
        total_count += 1
        
    print("total number of entries in single word matrix: {0}".format(total_count))
    print("number of entries with more than one language: {0}".format(more_than_one_lang_count))
def report_unparsables(wordlistdata_id, concept, counterpart, parsed_counterpart_tuple):
    invalid_parse_string = parsed_counterpart_tuple[1]
    error = wordlistdata_id+"\t"+concept+"\t"+counterpart+"\t"+invalid_parse_string
    unparsables.write(error)


if len(sys.argv) != 2:
    print("call: python parse_counterparts.py bibtex_key_source\n")

source = sys.argv[1]

# cr = CorpusReaderWordlist("data/testcorpus")
# cr = CorpusReaderDict("data/testcorpus")
# cr = CorpusReaderWordlist("data/csv")
cr = CorpusReaderDict("data/csv")

o = OrthographyParser("data/orthography_profiles/"+source+".txt")

rules_file_flag = 0
if os.path.isfile("data/orthography_profiles/"+"rules_"+source+".txt"):
    rules = OrthographyRulesParser("data/orthography_profiles/"+"rules_"+source+".txt")
    rules_file_flag = 1

# create a generator of corpus reader data
wordlist_iterator = ( (wordlistdata_id, head, translation)
for wordlistdata_id in cr.dictdata_ids_for_bibtex_key(source)
for head, translation in cr.heads_with_translations_for_dictdata_id(wordlistdata_id)
)

# print header