Python CorpusReaderWordlist Examples

Programming Language: Python

Namespace/Package Name: qlc.corpusreader

Examples at hotexamples.com: 7

Python CorpusReaderWordlist - 7 examples found. These are the top rated real world Python examples of qlc.corpusreader.CorpusReaderWordlist extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

wordlistdata_ids_for_bibtex_key(5)

get_language_bookname_for_wordlistdata_id(5)

concepts_with_counterparts_for_wordlistdata_id(4)

get_language_code_for_wordlistdata_id(2)

counterparts_for_wordlistdata_id(1)

data(1)

get_language_bookname_for_wordlist_data_id(1)

get_language_code_for_wordlist_data_id(1)

wordlist_ids_for_bibtex_key(1)

wordlistdata_ids_for_component(1)

Example #1

Show file

File: ngram.py Project: pombredanne/qlc

def main(argv):

    if len(argv) < 2:
        print("call: counterparts_huber1992.py data_path")
        exit(1)

    cr = CorpusReaderWordlist(os.path.join(argv[1], "csv"))
    o = OrthographyParser(os.path.join(argv[1], "orthography_profiles", "huber1992.txt"))
    
    ngrams_by_language_count = list()
    ngrams_set = set()
    
    for i, wordlistdata_id in enumerate(cr.wordlistdata_ids_for_bibtex_key('huber1992')):
        #counterparts = cr.counterpartsForWordlistdataId(wordlistdata_id)
        #print wordlistdata_id
        language_bookname = cr.get_language_bookname_for_wordlistdata_id(wordlistdata_id)
        language_code = cr.get_language_code_for_wordlistdata_id(wordlistdata_id)

        counterpart_graphemes = (o.parse_string_to_graphemes(counterpart) \
               for counterpart in cr.counterparts_for_wordlistdata_id(wordlistdata_id))

        matrix = qlc.ngram.words_ngrams_matrix_for_graphemes_list(counterpart_graphemes, 2)
        
        sum = numpy.sum(matrix.matrix, 0)
        #print("Sum length: {0}".format(len(sum)))
        #print("Column length: {0}".format(len(columns)))
        
        if len(sum.nonzero()[0]) != matrix.number_of_columns:
            print("Error: ")
            print("{0} != {1}".format(len(sum.nonzero()[0]), len(columns)))
            print(language_bookname)
        
        ngrams_by_language_count.append(collections.defaultdict(int))
        for j, c in enumerate(matrix.column_names):
            ngrams_set.add(c)
            ngrams_by_language_count[i][c] = sum[j]

    ngrams_list = sorted(list(ngrams_set))
    matrix = qlc.matrix.Matrix(ngrams_by_language_count, ngrams_list)
    # matrix = numpy.zeros( ( len(ngrams_by_language_count), len(ngrams_list) ) )
    
    for i in range(matrix.number_of_rows):
        for j, ngram in enumerate(ngrams_list):
            matrix.matrix[i][j] = ngrams_by_language_count[i][ngram]
            
    print(matrix.matrix)

Example #2

Show file

File: counterparts_huber1992.py Project: pombredanne/qlc

def main(argv):

    if len(argv) < 2:
        print("call: counterparts_huber1992.py data_path")
        exit(1)

    cr = CorpusReaderWordlist(argv[1])
        
    output = codecs.open("counterparts_huber1992.txt", "w", "utf-8")
    output.write("COUNTERPART\tCONCEPT\tLANGUAGE_BOOKNAME\tLANGUAGE_CODE\tFAMILY\tBIBTEX_KEY\n")
    
    for wordlistdata_id in cr.wordlist_ids_for_bibtex_key('huber1992'):
        #counterparts = cr.counterpartsForWordlistdataId(wordlistdata_id)
        #print wordlistdata_id
        language_bookname = cr.get_language_bookname_for_wordlist_data_id(wordlistdata_id)
        language_code = cr.get_language_code_for_wordlist_data_id(wordlistdata_id)
        family = families[language_bookname]
        
        for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id):
            output.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (counterpart, concept, language_bookname, language_code, family, 'huber1992'))
        
    output.close()

Example #3

Show file

File: concepts_with_counterparts.py Project: pombredanne/qlc

def main(argv):

    if len(argv) < 2:
        print("call: concepts_with_counterparts.py data_path [(bibtex_key|component)]")
        exit(1)

    cr = CorpusReaderWordlist(argv[1])
    print("Data loaded", file=sys.stderr)
    
    dictdata_ids = []    
    if len(argv) == 3:
        wordlistdata_ids = cr.wordlistdata_ids_for_bibtex_key(argv[2])
        if len(wordlistdata_ids) == 0:
            wordlistdata_ids = cr.wordlistdata_ids_for_component(argv[2])
            if len(wordlistdata_ids) == 0:
                print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2]), file=sys.stderr)
                sys.exit(1)
    else:
        wordlistdata_ids = cr.wordlistdata_string_ids
        
    bibtex_keys = collections.defaultdict(list)
    for wid in wordlistdata_ids:
        wordlistdata_string = cr.wordlistdata_string_ids[wid]
        bibtex_key = wordlistdata_string.split("_")[0]
        bibtex_keys[bibtex_key].append(wid)
        
    for bibtex_key in bibtex_keys:
    
        print("Writing data for wordlistdata bibtex key {0}".format(bibtex_key), file=sys.stderr)

        output = codecs.open("concepts_with_counterparts_%s.txt" % bibtex_key, "w", "utf-8")
        output.write("COUNTERPART\tCONCEPT\tLANGUAGE_BOOKNAME\tLANGUAGE_CODE\tBIBTEX_KEY\n")

        for wordlistdata_id in bibtex_keys[bibtex_key]:
            #heads_with_translations = cr.heads_with_translations_for_dictdata_id(dictdata_id)
            language_bookname = cr.get_language_bookname_for_wordlistdata_id(wordlistdata_id)
            language_code = cr.get_language_code_for_wordlistdata_id(wordlistdata_id)
            
            for concept, counterpart in cr.data(wordlistdata_id):
                output.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(counterpart, concept, language_bookname, language_code, bibtex_key))
            
        output.close()

        if os.path.getsize("concepts_with_counterparts_%s.txt" % bibtex_key) == 0:
            os.remove("concepts_with_counterparts_%s.txt" % bibtex_key)

Example #4

Show file

File: ngram_bag_of_symbols.py Project: pombredanne/qlc

def main(argv):

    if len(argv) < 2:
        print("call: counterparts_huber1992.py data_path")
        exit(1)

    cr = CorpusReaderWordlist(argv[1])
    o = OrthographyParser(qlc.get_data("orthography_profiles/huber1992.txt"))
    
    wordlist_iterator = ( (wordlistdata_id, concept, counterpart)
        for wordlistdata_id in cr.wordlistdata_ids_for_bibtex_key('huber1992')
        for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id)
    )
    
    wordlist = WordlistStoreWithNgrams(wordlist_iterator, o)
    
    matrix_dict = dict()

    for wordlistdata_id in wordlist.languages:

        language_bookname = cr.get_language_bookname_for_wordlistdata_id(wordlistdata_id)
        #language_code = cr.get_language_code_for_wordlistdata_id(wordlistdata_id)

        if language_bookname != "bora" and language_bookname != "muinane":
            continue

        print("Creating matrix for language {0}...".format(language_bookname))
                
        matrix = numpy.zeros( (len(wordlist.concepts), len(wordlist.unique_ngrams)) )
        
        for i, concept in enumerate(wordlist.concepts):
            for j, n in enumerate(wordlist.unique_ngrams):
                if n in wordlist.counterpart_for_language_and_concept(wordlistdata_id, concept):
                    matrix[i][j] = 1
        
        matrix_dict[language_bookname] = matrix
    
    # sum up over all languages
    #languages = matrix_dict.keys()
    #matrix_languages = numpy.zeros( (len(languages), len(master_ngrams)) )
    #for i, l in enumerate(languages):
    #    matrix_languages[i] = numpy.sum(matrix_dict[l], 0)[0]
            
    #numpy.savetxt("matrix_languages.txt", matrix_languages)
    
    print('Begin comparison of two languages... Bora and Muninane!')
    print
    
    languages_tuples = [ ("bora", "muinane") ]
    
    # for each language to get a matrix of bigrams by meanings
    
    for language1, language2 in languages_tuples:
        matrix1 = matrix_dict[language1]
        matrix2 = matrix_dict[language2]
        
        n1 = wordlist.unique_ngrams.index(('e', '#'))
        n2 = wordlist.unique_ngrams.index(('o', '#'))
        
        matrix_cooccurrences = numpy.dot(numpy.transpose(matrix1), matrix2)
        
        vector1 = numpy.sum(matrix1, 0)
        vector2 = numpy.sum(matrix2, 0)
        
        print(vector1[n1])
        print(vector2[n2])
        
        print(matrix_cooccurrences[n1][n2])
        
        matrix_expectations = numpy.outer(vector1, vector2) / len(wordlist.concepts)

        print(matrix_expectations[n1][n2])

        matrix_significance = matrix_expectations + \
                              numpy.log(scipy.misc.factorial(matrix_cooccurrences)) - \
                              matrix_cooccurrences * numpy.log(matrix_expectations)
        
        numpy.savetxt("matrix_significance.txt", matrix_significance)
        
        print(matrix_significance[n1][n2])

Example #5

Show file

File: matrix.py Project: pombredanne/qlc

if __name__=="__main__":
    import sys
    from qlc.corpusreader import CorpusReaderWordlist
    from qlc.orthography import OrthographyParser, GraphemeParser
    from scipy.io import mmread, mmwrite # write sparse matrices

    if len(sys.argv) != 2:
        print("call: python matrix.py source\n")
        print("python matrix.py huber1992\n")

    source = sys.argv[1] # dictionary/wordlist source key
    output_dir = "output/"+source+"/"

    # get data from corpus reader
    cr = CorpusReaderWordlist("data/csv")          # real data
    # cr = CorpusReaderWordlist("data/testcorpus") # test data

    # initialize orthography parser for source
    o = OrthographyParser("data/orthography_profiles/"+source+".txt")
    # o = GraphemeParser() # or use the grapheme parser

    # create a generator of corpus reader data
    wordlist_iterator = ( (wordlistdata_id, concept, counterpart)
        for wordlistdata_id in cr.wordlistdata_ids_for_bibtex_key(source)
        for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id)
    )

    # write the data to disk -- note it exhausts the generator, so either the generator
    # must be "regenerated" or run the following lines without the rest of the code below
    # move this into a method in the class

Example #6

Show file

File: parse_counterparts.py Project: pombredanne/qlc

unparsables = open("unparsables.txt", "w")

def report_unparsables(wordlistdata_id, concept, counterpart, parsed_counterpart_tuple):
    invalid_parse_string = parsed_counterpart_tuple[1]
    error = wordlistdata_id+"\t"+concept+"\t"+counterpart+"\t"+invalid_parse_string
    unparsables.write(error)


if len(sys.argv) != 2:
    print("call: python parse_counterparts.py bibtex_key_source\n")

source = sys.argv[1]

# cr = CorpusReaderWordlist("data/testcorpus")
cr = CorpusReaderWordlist("data/csv")

o = OrthographyParser("data/orthography_profiles/"+source+".txt")
rules = OrthographyRulesParser("data/orthography_profiles/"+"rules_"+source+".txt")


# create a generator of corpus reader data
wordlist_iterator = ( (wordlistdata_id, concept, counterpart)
for wordlistdata_id in cr.wordlistdata_ids_for_bibtex_key(source)
for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id)
)


# print header
print("wordlist_id"+"\t"+"language_book_name"+"\t"+"concept"+"\t"+"counterpart"+"\t"+"graphemic_parse"+"\t"+"ipa_parse"+"\t"+"orthographic_rules_parse")

Example #7

Show file

File: unigram_matrices.py Project: pombredanne/qlc

import sys
from qlc.corpusreader import CorpusReaderWordlist
from qlc.orthography import OrthographyParser, GraphemeParser
from qlc.matrix import WordlistStoreWithNgrams
from scipy.io import mmread, mmwrite # write sparse matrices
from scipy.sparse import csr_matrix, lil_matrix, coo_matrix

if len(sys.argv) != 2:
	print("call: python matrix.py source\n")
	print("python matrix.py huber1992\n")

source = sys.argv[1] # dictionary/wordlist source key
output_dir = source+"/"

# get data from corpus reader
cr = CorpusReaderWordlist("data/csv")          # real data
# cr = CorpusReaderWordlist("data/testcorpus") # test data

# initialize orthography parser for source
o = OrthographyParser("data/orthography_profiles/"+source+".txt")
# o = GraphemeParser()

# create a generator of corpus reader data
wordlist_iterator = ( (wordlistdata_id, concept, counterpart)
	for wordlistdata_id in cr.wordlistdata_ids_for_bibtex_key(source)
	for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id)
)

"""
# print all the things!
for wordlistdata_id, concept, counterpart in wordlist_iterator: