コード例 #1
0
ファイル: tokenizer.py プロジェクト: RichardLitt/lingpy
    def __init__(self):
        # deal with configuration file
        # configparser.read(default.cfg)
        cfg = SafeConfigParser()
        cfg.read("default.cfg")

        data = cfg.get("Paths", "data")
        orthography_profile = cfg.get("Paths", "orthography_profile")

        # set variables, e.g. source, orthography parser, etc.
        self.data = open(data, "r")

        self.o = OrthographyParser(orthography_profile)        
        # self.o = GraphemeParser()        

        self._languages = collections.defaultdict(int) # given unique ID to each unique language name
        self._concepts = collections.defaultdict(int) # ...
        self._counterparts = collections.defaultdict(int) # ..
        self._wordlist_iterator = self._process_input(self.data)

        # print(type(self.iterator))
        # print(len(self.counterparts))
        # words = self.get_qlc_tokenized_words()

        """
        count = 0
        for line in words:
            if line != "":
                print(line)
                count += 1
        print(count)
        """

        """
コード例 #2
0
ファイル: ngram.py プロジェクト: pombredanne/qlc
def main(argv):

    if len(argv) < 2:
        print("call: counterparts_huber1992.py data_path")
        exit(1)

    cr = CorpusReaderWordlist(os.path.join(argv[1], "csv"))
    o = OrthographyParser(os.path.join(argv[1], "orthography_profiles", "huber1992.txt"))
    
    ngrams_by_language_count = list()
    ngrams_set = set()
    
    for i, wordlistdata_id in enumerate(cr.wordlistdata_ids_for_bibtex_key('huber1992')):
        #counterparts = cr.counterpartsForWordlistdataId(wordlistdata_id)
        #print wordlistdata_id
        language_bookname = cr.get_language_bookname_for_wordlistdata_id(wordlistdata_id)
        language_code = cr.get_language_code_for_wordlistdata_id(wordlistdata_id)

        counterpart_graphemes = (o.parse_string_to_graphemes(counterpart) \
               for counterpart in cr.counterparts_for_wordlistdata_id(wordlistdata_id))

        matrix = qlc.ngram.words_ngrams_matrix_for_graphemes_list(counterpart_graphemes, 2)
        
        sum = numpy.sum(matrix.matrix, 0)
        #print("Sum length: {0}".format(len(sum)))
        #print("Column length: {0}".format(len(columns)))
        
        if len(sum.nonzero()[0]) != matrix.number_of_columns:
            print("Error: ")
            print("{0} != {1}".format(len(sum.nonzero()[0]), len(columns)))
            print(language_bookname)
        
        ngrams_by_language_count.append(collections.defaultdict(int))
        for j, c in enumerate(matrix.column_names):
            ngrams_set.add(c)
            ngrams_by_language_count[i][c] = sum[j]

    ngrams_list = sorted(list(ngrams_set))
    matrix = qlc.matrix.Matrix(ngrams_by_language_count, ngrams_list)
    # matrix = numpy.zeros( ( len(ngrams_by_language_count), len(ngrams_list) ) )
    
    for i in range(matrix.number_of_rows):
        for j, ngram in enumerate(ngrams_list):
            matrix.matrix[i][j] = ngrams_by_language_count[i][ngram]
            
    print(matrix.matrix)
コード例 #3
0
ファイル: tokenizer.py プロジェクト: RichardLitt/lingpy
class Tokenizer:

    """ takes as input a file with the QLC format:
    counterpart \t concept \t language

    and does things like 

    - tokenizes the file into LINGPY format
    - tokenizes the data into ortographically parsed QLC format
    - locates unicorns

    """


    def __init__(self):
        # deal with configuration file
        # configparser.read(default.cfg)
        cfg = SafeConfigParser()
        cfg.read("default.cfg")

        data = cfg.get("Paths", "data")
        orthography_profile = cfg.get("Paths", "orthography_profile")

        # set variables, e.g. source, orthography parser, etc.
        self.data = open(data, "r")

        self.o = OrthographyParser(orthography_profile)        
        # self.o = GraphemeParser()        

        self._languages = collections.defaultdict(int) # given unique ID to each unique language name
        self._concepts = collections.defaultdict(int) # ...
        self._counterparts = collections.defaultdict(int) # ..
        self._wordlist_iterator = self._process_input(self.data)

        # print(type(self.iterator))
        # print(len(self.counterparts))
        # words = self.get_qlc_tokenized_words()

        """
        count = 0
        for line in words:
            if line != "":
                print(line)
                count += 1
        print(count)
        """

        """
        self.cr = CorpusReaderWordlist("data/csv")
        self.wordlist_iterator = ( (wordlistdata_id, concept, counterpart)
            for wordlistdata_id in self.cr.wordlistdata_ids_for_bibtex_key(source)
            for concept, counterpart in self.cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id)
        )
        """

    def _process_input(self, file):
        languages_id = 1
        concepts_id = 1
        counterparts_id = 1
        header = file.readline()

        lines = []
        for line in file:
            line = line.strip()
            line = line.replace("  ", " ")
            counterpart, concept, language = line.split("\t")
            result = (counterpart, concept, language)
            lines.append(result)

            if language not in self._languages:
                self._languages[language] = languages_id
                languages_id += 1
            if concept not in self._concepts:
                self._concepts[concept] = concepts_id
                concepts_id += 1
            if counterpart not in self._counterparts:
                counterparts_id += 1
                self._counterparts[counterpart] = counterparts_id

        return ((concept, counterpart, language) for concept, counterpart, language in lines)


    def get_qlc_tokenized_words(self):
        unparasables = open("unparsables.txt", "w")
        tokenized_words = []
        for counterpart, concept, language in self._wordlist_iterator:
            counterpart = unicodedata.normalize("NFD", counterpart)
            grapheme_parsed_counterpart_tuple = self.o.parse_string_to_graphemes_string(counterpart)
            if grapheme_parsed_counterpart_tuple[0] == False:
                unparsables.write(grapheme_parsed_counterpart_tuple[1])
                continue
        
            grapheme_parse = grapheme_parsed_counterpart_tuple[1]
            tokenized_words.append(grapheme_parse)
        return tokenized_words

    def get_ipa_tokenized_words(self):
        tokenized_words = []
        words = get_list_qlc_tokenized_words()
        for word in words:
            grapheme_parsed_counterpart_tuple = self.o.parse_string_to_graphemes_string(counterpart)
            
    def lingpy_output(self):
        row_id = 1
        # given some data set from the corpusreader or somewhere else, output a lingpy format
        print("ID"+"\t"+"Taxa"+"\t"+"TaxonID"+"\t"+"Gloss"+"\t"+"GlossID"+"\t"+"IPA"+"\t"+"Orthography")
        # print("# LANGUAGE"+"\t"+"CONCEPT"+"\t"+"COUNTERPART"+"\t"+"ORTHO_PARSE")

        for counterpart, concept, language in self._wordlist_iterator:
            # counterpart, concept, language in self._wordlist_iterator:
            # skip for Mattis
            if counterpart == "?" or counterpart == "NONE":
                continue

            grapheme_parsed_counterpart_tuple = self.o.parse_string_to_graphemes_string(counterpart)
            if grapheme_parsed_counterpart_tuple[0] == False:
                continue

            ortho_parse = grapheme_parsed_counterpart_tuple[1]

            print(str(row_id)+"\t"+language+"\t"+str(self._languages[language])+"\t"+concept+"\t"+str(self._concepts[concept])+"\t"+counterpart+"\t"+grapheme_parsed_counterpart_tuple[1])
            # print(language+"\t"+concept+"\t"+counterpart+"\t"+grapheme_parsed_counterpart_tuple[1])

            row_id += 1

    def matrix_output(self):
        # produce Jelena style output format with matrix
        pass

    def qlc_output_format(self):
        # produce counterpart \t concept \t language QLC output format
        print("COUNTERPART"+"\t"+"ORTHO_PARSE"+"\t"+"CONCEPT"+"\t"+"LANGUAGE")
        for counterpart, concept, language in self._wordlist_iterator:
            if counterpart == "?" or counterpart == "NONE":
                print(counterpart+"\t"+counterpart+"\t"+concept+"\t"+language)                
                continue
            grapheme_parsed_counterpart_tuple = self.o.parse_string_to_graphemes_string(counterpart)
            
            # skip shit that doesn't parse
            if grapheme_parsed_counterpart_tuple[0] == False:
                continue

            ortho_parse = grapheme_parsed_counterpart_tuple[1]
            print(counterpart+"\t"+ortho_parse+"\t"+concept+"\t"+language)
コード例 #4
0
ファイル: parse_counterparts.py プロジェクト: pombredanne/qlc
def report_unparsables(wordlistdata_id, concept, counterpart, parsed_counterpart_tuple):
    invalid_parse_string = parsed_counterpart_tuple[1]
    error = wordlistdata_id+"\t"+concept+"\t"+counterpart+"\t"+invalid_parse_string
    unparsables.write(error)


if len(sys.argv) != 2:
    print("call: python parse_counterparts.py bibtex_key_source\n")

source = sys.argv[1]

# cr = CorpusReaderWordlist("data/testcorpus")
cr = CorpusReaderWordlist("data/csv")

o = OrthographyParser("data/orthography_profiles/"+source+".txt")
rules = OrthographyRulesParser("data/orthography_profiles/"+"rules_"+source+".txt")


# create a generator of corpus reader data
wordlist_iterator = ( (wordlistdata_id, concept, counterpart)
for wordlistdata_id in cr.wordlistdata_ids_for_bibtex_key(source)
for concept, counterpart in cr.concepts_with_counterparts_for_wordlistdata_id(wordlistdata_id)
)


# print header
print("wordlist_id"+"\t"+"language_book_name"+"\t"+"concept"+"\t"+"counterpart"+"\t"+"graphemic_parse"+"\t"+"ipa_parse"+"\t"+"orthographic_rules_parse")

err_count = 0
errors = ""
コード例 #5
0
    invalid_parse_string = parsed_counterpart_tuple[1]
    error = wordlistdata_id+"\t"+concept+"\t"+counterpart+"\t"+invalid_parse_string
    unparsables.write(error)


if len(sys.argv) != 2:
    print("call: python parse_counterparts.py bibtex_key_source\n")

source = sys.argv[1]

# cr = CorpusReaderWordlist("data/testcorpus")
# cr = CorpusReaderDict("data/testcorpus")
# cr = CorpusReaderWordlist("data/csv")
cr = CorpusReaderDict("data/csv")

o = OrthographyParser("data/orthography_profiles/"+source+".txt")

rules_file_flag = 0
if os.path.isfile("data/orthography_profiles/"+"rules_"+source+".txt"):
    rules = OrthographyRulesParser("data/orthography_profiles/"+"rules_"+source+".txt")
    rules_file_flag = 1

# create a generator of corpus reader data
wordlist_iterator = ( (wordlistdata_id, head, translation)
for wordlistdata_id in cr.dictdata_ids_for_bibtex_key(source)
for head, translation in cr.heads_with_translations_for_dictdata_id(wordlistdata_id)
)

# print header

if rules_file_flag: