예제 #1
0
def main():
    i = 0
    good = defaultdict(set)
    bad = defaultdict(set)
    for line in fileinput.input():
        i += 1
        # not sure about the proper encoding to use
        try:
            line = line.decode('utf8')
            if line.startswith('~t96;'):
                # should be a word ?
                entry = analyse_word_entry.parse_one(line)
                syl_list = wsl_to_kaulo.check_entry(entry['raw']) 
                for sino, readings in syl_list:
                    for r in readings:
                        syl = wsl_to_kaulo.convert(r)
                        if syl is not None:
                            good[sino].add(syl)
                        else:
                            bad[sino].add(r)
        except UnicodeDecodeError:
            print "encoding error on line", i

    # let's build a reverse index of problems
    readings_dic = defaultdict(list)
    for sino, readings in bad.iteritems():
        for r in readings:
            readings_dic[r].append(sino)
    for k,v in good.items():
        good[k] = list(v)
    for k,v in bad.items():
        bad[k] = list(v)
    print json.dumps((readings_dic, good, bad))
예제 #2
0
def main():
    i = 0
    good = defaultdict(set)
    bad = defaultdict(set)
    for line in fileinput.input():
        i += 1
        # not sure about the proper encoding to use
        try:
            line = line.decode('utf8')
            if line.startswith('~t96;'):
                # should be a word ?
                entry = analyse_word_entry.parse_one(line)
                syl_list = wsl_to_kaulo.check_entry(entry['raw'])
                for sino, readings in syl_list:
                    for r in readings:
                        syl = wsl_to_kaulo.convert(r)
                        if syl is not None:
                            good[sino].add(syl)
                        else:
                            bad[sino].add(r)
        except UnicodeDecodeError:
            print "encoding error on line", i

    # let's build a reverse index of problems
    readings_dic = defaultdict(list)
    for sino, readings in bad.iteritems():
        for r in readings:
            readings_dic[r].append(sino)
    for k, v in good.items():
        good[k] = list(v)
    for k, v in bad.items():
        bad[k] = list(v)
    print json.dumps((readings_dic, good, bad))
예제 #3
0
def process_buffer(buf, list_of_results):
    entry = analyse_word_entry.parse_one("".join(buf))
    if entry:
        if len(list_of_results) > 0 and list_of_results[-1]["entry"] == entry["entry"]:
            list_of_results[-1]["heteronyms"].append(entry)
        else:
            list_of_results.append({"entry": entry["entry"], "heteronyms": [entry]})
    else:
        print "unanalyzed", "".join(buf).encode("utf8")
예제 #4
0
def process_buffer(buf, list_of_results):
    entry = analyse_word_entry.parse_one("".join(buf))
    if entry:
        if len(list_of_results
               ) > 0 and list_of_results[-1]['entry'] == entry['entry']:
            list_of_results[-1]['heteronyms'].append(entry)
        else:
            list_of_results.append({
                'entry': entry['entry'],
                'heteronyms': [entry]
            })
    else:
        print "unanalyzed", "".join(buf).encode("utf8")
예제 #5
0
def main():
    i = 0
    for line in fileinput.input():
        i += 1
        # not sure about the proper encoding to use
        # Perl actually does a better job on this, original encoding is CP950
        try:
            line = line.decode('utf8')
            if line.startswith('~t96;'):
                # should be a word ?
                entry = analyse_word_entry.parse_one(line)
                print(analyse_word_entry.html_of_entry(entry)).encode('utf8')
        except UnicodeDecodeError:
            print "encoding error on line", i
예제 #6
0
def main():
    i = 0
    for line in fileinput.input():
        i += 1
        # not sure about the proper encoding to use
        # Perl actually does a better job on this, original encoding is CP950
        try:
            line = line.decode('utf8')
            if line.startswith('~t96;'):
                # should be a word ?
                entry = analyse_word_entry.parse_one(line)
                print (analyse_word_entry.html_of_entry(entry)).encode('utf8')
        except UnicodeDecodeError:
            print "encoding error on line", i
예제 #7
0
def main():
    i = 0
    tx = g.cypher.begin()
    #tx = DummyTx(g)
    for line in fileinput.input():
        i += 1
        # not sure about the proper encoding to use
        # Perl actually does a better job on this, original encoding is CP950
        try:
            line = line.decode('utf8')
            if line.startswith('~t96;'):
                # should be a word ?
                entry = analyse_word_entry.parse_one(line)
                if entry is None:
                    continue
                merge_in_graph(tx, entry)
                if (i % 50) == 0:
                    #print i
                    tx.commit()
                    tx = g.cypher.begin()
                    
        except UnicodeDecodeError:
            print "encoding error on line", i
    tx.commit()
예제 #8
0
def main():
    i = 0
    tx = g.cypher.begin()
    #tx = DummyTx(g)
    for line in fileinput.input():
        i += 1
        # not sure about the proper encoding to use
        # Perl actually does a better job on this, original encoding is CP950
        try:
            line = line.decode('utf8')
            if line.startswith('~t96;'):
                # should be a word ?
                entry = analyse_word_entry.parse_one(line)
                if entry is None:
                    continue
                merge_in_graph(tx, entry)
                if (i % 50) == 0:
                    #print i
                    tx.commit()
                    tx = g.cypher.begin()

        except UnicodeDecodeError:
            print "encoding error on line", i
    tx.commit()