def main(): i = 0 good = defaultdict(set) bad = defaultdict(set) for line in fileinput.input(): i += 1 # not sure about the proper encoding to use try: line = line.decode('utf8') if line.startswith('~t96;'): # should be a word ? entry = analyse_word_entry.parse_one(line) syl_list = wsl_to_kaulo.check_entry(entry['raw']) for sino, readings in syl_list: for r in readings: syl = wsl_to_kaulo.convert(r) if syl is not None: good[sino].add(syl) else: bad[sino].add(r) except UnicodeDecodeError: print "encoding error on line", i # let's build a reverse index of problems readings_dic = defaultdict(list) for sino, readings in bad.iteritems(): for r in readings: readings_dic[r].append(sino) for k,v in good.items(): good[k] = list(v) for k,v in bad.items(): bad[k] = list(v) print json.dumps((readings_dic, good, bad))
def main(): i = 0 good = defaultdict(set) bad = defaultdict(set) for line in fileinput.input(): i += 1 # not sure about the proper encoding to use try: line = line.decode('utf8') if line.startswith('~t96;'): # should be a word ? entry = analyse_word_entry.parse_one(line) syl_list = wsl_to_kaulo.check_entry(entry['raw']) for sino, readings in syl_list: for r in readings: syl = wsl_to_kaulo.convert(r) if syl is not None: good[sino].add(syl) else: bad[sino].add(r) except UnicodeDecodeError: print "encoding error on line", i # let's build a reverse index of problems readings_dic = defaultdict(list) for sino, readings in bad.iteritems(): for r in readings: readings_dic[r].append(sino) for k, v in good.items(): good[k] = list(v) for k, v in bad.items(): bad[k] = list(v) print json.dumps((readings_dic, good, bad))
def process_buffer(buf, list_of_results): entry = analyse_word_entry.parse_one("".join(buf)) if entry: if len(list_of_results) > 0 and list_of_results[-1]["entry"] == entry["entry"]: list_of_results[-1]["heteronyms"].append(entry) else: list_of_results.append({"entry": entry["entry"], "heteronyms": [entry]}) else: print "unanalyzed", "".join(buf).encode("utf8")
def process_buffer(buf, list_of_results): entry = analyse_word_entry.parse_one("".join(buf)) if entry: if len(list_of_results ) > 0 and list_of_results[-1]['entry'] == entry['entry']: list_of_results[-1]['heteronyms'].append(entry) else: list_of_results.append({ 'entry': entry['entry'], 'heteronyms': [entry] }) else: print "unanalyzed", "".join(buf).encode("utf8")
def main(): i = 0 for line in fileinput.input(): i += 1 # not sure about the proper encoding to use # Perl actually does a better job on this, original encoding is CP950 try: line = line.decode('utf8') if line.startswith('~t96;'): # should be a word ? entry = analyse_word_entry.parse_one(line) print(analyse_word_entry.html_of_entry(entry)).encode('utf8') except UnicodeDecodeError: print "encoding error on line", i
def main(): i = 0 for line in fileinput.input(): i += 1 # not sure about the proper encoding to use # Perl actually does a better job on this, original encoding is CP950 try: line = line.decode('utf8') if line.startswith('~t96;'): # should be a word ? entry = analyse_word_entry.parse_one(line) print (analyse_word_entry.html_of_entry(entry)).encode('utf8') except UnicodeDecodeError: print "encoding error on line", i
def main(): i = 0 tx = g.cypher.begin() #tx = DummyTx(g) for line in fileinput.input(): i += 1 # not sure about the proper encoding to use # Perl actually does a better job on this, original encoding is CP950 try: line = line.decode('utf8') if line.startswith('~t96;'): # should be a word ? entry = analyse_word_entry.parse_one(line) if entry is None: continue merge_in_graph(tx, entry) if (i % 50) == 0: #print i tx.commit() tx = g.cypher.begin() except UnicodeDecodeError: print "encoding error on line", i tx.commit()