def main(): i = 0 good = defaultdict(set) bad = defaultdict(set) for line in fileinput.input(): i += 1 # not sure about the proper encoding to use try: line = line.decode('utf8') if line.startswith('~t96;'): # should be a word ? entry = analyse_word_entry.parse_one(line) syl_list = wsl_to_kaulo.check_entry(entry['raw']) for sino, readings in syl_list: for r in readings: syl = wsl_to_kaulo.convert(r) if syl is not None: good[sino].add(syl) else: bad[sino].add(r) except UnicodeDecodeError: print "encoding error on line", i # let's build a reverse index of problems readings_dic = defaultdict(list) for sino, readings in bad.iteritems(): for r in readings: readings_dic[r].append(sino) for k, v in good.items(): good[k] = list(v) for k, v in bad.items(): bad[k] = list(v) print json.dumps((readings_dic, good, bad))
def main(): i = 0 good = defaultdict(set) bad = defaultdict(set) for line in fileinput.input(): i += 1 # not sure about the proper encoding to use try: line = line.decode('utf8') if line.startswith('~t96;'): # should be a word ? entry = analyse_word_entry.parse_one(line) syl_list = wsl_to_kaulo.check_entry(entry['raw']) for sino, readings in syl_list: for r in readings: syl = wsl_to_kaulo.convert(r) if syl is not None: good[sino].add(syl) else: bad[sino].add(r) except UnicodeDecodeError: print "encoding error on line", i # let's build a reverse index of problems readings_dic = defaultdict(list) for sino, readings in bad.iteritems(): for r in readings: readings_dic[r].append(sino) for k,v in good.items(): good[k] = list(v) for k,v in bad.items(): bad[k] = list(v) print json.dumps((readings_dic, good, bad))
def merge_in_graph(tx, e): e['key'] = "".join([e['entry'],e['nh']]) stmt_form = """ MERGE (:NgoForm {form:{form}}) """ stmt_word = """ MATCH (f:NgoForm {form:{key}}) MERGE (f) -[:entry]-> (:NgoWord {key: {key}, entry: {entry}, nh: {nh}, POS: {POS}, body: {body}}) """ stmt_syl = """ MATCH (w:NgoForm {form: {key}}) MERGE (s:NgoSyl {raw: {raw}}) ON CREATE SET s.sino = {sino}, s.wsl = {wsl} MERGE (w) -[:contains {nth: {n}}]-> (s) """ stmt_rom = """ MATCH (s:NgoSyl {raw: {syl}}) MERGE (r:NgoRom {wsl: {wsl}}) ON CREATE SET r.rom = {rom} MERGE (s) -[:contains]-> (r) """ stmt_err = """ MATCH (s:NgoSyl {raw: {syl}}) MERGE (e:NgoError {wsl: {wsl}}) MERGE (s) -[:contains]-> (e) """ stmt_sentence = """ MATCH (w:NgoWord {key: {key}}) CREATE (s:NgoSentence {lang: {lang}, text:{sentence}}) CREATE (w) -[:NgoDef {n: {n}}]-> (s) """ tx.append(stmt_form, {'form': e['key']}) tx.append(stmt_word, e) for n,s in enumerate(e['sentences']): tx.append(stmt_sentence, { 'key':e['key'], 'lang': s['lang'], 'sentence': s['sentence'], 'n': n}) syl_list = wsl_to_kaulo.check_entry(e['entry']) for n,(sino, readings) in enumerate(syl_list): wsl = "/".join(readings) tx.append(stmt_syl, {'raw': sino + wsl, 'sino': sino, 'wsl': wsl, 'key': e['key'], 'n': n}) for r in readings: rom = wsl_to_kaulo.convert(r) if rom is not None: tx.append(stmt_rom, {'syl': sino + wsl, 'rom': rom, 'wsl': r}) else: tx.append(stmt_err, {'syl': sino + wsl, 'wsl': r})
def merge_in_graph(tx, e): e['key'] = "".join([e['entry'], e['nh']]) stmt_form = """ MERGE (:NgoForm {form:{form}}) """ stmt_word = """ MATCH (f:NgoForm {form:{key}}) MERGE (f) -[:entry]-> (:NgoWord {key: {key}, entry: {entry}, nh: {nh}, POS: {POS}, body: {body}}) """ stmt_syl = """ MATCH (w:NgoForm {form: {key}}) MERGE (s:NgoSyl {raw: {raw}}) ON CREATE SET s.sino = {sino}, s.wsl = {wsl} MERGE (w) -[:contains {nth: {n}}]-> (s) """ stmt_rom = """ MATCH (s:NgoSyl {raw: {syl}}) MERGE (r:NgoRom {wsl: {wsl}}) ON CREATE SET r.rom = {rom} MERGE (s) -[:contains]-> (r) """ stmt_err = """ MATCH (s:NgoSyl {raw: {syl}}) MERGE (e:NgoError {wsl: {wsl}}) MERGE (s) -[:contains]-> (e) """ stmt_sentence = """ MATCH (w:NgoWord {key: {key}}) CREATE (s:NgoSentence {lang: {lang}, text:{sentence}}) CREATE (w) -[:NgoDef {n: {n}}]-> (s) """ tx.append(stmt_form, {'form': e['key']}) tx.append(stmt_word, e) for n, s in enumerate(e['sentences']): tx.append(stmt_sentence, { 'key': e['key'], 'lang': s['lang'], 'sentence': s['sentence'], 'n': n }) syl_list = wsl_to_kaulo.check_entry(e['entry']) for n, (sino, readings) in enumerate(syl_list): wsl = "/".join(readings) tx.append(stmt_syl, { 'raw': sino + wsl, 'sino': sino, 'wsl': wsl, 'key': e['key'], 'n': n }) for r in readings: rom = wsl_to_kaulo.convert(r) if rom is not None: tx.append(stmt_rom, {'syl': sino + wsl, 'rom': rom, 'wsl': r}) else: tx.append(stmt_err, {'syl': sino + wsl, 'wsl': r})