コード例 #1
0
def main():
    i = 0
    good = defaultdict(set)
    bad = defaultdict(set)
    for line in fileinput.input():
        i += 1
        # not sure about the proper encoding to use
        try:
            line = line.decode('utf8')
            if line.startswith('~t96;'):
                # should be a word ?
                entry = analyse_word_entry.parse_one(line)
                syl_list = wsl_to_kaulo.check_entry(entry['raw'])
                for sino, readings in syl_list:
                    for r in readings:
                        syl = wsl_to_kaulo.convert(r)
                        if syl is not None:
                            good[sino].add(syl)
                        else:
                            bad[sino].add(r)
        except UnicodeDecodeError:
            print "encoding error on line", i

    # let's build a reverse index of problems
    readings_dic = defaultdict(list)
    for sino, readings in bad.iteritems():
        for r in readings:
            readings_dic[r].append(sino)
    for k, v in good.items():
        good[k] = list(v)
    for k, v in bad.items():
        bad[k] = list(v)
    print json.dumps((readings_dic, good, bad))
コード例 #2
0
def main():
    i = 0
    good = defaultdict(set)
    bad = defaultdict(set)
    for line in fileinput.input():
        i += 1
        # not sure about the proper encoding to use
        try:
            line = line.decode('utf8')
            if line.startswith('~t96;'):
                # should be a word ?
                entry = analyse_word_entry.parse_one(line)
                syl_list = wsl_to_kaulo.check_entry(entry['raw']) 
                for sino, readings in syl_list:
                    for r in readings:
                        syl = wsl_to_kaulo.convert(r)
                        if syl is not None:
                            good[sino].add(syl)
                        else:
                            bad[sino].add(r)
        except UnicodeDecodeError:
            print "encoding error on line", i

    # let's build a reverse index of problems
    readings_dic = defaultdict(list)
    for sino, readings in bad.iteritems():
        for r in readings:
            readings_dic[r].append(sino)
    for k,v in good.items():
        good[k] = list(v)
    for k,v in bad.items():
        bad[k] = list(v)
    print json.dumps((readings_dic, good, bad))
コード例 #3
0
ファイル: graph.py プロジェクト: Taiwanese-Corpus/koktai
def merge_in_graph(tx, e):
    e['key'] = "".join([e['entry'],e['nh']])
    stmt_form = """
    MERGE (:NgoForm {form:{form}})
    """
    stmt_word = """
    MATCH (f:NgoForm {form:{key}})
    MERGE (f) -[:entry]-> (:NgoWord {key: {key},
                                     entry: {entry},
                                     nh: {nh},
                                     POS: {POS},
                                     body: {body}})
    """
    stmt_syl = """
    MATCH (w:NgoForm {form: {key}})
    MERGE (s:NgoSyl {raw: {raw}})
    ON CREATE SET s.sino = {sino}, s.wsl = {wsl}
    MERGE (w) -[:contains {nth: {n}}]-> (s)
    """
    stmt_rom = """
    MATCH (s:NgoSyl {raw: {syl}})
    MERGE (r:NgoRom {wsl: {wsl}})
    ON CREATE SET r.rom = {rom}
    MERGE (s) -[:contains]->  (r)
    """
    stmt_err = """
    MATCH (s:NgoSyl {raw: {syl}})
    MERGE (e:NgoError {wsl: {wsl}})
    MERGE (s) -[:contains]-> (e)
    """

    stmt_sentence = """
    MATCH (w:NgoWord {key: {key}})
    CREATE (s:NgoSentence {lang: {lang}, text:{sentence}})
    CREATE (w) -[:NgoDef {n: {n}}]-> (s)
    """
    tx.append(stmt_form, {'form': e['key']})
    tx.append(stmt_word, e)
    for n,s in enumerate(e['sentences']):
        tx.append(stmt_sentence, {
            'key':e['key'],
            'lang': s['lang'],
            'sentence': s['sentence'],
            'n': n})

    syl_list = wsl_to_kaulo.check_entry(e['entry'])
    for n,(sino, readings) in enumerate(syl_list):
        wsl = "/".join(readings)
        tx.append(stmt_syl, {'raw': sino + wsl,
                             'sino': sino,
                             'wsl': wsl,
                             'key': e['key'],
                             'n': n})
        for r in readings:
            rom = wsl_to_kaulo.convert(r)
            if rom is not None:
                tx.append(stmt_rom, {'syl': sino + wsl,
                                     'rom': rom,
                                     'wsl': r})
            else:
                tx.append(stmt_err, {'syl': sino + wsl, 
                                     'wsl': r})
コード例 #4
0
def merge_in_graph(tx, e):
    e['key'] = "".join([e['entry'], e['nh']])
    stmt_form = """
    MERGE (:NgoForm {form:{form}})
    """
    stmt_word = """
    MATCH (f:NgoForm {form:{key}})
    MERGE (f) -[:entry]-> (:NgoWord {key: {key},
                                     entry: {entry},
                                     nh: {nh},
                                     POS: {POS},
                                     body: {body}})
    """
    stmt_syl = """
    MATCH (w:NgoForm {form: {key}})
    MERGE (s:NgoSyl {raw: {raw}})
    ON CREATE SET s.sino = {sino}, s.wsl = {wsl}
    MERGE (w) -[:contains {nth: {n}}]-> (s)
    """
    stmt_rom = """
    MATCH (s:NgoSyl {raw: {syl}})
    MERGE (r:NgoRom {wsl: {wsl}})
    ON CREATE SET r.rom = {rom}
    MERGE (s) -[:contains]->  (r)
    """
    stmt_err = """
    MATCH (s:NgoSyl {raw: {syl}})
    MERGE (e:NgoError {wsl: {wsl}})
    MERGE (s) -[:contains]-> (e)
    """

    stmt_sentence = """
    MATCH (w:NgoWord {key: {key}})
    CREATE (s:NgoSentence {lang: {lang}, text:{sentence}})
    CREATE (w) -[:NgoDef {n: {n}}]-> (s)
    """
    tx.append(stmt_form, {'form': e['key']})
    tx.append(stmt_word, e)
    for n, s in enumerate(e['sentences']):
        tx.append(stmt_sentence, {
            'key': e['key'],
            'lang': s['lang'],
            'sentence': s['sentence'],
            'n': n
        })

    syl_list = wsl_to_kaulo.check_entry(e['entry'])
    for n, (sino, readings) in enumerate(syl_list):
        wsl = "/".join(readings)
        tx.append(stmt_syl, {
            'raw': sino + wsl,
            'sino': sino,
            'wsl': wsl,
            'key': e['key'],
            'n': n
        })
        for r in readings:
            rom = wsl_to_kaulo.convert(r)
            if rom is not None:
                tx.append(stmt_rom, {'syl': sino + wsl, 'rom': rom, 'wsl': r})
            else:
                tx.append(stmt_err, {'syl': sino + wsl, 'wsl': r})