def run(args): ds = Dataset() alms = Alignments(ds.dir.joinpath('workflow', 'D_Chen_partial.tsv').as_posix(), ref='cogids') alms.add_entries( 'structure', 'tokens', lambda x: basictypes.lists(' + '.join( [' '.join(y) for y in segments.get_structure(x)]))) print('[i] added segments') D = {0: [c for c in alms.columns]} for idx, tokens, structure in alms.iter_rows('tokens', 'structure'): if len(tokens.n) != len(structure.n): print('[!!!]', tokens, structure) elif len(tokens) != len(structure): print('[!]', tokens, structure) else: D[idx] = alms[idx] alms = Alignments(D, ref='cogids') template_alignment(alms, ref='cogids', template='imnct+imnct+imnct+imnct+imnct+imnct', structure='structure', fuzzy=True, segments='tokens') alms.output('tsv', filename=ds.dir.joinpath('workflow', 'D_Chen_aligned').as_posix(), prettify=False)
def run(args): ds = Dataset() alms = Alignments(ds.dir.joinpath('workflow', 'D_Chen_aligned.tsv').as_posix(), ref='cogids') find_bad_internal_alignments(alms) find_colexified_alignments( alms, cognates='cogids', segments='tokens', ref='crossids' ) # re-align the data template_alignment(alms, ref='crossids', template='imnct+imnct+imnct+imnct+imnct+imnct', structure = 'structure', fuzzy=True, segments='tokens') alms.output('tsv', filename=ds.dir.joinpath('workflow', 'D_Chen_crossids').as_posix(), prettify=False)
from sys import argv if 'all' in argv: fname = '../output/A_Deepadung_' else: fname = '../output/D_Deepadung_' alms = Alignments(fname + 'partial.tsv', ref='cogids') alms.add_entries( 'structure', 'tokens', lambda x: basictypes.lists(' + '.join( [' '.join(y) for y in segments.get_structure(x)]))) print('[i] added segments') D = {0: [c for c in alms.columns]} for idx, tokens, structure in alms.iter_rows('tokens', 'structure'): if len(tokens.n) != len(structure.n): print('[!!!]', tokens, structure) elif len(tokens) != len(structure): print('[!]', tokens, structure) else: D[idx] = alms[idx] alms = Alignments(D, ref='cogids') template_alignment(alms, ref='cogids', template='imnc+imnc+imnc+imnc+imnc+imnc', structure='structure', fuzzy=True, segments='tokens') alms.output('tsv', filename=fname + 'aligned_structure', prettify=False)