Beispiel #1
0
def write_instances(instance_list, out_path, type, overwrite=False):

    if os.path.exists(out_path) and not overwrite:
        SPLIT_LOG.error('File "{}" already exists and overwrite flag not set. Skipping!'.format(out_path))
        return
    else:

        # Create the directory if need be
        try:
            if not os.path.exists(os.path.dirname(out_path)):
                os.makedirs(os.path.dirname(out_path))
        except FileNotFoundError:
            pass


        num_sents = len(instance_list)
        if num_sents > 0:
            xc = XigtCorpus()
            for i, inst in enumerate(instance_list):
                # inst.id = 'i{}'.format(i)
                xc.append(inst)

            print("Writing {} instances to {}...".format(num_sents, out_path))
            f = open(out_path, 'w', encoding='utf-8')
            sort_corpus(xc)
            xigtxml.dump(f, xc)
            f.close()
        else:
            SPLIT_LOG.warn("No instances allocated for {}. Skipping file.".format(type))
Beispiel #2
0
def separate_tiers(args):
    tiers = set(args.tiers)
    # assuming XML for now
    with open(args.infile, 'r') as instream:
        src_xc = xigtxml.load(instream)
        sep_xc = XigtCorpus(attributes=src_xc.attributes,
                            metadata=src_xc.metadata)
        for igt in src_xc.igts:
            sep_xc.add(
                Igt(id=igt.id,
                    type=igt.type,
                    attributes=igt.attributes,
                    metadata=igt.metadata,
                    tiers=[t for t in igt.tiers if t.type in tiers]))
        xigtxml.dump(open(args.outfile, 'w'), sep_xc)

    if not args.remainder: return
    with open(args.infile, 'r') as instream:
        src_xc = xigtxml.load(instream)
        rem_xc = XigtCorpus(attributes=src_xc.attributes,
                            metadata=src_xc.metadata)
        for igt in src_xc.igts:
            rem_xc.add(
                Igt(id=igt.id,
                    type=igt.type,
                    attributes=igt.attributes,
                    metadata=igt.metadata,
                    tiers=[t for t in igt.tiers if t.type not in tiers]))
        xigtxml.dump(open(args.remainder, 'w'), rem_xc)
Beispiel #3
0
def filter_corpus(filelist, outpath, **kwargs):

    require_lang      = kwargs.get('require_lang', False)
    require_gloss     = kwargs.get('require_gloss', False)
    require_trans     = kwargs.get('require_trans', False)
    require_aln       = kwargs.get('require_aln', False)
    require_gloss_pos = kwargs.get('require_gloss_pos', False)
    require_grammatical=kwargs.get('require_grammatical', False)
    max_instances      =kwargs.get('max_instances', 0)

    xc, examined, failures, successes = do_filter(filelist, require_lang, require_gloss, require_trans,
                                                  require_aln, require_gloss_pos, require_grammatical, max_instances)



    # Only create a file if there are some instances to create...
    if len(xc) > 0:

        # Make sure the directory exists that contains the output.
        if os.path.dirname(outpath):
            os.makedirs(os.path.dirname(outpath), exist_ok=True)

        with open(outpath, 'w', encoding='utf-8') as out_f:
            FILTER_LOG.log(1000, "{} instances processed, {} filtered out, {} remain.".format(examined, failures, successes))
            FILTER_LOG.log(1000, 'Writing remaining instances to file "{}"...'.format(os.path.basename(outpath)))
            xigtxml.dump(out_f, xc)
            FILTER_LOG.log(1000, "Success.")

    else:
        print("No instances remain after filtering. Skipping.")
Beispiel #4
0
def _xigt_import(infile, outfile, options):
    with open(infile, 'r') as in_fh, open(outfile, 'w') as out_fh:
        igts = odin_igts(in_fh, options)
        xc = XigtCorpus(
            igts=igts,
            nsmap=_nsmap,
            mode='transient'
        )
        xigtxml.dump(out_fh, xc)
Beispiel #5
0
Datei: odin.py Projekt: xigt/xigt
def _xigt_import(infile, outfile, options):
    with open(infile, 'r') as in_fh, open(outfile, 'w') as out_fh:
        igts = odin_igts(in_fh, options)
        xc = XigtCorpus(
            igts=igts,
            nsmap=_nsmap,
            mode='transient'
        )
        xigtxml.dump(out_fh, xc)
Beispiel #6
0
def write(out_fn, fn_idx):
    xc = XigtCorpus()
    for fn, igt_indices in fn_idx.items():
        # if possible, try to decode needed igts only and skip the rest
        in_xc = xigtxml.load(fn, mode='transient')
        # ignoring corpus-level metadata
        xc.extend(igt for i, igt in enumerate(in_xc) if i in igt_indices)
    # assume the nsmap of the first igt is the same for all
    if xc.igts: xc.nsmap = xc[0].nsmap
    xigtxml.dump(out_fn, xc)
Beispiel #7
0
def run(args):
    if args.infiles:
        for fn in args.infiles:
            logging.info('Normalizing {}'.format(fn))
            xc = xigtxml.load(fn, mode='full')
            normalize_corpus(xc)
            xigtxml.dump(fn, xc)
    else:
        xc = xigtxml.load(sys.stdin, mode='full')
        normalize_corpus(xc)
        print(xigtxml.dumps(xc))
Beispiel #8
0
def run(args):
    if args.infiles:
        for fn in args.infiles:
            logging.info('Cleaning {}'.format(fn))
            xc = xigtxml.load(fn, mode='full')
            clean_corpus(xc)
            xigtxml.dump(fn, xc)
    else:
        xc = xigtxml.load(sys.stdin, mode='full')
        clean_corpus(xc)
        print(xigtxml.dumps(xc))
Beispiel #9
0
def _xigt_import(infile, outfile, options):
    with open(infile, "r") as in_fh, open(outfile, "w") as out_fh:
        igts = odin_igts(in_fh, options)
        xc = XigtCorpus(
            igts=igts,
            attributes={
                "xmlns:olac": "http://www.language-archives.org/OLAC/1.1/",
                "xmlns:dc": "http://purl.org/dc/elements/1.1/",
                "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
            },
            mode="transient",
        )
        xigtxml.dump(out_fh, xc)
Beispiel #10
0
def xigt_import(infile, outfile, options=None):

    if options is None:
        options = {}
    options.setdefault("tier_types", default_tier_types)
    options.setdefault("alignments", default_alignments)
    options.setdefault("record_markers", default_record_markers)
    options.setdefault("attribute_map", default_attribute_map)

    with open(infile, "r") as in_fh, open(outfile, "w") as out_fh:
        tb = toolbox.read_toolbox_file(in_fh)
        igts = toolbox_igts(tb, options)
        xc = XigtCorpus(igts=igts, mode="transient")
        xigtxml.dump(out_fh, xc)
Beispiel #11
0
def xigt_import(infile, outfile, options=None):

    if options is None:
        options = {}
    options.setdefault('tier_types', default_tier_types)
    options.setdefault('alignments', default_alignments)
    options.setdefault('record_markers', default_record_markers)
    options.setdefault('attribute_map', default_attribute_map)
    options.setdefault('error_recovery_method', default_error_recovery_method)

    with open(infile, 'r') as in_fh, open(outfile, 'w') as out_fh:
        tb = toolbox.read_toolbox_file(in_fh)
        igts = toolbox_igts(tb, options)
        xc = XigtCorpus(igts=igts, mode='transient')
        xigtxml.dump(out_fh, xc)
Beispiel #12
0
def xigt_import(infile, outfile, options=None):

    if options is None:
        options = {}
    options.setdefault('tier_types', default_tier_types)
    options.setdefault('alignments', default_alignments)
    options.setdefault('record_markers', default_record_markers)
    options.setdefault('attribute_map', default_attribute_map)
    options.setdefault('error_recovery_method', default_error_recovery_method)

    with open(infile, 'r') as in_fh, open(outfile, 'w') as out_fh:
        tb = toolbox.read_toolbox_file(in_fh)
        igts = toolbox_igts(tb, options)
        xc = XigtCorpus(igts=igts, mode='transient')
        xigtxml.dump(out_fh, xc)
Beispiel #13
0
def eval_classifier(c, inst_list, context_feats=False, posdict=None):
    """

    :param c: The classifier
    :param inst_list: A list of Igt instances to test against. Must already have POS tags.
    """

    gold_sents = []
    eval_sents = []

    to_dump = XigtCorpus()

    for inst in inst_list:

        to_tag = inst.copy()
        strip_pos(to_tag)

        # Do the classification.
        to_tag.classify_gloss_pos(c, lowercase=True,
                                  feat_next_gram=context_feats,
                                  feat_prev_gram=context_feats,
                                  posdict=posdict)


        to_dump.append(to_tag)
        # Fix the tags...
        # fix_ctn_gloss_line(to_tag, tag_method=INTENT_POS_CLASS)

        # Now, retrieve eval/gold.
        eval_tags = [v.value() for v in to_tag.get_pos_tags(GLOSS_WORD_ID, tag_method=INTENT_POS_CLASS)]
        gold_tags = [v.value() for v in inst.get_pos_tags(GLOSS_WORD_ID, tag_method=INTENT_POS_MANUAL)]


        tag_tokens = [POSToken('a', label=l) for l in eval_tags]
        gold_tokens= [POSToken('a', label=l) for l in gold_tags]

        if not len(tag_tokens) == len(gold_tokens):
            print("LENGTH OF SEQUENCE IS MISMATCHED")
            continue

        gold_sents.append(gold_tokens)
        eval_sents.append(tag_tokens)


    xigtxml.dump(open('./enriched_ctn_dev.xml', 'w'), to_dump)
    return poseval(eval_sents, gold_sents, details=True,csv=True, matrix=True)
Beispiel #14
0
def divide_corpus(args):
    infile = args.infile
    outdir = args.outdir
    igt_index = [0]  # just a list so I don't have to nonlocal it later
    indices = set()

    def make_filename(fn):
        return os.path.join(outdir, fn + '.xml')

    # this should make reading the corpus faster
    def selective_decode_igt(elem):
        idx = igt_index.pop()
        if idx not in indices:
            igt = None
        else:
            igt = xigtxml.default_decode_igt(elem)
            indices.remove(idx)
        igt_index.append(idx + 1)
        return igt

    if args.meta is not None:
        metatype, func = args.meta
        func = eval('lambda m:{}'.format(func))
        get_key = lambda igt: next(
            (func(m) for m in igt.get_meta(metatype, default=[]) if m is not None),
            None
        )

    # get a mapping of code to the indexed position of each IGT
    keymap = defaultdict(set)
    xc = xigtxml.load(open(infile, 'r'), mode='transient')
    for i, igt in enumerate(xc):
        key = get_key(igt)
        keymap[key].add(i)

    xigtxml.decode_igt = selective_decode_igt

    # now group IGTs with similar languages into a file
    for key, indices in keymap.items():
        if key is None:
            key = '-others-'  # FIXME not guaranteed to be unique
        igt_index = [0]
        xc = xigtxml.load(open(infile, 'r'), mode='transient')
        xigtxml.dump(open(make_filename(key), 'w'), xc)
Beispiel #15
0
def divide_corpus(args):
    infile = args.infile
    outdir = args.outdir
    igt_index = [0]  # just a list so I don't have to nonlocal it later
    indices = set()

    def make_filename(fn):
        return os.path.join(outdir, fn + '.xml')

    # this should make reading the corpus faster
    def selective_decode_igt(elem):
        idx = igt_index.pop()
        if idx not in indices:
            igt = None
        else:
            igt = xigtxml.default_decode_igt(elem)
            indices.remove(idx)
        igt_index.append(idx + 1)
        return igt

    if args.meta is not None:
        metatype, func = args.meta
        func = eval('lambda m:{}'.format(func))
        get_key = lambda igt: next((func(m)
                                    for m in igt.get_meta(metatype, default=[])
                                    if m is not None), None)

    # get a mapping of code to the indexed position of each IGT
    keymap = defaultdict(set)
    xc = xigtxml.load(open(infile, 'r'), mode='transient')
    for i, igt in enumerate(xc):
        key = get_key(igt)
        keymap[key].add(i)

    xigtxml.decode_igt = selective_decode_igt

    # now group IGTs with similar languages into a file
    for key, indices in keymap.items():
        if key is None:
            key = '-others-'  # FIXME not guaranteed to be unique
        igt_index = [0]
        xc = xigtxml.load(open(infile, 'r'), mode='transient')
        xigtxml.dump(open(make_filename(key), 'w'), xc)
Beispiel #16
0
def xigt_import(infile, outfile, options=None):

    if options is None:
        options = {}
    options.setdefault('record_markers', default_record_markers)
    options.setdefault('igt_attribute_map', default_igt_attribute_map)
    options.setdefault('tier_map', default_tier_map)
    options.setdefault('make_phrase_tier', default_make_phrase_tier)
    options.setdefault('tier_types', default_tier_types)
    options.setdefault('alignments', default_alignments)
    options.setdefault('error_recovery_method', default_error_recovery_method)

    # just use existing info to create marker-based alignment info
    options['tb_alignments'] = _make_tb_alignments(options)

    with open(infile, 'r') as in_fh, open(outfile, 'w') as out_fh:
        tb = toolbox.read_toolbox_file(in_fh)
        igts = toolbox_igts(tb, options)
        xc = XigtCorpus(igts=igts, mode='transient')
        xigtxml.dump(out_fh, xc)
Beispiel #17
0
def xigt_import(infile, outfile, options=None):

    if options is None:
        options = {}
    options.setdefault('record_markers', default_record_markers)
    options.setdefault('igt_attribute_map', default_igt_attribute_map)
    options.setdefault('tier_map', default_tier_map)
    options.setdefault('make_phrase_tier', default_make_phrase_tier)
    options.setdefault('tier_types', default_tier_types)
    options.setdefault('alignments', default_alignments)
    options.setdefault('error_recovery_method', default_error_recovery_method)

    # just use existing info to create marker-based alignment info
    options['tb_alignments'] = _make_tb_alignments(options)

    with open(infile, 'r') as in_fh, open(outfile, 'w') as out_fh:
        tb = toolbox.read_toolbox_file(in_fh)
        igts = toolbox_igts(tb, options)
        xc = XigtCorpus(igts=igts, mode='transient')
        xigtxml.dump(out_fh, xc)
Beispiel #18
0
def run(args):
    xc = xigtxml.load(args.infile)
    if args.igt_key:
        logging.info('Sorting %s IGTs' % args.infile)
        xc.sort(key=make_sortkey(args.igt_key))
    if args.tier_key:
        logging.info('Sorting %s tiers by key' % args.infile)
        for igt in xc:
            igt.sort(key=make_sortkey(args.tier_key))
    elif args.tier_deps:
        logging.info('Sorting %s tiers by ref-dependencies' % args.infile)
        refattrs = [ra.strip() for ra in args.tier_deps.split(',')]
        for igt in xc:
            igt.sort_tiers(refattrs=refattrs)
    if args.item_key:
        logging.info('Sorting %s items by key' % args.infile)
        for igt in xc:
            for tier in igt:
                tier.sort(key=make_sortkey(args.item_key))
    if args.in_place:
        xigtxml.dump(args.infile, xc)
    else:
        print(xigtxml.dumps(xc))
Beispiel #19
0
def separate_tiers(args):
    tiers = set(args.tiers)
    # assuming XML for now
    with open(args.infile,'r') as instream:
        src_xc = xigtxml.load(instream)
        sep_xc = XigtCorpus(attributes=src_xc.attributes,
                            metadata=src_xc.metadata)
        for igt in src_xc.igts:
            sep_xc.add(Igt(id=igt.id, type=igt.type,
                           attributes=igt.attributes, metadata=igt.metadata,
                           tiers=[t for t in igt.tiers if t.type in tiers]))
        xigtxml.dump(open(args.outfile, 'w'), sep_xc)

    if not args.remainder: return
    with open(args.infile,'r') as instream:
        src_xc = xigtxml.load(instream)
        rem_xc = XigtCorpus(attributes=src_xc.attributes,
                            metadata=src_xc.metadata)
        for igt in src_xc.igts:
            rem_xc.add(Igt(id=igt.id, type=igt.type,
                           attributes=igt.attributes, metadata=igt.metadata,
                           tiers=[t for t in igt.tiers
                                  if t.type not in tiers]))
        xigtxml.dump(open(args.remainder, 'w'), rem_xc)
Beispiel #20
0
    else:
        return xigtxml.default_decode_meta(elem)

### Encoding ###

def encode_meta(meta):
    metatype = meta.type.lower()
    if metatype in ('judgment', 'vetted', 'phenomena'):
        attributes = dict(type=meta.type, **meta.attributes)
        e = etree.Element('meta', attrib=attributes)
        if metatype == 'phenomena':
            for phenomenon in meta.content:
                p = etree.Element('phenomenon')
                p.text = phenomenon
                e.append(p)
        return e
    else:
        return xigtxml.default_encode_meta(meta)

### Function maps ###

xigtxml.decode_meta = matrix_decode_meta
xigtxml.encode_meta = matrix_encode_meta

if __name__ == '__main__':
    import sys
    f = sys.argv[1]
    xc = xigtxml.load(open(f,'r'))
    print(xigtxml.dumps(xc, pretty_print=True))
    xigtxml.dump(open('abkhaz-out.xigt','w'), xc, pretty_print=True)
Beispiel #21
0
def do_projection(**kwargs):
    """
    (Re)project the
    :param aln_method: The alignment method
    """
    kwargs = ArgPasser(kwargs)
    aln_method = ALN_ARG_MAP[kwargs.get('aln_method', ARG_ALN_ANY)]

    successes = 0
    failures  = 0

    in_path = kwargs.get(ARG_INFILE)
    with open(in_path, 'r', encoding='utf-8') as f:
        PROJ_LOG.log(1000, 'Loading file "{}"...'.format(os.path.basename(in_path)))
        xc = xigtxml.load(f, mode=INCREMENTAL)
        for inst in xc:
            success_fail_string = 'Instance {:20s} {{:10s}}{{}}'.format('"'+inst.id+'"...')

            def fail(reason):
                nonlocal failures, success_fail_string
                success_fail_string = success_fail_string.format('FAIL', reason)
                failures += 1
            def success():
                nonlocal successes, success_fail_string
                success_fail_string = success_fail_string.format('SUCCESS', '')
                successes += 1

            # Query whether we want to require to use only trees
            # where the alignment is 100%.
            completeness_requirement = kwargs.get('completeness', default=0, t=float)

            # TODO: Find better way to do this?
            try:
                if kwargs.get('pos', True):
                    project_trans_pos_to_gloss(inst, aln_method=aln_method, completeness_requirement=completeness_requirement)
                    project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_PROJ)
                if kwargs.get('ds', True):
                    project_pt_tier(inst, proj_aln_method=aln_method)
                    project_ds_tier(inst, proj_aln_method=aln_method, completeness_requirement=completeness_requirement)
            except (NoNormLineException) as ntle:
                fail("Bad Lines")
            except (NoAlignmentProvidedError, ProjectionException) as nape:
                fail("Alignment")
            except (GlossLangAlignException) as glae:
                fail("Gloss-Lang")
            except (ProjectionIncompleteAlignment) as pia:
                fail("Alignment Incomplete")
            except PhraseStructureProjectionException as pspe:
                fail("Projection Failed")
            else:
                success()
            finally:
                PROJ_LOG.info(success_fail_string)
                inst.sort_tiers()

        out_path = kwargs.get(ARG_OUTFILE)
        # Try to make the folder if it doesn't already exist.
        os.makedirs(os.path.dirname(out_path), exist_ok=True)

        PROJ_LOG.log(1000, 'Writing new file "{}"...'.format(os.path.basename(out_path)))
        with open(out_path, 'w', encoding='utf-8') as out_f:
            xigtxml.dump(out_f, xc)

    PROJ_LOG.log(1000, '{} instances processed, {} successful, {} failed.'.format(len(xc), successes, failures))
Beispiel #22
0
    p.add_argument('-d', '--dest', required=True, help='Output directory for modified files.')
    p.add_argument('-f', '--force', help='Force overwrite existing files.')

    args = p.parse_args()

    for path in args.FILE:
        with open(path, 'r', encoding='utf-8') as f:
            xc = xigtxml.load(f, mode=INCREMENTAL)

            for inst in xc:
                JUDG_LOG.info('Processing instance "{}"'.format(inst.id))
                for item in xigtpath.findall(inst, 'tier[@type='+ODIN_TIER_TYPE+ ']/item'):

                    # Skip blank lines
                    if item.value() is None:
                        continue

                    # Get the judgment and add it if it is non-null.
                    j = get_judgment(item.value())
                    if j is not None:
                        item.attributes[ODIN_JUDGMENT_ATTRIBUTE] = j
                        JUDG_LOG.debug('Judgment found on item "{}"'.format(item.id))

            # Make the output directory if it doesn't exist.
            makedirs(args.dest, exist_ok=True)
            outpath = os.path.join(args.dest, os.path.basename(path))

            if not os.path.exists(outpath) or args.force:
                with open(outpath, 'w', encoding='utf-8') as out_f:
                    xigtxml.dump(out_f, xc)
Beispiel #23
0
def enrich(**kwargs):

    global classifier

    if ARG_OUTFILE not in kwargs:
        ENRICH_LOG.critical("No output file specified.")
        sys.exit()

    # =============================================================================
    # Set up the alternate classifier path...
    # =============================================================================

    class_path = kwargs.get('class_path')

    #===========================================================================
    # Set up the different arguments...
    #===========================================================================
    inpath = kwargs.get(ARG_INFILE)

    parse_args = kwargs.get(PARSE_VAR, [])
    pos_args = kwargs.get(POS_VAR, [])
    aln_args = kwargs.get(ALN_VAR, [])

    max_parse_length = kwargs.get('max_parse_length', 10)

    if not (parse_args or pos_args or aln_args):
        ENRICH_LOG.warning("No enrichment specified. Basic processing only will be performed.")

    #===========================================================================
    # Sanity check the arguments.
    #===========================================================================

    # Check that alignment is asked for if projection is asked for.
    if (ARG_POS_PROJ in pos_args or ARG_PARSE_PROJ in parse_args) and (not aln_args):
        ENRICH_LOG.warn("You have asked for projection methods but have not requested " + \
                        "alignments to be generated. Projection may fail if alignment not already present in file.")

    ENRICH_LOG.log(1000, 'Loading input file...')
    with open(inpath, 'r', encoding='utf-8') as in_f:
        corp = xigtxml.load(in_f, mode=INCREMENTAL)

        # -------------------------------------------
        # Initialize the English tagger if:
        #   A) "proj" option is selected for pos.
        #   B) "trans" option is given for pos.
        #   C) "heurpos" option is given for alignment.
        # -------------------------------------------
        s = None
        if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args:
            ENRICH_LOG.log(1000, 'Initializing tagger...')
            tagger = c.getpath('stanford_tagger_trans')

            try:
                s = StanfordPOSTagger(tagger)
            except TaggerError as te:
                ENRICH_LOG.critical(te)
                sys.exit(2)

        # -------------------------------------------
        # Initialize the parser if:
        #    A) "trans" option is given for parse
        #    B) "proj" option is given for parse.
        # -------------------------------------------
        if ARG_PARSE_TRANS in parse_args or ARG_PARSE_PROJ in parse_args:
            ENRICH_LOG.log(1000, "Intializing English parser...")
            sp = stanford_parser.StanfordParser()

        # -------------------------------------------
        # Initialize the classifier if:
        #    A) "class" option is given for pos
        #    B) "heurpos" option is given for alignment.
        # -------------------------------------------
        m = None
        if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args:
            ENRICH_LOG.log(1000, "Initializing gloss-line classifier...")
            p = load_posdict()
            m = mallet_maxent.MalletMaxent(classifier)


        # -- 1b) Giza Gloss to Translation alignment --------------------------------------
        if ARG_ALN_GIZA in aln_args or ARG_ALN_GIZAHEUR in aln_args:
            ENRICH_LOG.log(1000, 'Aligning gloss and translation lines using mgiza++...')

            try:
                if ARG_ALN_GIZAHEUR in aln_args:
                    giza_align_t_g(corp, resume=True, use_heur=True, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT))
                if ARG_ALN_GIZA in aln_args:
                    giza_align_t_g(corp, resume=True, use_heur=False, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT))
            except GizaAlignmentException as gae:
                gl = logging.getLogger('giza')
                gl.critical(str(gae))
                raise gae

        # -------------------------------------------
        # Begin iterating through the corpus
        # -------------------------------------------

        for inst in corp:

            feedback_string = 'Instance {:15s}: {{:20s}}{{}}'.format(inst.id)

            reasons = []
            inst_status = None

            def fail(reason):
                nonlocal inst_status, reasons
                if reason not in reasons:
                    reasons.append(reason)
                inst_status = 'WARN'

            def success():
                nonlocal inst_status
                inst_status = 'OK'

            # -------------------------------------------
            # Define the reasons for failure
            # -------------------------------------------
            F_GLOSS_LINE = "NOGLOSS"
            F_LANG_LINE  = "NOLANG"
            F_TRANS_LINE = "NOTRANS"
            F_BAD_LINES  = "BADLINES"
            F_L_G_ALN    = "L_G_ALIGN"
            F_T_G_ALN    = "G_T_ALIGN"
            F_NO_TRANS_POS="NO_POS_TRANS"
            F_PROJECTION = "PROJECTION"
            F_UNKNOWN    = "UNKNOWN"
            F_PARSELEN   = "OVER_MAX_LENGTH"


            try:

                # -------------------------------------------
                # Get the different lines
                # -------------------------------------------
                def tryline(func):
                    nonlocal inst
                    try:
                        return func(inst)
                    except NoNormLineException as nnle:
                        return None

                gl = tryline(gloss_line)
                tls = tryline(trans_lines)
                lls  = tryline(lang_lines)

                has_gl = gl is not None
                has_tl = tls is not None
                has_ll = lls is not None

                has_all = lambda: (has_gl and has_tl and has_ll)


                # -------------------------------------------
                # Translation Line
                # -------------------------------------------
                if has_tl:

                    if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args:

                        try:
                            tag_trans_pos(inst, s)
                        except CriticalTaggerError as cte:
                            ENRICH_LOG.critical(str(cte))
                            sys.exit(2)

                    if ARG_PARSE_PROJ in parse_args or ARG_PARSE_TRANS in parse_args:
                        if len(trans(inst)) <= max_parse_length:
                            parse_translation_line(inst, sp, pt=True, dt=True)
                        else:
                            fail(F_PARSELEN)

                # 4) POS tag the gloss line --------------------------------------------
                if has_gl:
                    if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args:
                        classify_gloss_pos(inst, m, posdict=p)

                # -------------------------------------------
                # Try getting alignments.
                # -------------------------------------------
                if has_gl and has_ll:
                    try:
                        add_gloss_lang_alignments(inst)
                    except GlossLangAlignException as glae:
                        fail(F_L_G_ALN)

                if has_gl and has_tl:
                    if ARG_ALN_HEURPOS in aln_args:
                        heur_align_inst(inst, use_pos=True)
                    if ARG_ALN_HEUR in aln_args:
                        heur_align_inst(inst, use_pos=False)

                # -------------------------------------------
                # Now, do the necessary projection tasks.
                # -------------------------------------------

                # Project the classifier tags...
                if has_ll and has_gl and ARG_POS_CLASS in pos_args:
                    try:
                        project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_CLASS)
                    except GlossLangAlignException:
                        fail(F_L_G_ALN)

                # -------------------------------------------
                # Do the trans-to-lang projection...
                # -------------------------------------------

                if has_all():
                    proj_aln_method = ALN_ARG_MAP[kwargs.get('proj_aln', ARG_ALN_ANY)]
                    aln = get_trans_gloss_alignment(inst, aln_method=proj_aln_method)
                    if not aln or len(aln) == 0:
                        fail(F_T_G_ALN)
                    else:
                        # -------------------------------------------
                        # POS Projection
                        # -------------------------------------------
                        if ARG_POS_PROJ in pos_args:
                            trans_tags = trans_tag_tier(inst)

                            if not trans_tags:
                                fail(F_NO_TRANS_POS)
                            else:
                                project_trans_pos_to_gloss(inst)
                                try:
                                    project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_PROJ)
                                except GlossLangAlignException as glae:
                                    fail(F_L_G_ALN)

                        # -------------------------------------------
                        # Parse projection
                        # -------------------------------------------
                        if ARG_PARSE_PROJ in parse_args:
                            try:
                                project_pt_tier(inst, proj_aln_method=proj_aln_method)
                            except PhraseStructureProjectionException as pspe:
                                fail(F_PROJECTION)
                            except NoAlignmentProvidedError as nape:
                                fail(F_T_G_ALN)

                            try:
                                project_ds_tier(inst, proj_aln_method=proj_aln_method)
                            except ProjectionException as pe:
                                fail(F_PROJECTION)
                            except NoAlignmentProvidedError as nape:
                                fail(F_T_G_ALN)



                # Sort the tiers... ----------------------------------------------------
                inst.sort_tiers()

            except Exception as e:
                # ENRICH_LOG.warn("Unknown Error occurred processing instance {}".format(inst.id))
                ENRICH_LOG.debug(e)
                # raise(e)
                fail(F_UNKNOWN)

            if not reasons:
                success()


            ENRICH_LOG.info(feedback_string.format(inst_status, ','.join(reasons)))

        ENRICH_LOG.log(1000, 'Writing output file...')

        if hasattr(kwargs.get(ARG_OUTFILE), 'write'):
            xigtxml.dump(kwargs.get(ARG_OUTFILE), corp)
        else:
            xigtxml.dump(writefile(kwargs.get(ARG_OUTFILE)), corp)

        ENRICH_LOG.log(1000, 'Done.')
        ENRICH_LOG.log(1000, "{} instances written.".format(len(corp)))
Beispiel #24
0
 def test_parse(self):
     p = os.path.join(testfile_dir, 'naacl/ger.naacl')
     o = os.path.join(testfile_dir, 'naacl/ger.xml')
     xc = naacl_to_xigt(p)
     dump(open(o, 'w'), xc)
Beispiel #25
0
elif args.subcommand == CMD_FILTER:
    filter_corpus(flatten_list(getattr(args, ARG_INFILE)), getattr(args, ARG_OUTFILE), **vars(args))

# EXTRACT
elif args.subcommand == CMD_EXTRACT:
    extract_from_xigt(input_filelist=flatten_list(args.FILE), **vars(args))

# EVAL
elif args.subcommand == CMD_EVAL:
    evaluate_intent(flatten_list(args.FILE),
                    eval_alignment=args.alignment,
                    eval_ds=args.ds_projection,
                    eval_posproj=args.pos_projection,
                    classifier_path=args.classifier,
                    classifier_feats=args.classifier_feats,
                    eval_tagger=args.pos_tagger,
                    gold_tagmap=args.tagmap_gold, trans_tagmap=args.tagmap_trans, outpath=args.output)

# TEXT CONVERT
elif args.subcommand == CMD_TEXT:
    xc = text_to_xigtxml(args.FILE)
    dump(args.OUT_FILE, xc)

# PROJECT
elif args.subcommand == CMD_PROJECT:
    do_projection(**vars(args))

# REPRO
elif args.subcommand == CMD_REPRO:
    reproduce(args.action)
Beispiel #26
0
            if head_i == -1:
                head_w = 'ROOT'
                head_i = 0
            else:
                head_w = w_tier[int(head_i)-1].value()

            child_t = Terminal(dep_w, index=int(dep))
            head_t = Terminal(head_w, index=head_i)
            edges.append(DepEdge(head=head_t, dep=child_t))


    dt = build_dep_edges(edges)
    return dt

if __name__ == '__main__':
    p = ArgumentParser()
    p.add_argument('IN_FILE', type=existsfile)
    p.add_argument('OUT_FILE')

    args = p.parse_args()

    xc = naacl_to_xigt(args.IN_FILE)
    dump(open(args.OUT_FILE, 'w'), xc)

class test_naacl(TestCase):

    def test_parse(self):
        p = os.path.join(testfile_dir, 'naacl/ger.naacl')
        o = os.path.join(testfile_dir, 'naacl/ger.xml')
        xc = naacl_to_xigt(p)
        dump(open(o, 'w'), xc)
Beispiel #27
0
def convert_pml(aln_path, out_path, hindi=True):

    if hindi:
        igt_data = retrieve_hindi()
    else:
        igt_data = retrieve_naacl()

    a_root = load_xml(aln_path)
    doc_a  = a_root.find(".//reffile[@name='document_a']").get('href')
    doc_b  = a_root.find(".//reffile[@name='document_b']").get('href')



    doc_a = os.path.join(os.path.join(os.path.dirname(aln_path), doc_a))
    doc_b  = os.path.join(os.path.join(os.path.dirname(aln_path), doc_b))

    # Load the sentences for each document.
    a_sents, a_glossed = load_sents(doc_a)
    b_sents, b_glossed = load_sents(doc_b)



    sent_alignments = a_root.findall(".//body/LM")

    assert (a_glossed and not b_glossed) or (b_glossed and not a_glossed), "Only one file should have glosses"

    xc = XigtCorpus()

    for sent_alignment in sent_alignments:

        # Get the sentence id...
        aln_id = sent_alignment.attrib.get('id')
        a_snt_id = re.search('^.+?-(.*)$', aln_id).group(1)
        if a_snt_id not in igt_data:
            continue

        # Get the text and tokens from the naacl data.
        pre_txt, lang_txt, gloss_txt, trans_txt = igt_data[a_snt_id]
        lang_tokens = lang_txt.split()
        gloss_tokens = gloss_txt.split()
        trans_tokens = trans_txt.split()

        a_snt_ref = sent_alignment.find('./tree_a.rf').text.split('#')[1]
        b_snt_ref = sent_alignment.find('./tree_b.rf').text.split('#')[1]

        word_alignments = sent_alignment.findall('./node_alignments/LM')

        a_snt, a_edges = a_sents[a_snt_ref]
        b_snt, b_edges = b_sents[b_snt_ref]

        assert isinstance(a_snt, Sentence)
        assert isinstance(b_snt, Sentence)
        # -------------------------------------------
        # Skip sentences if they are not found for whatever reason
        # -------------------------------------------
        if not a_snt or not b_snt:
            continue

        # -------------------------------------------
        # Start constructing the IGT Instance.
        # -------------------------------------------

        trans_snt, trans_indices = a_snt, a_edges
        gloss_snt, gloss_indices = b_snt, b_edges
        if a_glossed:
            trans_snt, trans_indices = b_snt, b_edges
            gloss_snt, gloss_indices = a_snt, a_edges

        # Hindi stuff...
        if hindi:
            lang_tokens = [w.text for w in gloss_snt]
            lang_postags   = [w.pos  for w in gloss_snt]
            lang_txt    = ' '.join(lang_tokens)

            trans_tokens = [w.text for w in trans_snt]
            trans_postags   = [w.pos  for w in trans_snt]
            trans_txt    = ' '.join(trans_tokens)

            gloss_tokens  = [w.gloss if w.gloss else 'NULL' for w in gloss_snt]
            gloss_postags = lang_postags
            gloss_txt     = ' '.join(gloss_tokens)



        inst = Igt(id=re.sub('s-', 'igt', a_snt_ref))
        nt   = Tier(type=ODIN_TIER_TYPE, id=NORM_ID, attributes={STATE_ATTRIBUTE:NORM_STATE})
        ll   = Item(id='n1', attributes={ODIN_TAG_ATTRIBUTE:ODIN_LANG_TAG}, text=lang_txt)
        gl   = Item(id='n2', attributes={ODIN_TAG_ATTRIBUTE:ODIN_GLOSS_TAG}, text=gloss_txt)
        tl   = Item(id='n3', attributes={ODIN_TAG_ATTRIBUTE:ODIN_TRANS_TAG}, text=trans_txt)
        nt.extend([ll,gl,tl])
        inst.append(nt)


        # -------------------------------------------
        # Handle the phrase tiers
        # -------------------------------------------
        generate_lang_phrase_tier(inst)
        generate_trans_phrase_tier(inst)

        def process_postags(sent, tokens):
            postags = []
            for i, token in enumerate(tokens):
                word = sent.getorder(i+1)
                if word is None:
                    postags.append(None)
                else:
                    postags.append(word.pos)
            return postags

        # -------------------------------------------
        # Now, handle the translation words.
        # -------------------------------------------
        tt = create_word_tier(ODIN_TRANS_TAG, trans_tokens, trans_phrase(inst)[0])
        inst.append(tt)

        if not hindi:
            trans_postags = process_postags(trans_snt, trans_tokens)

        add_pos_tags(inst, tt.id, trans_postags, tag_method=INTENT_POS_MANUAL)


        # -------------------------------------------
        # Handle the words tiers...
        # -------------------------------------------
        wt = create_word_tier(ODIN_LANG_TAG, lang_tokens, lang_phrase(inst)[0])
        gwt= create_word_tier(ODIN_GLOSS_TAG, gloss_tokens, gl)
        inst.extend([wt, gwt])
        # Quickly set the alignment for the gloss words.
        for w, gw in zip(wt, gwt):
            gw.alignment = w.id


        if not hindi:
            lang_postags = process_postags(gloss_snt, gloss_tokens)
            gloss_postags = lang_postags

        add_pos_tags(inst, wt.id, lang_postags, tag_method=INTENT_POS_MANUAL)
        add_pos_tags(inst, gwt.id, gloss_postags, tag_method=INTENT_POS_MANUAL)

        create_dt_tier(inst, assemble_ds(gloss_snt, gloss_indices), wt, INTENT_DS_MANUAL)
        create_dt_tier(inst, assemble_ds(trans_snt, trans_indices), tt, INTENT_DS_MANUAL)



        # -------------------------------------------
        # Now, the word alignments.
        # -------------------------------------------
        a = Alignment()
        for word_alignment in word_alignments:
            a_ref = word_alignment.find('./a.rf').text.split('#')[1]
            b_ref = word_alignment.find('./b.rf').text.split('#')[1]

            a_word = a_snt.getid(a_ref)
            b_word = b_snt.getid(b_ref)

            if a_word is None or b_word is None:
                continue

            if not hindi:
                a_idx  = a_word.order
                b_idx  = b_word.order
            else:
                a_idx  = a_snt.index(a_word)+1
                b_idx  = b_snt.index(b_word)+1

            # Make sure the gloss is in the
            if a_glossed:
                trans_idx = b_idx
                lang_idx  = a_idx
            else:
                trans_idx = a_idx
                lang_idx  = b_idx

            a.add((trans_idx, lang_idx))


        set_bilingual_alignment(inst, trans(inst), lang(inst), a, INTENT_ALN_MANUAL)
        set_bilingual_alignment(inst, trans(inst), gloss(inst), a, INTENT_ALN_MANUAL)

        xc.append(inst)

    with open(out_path, 'w', encoding='utf-8') as f:
        xigtxml.dump(f, xc)