Ejemplo n.º 1
0
def produce_files(**c):

    # Set up the output files
    outdir = c.get('outdir')
    c['tag_out'] = os.path.join(outdir, 'ablation_tags.txt')
    c['class_out'] = os.path.join(outdir, 'ablation_class.txt')

    c['maxent_path'] = os.path.join(outdir, 'ablation-model.maxent')

    c['tag_f'] = open(c.get('tag_out'), 'w', encoding='utf-8')
    c['class_f'] = open(c.get('class_out'), 'w', encoding='utf-8')

    c = ArgPasser(c)

    xp = XamlParser(**c)

    xml_files = glob.glob(os.path.join(c.get('input_dir'), c.get('pattern', default='*.xml')))

    for x_f in xml_files:
        xp.parse(x_f, **c)
    return c
Ejemplo n.º 2
0
def write_gram(token, **kwargs):

    # Re-cast the kwargs as an argpasser.
    kwargs = ArgPasser(kwargs)

    output_type = kwargs.get('type', 'classifier')
    output = kwargs.get('output', sys.stdout)

    posdict = kwargs.get('posdict', None)


    if posdict is None:
        posdict = env.load_posdict()

    # Previous tag info
    prev_gram = kwargs.get('prev_gram')
    next_gram = kwargs.get('next_gram')


    # Get heuristic alignment
    aln_labels = kwargs.get('aln_labels', [])

    # ===========================================================================
    # Break apart the token...
    # ===========================================================================
    gram = token.seq

    pos = token.goldlabel

    # Lowercase if asked for
    lower = kwargs.get('lowercase', True, bool)
    gram = gram.lower() if gram else gram

    # Fix the various issues with the grams.
    gram = fix_gram(gram)

    # ===========================================================================
    # Do some cleaning on the gram....
    # ===========================================================================

    # Only take the first of two slashed grams
    gram = re.sub('(.*)?/(.*)', r'\1', gram)

    # Remove leading and trailing stuff
    gram = re.sub('^(\S+)[\-=:\[\(\]\)/\*]$', r'\1', gram)
    gram = re.sub('^[\-=:\[\(\]\)/\*](\S+)$', r'\1', gram)

    # ===========================================================================

    # Output the grams for a classifier
    #
    # NOTE! Only tokens that have an ASSIGNED pos tag will be written out this way!
    if output_type == 'classifier' and pos:
        output.write(pos)

        # =======================================================================
        # Get the morphemes
        # =======================================================================

        morphs = intent.utils.token.tokenize_string(gram, intent.utils.token.morpheme_tokenizer)

        # =============================================================================
        # Gram cleaning....
        # =============================================================================

        # Replace the characters that cause the svmlight format issues.
        gram = gram.replace(':', '-')
        gram = gram.replace('#', '-')

        # =======================================================================
        # Is there a number
        # =======================================================================
        if re.search('[0-9]', gram) and kwargs.get('feat_has_number', False, bool):
            output.write('\thas-number:1')

        # =======================================================================
        # What labels is it aligned with
        # =======================================================================
        if kwargs.get('feat_align', False, bool):
            for aln_label in aln_labels:
                output.write('\taln-label-%s:1' % aln_label)

        # =======================================================================
        # Suffix
        # =======================================================================
        if kwargs.get('feat_suffix', True, bool):
            output.write('\tgram-suffix-3-%s:1' % gram[-3:])
            output.write('\tgram-suffix-2-%s:1' % gram[-2:])
            output.write('\tgram-suffix-1-%s:1' % gram[-1:])

        # =======================================================================
        # Prefix
        # =======================================================================
        if kwargs.get('feat_prefix', True, bool):
            output.write('\tgram-prefix-3-%s:1' % gram[:3])
            output.write('\tgram-prefix-2-%s:1' % gram[:2])
            output.write('\tgram-prefix-1-%s:1' % gram[:1])

        # =======================================================================
        # Number of morphs
        # =======================================================================
        if kwargs.get('feat_morph_num', False, bool):
            output.write('\t%d-morphs:1' % len(list(morphs)))

        # ===================================================================
        # Previous gram
        # ===================================================================
        if prev_gram:
            prev_gram = prev_gram.lower() if lower else prev_gram

            # And then tokenize...
            for token in intent.utils.token.tokenize_string(prev_gram, intent.utils.token.morpheme_tokenizer):

                if kwargs.get('feat_prev_gram', True, bool):
                    output.write('\tprev-gram-%s:1' % fix_gram(token.seq))

                # Add prev dictionary tag
                if posdict and kwargs.get('feat_prev_gram_dict', True, bool) and token.seq in posdict:
                    prev_tags = posdict.top_n(token.seq)
                    output.write('\tprev-gram-dict-tag-%s:1' % prev_tags[0][0])

        # Write a "**NONE**" for prev or next...
        elif kwargs.get('feat_prev_gram', True, bool):
            output.write('\tprev-gram-**NONE**:1')

        # ===================================================================
        # Next gram
        # ===================================================================
        if next_gram:
            next_gram = next_gram.lower() if lower else next_gram

            for token in intent.utils.token.tokenize_string(next_gram, intent.utils.token.morpheme_tokenizer):

                # ===================================================================
                # Gram itself
                # ===================================================================

                if kwargs.get('feat_next_gram', True, bool):
                    output.write('\tnext-gram-%s:1' % fix_gram(token.seq))

                if posdict and kwargs.get('feat_next_gram_dict', True, bool) and token.seq in posdict:
                    next_tags = posdict.top_n(token.seq)
                    output.write('\tnext-gram-dict-tag-%s:1' % next_tags[0][0])

        elif kwargs.get('feat_next_gram', True, bool):
            output.write('\tnext-gram-**NONE**:1')

        # =======================================================================
        # Iterate through the morphs
        # =======================================================================

        for token in morphs:
            # ===================================================================
            # Just write the morph
            # ===================================================================
            if kwargs.get('feat_basic', True, bool):
                output.write('\t%s:1' % token.seq)

            # ===================================================================
            # If the morph resembles a word in our dictionary, give it
            # a predicted tag
            # ===================================================================

            if posdict and token.seq in posdict and kwargs.get('feat_dict', True, bool):

                top_tags = posdict.top_n(token.seq)
                # best = top_tags[0][0]
                # if best != pos:
                # 	MODULE_LOGGER.debug('%s tagged as %s not %s' % (gram, pos, best))

                output.write('\ttop-dict-word-%s:1' % top_tags[0][0])
                if len(top_tags) > 1:
                    output.write('\tnext-dict-word-%s:1' % top_tags[1][0])

        output.write('\n')

    # ===========================================================================
    # If writing the gram out for the tagger...
    # ===========================================================================

    if output_type == 'tagger' and kwargs.get('tag_f'):
        output.write('%s/%s ' % (gram, pos))
Ejemplo n.º 3
0
def do_projection(**kwargs):
    """
    (Re)project the
    :param aln_method: The alignment method
    """
    kwargs = ArgPasser(kwargs)
    aln_method = ALN_ARG_MAP[kwargs.get('aln_method', ARG_ALN_ANY)]

    successes = 0
    failures  = 0

    in_path = kwargs.get(ARG_INFILE)
    with open(in_path, 'r', encoding='utf-8') as f:
        PROJ_LOG.log(1000, 'Loading file "{}"...'.format(os.path.basename(in_path)))
        xc = xigtxml.load(f, mode=INCREMENTAL)
        for inst in xc:
            success_fail_string = 'Instance {:20s} {{:10s}}{{}}'.format('"'+inst.id+'"...')

            def fail(reason):
                nonlocal failures, success_fail_string
                success_fail_string = success_fail_string.format('FAIL', reason)
                failures += 1
            def success():
                nonlocal successes, success_fail_string
                success_fail_string = success_fail_string.format('SUCCESS', '')
                successes += 1

            # Query whether we want to require to use only trees
            # where the alignment is 100%.
            completeness_requirement = kwargs.get('completeness', default=0, t=float)

            # TODO: Find better way to do this?
            try:
                if kwargs.get('pos', True):
                    project_trans_pos_to_gloss(inst, aln_method=aln_method, completeness_requirement=completeness_requirement)
                    project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_PROJ)
                if kwargs.get('ds', True):
                    project_pt_tier(inst, proj_aln_method=aln_method)
                    project_ds_tier(inst, proj_aln_method=aln_method, completeness_requirement=completeness_requirement)
            except (NoNormLineException) as ntle:
                fail("Bad Lines")
            except (NoAlignmentProvidedError, ProjectionException) as nape:
                fail("Alignment")
            except (GlossLangAlignException) as glae:
                fail("Gloss-Lang")
            except (ProjectionIncompleteAlignment) as pia:
                fail("Alignment Incomplete")
            except PhraseStructureProjectionException as pspe:
                fail("Projection Failed")
            else:
                success()
            finally:
                PROJ_LOG.info(success_fail_string)
                inst.sort_tiers()

        out_path = kwargs.get(ARG_OUTFILE)
        # Try to make the folder if it doesn't already exist.
        os.makedirs(os.path.dirname(out_path), exist_ok=True)

        PROJ_LOG.log(1000, 'Writing new file "{}"...'.format(os.path.basename(out_path)))
        with open(out_path, 'w', encoding='utf-8') as out_f:
            xigtxml.dump(out_f, xc)

    PROJ_LOG.log(1000, '{} instances processed, {} successful, {} failed.'.format(len(xc), successes, failures))