コード例 #1
0
ファイル: evaluation.py プロジェクト: rgeorgi/intent
def evaluate_classifier_on_instances(inst_list, classifier, feat_list, pos_class_matrix, gold_tagmap=None):
    """
    Given a list of instances, do the evaluation on them.

    :param inst_list:
    :param classifier:
    :param tagger:
    :return:
    """

    pd = load_posdict() if (CLASS_FEATS_DICT in feat_list) or (CLASS_FEATS_PDICT in feat_list) or (CLASS_FEATS_NDICT in feat_list) else False

    matches = 0
    compares = 0

    for inst in inst_list:
        sup_postier = gloss_tag_tier(inst, tag_method=INTENT_POS_MANUAL)
        if sup_postier is None:
            continue
        gw_tier = gloss(inst)
        classify_gloss_pos(inst, classifier,
                           posdict=pd,
                           feat_prev_gram=CLASS_FEATS_PRESW in feat_list,
                           feat_next_gram=CLASS_FEATS_NEXSW in feat_list,
                           feat_dict=CLASS_FEATS_DICT in feat_list,
                           feat_prev_gram_dict=CLASS_FEATS_PDICT in feat_list,
                           feat_next_gram_dict=CLASS_FEATS_NDICT in feat_list,
                           feat_suffix=CLASS_FEATS_SUF in feat_list,
                           feat_prefix=CLASS_FEATS_PRE in feat_list,
                           feat_morph_num=CLASS_FEATS_NUMSW in feat_list,
                           feat_has_number=CLASS_FEATS_NUM in feat_list,
                           feat_basic=CLASS_FEATS_SW in feat_list)


        cls_postier = gloss_tag_tier(inst, tag_method=INTENT_POS_CLASS)


        for cls_tag in cls_postier:
            word = xigt_find(gw_tier, id=cls_tag.alignment)
            sup_tag = xigt_find(sup_postier, alignment=cls_tag.alignment)

            if sup_tag is None:
                continue
            else:
                sup_tag_v = sup_tag.value()
                if gold_tagmap is not None:
                    sup_tag_v = gold_tagmap.get(sup_tag_v)

            pos_class_matrix.add(sup_tag_v, cls_tag.value())
            if cls_tag.value() == sup_tag_v:
                matches += 1
            compares += 1

    return matches, compares, matches/compares*100
コード例 #2
0
ファイル: extraction.py プロジェクト: rgeorgi/intent
def write_out_gram_dict(subword_dict, feat_path, feat_list, threshold = 1):
    """
    Given the gram+tag dict, write out grams for those that have been seen enough to
    meet our threshold.

    :param subword_dict:
    :type subword_dict: TwoLevelCountDict
    :param feat_path:
    :param class_path:
    """

    EXTRACT_LOG.log(NORM_LEVEL, 'Writing out svm-lite style features to "{}"...'.format(feat_path))
    feat_file = open(feat_path, 'w', encoding='utf-8')

    # Load the posdict if needed...
    pd = load_posdict() if (CLASS_FEATS_DICT in feat_list) or (CLASS_FEATS_PDICT in feat_list) or (CLASS_FEATS_NDICT in feat_list) else False

    for subword in subword_dict.keys():
        for tag in subword_dict[subword].keys():
            # Write out the gram with this tag as many times as it appears...
            for prev_word, next_word in subword_dict[subword][tag]['contexts']:
                gt = GoldTagPOSToken(subword, goldlabel=tag)

                # -------------------------------------------
                # Now, vary the features depending on whats in the list
                # -------------------------------------------

                write_gram(gt, lowercase=True,
                           feat_next_gram=CLASS_FEATS_NEXSW in feat_list,
                           feat_prev_gram=CLASS_FEATS_PRESW in feat_list,
                           feat_suffix=CLASS_FEATS_SUF in feat_list,
                           feat_prefix=CLASS_FEATS_PRE in feat_list,
                           feat_has_number=CLASS_FEATS_NUM in feat_list,
                           feat_morph_num=CLASS_FEATS_NUMSW in feat_list,
                           feat_prev_gram_dict=CLASS_FEATS_PDICT in feat_list,
                           feat_next_gram_dict=CLASS_FEATS_NDICT in feat_list,
                           feat_basic=CLASS_FEATS_SW in feat_list,
                           feat_dict=CLASS_FEATS_DICT in feat_list,
                           posdict=pd,
                           next_gram=next_word,
                           prev_gram=prev_word,
                           output=feat_file)

    feat_file.close()
    EXTRACT_LOG.log(NORM_LEVEL, 'Written')
コード例 #3
0
ファイル: grams.py プロジェクト: rgeorgi/intent
def write_gram(token, **kwargs):

    # Re-cast the kwargs as an argpasser.
    kwargs = ArgPasser(kwargs)

    output_type = kwargs.get('type', 'classifier')
    output = kwargs.get('output', sys.stdout)

    posdict = kwargs.get('posdict', None)


    if posdict is None:
        posdict = env.load_posdict()

    # Previous tag info
    prev_gram = kwargs.get('prev_gram')
    next_gram = kwargs.get('next_gram')


    # Get heuristic alignment
    aln_labels = kwargs.get('aln_labels', [])

    # ===========================================================================
    # Break apart the token...
    # ===========================================================================
    gram = token.seq

    pos = token.goldlabel

    # Lowercase if asked for
    lower = kwargs.get('lowercase', True, bool)
    gram = gram.lower() if gram else gram

    # Fix the various issues with the grams.
    gram = fix_gram(gram)

    # ===========================================================================
    # Do some cleaning on the gram....
    # ===========================================================================

    # Only take the first of two slashed grams
    gram = re.sub('(.*)?/(.*)', r'\1', gram)

    # Remove leading and trailing stuff
    gram = re.sub('^(\S+)[\-=:\[\(\]\)/\*]$', r'\1', gram)
    gram = re.sub('^[\-=:\[\(\]\)/\*](\S+)$', r'\1', gram)

    # ===========================================================================

    # Output the grams for a classifier
    #
    # NOTE! Only tokens that have an ASSIGNED pos tag will be written out this way!
    if output_type == 'classifier' and pos:
        output.write(pos)

        # =======================================================================
        # Get the morphemes
        # =======================================================================

        morphs = intent.utils.token.tokenize_string(gram, intent.utils.token.morpheme_tokenizer)

        # =============================================================================
        # Gram cleaning....
        # =============================================================================

        # Replace the characters that cause the svmlight format issues.
        gram = gram.replace(':', '-')
        gram = gram.replace('#', '-')

        # =======================================================================
        # Is there a number
        # =======================================================================
        if re.search('[0-9]', gram) and kwargs.get('feat_has_number', False, bool):
            output.write('\thas-number:1')

        # =======================================================================
        # What labels is it aligned with
        # =======================================================================
        if kwargs.get('feat_align', False, bool):
            for aln_label in aln_labels:
                output.write('\taln-label-%s:1' % aln_label)

        # =======================================================================
        # Suffix
        # =======================================================================
        if kwargs.get('feat_suffix', True, bool):
            output.write('\tgram-suffix-3-%s:1' % gram[-3:])
            output.write('\tgram-suffix-2-%s:1' % gram[-2:])
            output.write('\tgram-suffix-1-%s:1' % gram[-1:])

        # =======================================================================
        # Prefix
        # =======================================================================
        if kwargs.get('feat_prefix', True, bool):
            output.write('\tgram-prefix-3-%s:1' % gram[:3])
            output.write('\tgram-prefix-2-%s:1' % gram[:2])
            output.write('\tgram-prefix-1-%s:1' % gram[:1])

        # =======================================================================
        # Number of morphs
        # =======================================================================
        if kwargs.get('feat_morph_num', False, bool):
            output.write('\t%d-morphs:1' % len(list(morphs)))

        # ===================================================================
        # Previous gram
        # ===================================================================
        if prev_gram:
            prev_gram = prev_gram.lower() if lower else prev_gram

            # And then tokenize...
            for token in intent.utils.token.tokenize_string(prev_gram, intent.utils.token.morpheme_tokenizer):

                if kwargs.get('feat_prev_gram', True, bool):
                    output.write('\tprev-gram-%s:1' % fix_gram(token.seq))

                # Add prev dictionary tag
                if posdict and kwargs.get('feat_prev_gram_dict', True, bool) and token.seq in posdict:
                    prev_tags = posdict.top_n(token.seq)
                    output.write('\tprev-gram-dict-tag-%s:1' % prev_tags[0][0])

        # Write a "**NONE**" for prev or next...
        elif kwargs.get('feat_prev_gram', True, bool):
            output.write('\tprev-gram-**NONE**:1')

        # ===================================================================
        # Next gram
        # ===================================================================
        if next_gram:
            next_gram = next_gram.lower() if lower else next_gram

            for token in intent.utils.token.tokenize_string(next_gram, intent.utils.token.morpheme_tokenizer):

                # ===================================================================
                # Gram itself
                # ===================================================================

                if kwargs.get('feat_next_gram', True, bool):
                    output.write('\tnext-gram-%s:1' % fix_gram(token.seq))

                if posdict and kwargs.get('feat_next_gram_dict', True, bool) and token.seq in posdict:
                    next_tags = posdict.top_n(token.seq)
                    output.write('\tnext-gram-dict-tag-%s:1' % next_tags[0][0])

        elif kwargs.get('feat_next_gram', True, bool):
            output.write('\tnext-gram-**NONE**:1')

        # =======================================================================
        # Iterate through the morphs
        # =======================================================================

        for token in morphs:
            # ===================================================================
            # Just write the morph
            # ===================================================================
            if kwargs.get('feat_basic', True, bool):
                output.write('\t%s:1' % token.seq)

            # ===================================================================
            # If the morph resembles a word in our dictionary, give it
            # a predicted tag
            # ===================================================================

            if posdict and token.seq in posdict and kwargs.get('feat_dict', True, bool):

                top_tags = posdict.top_n(token.seq)
                # best = top_tags[0][0]
                # if best != pos:
                # 	MODULE_LOGGER.debug('%s tagged as %s not %s' % (gram, pos, best))

                output.write('\ttop-dict-word-%s:1' % top_tags[0][0])
                if len(top_tags) > 1:
                    output.write('\tnext-dict-word-%s:1' % top_tags[1][0])

        output.write('\n')

    # ===========================================================================
    # If writing the gram out for the tagger...
    # ===========================================================================

    if output_type == 'tagger' and kwargs.get('tag_f'):
        output.write('%s/%s ' % (gram, pos))
コード例 #4
0
ファイル: enrich.py プロジェクト: rgeorgi/intent
def enrich(**kwargs):

    global classifier

    if ARG_OUTFILE not in kwargs:
        ENRICH_LOG.critical("No output file specified.")
        sys.exit()

    # =============================================================================
    # Set up the alternate classifier path...
    # =============================================================================

    class_path = kwargs.get('class_path')

    #===========================================================================
    # Set up the different arguments...
    #===========================================================================
    inpath = kwargs.get(ARG_INFILE)

    parse_args = kwargs.get(PARSE_VAR, [])
    pos_args = kwargs.get(POS_VAR, [])
    aln_args = kwargs.get(ALN_VAR, [])

    max_parse_length = kwargs.get('max_parse_length', 10)

    if not (parse_args or pos_args or aln_args):
        ENRICH_LOG.warning("No enrichment specified. Basic processing only will be performed.")

    #===========================================================================
    # Sanity check the arguments.
    #===========================================================================

    # Check that alignment is asked for if projection is asked for.
    if (ARG_POS_PROJ in pos_args or ARG_PARSE_PROJ in parse_args) and (not aln_args):
        ENRICH_LOG.warn("You have asked for projection methods but have not requested " + \
                        "alignments to be generated. Projection may fail if alignment not already present in file.")

    ENRICH_LOG.log(1000, 'Loading input file...')
    with open(inpath, 'r', encoding='utf-8') as in_f:
        corp = xigtxml.load(in_f, mode=INCREMENTAL)

        # -------------------------------------------
        # Initialize the English tagger if:
        #   A) "proj" option is selected for pos.
        #   B) "trans" option is given for pos.
        #   C) "heurpos" option is given for alignment.
        # -------------------------------------------
        s = None
        if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args:
            ENRICH_LOG.log(1000, 'Initializing tagger...')
            tagger = c.getpath('stanford_tagger_trans')

            try:
                s = StanfordPOSTagger(tagger)
            except TaggerError as te:
                ENRICH_LOG.critical(te)
                sys.exit(2)

        # -------------------------------------------
        # Initialize the parser if:
        #    A) "trans" option is given for parse
        #    B) "proj" option is given for parse.
        # -------------------------------------------
        if ARG_PARSE_TRANS in parse_args or ARG_PARSE_PROJ in parse_args:
            ENRICH_LOG.log(1000, "Intializing English parser...")
            sp = stanford_parser.StanfordParser()

        # -------------------------------------------
        # Initialize the classifier if:
        #    A) "class" option is given for pos
        #    B) "heurpos" option is given for alignment.
        # -------------------------------------------
        m = None
        if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args:
            ENRICH_LOG.log(1000, "Initializing gloss-line classifier...")
            p = load_posdict()
            m = mallet_maxent.MalletMaxent(classifier)


        # -- 1b) Giza Gloss to Translation alignment --------------------------------------
        if ARG_ALN_GIZA in aln_args or ARG_ALN_GIZAHEUR in aln_args:
            ENRICH_LOG.log(1000, 'Aligning gloss and translation lines using mgiza++...')

            try:
                if ARG_ALN_GIZAHEUR in aln_args:
                    giza_align_t_g(corp, resume=True, use_heur=True, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT))
                if ARG_ALN_GIZA in aln_args:
                    giza_align_t_g(corp, resume=True, use_heur=False, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT))
            except GizaAlignmentException as gae:
                gl = logging.getLogger('giza')
                gl.critical(str(gae))
                raise gae

        # -------------------------------------------
        # Begin iterating through the corpus
        # -------------------------------------------

        for inst in corp:

            feedback_string = 'Instance {:15s}: {{:20s}}{{}}'.format(inst.id)

            reasons = []
            inst_status = None

            def fail(reason):
                nonlocal inst_status, reasons
                if reason not in reasons:
                    reasons.append(reason)
                inst_status = 'WARN'

            def success():
                nonlocal inst_status
                inst_status = 'OK'

            # -------------------------------------------
            # Define the reasons for failure
            # -------------------------------------------
            F_GLOSS_LINE = "NOGLOSS"
            F_LANG_LINE  = "NOLANG"
            F_TRANS_LINE = "NOTRANS"
            F_BAD_LINES  = "BADLINES"
            F_L_G_ALN    = "L_G_ALIGN"
            F_T_G_ALN    = "G_T_ALIGN"
            F_NO_TRANS_POS="NO_POS_TRANS"
            F_PROJECTION = "PROJECTION"
            F_UNKNOWN    = "UNKNOWN"
            F_PARSELEN   = "OVER_MAX_LENGTH"


            try:

                # -------------------------------------------
                # Get the different lines
                # -------------------------------------------
                def tryline(func):
                    nonlocal inst
                    try:
                        return func(inst)
                    except NoNormLineException as nnle:
                        return None

                gl = tryline(gloss_line)
                tls = tryline(trans_lines)
                lls  = tryline(lang_lines)

                has_gl = gl is not None
                has_tl = tls is not None
                has_ll = lls is not None

                has_all = lambda: (has_gl and has_tl and has_ll)


                # -------------------------------------------
                # Translation Line
                # -------------------------------------------
                if has_tl:

                    if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args:

                        try:
                            tag_trans_pos(inst, s)
                        except CriticalTaggerError as cte:
                            ENRICH_LOG.critical(str(cte))
                            sys.exit(2)

                    if ARG_PARSE_PROJ in parse_args or ARG_PARSE_TRANS in parse_args:
                        if len(trans(inst)) <= max_parse_length:
                            parse_translation_line(inst, sp, pt=True, dt=True)
                        else:
                            fail(F_PARSELEN)

                # 4) POS tag the gloss line --------------------------------------------
                if has_gl:
                    if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args:
                        classify_gloss_pos(inst, m, posdict=p)

                # -------------------------------------------
                # Try getting alignments.
                # -------------------------------------------
                if has_gl and has_ll:
                    try:
                        add_gloss_lang_alignments(inst)
                    except GlossLangAlignException as glae:
                        fail(F_L_G_ALN)

                if has_gl and has_tl:
                    if ARG_ALN_HEURPOS in aln_args:
                        heur_align_inst(inst, use_pos=True)
                    if ARG_ALN_HEUR in aln_args:
                        heur_align_inst(inst, use_pos=False)

                # -------------------------------------------
                # Now, do the necessary projection tasks.
                # -------------------------------------------

                # Project the classifier tags...
                if has_ll and has_gl and ARG_POS_CLASS in pos_args:
                    try:
                        project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_CLASS)
                    except GlossLangAlignException:
                        fail(F_L_G_ALN)

                # -------------------------------------------
                # Do the trans-to-lang projection...
                # -------------------------------------------

                if has_all():
                    proj_aln_method = ALN_ARG_MAP[kwargs.get('proj_aln', ARG_ALN_ANY)]
                    aln = get_trans_gloss_alignment(inst, aln_method=proj_aln_method)
                    if not aln or len(aln) == 0:
                        fail(F_T_G_ALN)
                    else:
                        # -------------------------------------------
                        # POS Projection
                        # -------------------------------------------
                        if ARG_POS_PROJ in pos_args:
                            trans_tags = trans_tag_tier(inst)

                            if not trans_tags:
                                fail(F_NO_TRANS_POS)
                            else:
                                project_trans_pos_to_gloss(inst)
                                try:
                                    project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_PROJ)
                                except GlossLangAlignException as glae:
                                    fail(F_L_G_ALN)

                        # -------------------------------------------
                        # Parse projection
                        # -------------------------------------------
                        if ARG_PARSE_PROJ in parse_args:
                            try:
                                project_pt_tier(inst, proj_aln_method=proj_aln_method)
                            except PhraseStructureProjectionException as pspe:
                                fail(F_PROJECTION)
                            except NoAlignmentProvidedError as nape:
                                fail(F_T_G_ALN)

                            try:
                                project_ds_tier(inst, proj_aln_method=proj_aln_method)
                            except ProjectionException as pe:
                                fail(F_PROJECTION)
                            except NoAlignmentProvidedError as nape:
                                fail(F_T_G_ALN)



                # Sort the tiers... ----------------------------------------------------
                inst.sort_tiers()

            except Exception as e:
                # ENRICH_LOG.warn("Unknown Error occurred processing instance {}".format(inst.id))
                ENRICH_LOG.debug(e)
                # raise(e)
                fail(F_UNKNOWN)

            if not reasons:
                success()


            ENRICH_LOG.info(feedback_string.format(inst_status, ','.join(reasons)))

        ENRICH_LOG.log(1000, 'Writing output file...')

        if hasattr(kwargs.get(ARG_OUTFILE), 'write'):
            xigtxml.dump(kwargs.get(ARG_OUTFILE), corp)
        else:
            xigtxml.dump(writefile(kwargs.get(ARG_OUTFILE)), corp)

        ENRICH_LOG.log(1000, 'Done.')
        ENRICH_LOG.log(1000, "{} instances written.".format(len(corp)))
コード例 #5
0
ファイル: tests.py プロジェクト: rgeorgi/intent
 def test_classify_pos_tags(self):
     tags = classify_gloss_pos(self.igt, MalletMaxent(), posdict=load_posdict())
     self.assertEqual(tags, self.tags)