Esempio n. 1
0
def process_dicts(class_path):
    c = pickle.load(open(c_path, "rb"))
    d = pickle.load(open(d_path, "rb"))
    m = pickle.load(open(m_path, "rb"))

    print(len(c), len(d), len(m))

    # Threshold:
    thresh = 30

    # Now, we want to write out every word that we've seen at least 3 times.
    out_path = os.path.join(proj_root, "odin_feats.txt")
    out_f = open(out_path, "w", encoding="utf-8")
    for i, w in enumerate(d.keys()):

        if d[w].total() < thresh:
            LOG.debug("Skipping {}".format(w))
        else:
            LOG.debug("Testing {}".format(w))
            for tag in d[w].keys():

                LOG.debug("Writing out tag for {}-{}".format(w, tag))
                t = GoldTagPOSToken(w, goldlabel=tag)
                write_gram(t, output=out_f, feat_next_gram=False, feat_prev_gram=False, lowercase=True)
            out_f.flush()

    out_f.close()

    train_txt(out_path, class_path)
Esempio n. 2
0
def instances_to_classifier(
    instances, class_out_path, tag_method=None, posdict=None, feat_path=None, context_feats=False
):
    """
    Given a list of IGT instances, create a gloss-line classifier from them.

    :param instances:
    :type instances: list[RGIgt]
    :param class_out_path:
    :type class_out_path: str
    :param tag_method:
    :param posdict:
    :param feat_path: Path to specify where to write out the svmlight-format feature file. If it is none, use a temp file.
    """

    # Create a temporary file for the features file that we will
    # create...
    if feat_path is None:
        ntf = NamedTemporaryFile("w", delete=True, encoding="utf-8")
    else:
        ntf = open(feat_path, "w", encoding="utf-8")

    counts = CountDict()

    def callback(result):
        out_string, cur_instances = result
        ntf.write(out_string)
        counts.add("instances", 1)

    p = Pool(cpu_count())

    # Iterate through the instances provided
    for chunk in chunkIt(list(instances), cpu_count()):
        # p.apply_async(chunk_to_features, args=[chunk, tag_method, posdict, context_feats], callback=callback)
        callback(chunk_to_features(chunk, tag_method=tag_method, posdict=posdict, context_feats=context_feats))

    p.close()
    p.join()

    if counts["instances"] == 0:
        raise ClassifierException("No gloss POS tags found!")

    ntf.close()
    return train_txt(ntf.name, class_out_path)
Esempio n. 3
0
def extract_from_xigt(input_filelist = list, classifier_prefix=None, classifier_feats=CLASS_FEATS_DEFAULT,
                      cfg_path=None, tagger_prefix=None,
                      dep_prefix=None, pos_method=None, aln_method=None,
                      sent_prefix=None, no_alignment_heur=False, sent_type=SENT_TYPE_T_G, **kwargs):

    # ------- Dictionaries for keeping track of gloss_pos preprocessing. --------

    # This dictionary will first, be a list of "words" (full word-level)
    subword_dict = SubwordDict()

    # -------------------------------------------
    # Map the argument provided for "dep_pos" to
    # the alignment type that will be searched
    # -------------------------------------------
    use_pos = ARG_POS_MAP[pos_method]
    use_aln = ALN_ARG_MAP[aln_method]

    # -------------------------------------------
    # Get the tagset mapping if provided
    # -------------------------------------------
    tagpath = kwargs.get('tagmap')
    tm = None if tagpath is None else TagMap(tagpath)

    # =============================================================================
    # 1) SET UP
    # =============================================================================

    extracted_tagged_snts = 0
    extracted_parsed_snts = 0
    inst_count = 0


    if dep_prefix or tagger_prefix:
        if use_pos == ARG_POS_NONE:
            EXTRACT_LOG.log(NORM_LEVEL, 'Not using POS tags for extraction.')
        elif use_pos is None:
            EXTRACT_LOG.log(NORM_LEVEL, "Using any available POS tags for extraction.")
        else:
            EXTRACT_LOG.log(NORM_LEVEL, 'Using language line tags produced by method "{}"...'.format(use_pos))


    # Set up the classifier....
    if classifier_prefix is not None:
        EXTRACT_LOG.log(NORM_LEVEL, "Gathering statistics on POS tags...")

    # Set up the tagger training file...
    if tagger_prefix is not None:
        tagger_train_path = tagger_prefix+'_tagger_train.txt'
        tagger_model_path = tagger_prefix+'.tagger'


        EXTRACT_LOG.log(NORM_LEVEL, 'Opening tagger training file at "{}"'.format(tagger_train_path))
        fileutils.makedirs(os.path.dirname(tagger_train_path))
        tagger_train_f = open(tagger_train_path, 'w', encoding='utf-8')

    # Set up the dependency parser output if it's specified...
    dep_train_f = None
    dep_train_path = None
    if dep_prefix is not None:
        dep_train_path = dep_prefix+'_dep_train.txt'
        EXTRACT_LOG.log(NORM_LEVEL, 'Writing dependency parser training data to "{}"'.format(dep_train_path))

        # Make the containing directory if it does not exist.
        fileutils.makedirs(os.path.dirname(dep_prefix))

        # Write out the training file.
        dep_train_f = open(dep_train_path, 'w', encoding='utf-8')

    # Set up the files for writing out alignment.
    if sent_prefix is not None:
        fileutils.makedirs(os.path.dirname(sent_prefix))
        e_f = open(sent_prefix + '_e.txt', 'w', encoding='utf-8')
        f_f = open(sent_prefix + '_f.txt', 'w', encoding='utf-8')

    # Set up the CFG path for writing.
    if cfg_path is not None:
        fileutils.makedirs(os.path.dirname(cfg_path))
        cfg_f = open(cfg_path, 'w', encoding='utf-8')

    # -------------------------------------------
    # Iterate over the provided files.
    # -------------------------------------------
    for path in input_filelist:
        xc = xc_load(path, mode=INCREMENTAL)

        # -------------------------------------------
        # Do the appropriate extraction for each
        # -------------------------------------------
        for inst in xc:
            inst_count += 1
            if tagger_prefix is not None:
                extracted_tagged_snts += extract_tagger_from_instance(inst, tagger_train_f, use_pos, tm)

            if dep_prefix is not None:
                extracted_parsed_snts += extract_parser_from_instance(inst, dep_train_f, use_pos, tm)

            if classifier_prefix is not None:
                gather_gloss_pos_stats(inst, subword_dict, classifier_feats)

            if sent_prefix is not None:
                try:
                    extract_sents_from_inst(inst, e_f, f_f, no_alignment_heur=no_alignment_heur,
                                            sent_type=sent_type, aln_method=use_aln)
                except NoNormLineException:
                    pass

            if cfg_path:
                extract_cfg_rules_from_inst(inst, cfg_f)

    # -------------------------------------------
    # After looping
    # -------------------------------------------

    EXTRACT_LOG.log(NORM_LEVEL, "{} instances processed.".format(inst_count))

    # Add punctuation marks to the tagger.
    if tagger_prefix is not None:
        if extracted_tagged_snts == 0:
            EXTRACT_LOG.error("No tags were found. Not writing out file.")
            tagger_train_f.close()
            unlink(tagger_train_path)
        else:
            for t in ['?','“','"',"''","'",',','…','/','--','-','``','`',':',';','«','»']:
                tagger_train_f.write('{}{}{}\n'.format(t,'/','PUNC'))
            tagger_train_f.close()
            EXTRACT_LOG.log(NORM_LEVEL, 'Training postagger using "{}"'.format(tagger_train_path))
            # Now, train the POStagger...
            train_postagger(tagger_train_path, tagger_model_path)
            EXTRACT_LOG.log(NORM_LEVEL, "Tagger training complete.")



    # =============================================================================
    # Classifier output...
    # =============================================================================

    if classifier_prefix is not None:

        # The path for the svm-light-based features.
        class_dir  = os.path.dirname(classifier_prefix)
        os.makedirs(class_dir, exist_ok=True)

        feat_path  =  classifier_prefix+'.feats.txt'
        class_path  = classifier_prefix+'.classifier'

        write_out_gram_dict(subword_dict, feat_path, classifier_feats)

        EXTRACT_LOG.log(NORM_LEVEL, "Training classifier.")
        train_txt(feat_path, class_path)
        EXTRACT_LOG.log(NORM_LEVEL, "Complete.")

    if cfg_path:
        cfg_f.close()

    # -------------------------------------------
    # Train
    # -------------------------------------------
    if dep_prefix:
        if extracted_parsed_snts == 0:
            EXTRACT_LOG.error("No dependency parses were found. Not training parser.")
            dep_train_f.close()
            unlink(dep_train_path)
        else:
            EXTRACT_LOG.log(NORM_LEVEL, "{} dependency parses found. Training parser...".format(extracted_parsed_snts))
            dep_train_f.close()
            dep_parser_path = dep_prefix+'.depparser'
            mp = MSTParser()
            mp.train(dep_train_path, dep_parser_path)