Beispiel #1
0
def train_tagger(prefix, slashtags=[], conll=[], tagmap = None, lowercase=False):



    trainsents = []

    for c in conll:
        cc = ConllCorpus.read(c, lowercase=lowercase, tagmap=tagmap)
        for sent in cc:
            trainsents.append(sent.slashtags())

    for st in slashtags:
        raise NotImplementedError

    alldatatrain = NamedTemporaryFile('w', delete=False)

    # -------------------------------------------
    # Now write all the training sentences out to the temporary file.
    # -------------------------------------------
    for trainsent in trainsents:
        alldatatrain.write(trainsent+'\n')
    alldatatrain.close()

    # -------------------------------------------
    # And train the tagger.
    # -------------------------------------------
    r = stanford_tagger.train_postagger(alldatatrain.name, prefix+'.tagger')
    unlink(alldatatrain.name)
Beispiel #2
0
def extract_from_xigt(input_filelist = list, classifier_prefix=None, classifier_feats=CLASS_FEATS_DEFAULT,
                      cfg_path=None, tagger_prefix=None,
                      dep_prefix=None, pos_method=None, aln_method=None,
                      sent_prefix=None, no_alignment_heur=False, sent_type=SENT_TYPE_T_G, **kwargs):

    # ------- Dictionaries for keeping track of gloss_pos preprocessing. --------

    # This dictionary will first, be a list of "words" (full word-level)
    subword_dict = SubwordDict()

    # -------------------------------------------
    # Map the argument provided for "dep_pos" to
    # the alignment type that will be searched
    # -------------------------------------------
    use_pos = ARG_POS_MAP[pos_method]
    use_aln = ALN_ARG_MAP[aln_method]

    # -------------------------------------------
    # Get the tagset mapping if provided
    # -------------------------------------------
    tagpath = kwargs.get('tagmap')
    tm = None if tagpath is None else TagMap(tagpath)

    # =============================================================================
    # 1) SET UP
    # =============================================================================

    extracted_tagged_snts = 0
    extracted_parsed_snts = 0
    inst_count = 0


    if dep_prefix or tagger_prefix:
        if use_pos == ARG_POS_NONE:
            EXTRACT_LOG.log(NORM_LEVEL, 'Not using POS tags for extraction.')
        elif use_pos is None:
            EXTRACT_LOG.log(NORM_LEVEL, "Using any available POS tags for extraction.")
        else:
            EXTRACT_LOG.log(NORM_LEVEL, 'Using language line tags produced by method "{}"...'.format(use_pos))


    # Set up the classifier....
    if classifier_prefix is not None:
        EXTRACT_LOG.log(NORM_LEVEL, "Gathering statistics on POS tags...")

    # Set up the tagger training file...
    if tagger_prefix is not None:
        tagger_train_path = tagger_prefix+'_tagger_train.txt'
        tagger_model_path = tagger_prefix+'.tagger'


        EXTRACT_LOG.log(NORM_LEVEL, 'Opening tagger training file at "{}"'.format(tagger_train_path))
        fileutils.makedirs(os.path.dirname(tagger_train_path))
        tagger_train_f = open(tagger_train_path, 'w', encoding='utf-8')

    # Set up the dependency parser output if it's specified...
    dep_train_f = None
    dep_train_path = None
    if dep_prefix is not None:
        dep_train_path = dep_prefix+'_dep_train.txt'
        EXTRACT_LOG.log(NORM_LEVEL, 'Writing dependency parser training data to "{}"'.format(dep_train_path))

        # Make the containing directory if it does not exist.
        fileutils.makedirs(os.path.dirname(dep_prefix))

        # Write out the training file.
        dep_train_f = open(dep_train_path, 'w', encoding='utf-8')

    # Set up the files for writing out alignment.
    if sent_prefix is not None:
        fileutils.makedirs(os.path.dirname(sent_prefix))
        e_f = open(sent_prefix + '_e.txt', 'w', encoding='utf-8')
        f_f = open(sent_prefix + '_f.txt', 'w', encoding='utf-8')

    # Set up the CFG path for writing.
    if cfg_path is not None:
        fileutils.makedirs(os.path.dirname(cfg_path))
        cfg_f = open(cfg_path, 'w', encoding='utf-8')

    # -------------------------------------------
    # Iterate over the provided files.
    # -------------------------------------------
    for path in input_filelist:
        xc = xc_load(path, mode=INCREMENTAL)

        # -------------------------------------------
        # Do the appropriate extraction for each
        # -------------------------------------------
        for inst in xc:
            inst_count += 1
            if tagger_prefix is not None:
                extracted_tagged_snts += extract_tagger_from_instance(inst, tagger_train_f, use_pos, tm)

            if dep_prefix is not None:
                extracted_parsed_snts += extract_parser_from_instance(inst, dep_train_f, use_pos, tm)

            if classifier_prefix is not None:
                gather_gloss_pos_stats(inst, subword_dict, classifier_feats)

            if sent_prefix is not None:
                try:
                    extract_sents_from_inst(inst, e_f, f_f, no_alignment_heur=no_alignment_heur,
                                            sent_type=sent_type, aln_method=use_aln)
                except NoNormLineException:
                    pass

            if cfg_path:
                extract_cfg_rules_from_inst(inst, cfg_f)

    # -------------------------------------------
    # After looping
    # -------------------------------------------

    EXTRACT_LOG.log(NORM_LEVEL, "{} instances processed.".format(inst_count))

    # Add punctuation marks to the tagger.
    if tagger_prefix is not None:
        if extracted_tagged_snts == 0:
            EXTRACT_LOG.error("No tags were found. Not writing out file.")
            tagger_train_f.close()
            unlink(tagger_train_path)
        else:
            for t in ['?','“','"',"''","'",',','…','/','--','-','``','`',':',';','«','»']:
                tagger_train_f.write('{}{}{}\n'.format(t,'/','PUNC'))
            tagger_train_f.close()
            EXTRACT_LOG.log(NORM_LEVEL, 'Training postagger using "{}"'.format(tagger_train_path))
            # Now, train the POStagger...
            train_postagger(tagger_train_path, tagger_model_path)
            EXTRACT_LOG.log(NORM_LEVEL, "Tagger training complete.")



    # =============================================================================
    # Classifier output...
    # =============================================================================

    if classifier_prefix is not None:

        # The path for the svm-light-based features.
        class_dir  = os.path.dirname(classifier_prefix)
        os.makedirs(class_dir, exist_ok=True)

        feat_path  =  classifier_prefix+'.feats.txt'
        class_path  = classifier_prefix+'.classifier'

        write_out_gram_dict(subword_dict, feat_path, classifier_feats)

        EXTRACT_LOG.log(NORM_LEVEL, "Training classifier.")
        train_txt(feat_path, class_path)
        EXTRACT_LOG.log(NORM_LEVEL, "Complete.")

    if cfg_path:
        cfg_f.close()

    # -------------------------------------------
    # Train
    # -------------------------------------------
    if dep_prefix:
        if extracted_parsed_snts == 0:
            EXTRACT_LOG.error("No dependency parses were found. Not training parser.")
            dep_train_f.close()
            unlink(dep_train_path)
        else:
            EXTRACT_LOG.log(NORM_LEVEL, "{} dependency parses found. Training parser...".format(extracted_parsed_snts))
            dep_train_f.close()
            dep_parser_path = dep_prefix+'.depparser'
            mp = MSTParser()
            mp.train(dep_train_path, dep_parser_path)
Beispiel #3
0
    if not (args.train or args.tagger):
        sys.stderr.write("Either a training file or a pre-trained tagger is required.")
        p.print_help()
        sys.exit(11)

    if args.train and args.tagger:
        sys.stderr.write("WARNING: Both a training file and a tagger were specified. The tagger will take precedence.")

    # =============================================================================
    # First, train the tagger.
    # =============================================================================

    if args.train and not args.tagger:
        print('Training tagger from "{}"'.format(args.train))
        tagger_file = NamedTemporaryFile('w')
        tagger = train_postagger(args.train, tagger_file.name)
        print("Tagger training complete.")
        tagger_path = tagger_file.name
    else:
        print('Loading tagger from "{}"'.format(args.tagger))
        tagger_path = args.tagger

    # =============================================================================
    # Next, strip the tags from the test file into a temporary file.
    # =============================================================================
    raw_tmp = NamedTemporaryFile()

    remove_tags(args.test, raw_tmp.name)
    # =============================================================================
    # Figure out if we want to save the output path
    # =============================================================================