def train_tagger(prefix, slashtags=[], conll=[], tagmap = None, lowercase=False): trainsents = [] for c in conll: cc = ConllCorpus.read(c, lowercase=lowercase, tagmap=tagmap) for sent in cc: trainsents.append(sent.slashtags()) for st in slashtags: raise NotImplementedError alldatatrain = NamedTemporaryFile('w', delete=False) # ------------------------------------------- # Now write all the training sentences out to the temporary file. # ------------------------------------------- for trainsent in trainsents: alldatatrain.write(trainsent+'\n') alldatatrain.close() # ------------------------------------------- # And train the tagger. # ------------------------------------------- r = stanford_tagger.train_postagger(alldatatrain.name, prefix+'.tagger') unlink(alldatatrain.name)
def extract_from_xigt(input_filelist = list, classifier_prefix=None, classifier_feats=CLASS_FEATS_DEFAULT, cfg_path=None, tagger_prefix=None, dep_prefix=None, pos_method=None, aln_method=None, sent_prefix=None, no_alignment_heur=False, sent_type=SENT_TYPE_T_G, **kwargs): # ------- Dictionaries for keeping track of gloss_pos preprocessing. -------- # This dictionary will first, be a list of "words" (full word-level) subword_dict = SubwordDict() # ------------------------------------------- # Map the argument provided for "dep_pos" to # the alignment type that will be searched # ------------------------------------------- use_pos = ARG_POS_MAP[pos_method] use_aln = ALN_ARG_MAP[aln_method] # ------------------------------------------- # Get the tagset mapping if provided # ------------------------------------------- tagpath = kwargs.get('tagmap') tm = None if tagpath is None else TagMap(tagpath) # ============================================================================= # 1) SET UP # ============================================================================= extracted_tagged_snts = 0 extracted_parsed_snts = 0 inst_count = 0 if dep_prefix or tagger_prefix: if use_pos == ARG_POS_NONE: EXTRACT_LOG.log(NORM_LEVEL, 'Not using POS tags for extraction.') elif use_pos is None: EXTRACT_LOG.log(NORM_LEVEL, "Using any available POS tags for extraction.") else: EXTRACT_LOG.log(NORM_LEVEL, 'Using language line tags produced by method "{}"...'.format(use_pos)) # Set up the classifier.... if classifier_prefix is not None: EXTRACT_LOG.log(NORM_LEVEL, "Gathering statistics on POS tags...") # Set up the tagger training file... if tagger_prefix is not None: tagger_train_path = tagger_prefix+'_tagger_train.txt' tagger_model_path = tagger_prefix+'.tagger' EXTRACT_LOG.log(NORM_LEVEL, 'Opening tagger training file at "{}"'.format(tagger_train_path)) fileutils.makedirs(os.path.dirname(tagger_train_path)) tagger_train_f = open(tagger_train_path, 'w', encoding='utf-8') # Set up the dependency parser output if it's specified... dep_train_f = None dep_train_path = None if dep_prefix is not None: dep_train_path = dep_prefix+'_dep_train.txt' EXTRACT_LOG.log(NORM_LEVEL, 'Writing dependency parser training data to "{}"'.format(dep_train_path)) # Make the containing directory if it does not exist. fileutils.makedirs(os.path.dirname(dep_prefix)) # Write out the training file. dep_train_f = open(dep_train_path, 'w', encoding='utf-8') # Set up the files for writing out alignment. if sent_prefix is not None: fileutils.makedirs(os.path.dirname(sent_prefix)) e_f = open(sent_prefix + '_e.txt', 'w', encoding='utf-8') f_f = open(sent_prefix + '_f.txt', 'w', encoding='utf-8') # Set up the CFG path for writing. if cfg_path is not None: fileutils.makedirs(os.path.dirname(cfg_path)) cfg_f = open(cfg_path, 'w', encoding='utf-8') # ------------------------------------------- # Iterate over the provided files. # ------------------------------------------- for path in input_filelist: xc = xc_load(path, mode=INCREMENTAL) # ------------------------------------------- # Do the appropriate extraction for each # ------------------------------------------- for inst in xc: inst_count += 1 if tagger_prefix is not None: extracted_tagged_snts += extract_tagger_from_instance(inst, tagger_train_f, use_pos, tm) if dep_prefix is not None: extracted_parsed_snts += extract_parser_from_instance(inst, dep_train_f, use_pos, tm) if classifier_prefix is not None: gather_gloss_pos_stats(inst, subword_dict, classifier_feats) if sent_prefix is not None: try: extract_sents_from_inst(inst, e_f, f_f, no_alignment_heur=no_alignment_heur, sent_type=sent_type, aln_method=use_aln) except NoNormLineException: pass if cfg_path: extract_cfg_rules_from_inst(inst, cfg_f) # ------------------------------------------- # After looping # ------------------------------------------- EXTRACT_LOG.log(NORM_LEVEL, "{} instances processed.".format(inst_count)) # Add punctuation marks to the tagger. if tagger_prefix is not None: if extracted_tagged_snts == 0: EXTRACT_LOG.error("No tags were found. Not writing out file.") tagger_train_f.close() unlink(tagger_train_path) else: for t in ['?','“','"',"''","'",',','…','/','--','-','``','`',':',';','«','»']: tagger_train_f.write('{}{}{}\n'.format(t,'/','PUNC')) tagger_train_f.close() EXTRACT_LOG.log(NORM_LEVEL, 'Training postagger using "{}"'.format(tagger_train_path)) # Now, train the POStagger... train_postagger(tagger_train_path, tagger_model_path) EXTRACT_LOG.log(NORM_LEVEL, "Tagger training complete.") # ============================================================================= # Classifier output... # ============================================================================= if classifier_prefix is not None: # The path for the svm-light-based features. class_dir = os.path.dirname(classifier_prefix) os.makedirs(class_dir, exist_ok=True) feat_path = classifier_prefix+'.feats.txt' class_path = classifier_prefix+'.classifier' write_out_gram_dict(subword_dict, feat_path, classifier_feats) EXTRACT_LOG.log(NORM_LEVEL, "Training classifier.") train_txt(feat_path, class_path) EXTRACT_LOG.log(NORM_LEVEL, "Complete.") if cfg_path: cfg_f.close() # ------------------------------------------- # Train # ------------------------------------------- if dep_prefix: if extracted_parsed_snts == 0: EXTRACT_LOG.error("No dependency parses were found. Not training parser.") dep_train_f.close() unlink(dep_train_path) else: EXTRACT_LOG.log(NORM_LEVEL, "{} dependency parses found. Training parser...".format(extracted_parsed_snts)) dep_train_f.close() dep_parser_path = dep_prefix+'.depparser' mp = MSTParser() mp.train(dep_train_path, dep_parser_path)
if not (args.train or args.tagger): sys.stderr.write("Either a training file or a pre-trained tagger is required.") p.print_help() sys.exit(11) if args.train and args.tagger: sys.stderr.write("WARNING: Both a training file and a tagger were specified. The tagger will take precedence.") # ============================================================================= # First, train the tagger. # ============================================================================= if args.train and not args.tagger: print('Training tagger from "{}"'.format(args.train)) tagger_file = NamedTemporaryFile('w') tagger = train_postagger(args.train, tagger_file.name) print("Tagger training complete.") tagger_path = tagger_file.name else: print('Loading tagger from "{}"'.format(args.tagger)) tagger_path = args.tagger # ============================================================================= # Next, strip the tags from the test file into a temporary file. # ============================================================================= raw_tmp = NamedTemporaryFile() remove_tags(args.test, raw_tmp.name) # ============================================================================= # Figure out if we want to save the output path # =============================================================================