def _process_file(f): c = TwoLevelCountDict() d = TwoLevelCountDict() m = TwoLevelCountDict() print("Processing file {}".format(f)) xc = xc_load(f) for inst in xc: LOG.info("Now on instance {}".format(inst.id)) # Search for the gloss POS tier, if it exists. gpos = inst.find(alignment=GLOSS_WORD_ID, type=POS_TIER_TYPE) # If a gloss POS tier was found... if gpos: # Iterate through the projected tags. for gp in gpos: word = gp.igt.find(id=gp.attributes[ALIGNMENT]) grams = tokenize_item(word, morpheme_tokenizer) # Add the (gram, POSTag) pair as something that was encountered. for gram in grams: m.add(gram.content.lower(), gp.value()) c.add(gp.value(), word.value().lower()) d.add(word.value().lower(), gp.value()) return (c, d, m)
def test_ds_cycle(self): """ The tree in the ds_cycle file has "woman" depend both on "arriving" and "browse." """ xc = xc_load(ds_cycle) inst = xc[0] # 1 2 4 5 7 8 9 # The woman, (after) arriving, began to browse. # (The commas count as words, hence the skipping) tgt_t = DepTree.fromstring(""" (ROOT[0] (began[7] (woman[2] (The[1]) (\(after\)[4] (arriving[5]))) (browse[9] (woman[2]) (to[8]) ) )) """, stype=DEPSTR_PTB) ds = get_ds(inst, trans(inst)) self.assertTrue(tgt_t.structurally_eq(ds)) self.assertIsNone(project_ds_tier(inst))
def test_nogloss(self): xp = xigt_testfile('missing_line_tests.xml') xc = xc_load(xp) no_gloss = xc[0] self.assertRaises(NoGlossLineException, gloss_line, no_gloss) self.assertRaises(NoNormLineException, gloss_line, no_gloss)
def test_ds_project(self): xc = xc_load(os.path.join(testfile_dir, 'xigt/index_error.xml'), do_basic_processing=True) inst = xc[0] heur_align_inst(inst) parse_translation_line(inst, dt=True) project_ds_tier(inst) proj_t = get_lang_ds(inst) tgt_t = DepTree.fromstring("""(ROOT[0] (salli-i[2] (Jumala[1]) (sata-a[4] ([[3])) (rake-i-ta[5]) (ja[6]) (tuhka-a[7] (].[8]))))""", stype=DEPSTR_PTB) self.assertTrue(tgt_t.similar(proj_t)) inst2 = xc[1] heur_align_inst(inst2) parse_translation_line(inst2, dt=True) project_ds_tier(inst2) print(inst2) tgt2_t = DepTree.fromstring("""(ROOT[0] (unohta-a[2] (*Minua[1]) (unohda-n[4] (/Minä[3]) (/laula-tta-a[6] (pelo-tta-a[5])) ) )) """, stype=DEPSTR_PTB) self.assertTrue(get_lang_ds(inst2), tgt2_t)
def setUp(self): self.xc1 = xc_load(os.path.join(testfile_dir, 'xigt/kor-ex.xml')) self.inst = self.xc1[0] self.g1_2 = xigt_find(self.inst, id='g1.2') self.m2_1 = xigt_find(self.inst, id="m2.1") self.p1 = xigt_find(self.inst, id='p1') self.w1 = xigt_find(self.inst, id="w1") self.m_tier = xigt_find(self.inst, id="m") self.new_m = Item(id="m5", tier=self.m_tier, segmentation="w1[0:2]+w4[2:3]")
def nfold_xaml(): xaml_paths = glob("/Users/rgeorgi/Documents/code/dissertation/data/annotation/filtered/*.xml") lang_test = {} lang_train = {} lang_all = {} tagger = StanfordPOSTagger(tagger_model) for xaml_path in xaml_paths: lang = os.path.basename(xaml_path)[:3] xc = xc_load(xaml_path) train, dev, test = split_instances(xc, train=0.5, test=0.5, dev=0.0) lang_train[lang] = train lang_all[lang] = train+test lang_test[lang] = test # Now, build our classifiers... all_other = POSEvalDict() all_all = POSEvalDict() all_odin = POSEvalDict() all_proj = POSEvalDict() for lang in lang_all.keys(): other_lang_instances = [] all_lang_instances = lang_train[lang] for other_lang in lang_all.keys(): if other_lang != lang: other_lang_instances.extend(lang_all[other_lang]) all_lang_instances.extend(lang_all[other_lang]) other_lang_classifier = extract_from_instances(other_lang_instances, 'test.class', 'test.feats', '/dev/null') all_lang_classifier = extract_from_instances(all_lang_instances, 'all.class', 'all.feats', '/dev/null') test_instances = lang_test[lang] print(lang) prj_other_eval, cls_other_eval = evaluate_classifier_on_instances(test_instances, other_lang_classifier, tagger) prj_all_eval, cls_all_eval = evaluate_classifier_on_instances(test_instances, all_lang_classifier, tagger) prj_odin_eval, cls_odin_eval = evaluate_classifier_on_instances(test_instances, MalletMaxent('/Users/rgeorgi/Documents/code/dissertation/gc.classifier'), tagger) all_other += cls_other_eval all_all += cls_all_eval all_odin += cls_odin_eval all_proj += prj_all_eval print('ALL') print('{:.2f},{:.2f},{:.2f},{:.2f},{:.2f}'.format(all_proj.precision(), all_proj.unaligned(), all_other.accuracy(), all_all.accuracy(), all_odin.accuracy())) print(all_proj.error_matrix(csv=True))
def test_filter_gloss_not_present(self): xp = xigt_testfile('missing_line_tests.xml') xc = xc_load(xp) test_xc, ex, fail, succ = filter_xc(xc, require_gloss=True) self.assertEqual(len(test_xc), 2) test_xc, ex, fail, succ = filter_xc(xc, require_gloss=False) self.assertEqual(len(test_xc), 3) test_xc, ex, fail, succ = filter_xc(xc, require_gloss=True, require_trans=True) self.assertEqual(len(test_xc), 1) test_xc, ex, fail, succ = filter_xc(xc, require_aln=True) self.assertEqual(len(test_xc), 0) test_xc, ex, fail, succ = filter_xc(xc, require_trans=True) self.assertEqual(len(test_xc), 2)
def do_filter(filelist, require_lang=False, require_gloss=False, require_trans=False, require_aln=False, require_gloss_pos=False, require_grammatical=False, max_instances=0): new_corp = XigtCorpus() FILTER_LOG.log(NORM_LEVEL, "Beginning filtering...") successes = 0 failures = 0 examined = 0 for path in filelist: FILTER_LOG.log(1000, 'Opening file "{}" for filtering.'.format(os.path.basename(path))) xc = xc_load(path, mode=INCREMENTAL) instances, iter_examined, iter_success, iter_failures = filter_xc(xc, require_lang, require_gloss, require_trans, require_aln, require_gloss_pos, require_grammatical, max_instances, successes) for instance in instances: new_corp.append(instance) successes += iter_success failures += iter_failures examined += iter_examined return new_corp, examined, failures, successes
def test_gloss_projection_unaligned(self): xc = xc_load(os.path.join(testfile_dir, "xigt/project_gloss_lang_tests.xml")) igt = xc[0] project_gloss_pos_to_lang(igt, tag_method=INTENT_POS_PROJ, unk_handling='keep') self.assertEqual('UNK', pos_tag_tier(igt, lang(igt).id, INTENT_POS_PROJ)[-1].value())
def setUp(self): self.xc = xc_load(os.path.join(testfile_dir, 'xigt/kor-ex.xml'))
def setUp(self): self.xp = xigt_testfile('multiple_alignments.xml') self.xc = xc_load(self.xp)
""" import os from unittest import TestCase from intent.alignment.Alignment import Alignment from intent.consts import INTENT_ALN_HEUR, INTENT_ALN_GIZA, INTENT_POS_PROJ, INTENT_ALN_MANUAL from intent.igt.create_tiers import lang, glosses, gloss, trans from intent.igt.parsing import xc_load, parse_odin_inst from intent.igt.references import xigt_find, item_index from intent.igt.igt_functions import pos_tag_tier, project_gloss_pos_to_lang, giza_align_t_g, heur_align_corp, add_pos_tags, tier_tokens, classify_gloss_pos, tag_trans_pos, tier_text, set_bilingual_alignment, \ get_trans_glosses_alignment, copy_xigt from intent.interfaces.mallet_maxent import MalletMaxent from intent.interfaces.stanford_tagger import StanfordPOSTagger from intent.utils.env import posdict, tagger_model, testfile_dir, load_posdict xc = xc_load(os.path.join(testfile_dir, "xigt/kor-ex.xml")) class GlossAlignTest(TestCase): def test_gloss_projection_unaligned(self): xc = xc_load(os.path.join(testfile_dir, "xigt/project_gloss_lang_tests.xml")) igt = xc[0] project_gloss_pos_to_lang(igt, tag_method=INTENT_POS_PROJ, unk_handling='keep') self.assertEqual('UNK', pos_tag_tier(igt, lang(igt).id, INTENT_POS_PROJ)[-1].value()) #=============================================================================== # Unit Tests #===============================================================================
def setUp(self): self.xc = xc_load(os.path.join(testfile_dir, 'xigt/morph_align_567.xml'))
def setUp(self): self.xc = xc_load(xigt_testfile('word_align.xml'))
def setUp(self): self.xc = xc_load(xigt_testfile('no_raw.xml'))
def extract_from_xigt(input_filelist = list, classifier_prefix=None, classifier_feats=CLASS_FEATS_DEFAULT, cfg_path=None, tagger_prefix=None, dep_prefix=None, pos_method=None, aln_method=None, sent_prefix=None, no_alignment_heur=False, sent_type=SENT_TYPE_T_G, **kwargs): # ------- Dictionaries for keeping track of gloss_pos preprocessing. -------- # This dictionary will first, be a list of "words" (full word-level) subword_dict = SubwordDict() # ------------------------------------------- # Map the argument provided for "dep_pos" to # the alignment type that will be searched # ------------------------------------------- use_pos = ARG_POS_MAP[pos_method] use_aln = ALN_ARG_MAP[aln_method] # ------------------------------------------- # Get the tagset mapping if provided # ------------------------------------------- tagpath = kwargs.get('tagmap') tm = None if tagpath is None else TagMap(tagpath) # ============================================================================= # 1) SET UP # ============================================================================= extracted_tagged_snts = 0 extracted_parsed_snts = 0 inst_count = 0 if dep_prefix or tagger_prefix: if use_pos == ARG_POS_NONE: EXTRACT_LOG.log(NORM_LEVEL, 'Not using POS tags for extraction.') elif use_pos is None: EXTRACT_LOG.log(NORM_LEVEL, "Using any available POS tags for extraction.") else: EXTRACT_LOG.log(NORM_LEVEL, 'Using language line tags produced by method "{}"...'.format(use_pos)) # Set up the classifier.... if classifier_prefix is not None: EXTRACT_LOG.log(NORM_LEVEL, "Gathering statistics on POS tags...") # Set up the tagger training file... if tagger_prefix is not None: tagger_train_path = tagger_prefix+'_tagger_train.txt' tagger_model_path = tagger_prefix+'.tagger' EXTRACT_LOG.log(NORM_LEVEL, 'Opening tagger training file at "{}"'.format(tagger_train_path)) fileutils.makedirs(os.path.dirname(tagger_train_path)) tagger_train_f = open(tagger_train_path, 'w', encoding='utf-8') # Set up the dependency parser output if it's specified... dep_train_f = None dep_train_path = None if dep_prefix is not None: dep_train_path = dep_prefix+'_dep_train.txt' EXTRACT_LOG.log(NORM_LEVEL, 'Writing dependency parser training data to "{}"'.format(dep_train_path)) # Make the containing directory if it does not exist. fileutils.makedirs(os.path.dirname(dep_prefix)) # Write out the training file. dep_train_f = open(dep_train_path, 'w', encoding='utf-8') # Set up the files for writing out alignment. if sent_prefix is not None: fileutils.makedirs(os.path.dirname(sent_prefix)) e_f = open(sent_prefix + '_e.txt', 'w', encoding='utf-8') f_f = open(sent_prefix + '_f.txt', 'w', encoding='utf-8') # Set up the CFG path for writing. if cfg_path is not None: fileutils.makedirs(os.path.dirname(cfg_path)) cfg_f = open(cfg_path, 'w', encoding='utf-8') # ------------------------------------------- # Iterate over the provided files. # ------------------------------------------- for path in input_filelist: xc = xc_load(path, mode=INCREMENTAL) # ------------------------------------------- # Do the appropriate extraction for each # ------------------------------------------- for inst in xc: inst_count += 1 if tagger_prefix is not None: extracted_tagged_snts += extract_tagger_from_instance(inst, tagger_train_f, use_pos, tm) if dep_prefix is not None: extracted_parsed_snts += extract_parser_from_instance(inst, dep_train_f, use_pos, tm) if classifier_prefix is not None: gather_gloss_pos_stats(inst, subword_dict, classifier_feats) if sent_prefix is not None: try: extract_sents_from_inst(inst, e_f, f_f, no_alignment_heur=no_alignment_heur, sent_type=sent_type, aln_method=use_aln) except NoNormLineException: pass if cfg_path: extract_cfg_rules_from_inst(inst, cfg_f) # ------------------------------------------- # After looping # ------------------------------------------- EXTRACT_LOG.log(NORM_LEVEL, "{} instances processed.".format(inst_count)) # Add punctuation marks to the tagger. if tagger_prefix is not None: if extracted_tagged_snts == 0: EXTRACT_LOG.error("No tags were found. Not writing out file.") tagger_train_f.close() unlink(tagger_train_path) else: for t in ['?','“','"',"''","'",',','…','/','--','-','``','`',':',';','«','»']: tagger_train_f.write('{}{}{}\n'.format(t,'/','PUNC')) tagger_train_f.close() EXTRACT_LOG.log(NORM_LEVEL, 'Training postagger using "{}"'.format(tagger_train_path)) # Now, train the POStagger... train_postagger(tagger_train_path, tagger_model_path) EXTRACT_LOG.log(NORM_LEVEL, "Tagger training complete.") # ============================================================================= # Classifier output... # ============================================================================= if classifier_prefix is not None: # The path for the svm-light-based features. class_dir = os.path.dirname(classifier_prefix) os.makedirs(class_dir, exist_ok=True) feat_path = classifier_prefix+'.feats.txt' class_path = classifier_prefix+'.classifier' write_out_gram_dict(subword_dict, feat_path, classifier_feats) EXTRACT_LOG.log(NORM_LEVEL, "Training classifier.") train_txt(feat_path, class_path) EXTRACT_LOG.log(NORM_LEVEL, "Complete.") if cfg_path: cfg_f.close() # ------------------------------------------- # Train # ------------------------------------------- if dep_prefix: if extracted_parsed_snts == 0: EXTRACT_LOG.error("No dependency parses were found. Not training parser.") dep_train_f.close() unlink(dep_train_path) else: EXTRACT_LOG.log(NORM_LEVEL, "{} dependency parses found. Training parser...".format(extracted_parsed_snts)) dep_train_f.close() dep_parser_path = dep_prefix+'.depparser' mp = MSTParser() mp.train(dep_train_path, dep_parser_path)
def test_basic_processing(self): xc = xc_load(self.path, do_basic_processing=True)
def setUp(self): path = xigt_testfile('multiple_line_tests.xml') self.xc = xc_load(path)
def setUp(self): logging.basicConfig(level=logging.DEBUG) self.xc = xc_load(xigt_proj, do_basic_processing=False) self.inst1 = self.xc[0] self.inst2 = self.xc[1]
def evaluate_intent(filelist, classifier_path=None, eval_alignment=None, eval_ds=None, eval_posproj=None, classifier_feats=CLASS_FEATS_DEFAULT, eval_tagger=None, gold_tagmap=None, trans_tagmap=None, outpath=None): """ Given a list of files that have manual POS tags and manual alignment, evaluate the various INTENT methods on that file. :param filelist: List of paths to evaluate against. :type filelist: list[str] :param classifier_path: Path to the classifier model :type classifier_path: str :param eval_alignment: """ tagger = StanfordPOSTagger(tagger_model) outstream = sys.stdout if outpath is not None: outstream = open(outpath, mode='w', encoding='utf-8') # ============================================================================= # Set up the objects to run as "servers" # ============================================================================= classifier_obj = MalletMaxent(classifier) if classifier_path is not None: classifier_obj = MalletMaxent(classifier_path) class_matches, class_compares = 0, 0 e_tagger = None if eval_tagger is not None: e_tagger = StanfordPOSTagger(eval_tagger) mas = MultAlignScorer() ds_plma = PerLangMethodAccuracies() pos_plma= PerLangMethodAccuracies() pos_pla = POSEvalDict() pos_proj_matrix = POSMatrix() pos_class_matrix = POSMatrix() # ------------------------------------------- # If a tag map is specified, let's load it. # ------------------------------------------- g_tm = TagMap(gold_tagmap) if gold_tagmap is not None else None t_tm = TagMap(trans_tagmap) if trans_tagmap is not None else None # Go through all the files in the list... for f in filelist: outstream.write('Evaluating on file: {}\n'.format(f)) xc = xc_load(f, mode=FULL) lang = os.path.basename(f) # ------------------------------------------- # Test the classifier if evaluation is requested. # ------------------------------------------- if classifier_path is not None: matches, compares, acc = evaluate_classifier_on_instances(xc, classifier_obj, classifier_feats, pos_class_matrix, gold_tagmap=g_tm) outstream.write('{},{},{},{:.2f}\n'.format(lang, matches, compares, acc)) class_matches += matches class_compares += compares # ------------------------------------------- # Test alignment if requested. # ------------------------------------------- if eval_alignment: mas.add_corpus('gold', INTENT_ALN_MANUAL, lang, xc) EVAL_LOG.log(NORM_LEVEL, "Evaluating heuristic methods...") evaluate_heuristic_methods_on_file(f, xc, mas, classifier_obj, tagger, lang) EVAL_LOG.log(NORM_LEVEL, "Evaluating statistical methods...") evaluate_statistic_methods_on_file(f, xc, mas, classifier_obj, tagger, lang) # ------------------------------------------- # Test DS Projection if requested # ------------------------------------------- if eval_ds: evaluate_ds_projections_on_file(lang, xc, ds_plma, outstream=outstream) outstream.write('{}\n'.format(ds_plma)) # ------------------------------------------- # Test POS Projection # ------------------------------------------- if eval_posproj: evaluate_pos_projections_on_file(lang, xc, pos_plma, pos_proj_matrix, tagger, gold_tagmap=g_tm, trans_tagmap=t_tm, outstream=outstream) if e_tagger is not None: evaluate_lang_pos(lang, xc, e_tagger, pos_pla, gold_tagmap=g_tm, outstream=outstream) if eval_alignment: mas.eval_all(outstream=outstream) if eval_ds: outstream.write('{}\n'.format(ds_plma)) if e_tagger is not None: outstream.write('{},{},{},{:.2f}\n'.format(lang, pos_pla.all_matches(), pos_pla.fulltotal(), pos_pla.accuracy())) e_tagger.close() # Report the POS tagging accuracy... if classifier_path is not None: outstream.write("ALL...\n") outstream.write('{},{},{:.2f}\n'.format(class_matches, class_compares, class_matches/class_compares*100)) outstream.write('{}\n'.format(pos_class_matrix)) if eval_posproj: outstream.write('{}\n'.format(pos_proj_matrix)) outstream.close()
def produce_tagger(inpath, out_f, method, kwargs = None): if kwargs.get('xc'): xc = kwargs.get('xc') else: # Load the xigt corpus. xc = xc_load(inpath) corp_length = len(xc) # Before reducing the size of the corpus, filter out # instances lacking g/t alignment for classification and projection... if method == classification or method in normal_proj: xc.require_one_to_one() corp_length = len(xc) # Also, filter out instances where a translation line is missing # if we are projecting. (This overlaps with the above, but leaves # direct giza alignments to not require one to one alignment.) if method in projection: xc.require_trans_lines() corp_length = len(xc) limit = kwargs.get('limit', 0, int) if limit: xc.igts = xc.igts[:limit] corp_length = len(xc) # Giza Realignment --------------------------------------------------------- # If we are using a giza based approach, we will want to # realign the corpus now, since it is heuristic by default. if method == giza_proj: xc.giza_align_t_g(kwargs.get('resume')) elif method == giza_direct: xc.giza_align_l_t() TAGLOG.info('Producing tagfile for "%s"' % os.path.relpath(out_f.name)) #=========================================================================== # ADD PUNC #=========================================================================== out_f.write('''./PUNC ?/PUNC “/PUNC "/PUNC ''/PUNC '/PUNC ,/PUNC …/PUNC //PUNC --/PUNC ``/PUNC :/PUNC ;/PUNC «/PUNC »/PUNC -/PUNC\n''') for i, inst in enumerate(xc): if i % 25 == 0: TAGLOG.info('Processing instance %d' % i) # If we are doing classification if method == classification: inst.classify_gloss_pos(kwargs.get('classifier'), posdict=kwargs.get('posdict')) inst.project_gloss_to_lang() # If we are doing normal projection via the gloss line elif method in normal_proj: try: inst.project_trans_to_gloss() except ProjectionTransGlossException as ptge: TAGLOG.warn(ptge) continue inst.project_gloss_to_lang() # Otherwise, we are looking at doing the direct translation # to language based approach. elif method == giza_direct: inst.project_trans_to_lang() # Raise an exception if we somehow got a different method. else: raise TagProductionException('Method "%s" is not defined for producing taggers.' % method) # Whichever method, get the gloss line tags: sequence = inst.get_lang_sequence() # If we get a "skip" and "UNK" appears in the sequence... if kwargs.get('skip') and len(sequence) != len([i for i in sequence if i.label != UNK]): corp_length -= 1 continue else: # Replace the "UNK" with "NOUN" for i, pos_token in enumerate(sequence): if pos_token.label == 'UNK' and kwargs.get('unk_nouns'): pos_token.label = "NOUN" elif pos_token.label == 'UNK' and kwargs.get('unk_classify'): classifier = kwargs.get('classifier') kwargs['prev_gram'] = '' kwargs['next_gram'] = '' if i > 0: kwargs['prev_gram'] = inst.gloss[i-1].get_content() if i < len(inst.gloss)-1: kwargs['next_gram'] = inst.gloss[i+1].get_content() pos_token.label = classifier.classify_string(inst.gloss[i].get_content(), **kwargs).largest()[0] out_f.write('%s/%s ' % (pos_token.seq, pos_token.label)) out_f.write('\n') out_f.flush() out_f.close() return corp_length
# # Get the language line words projected onto the gloss... # for inst in train_xc: # word_align(inst.gloss, inst.lang) # inst.project_lang_to_gloss(tagmap = './data/tagset_mappings/ctn.txt') # inst.tag_trans_pos(tagger) # inst.heur_align() # inst.project_trans_to_gloss() # fix_ctn_gloss_line(inst, tag_method=INTENT_POS_PROJ) # # print("Done.") # # xigtxml.dump(open(ctn_train_processed, 'w', encoding='utf-8'), train_xc) # sys.exit() print("Loading Processed CTN Train corpus...", end=" ", flush=True) train_xc = xc_load(ctn_train_processed) print("Done.") print("Loading Processed CTN Dev corpus...", end=" ", flush=True) dev_xc = xc_load(ctn_dev_processed) print("Done.") # # # ============================================================================= # # 2) Train a classifier based on the projected gloss line. # # ============================================================================= # index_list = [35,70,106,141,284,569,854,1139,1424,1708,1993,7120] for train_stop_index in index_list:
def setUp(self): self.xc = xc_load(dep_file) self.inst = self.xc[3]
def test_inst_2(self): xp = xigt_testfile('xigt-projection-tests.xml') xc = xc_load(xp) do_projection(**{ARG_INFILE:xp, 'aln_method':ARG_ALN_GIZA, ARG_OUTFILE:'/dev/null'})
def broken_german_test(self): xc = xc_load(os.path.join(testfile_dir, 'xigt/broken-german-instance.xml')) inst = xc[0] self.assertIsNotNone(classify_gloss_pos(inst))