def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser(description='Train morphology generation model') parser.add_argument('category', help='Russian word category to (R/V/A/N/M)') parser.add_argument('model', help='output file for trained model') parser.add_argument('--penalty', help='regularization penalty', type=float, default=0.001) args = parser.parse_args() assert len(args.category) == 1 with open(args.model, 'w') as f: f.write('write test / training...') logging.info('Extracting features for training data') training_features = [] training_outputs = [] for source, target, alignment in read_sentences(sys.stdin): for features, output in extract_instances(args.category, source, target, alignment): training_features.append(features) training_outputs.append(output) vectorizer = DictVectorizer() X = vectorizer.fit_transform(training_features) y = training_outputs logging.info('Training data size: %d instances x %d features', *X.shape) logging.info('Training model for category: %s (%d tags)', args.category, len(set(y))) model = LogisticRegression(C=args.penalty) model.fit(X, y) with open(args.model, 'w') as f: cPickle.dump((args.category, vectorizer, model), f, protocol=-1)
def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser( description='Create synthetic phrases' ' using trained CRF models and lemma grammar') parser.add_argument('rev_map', help='reverse inflection map') parser.add_argument('models', nargs='+', help='trained models (category:file)') parser.add_argument('sgm', help='original sentences + grammar pointers') parser.add_argument('sgm_lem', help='original sentences + lemma grammar pointers') parser.add_argument('out', help='grammar output directory') args = parser.parse_args() if not os.path.exists(args.out): os.mkdir(args.out) logging.info('Loading reverse inflection map') with open(args.rev_map) as f: rev_map = cPickle.load(f) logging.info('Loading inflection prediction models') models = load_models(args.models) logging.info('Generating extended grammars') data = izip(read_sentences(sys.stdin, skip_empty=False), read_sgm(args.sgm), read_sgm(args.sgm_lem)) for (source, _, _), (grm_path, sid, left, right),\ (lem_grm_path, lem_sid, lem_left, lem_right) in data: assert sid == lem_sid and left == lem_left and right == lem_right # Create grammar file out_path = os.path.join(args.out, 'grammar.{}.gz'.format(sid)) grammar_file = gzip.open(out_path, 'w') # Copy original grammar with gzip.open(grm_path) as f: for line in f: grammar_file.write(line) # Generate synthetic phrases from lemma grammar for rule in read_grammar(lem_grm_path): assert not any(src.startswith('[X,') for src in rule.lhs) # no gaps, please for match in source_match(rule.lhs, source): # create (at most) a synthetic rule for new_rule in synthetic_rule(rev_map, models, rule, source, match): grammar_file.write(unicode(new_rule).encode('utf8') + '\n') grammar_file.close() # Write sgm new_left = u'<seg grammar="{}" id="{}">{}</seg>'.format( out_path, sid, left) print(u' ||| '.join([new_left] + right).encode('utf8'))
def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser(description='Create synthetic phrases' ' using trained CRF models and lemma grammar') parser.add_argument('rev_map', help='reverse inflection map') parser.add_argument('models', nargs='+', help='trained models (category:file)') parser.add_argument('sgm', help='original sentences + grammar pointers') parser.add_argument('sgm_lem', help='original sentences + lemma grammar pointers') parser.add_argument('out', help='grammar output directory') args = parser.parse_args() if not os.path.exists(args.out): os.mkdir(args.out) logging.info('Loading reverse inflection map') with open(args.rev_map) as f: rev_map = cPickle.load(f) logging.info('Loading inflection prediction models') models = load_models(args.models) extracted_tags = ''.join([cat for cat in models.keys()]) logging.info('Inflecting categories: {}'.format(extracted_tags)) lemma_re = re.compile('^(.+)_(['+''+extracted_tags+'])$') logging.info('Generating extended grammars') data = izip(read_sentences(sys.stdin, skip_empty=False), read_sgm(args.sgm), read_sgm(args.sgm_lem)) for (source, _, _), (grm_path, sid, left, right),\ (lem_grm_path, lem_sid, lem_left, lem_right) in data: assert sid == lem_sid and left == lem_left and right == lem_right # Create grammar file out_path = os.path.join(args.out, 'grammar.{}.gz'.format(sid)) grammar_file = gzip.open(out_path, 'w') # Copy original grammar with gzip.open(grm_path) as f: for line in f: grammar_file.write(line) # Generate synthetic phrases from lemma grammar for rule in read_grammar(lem_grm_path): assert not any(src.startswith('[X,') for src in rule.lhs) # no gaps, please for match in source_match(rule.lhs, source): # create (at most) a synthetic rule for new_rule in synthetic_rule(rev_map, models, rule, source, match, lemma_re): grammar_file.write(unicode(new_rule).encode('utf8')+'\n') grammar_file.close() # Write sgm new_left = u'<seg grammar="{}" id="{}">{}</seg>'.format(out_path, sid, left) print(u' ||| '.join([new_left] + right).encode('utf8'))
def main(): parser = argparse.ArgumentParser(description='Create source ||| lemma_tag corpus') parser.add_argument('--partial', action='store_true', help='exclude non-predicted categories from lemmatization') args = parser.parse_args() def lemmatize(tgt, lemma, tag): if args.partial and tag[0] not in config.EXTRACTED_TAGS: return tgt return lemma+'_'+tag[0] for source, target, _ in read_sentences(sys.stdin): src = ' '.join(w.token for w in source) tgt = ' '.join(lemmatize(tgt, lemma, tag) for tgt, lemma, tag in target) print(u'{} ||| {}'.format(src, tgt).encode('utf8'))
def main(): parser = argparse.ArgumentParser( description='Create source ||| lemma_tag corpus') parser.add_argument( '--partial', action='store_true', help='exclude non-predicted categories from lemmatization') args = parser.parse_args() def lemmatize(tgt, lemma, tag): if args.partial and tag[0] not in config.EXTRACTED_TAGS: return tgt return lemma + '_' + tag[0] for source, target, _ in read_sentences(sys.stdin): src = ' '.join(w.token for w in source) tgt = ' '.join( lemmatize(tgt, lemma, tag) for tgt, lemma, tag in target) print(u'{} ||| {}'.format(src, tgt).encode('utf8'))
def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser( description='Train morphology generation model') parser.add_argument('category', help='Russian word category to (R/V/A/N/M)') parser.add_argument('model', help='output file for trained model') parser.add_argument('--penalty', help='regularization penalty', type=float, default=0.001) args = parser.parse_args() assert len(args.category) == 1 with open(args.model, 'w') as f: f.write('write test / training...') logging.info('Extracting features for training data') training_features = [] training_outputs = [] for source, target, alignment in read_sentences(sys.stdin): for features, output in extract_instances(args.category, source, target, alignment): training_features.append(features) training_outputs.append(output) vectorizer = DictVectorizer() X = vectorizer.fit_transform(training_features) y = training_outputs logging.info('Training data size: %d instances x %d features', *X.shape) logging.info('Training model for category: %s (%d tags)', args.category, len(set(y))) model = LogisticRegression(C=args.penalty) model.fit(X, y) with open(args.model, 'w') as f: cPickle.dump((args.category, vectorizer, model), f, protocol=-1)
def main(): logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser(description='Create reverse inflection map') parser.add_argument('rev_map', help='output file') args = parser.parse_args() lemma_map = defaultdict(lambda: defaultdict(Counter)) logging.info('Finding inflections and counting tag/form occurences...') for _, target, _ in read_sentences(sys.stdin): for (inflection, lemma, tag) in target: if tag[0] not in config.EXTRACTED_TAGS: continue lemma_map[lemma, tag[0]][tag[1:]][inflection] += 1 logging.info('Selecting most frequent form for each tag') rev_map = {lt: set() for lt in lemma_map.iterkeys()} for lt, inflections in lemma_map.iteritems(): for tag, forms in inflections.iteritems(): ((best_form, _),) = forms.most_common(1) rev_map[lt].add((tag, best_form)) logging.info('Saving inflection map') with open(args.rev_map, 'w') as f: cPickle.dump(rev_map, f, protocol=-1)
def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser(description='Create cdec CRF grammars and training data') parser.add_argument('category', help='Russian word category to (R/V/A/N/M)') parser.add_argument('rev_map', help='reverse inflection map') parser.add_argument('output', help='training output path') args = parser.parse_args() category = args.category logging.info('Loading reverse inflection map') with open(args.rev_map) as f: rev_map = cPickle.load(f) # Create training data paths if not os.path.exists(args.output): os.mkdir(args.output) grammar_path = os.path.join(args.output, 'grammars') if not os.path.exists(grammar_path): os.mkdir(grammar_path) sgm = io.open(os.path.join(args.output, 'train.sgm'), 'w', encoding='utf8') fvoc = Vocabulary() n_sentences = 0 logging.info('Generating the grammars') for source, target, alignment in read_sentences(sys.stdin): n_sentences += 1 if n_sentences % 1000 == 0: if too_much_mem(): logging.info('Running out of memory') break for word, features in extract_instances(category, source, target, alignment): inflection, lemma, tag = word category = tag[0] ref_attributes = tag[1:] possible_inflections = rev_map.get((lemma, category), []) if (ref_attributes, inflection) not in possible_inflections: logging.debug('Skip: %s (%s)', inflection, ref_attributes) continue # Write sentence grammar grammar_name = os.path.join(grammar_path, uuid.uuid1().hex) with io.open(grammar_name, 'w', encoding='utf8') as grammar: for attributes, _ in possible_inflections: rule = fvoc.make_rule(lemma, category, attributes, features) grammar.write(rule) # Write src / ref src = lemma+'_'+category ref = ' '.join(config.get_attributes(category, ref_attributes)) sgm.write(u'<seg grammar="{}"> {} ||| {} {} </seg>\n'.format( os.path.abspath(grammar_name), src, category, ref)) logging.info('Processed %d sentences', n_sentences) logging.info('Saving weights') ff_path = os.path.join(args.output, 'weights.ini') with io.open(ff_path, 'w', encoding='utf8') as f: for fname, fid in fvoc.iteritems(): f.write(u'# {}\n'.format(fname)) f.write(u'F{} 0\n'.format(fid)) sgm.close()
def main(): logging.basicConfig(level=logging.INFO, format="%(message)s") parser = argparse.ArgumentParser(description="Predict using trained models") parser.add_argument("rev_map", help="reverse inflection map") parser.add_argument("models", nargs="+", help="trained models (category:file)") parser.add_argument("--ambiguous", action="store_true", help="evaluate only lemmas with multiple inflections") args = parser.parse_args() logging.info("Loading reverse inflection map") with open(args.rev_map) as f: rev_map = cPickle.load(f) logging.info("Loading inflection prediction models") models = load_models(args.models) logging.info("Loaded models for %d categories", len(models)) stats = {cat: [0, 0, 0, 0, 0] for cat in config.EXTRACTED_TAGS} for source, target, alignment in read_sentences(sys.stdin): for word, features in extract_instances(source, target, alignment): gold_inflection, lemma, tag = word category = tag[0] gold_tag = tag[1:] possible_inflections = rev_map.get((lemma, category), []) if (gold_tag, gold_inflection) not in possible_inflections: print(u"Expected: {} ({}) not found".format(gold_inflection, gold_tag).encode("utf8")) continue if args.ambiguous and len(possible_inflections) == 1: continue model = models[category] scored_inflections = model.score_all(possible_inflections, features) ranked_inflections = sorted(scored_inflections, reverse=True) predicted_score, predicted_tag, predicted_inflection = ranked_inflections[0] gold_rank = 1 + [tag for _, tag, _ in ranked_inflections].index(gold_tag) gold_score = next((score for score, tag, _ in ranked_inflections if tag == gold_tag)) print( u"Expected: {} ({}) r={} score={:.3f} |" " Predicted: {} ({}) score={:.3f}".format( gold_inflection, gold_tag, gold_rank, gold_score, predicted_inflection, predicted_tag, predicted_score, ).encode("utf8") ) stats[category][0] += 1 stats[category][1] += 1 / float(gold_rank) stats[category][2] += gold_inflection == predicted_inflection stats[category][3] += gold_score stats[category][4] += len(ranked_inflections) for category, (n_instances, rrank_sum, n_correct, total_log_prob, n_inflections) in stats.items(): if n_instances == 0: continue mrr = rrank_sum / n_instances accuracy = n_correct / float(n_instances) ppl = math.exp(-total_log_prob / n_instances) avg_inflections = n_inflections / float(n_instances) print( "Category {}: MRR={:.3f} acc={:.1%} ppl={:.2f} ({} instances; avg #infl={:.2f})".format( category, mrr, accuracy, ppl, n_instances, avg_inflections ) )
def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser(description='Trained stuctured model') parser.add_argument('category', help='target word category') parser.add_argument('rev_map', help='reverse inflection map') parser.add_argument('model', help='output directory for models') parser.add_argument('-i', '--n_iter', type=int, help='number of SGD iterations') parser.add_argument('-r', '--rate', type=float, help='SGD udpate rate') args = parser.parse_args() category = args.category logging.info('Loading reverse inflection map') with open(args.rev_map) as f: rev_map = cPickle.load(f) logging.info('Generating the training data') X = [] Y_all = [] Y_star = [] Y_lim = [] n = 0 inflection_lims = {} # inflection set cache (ranges for y in Y_all) for source, target, alignment in read_sentences(sys.stdin): for word, features in extract_instances(category, source, target, alignment): ref_inflection, lemma, tag = word category = tag[0] ref_attributes = tag[1:] possible_inflections = rev_map.get((lemma, category), []) # Skip if |inflections| = 1 [p(infl | lemma) = 1] if len(possible_inflections) == 1: continue if (ref_attributes, ref_inflection) not in possible_inflections: continue X.append(features) # Y_all / Y_lim lims = inflection_lims.get((lemma, category), None) if lims is None: # new set of inflections for i, (attributes, _) in enumerate(possible_inflections): label = {attr: 1 for attr in config.get_attributes(category, attributes)} Y_all.append(label) # attributes map lims = (n, n+len(possible_inflections)) inflection_lims[lemma, category] = lims n += len(possible_inflections) Y_lim.append(lims) # Y_star for i, (attributes, _) in enumerate(possible_inflections): if attributes == ref_attributes: Y_star.append(i) # free some memory del rev_map if not os.path.exists(args.model): os.mkdir(args.model) def save_model(it, model): with open(os.path.join(args.model, 'model.{}.pickle'.format(it+1)), 'w') as f: cPickle.dump(model, f, protocol=-1) model = StructuredModel(args.category) model.train(X, Y_all, Y_star, Y_lim, n_iter=args.n_iter, alpha_sgd=args.rate, every_iter=save_model)
def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser( description='Predict using trained models') parser.add_argument('rev_map', help='reverse inflection map') parser.add_argument('models', nargs='+', help='trained models (category:file)') parser.add_argument('--ambiguous', action='store_true', help='evaluate only lemmas with multiple inflections') args = parser.parse_args() logging.info('Loading reverse inflection map') with open(args.rev_map) as f: rev_map = cPickle.load(f) logging.info('Loading inflection prediction models') models = load_models(args.models) logging.info('Loaded models for %d categories', len(models)) stats = {cat: [0, 0, 0, 0, 0] for cat in config.EXTRACTED_TAGS} for source, target, alignment in read_sentences(sys.stdin): for word, features in extract_instances(source, target, alignment): gold_inflection, lemma, tag = word category = tag[0] gold_tag = tag[1:] possible_inflections = rev_map.get((lemma, category), []) if (gold_tag, gold_inflection) not in possible_inflections: print(u'Expected: {} ({}) not found'.format( gold_inflection, gold_tag).encode('utf8')) continue if args.ambiguous and len(possible_inflections) == 1: continue model = models[category] scored_inflections = model.score_all(possible_inflections, features) ranked_inflections = sorted(scored_inflections, reverse=True) predicted_score, predicted_tag, predicted_inflection = ranked_inflections[ 0] gold_rank = 1 + [tag for _, tag, _ in ranked_inflections ].index(gold_tag) gold_score = next((score for score, tag, _ in ranked_inflections if tag == gold_tag)) print(u'Expected: {} ({}) r={} score={:.3f} |' ' Predicted: {} ({}) score={:.3f}'.format( gold_inflection, gold_tag, gold_rank, gold_score, predicted_inflection, predicted_tag, predicted_score).encode('utf8')) stats[category][0] += 1 stats[category][1] += 1 / float(gold_rank) stats[category][2] += (gold_inflection == predicted_inflection) stats[category][3] += gold_score stats[category][4] += len(ranked_inflections) for category, (n_instances, rrank_sum, n_correct, total_log_prob, n_inflections) in stats.items(): if n_instances == 0: continue mrr = rrank_sum / n_instances accuracy = n_correct / float(n_instances) ppl = math.exp(-total_log_prob / n_instances) avg_inflections = n_inflections / float(n_instances) print( 'Category {}: MRR={:.3f} acc={:.1%} ppl={:.2f} ({} instances; avg #infl={:.2f})' .format(category, mrr, accuracy, ppl, n_instances, avg_inflections))
def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser(description='Trained stuctured model') parser.add_argument('category', help='target word category') parser.add_argument('rev_map', help='reverse inflection map') parser.add_argument('model', help='output directory for models') parser.add_argument('-i', '--n_iter', type=int, help='number of SGD iterations') parser.add_argument('-r', '--rate', type=float, help='SGD udpate rate') args = parser.parse_args() category = args.category logging.info('Loading reverse inflection map') with open(args.rev_map) as f: rev_map = cPickle.load(f) logging.info('Generating the training data') X = [] Y_all = [] Y_star = [] Y_lim = [] n = 0 inflection_lims = {} # inflection set cache (ranges for y in Y_all) for source, target, alignment in read_sentences(sys.stdin): for word, features in extract_instances(category, source, target, alignment): ref_inflection, lemma, tag = word category = tag[0] ref_attributes = tag[1:] possible_inflections = rev_map.get((lemma, category), []) # Skip if |inflections| = 1 [p(infl | lemma) = 1] if len(possible_inflections) == 1: continue if (ref_attributes, ref_inflection) not in possible_inflections: continue X.append(features) # Y_all / Y_lim lims = inflection_lims.get((lemma, category), None) if lims is None: # new set of inflections for i, (attributes, _) in enumerate(possible_inflections): label = { attr: 1 for attr in config.get_attributes( category, attributes) } Y_all.append(label) # attributes map lims = (n, n + len(possible_inflections)) inflection_lims[lemma, category] = lims n += len(possible_inflections) Y_lim.append(lims) # Y_star for i, (attributes, _) in enumerate(possible_inflections): if attributes == ref_attributes: Y_star.append(i) # free some memory del rev_map if not os.path.exists(args.model): os.mkdir(args.model) def save_model(it, model): with open(os.path.join(args.model, 'model.{}.pickle'.format(it + 1)), 'w') as f: cPickle.dump(model, f, protocol=-1) model = StructuredModel(args.category) model.train(X, Y_all, Y_star, Y_lim, n_iter=args.n_iter, alpha_sgd=args.rate, every_iter=save_model)
def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser(description='Trained stuctured model') parser.add_argument('category', help='target word category') parser.add_argument('rev_map', help='reverse inflection map') parser.add_argument('model', help='output directory for models') parser.add_argument('-i', '--n_iter', type=int, help='number of SGD iterations') parser.add_argument('-r', '--rate', type=float, help='SGD udpate rate') parser.add_argument('-c', '--config', help='configuration module for supervised models (must be in config directory)') parser.add_argument('-a','--adagrad', action='store_true', default=False, help='Use the AdaGrad adaptive gradient technique to adjust rate') parser.add_argument('-l','--l1', type=float, default=0.0 , help='lambda value for l1 regualrization (l1 regularization currently only implemented for use with adagrad). If none given, regularization will not be used') args = parser.parse_args() category = args.category logging.info('Training inflection model for category {}'.format(category)) if args.config: logging.info('Loading external configuration module {}'.format(args.config)) sup_config = __import__('config_files.'+args.config,globals(),locals(),['get_attributes']) attr_function = lambda cat, attr: sup_config.get_attributes(cat, attr) else: attr_function = lambda cat, attr: config.get_attributes(cat, attr) logging.info('Loading reverse inflection map') with open(args.rev_map) as f: rev_map = cPickle.load(f) logging.info('length of reverse map: {}'.format(len(rev_map))) logging.info('Generating the training data') X = [] Y_all = [] Y_star = [] Y_lim = [] n = 0 inflection_lims = {} # inflection set cache (ranges for y in Y_all) for source, target, alignment in read_sentences(sys.stdin): for word, features in extract_instances(category, source, target, alignment): ref_inflection, lemma, tag = word category = tag[0] ref_attributes = tag[1:] possible_inflections = rev_map.get((lemma, category), []) # Skip if |inflections| = 1 [p(infl | lemma) = 1] if len(possible_inflections) == 1: continue if (ref_attributes, ref_inflection) not in possible_inflections: continue X.append(features) # Y_all / Y_lim lims = inflection_lims.get((lemma, category), None) if lims is None: # new set of inflections for i, (attributes, _) in enumerate(possible_inflections): label = {attr: 1 for attr in attr_function(category, attributes)} Y_all.append(label) # attributes map lims = (n, n+len(possible_inflections)) inflection_lims[lemma, category] = lims n += len(possible_inflections) Y_lim.append(lims) # Y_star for i, (attributes, _) in enumerate(possible_inflections): if attributes == ref_attributes: Y_star.append(i) # free some memory del rev_map if not os.path.exists(args.model): os.mkdir(args.model) def save_model(it, model): with open(os.path.join(args.model, 'model.{}.pickle'.format(it+1)), 'w') as f: cPickle.dump(model, f, protocol=-1) if args.config: model = StructuredModel(args.category, functools.partial(sup_config.get_attributes)) else: model = StructuredModel(args.category, config.get_attributes) model.train(X, Y_all, Y_star, Y_lim, n_iter=args.n_iter, alpha_sgd=args.rate, every_iter=save_model, adagrad=args.adagrad, l1=args.l1)