Esempi in Python per read_sentences, esempi in Python per common.read_sentences

Esempio n. 1

0

Mostra file

File: train.py Progetto: vchahun/morphogen

def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(description='Train morphology generation model')
    parser.add_argument('category', help='Russian word category to (R/V/A/N/M)')
    parser.add_argument('model', help='output file for trained model')
    parser.add_argument('--penalty', help='regularization penalty', type=float, default=0.001)
    args = parser.parse_args()

    assert len(args.category) == 1
    with open(args.model, 'w') as f:
        f.write('write test / training...')

    logging.info('Extracting features for training data')

    training_features = []
    training_outputs = []
    for source, target, alignment in read_sentences(sys.stdin):
        for features, output in extract_instances(args.category, source, target, alignment):
            training_features.append(features)
            training_outputs.append(output)

    vectorizer = DictVectorizer()
    X = vectorizer.fit_transform(training_features)
    y = training_outputs

    logging.info('Training data size: %d instances x %d features', *X.shape)
    logging.info('Training model for category: %s (%d tags)', args.category, len(set(y)))

    model = LogisticRegression(C=args.penalty)
    model.fit(X, y)

    with open(args.model, 'w') as f:
        cPickle.dump((args.category, vectorizer, model), f, protocol=-1)

Esempio n. 2

0

Mostra file

File: synthetic_grammar.py Progetto: afcarl/morphogen

def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(
        description='Create synthetic phrases'
        ' using trained CRF models and lemma grammar')
    parser.add_argument('rev_map', help='reverse inflection map')
    parser.add_argument('models',
                        nargs='+',
                        help='trained models (category:file)')
    parser.add_argument('sgm', help='original sentences + grammar pointers')
    parser.add_argument('sgm_lem',
                        help='original sentences + lemma grammar pointers')
    parser.add_argument('out', help='grammar output directory')
    args = parser.parse_args()

    if not os.path.exists(args.out):
        os.mkdir(args.out)

    logging.info('Loading reverse inflection map')
    with open(args.rev_map) as f:
        rev_map = cPickle.load(f)

    logging.info('Loading inflection prediction models')
    models = load_models(args.models)

    logging.info('Generating extended grammars')
    data = izip(read_sentences(sys.stdin, skip_empty=False),
                read_sgm(args.sgm), read_sgm(args.sgm_lem))
    for (source, _, _), (grm_path, sid, left, right),\
            (lem_grm_path, lem_sid, lem_left, lem_right) in data:
        assert sid == lem_sid and left == lem_left and right == lem_right
        # Create grammar file
        out_path = os.path.join(args.out, 'grammar.{}.gz'.format(sid))
        grammar_file = gzip.open(out_path, 'w')
        # Copy original grammar
        with gzip.open(grm_path) as f:
            for line in f:
                grammar_file.write(line)

        # Generate synthetic phrases from lemma grammar
        for rule in read_grammar(lem_grm_path):
            assert not any(src.startswith('[X,')
                           for src in rule.lhs)  # no gaps, please
            for match in source_match(rule.lhs, source):
                # create (at most) a synthetic rule
                for new_rule in synthetic_rule(rev_map, models, rule, source,
                                               match):
                    grammar_file.write(unicode(new_rule).encode('utf8') + '\n')

        grammar_file.close()
        # Write sgm
        new_left = u'<seg grammar="{}" id="{}">{}</seg>'.format(
            out_path, sid, left)
        print(u' ||| '.join([new_left] + right).encode('utf8'))

Esempio n. 3

0

Mostra file

File: synthetic_grammar.py Progetto: PhdDone/morphogen

def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(description='Create synthetic phrases'
            ' using trained CRF models and lemma grammar')
    parser.add_argument('rev_map', help='reverse inflection map')
    parser.add_argument('models', nargs='+', help='trained models (category:file)')
    parser.add_argument('sgm', help='original sentences + grammar pointers')
    parser.add_argument('sgm_lem', help='original sentences + lemma grammar pointers')
    parser.add_argument('out', help='grammar output directory')
    args = parser.parse_args()

    if not os.path.exists(args.out):
        os.mkdir(args.out)

    logging.info('Loading reverse inflection map')
    with open(args.rev_map) as f:
        rev_map = cPickle.load(f)

    logging.info('Loading inflection prediction models')
    models = load_models(args.models)
    extracted_tags = ''.join([cat for cat in models.keys()])
    logging.info('Inflecting categories: {}'.format(extracted_tags))
    lemma_re = re.compile('^(.+)_(['+''+extracted_tags+'])$')

    logging.info('Generating extended grammars')
    data = izip(read_sentences(sys.stdin, skip_empty=False),
            read_sgm(args.sgm), read_sgm(args.sgm_lem))
    for (source, _, _), (grm_path, sid, left, right),\
            (lem_grm_path, lem_sid, lem_left, lem_right) in data:
        assert sid == lem_sid and left == lem_left and right == lem_right
        # Create grammar file
        out_path = os.path.join(args.out, 'grammar.{}.gz'.format(sid))
        grammar_file = gzip.open(out_path, 'w')
        # Copy original grammar
        with gzip.open(grm_path) as f:
            for line in f:
                grammar_file.write(line)

        # Generate synthetic phrases from lemma grammar
        for rule in read_grammar(lem_grm_path):
            assert not any(src.startswith('[X,') for src in rule.lhs) # no gaps, please
            for match in source_match(rule.lhs, source):
                # create (at most) a synthetic rule
                for new_rule in synthetic_rule(rev_map, models, rule, source, match, lemma_re):
                    grammar_file.write(unicode(new_rule).encode('utf8')+'\n')

        grammar_file.close()
        # Write sgm
        new_left = u'<seg grammar="{}" id="{}">{}</seg>'.format(out_path, sid, left)
        print(u' ||| '.join([new_left] + right).encode('utf8'))

Esempio n. 4

0

Mostra file

File: lex_align.py Progetto: vchahun/morphogen

def main():
    parser = argparse.ArgumentParser(description='Create source ||| lemma_tag corpus')
    parser.add_argument('--partial', action='store_true',
            help='exclude non-predicted categories from lemmatization')
    args = parser.parse_args()

    def lemmatize(tgt, lemma, tag):
        if args.partial and tag[0] not in config.EXTRACTED_TAGS:
            return tgt
        return lemma+'_'+tag[0]

    for source, target, _ in read_sentences(sys.stdin):
        src = ' '.join(w.token for w in source)
        tgt = ' '.join(lemmatize(tgt, lemma, tag) for tgt, lemma, tag in target)
        print(u'{} ||| {}'.format(src, tgt).encode('utf8'))

Esempio n. 5

0

Mostra file

def main():
    parser = argparse.ArgumentParser(
        description='Create source ||| lemma_tag corpus')
    parser.add_argument(
        '--partial',
        action='store_true',
        help='exclude non-predicted categories from lemmatization')
    args = parser.parse_args()

    def lemmatize(tgt, lemma, tag):
        if args.partial and tag[0] not in config.EXTRACTED_TAGS:
            return tgt
        return lemma + '_' + tag[0]

    for source, target, _ in read_sentences(sys.stdin):
        src = ' '.join(w.token for w in source)
        tgt = ' '.join(
            lemmatize(tgt, lemma, tag) for tgt, lemma, tag in target)
        print(u'{} ||| {}'.format(src, tgt).encode('utf8'))

Esempio n. 6

0

Mostra file

def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(
        description='Train morphology generation model')
    parser.add_argument('category',
                        help='Russian word category to (R/V/A/N/M)')
    parser.add_argument('model', help='output file for trained model')
    parser.add_argument('--penalty',
                        help='regularization penalty',
                        type=float,
                        default=0.001)
    args = parser.parse_args()

    assert len(args.category) == 1
    with open(args.model, 'w') as f:
        f.write('write test / training...')

    logging.info('Extracting features for training data')

    training_features = []
    training_outputs = []
    for source, target, alignment in read_sentences(sys.stdin):
        for features, output in extract_instances(args.category, source,
                                                  target, alignment):
            training_features.append(features)
            training_outputs.append(output)

    vectorizer = DictVectorizer()
    X = vectorizer.fit_transform(training_features)
    y = training_outputs

    logging.info('Training data size: %d instances x %d features', *X.shape)
    logging.info('Training model for category: %s (%d tags)', args.category,
                 len(set(y)))

    model = LogisticRegression(C=args.penalty)
    model.fit(X, y)

    with open(args.model, 'w') as f:
        cPickle.dump((args.category, vectorizer, model), f, protocol=-1)

Esempio n. 7

0

Mostra file

File: rev_map.py Progetto: vchahun/morphogen

def main():
    logging.basicConfig(level=logging.INFO)

    parser = argparse.ArgumentParser(description='Create reverse inflection map')
    parser.add_argument('rev_map', help='output file')
    args = parser.parse_args()
    
    lemma_map = defaultdict(lambda: defaultdict(Counter))
    logging.info('Finding inflections and counting tag/form occurences...')
    for _, target, _ in read_sentences(sys.stdin):
        for (inflection, lemma, tag) in target:
            if tag[0] not in config.EXTRACTED_TAGS: continue
            lemma_map[lemma, tag[0]][tag[1:]][inflection] += 1

    logging.info('Selecting most frequent form for each tag')
    rev_map = {lt: set() for lt in lemma_map.iterkeys()}
    for lt, inflections in lemma_map.iteritems():
        for tag, forms in inflections.iteritems():
            ((best_form, _),) = forms.most_common(1)
            rev_map[lt].add((tag, best_form))

    logging.info('Saving inflection map')
    with open(args.rev_map, 'w') as f:
        cPickle.dump(rev_map, f, protocol=-1)

Esempio n. 8

0

Mostra file

File: crf_train.py Progetto: afcarl/morphogen

def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(description='Create cdec CRF grammars and training data')
    parser.add_argument('category', help='Russian word category to (R/V/A/N/M)')
    parser.add_argument('rev_map', help='reverse inflection map')
    parser.add_argument('output', help='training output path')
    args = parser.parse_args()

    category = args.category

    logging.info('Loading reverse inflection map')
    with open(args.rev_map) as f:
        rev_map = cPickle.load(f)

    # Create training data paths
    if not os.path.exists(args.output):
        os.mkdir(args.output)
    grammar_path = os.path.join(args.output, 'grammars')
    if not os.path.exists(grammar_path):
        os.mkdir(grammar_path)

    sgm = io.open(os.path.join(args.output, 'train.sgm'), 'w', encoding='utf8')

    fvoc = Vocabulary()

    n_sentences = 0
    logging.info('Generating the grammars')
    for source, target, alignment in read_sentences(sys.stdin):
        n_sentences += 1
        if n_sentences % 1000 == 0:
            if too_much_mem():
                logging.info('Running out of memory')
                break
        for word, features in extract_instances(category, source, target, alignment):
            inflection, lemma, tag = word
            category = tag[0]
            ref_attributes = tag[1:]
            possible_inflections = rev_map.get((lemma, category), [])
            if (ref_attributes, inflection) not in possible_inflections:
                logging.debug('Skip: %s (%s)', inflection, ref_attributes)
                continue
            # Write sentence grammar
            grammar_name = os.path.join(grammar_path, uuid.uuid1().hex)
            with io.open(grammar_name, 'w', encoding='utf8') as grammar:
                for attributes, _ in possible_inflections:
                    rule = fvoc.make_rule(lemma, category, attributes, features)
                    grammar.write(rule)
            # Write src / ref
            src = lemma+'_'+category
            ref = ' '.join(config.get_attributes(category, ref_attributes))
            sgm.write(u'<seg grammar="{}"> {} ||| {} {} </seg>\n'.format(
                os.path.abspath(grammar_name), src, category, ref))

    logging.info('Processed %d sentences', n_sentences)
    logging.info('Saving weights')
    ff_path = os.path.join(args.output, 'weights.ini')
    with io.open(ff_path, 'w', encoding='utf8') as f:
        for fname, fid in fvoc.iteritems():
            f.write(u'# {}\n'.format(fname))
            f.write(u'F{} 0\n'.format(fid))

    sgm.close()

Esempio n. 9

0

Mostra file

File: predict.py Progetto: yakazimir/morphogen

def main():
    logging.basicConfig(level=logging.INFO, format="%(message)s")

    parser = argparse.ArgumentParser(description="Predict using trained models")
    parser.add_argument("rev_map", help="reverse inflection map")
    parser.add_argument("models", nargs="+", help="trained models (category:file)")
    parser.add_argument("--ambiguous", action="store_true", help="evaluate only lemmas with multiple inflections")
    args = parser.parse_args()

    logging.info("Loading reverse inflection map")
    with open(args.rev_map) as f:
        rev_map = cPickle.load(f)

    logging.info("Loading inflection prediction models")
    models = load_models(args.models)
    logging.info("Loaded models for %d categories", len(models))

    stats = {cat: [0, 0, 0, 0, 0] for cat in config.EXTRACTED_TAGS}

    for source, target, alignment in read_sentences(sys.stdin):
        for word, features in extract_instances(source, target, alignment):
            gold_inflection, lemma, tag = word
            category = tag[0]
            gold_tag = tag[1:]
            possible_inflections = rev_map.get((lemma, category), [])
            if (gold_tag, gold_inflection) not in possible_inflections:
                print(u"Expected: {} ({}) not found".format(gold_inflection, gold_tag).encode("utf8"))
                continue
            if args.ambiguous and len(possible_inflections) == 1:
                continue

            model = models[category]

            scored_inflections = model.score_all(possible_inflections, features)
            ranked_inflections = sorted(scored_inflections, reverse=True)
            predicted_score, predicted_tag, predicted_inflection = ranked_inflections[0]

            gold_rank = 1 + [tag for _, tag, _ in ranked_inflections].index(gold_tag)
            gold_score = next((score for score, tag, _ in ranked_inflections if tag == gold_tag))

            print(
                u"Expected: {} ({}) r={} score={:.3f} |"
                " Predicted: {} ({}) score={:.3f}".format(
                    gold_inflection,
                    gold_tag,
                    gold_rank,
                    gold_score,
                    predicted_inflection,
                    predicted_tag,
                    predicted_score,
                ).encode("utf8")
            )

            stats[category][0] += 1
            stats[category][1] += 1 / float(gold_rank)
            stats[category][2] += gold_inflection == predicted_inflection
            stats[category][3] += gold_score
            stats[category][4] += len(ranked_inflections)

    for category, (n_instances, rrank_sum, n_correct, total_log_prob, n_inflections) in stats.items():
        if n_instances == 0:
            continue
        mrr = rrank_sum / n_instances
        accuracy = n_correct / float(n_instances)
        ppl = math.exp(-total_log_prob / n_instances)
        avg_inflections = n_inflections / float(n_instances)
        print(
            "Category {}: MRR={:.3f} acc={:.1%} ppl={:.2f} ({} instances; avg #infl={:.2f})".format(
                category, mrr, accuracy, ppl, n_instances, avg_inflections
            )
        )

Esempio n. 10

0

Mostra file

File: struct_train.py Progetto: vchahun/morphogen

def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(description='Trained stuctured model')
    parser.add_argument('category', help='target word category')
    parser.add_argument('rev_map', help='reverse inflection map')
    parser.add_argument('model', help='output directory for models')
    parser.add_argument('-i', '--n_iter', type=int, help='number of SGD iterations')
    parser.add_argument('-r', '--rate', type=float, help='SGD udpate rate')
    args = parser.parse_args()

    category = args.category

    logging.info('Loading reverse inflection map')
    with open(args.rev_map) as f:
        rev_map = cPickle.load(f)

    logging.info('Generating the training data')
    X = []
    Y_all = []
    Y_star = []
    Y_lim = []
    n = 0
    inflection_lims = {} # inflection set cache (ranges for y in Y_all)
    for source, target, alignment in read_sentences(sys.stdin):
        for word, features in extract_instances(category, source, target, alignment):
            ref_inflection, lemma, tag = word
            category = tag[0]
            ref_attributes = tag[1:]
            possible_inflections = rev_map.get((lemma, category), [])
            # Skip if |inflections| = 1 [p(infl | lemma) = 1]
            if len(possible_inflections) == 1: continue
            if (ref_attributes, ref_inflection) not in possible_inflections: continue
            X.append(features)
            # Y_all / Y_lim
            lims = inflection_lims.get((lemma, category), None)
            if lims is None: # new set of inflections
                for i, (attributes, _) in enumerate(possible_inflections):
                    label = {attr: 1 for attr in config.get_attributes(category, attributes)}
                    Y_all.append(label) # attributes map
                lims = (n, n+len(possible_inflections))
                inflection_lims[lemma, category] = lims
                n += len(possible_inflections)
            Y_lim.append(lims)
            # Y_star
            for i, (attributes, _) in enumerate(possible_inflections):
                if attributes == ref_attributes:
                    Y_star.append(i)

    # free some memory
    del rev_map

    if not os.path.exists(args.model):
        os.mkdir(args.model)
    def save_model(it, model):
        with open(os.path.join(args.model, 'model.{}.pickle'.format(it+1)), 'w') as f:
            cPickle.dump(model, f, protocol=-1)

    model = StructuredModel(args.category)
    model.train(X, Y_all, Y_star, Y_lim, n_iter=args.n_iter,
            alpha_sgd=args.rate, every_iter=save_model)

Esempio n. 11

0

Mostra file

def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(
        description='Predict using trained models')
    parser.add_argument('rev_map', help='reverse inflection map')
    parser.add_argument('models',
                        nargs='+',
                        help='trained models (category:file)')
    parser.add_argument('--ambiguous',
                        action='store_true',
                        help='evaluate only lemmas with multiple inflections')
    args = parser.parse_args()

    logging.info('Loading reverse inflection map')
    with open(args.rev_map) as f:
        rev_map = cPickle.load(f)

    logging.info('Loading inflection prediction models')
    models = load_models(args.models)
    logging.info('Loaded models for %d categories', len(models))

    stats = {cat: [0, 0, 0, 0, 0] for cat in config.EXTRACTED_TAGS}

    for source, target, alignment in read_sentences(sys.stdin):
        for word, features in extract_instances(source, target, alignment):
            gold_inflection, lemma, tag = word
            category = tag[0]
            gold_tag = tag[1:]
            possible_inflections = rev_map.get((lemma, category), [])
            if (gold_tag, gold_inflection) not in possible_inflections:
                print(u'Expected: {} ({}) not found'.format(
                    gold_inflection, gold_tag).encode('utf8'))
                continue
            if args.ambiguous and len(possible_inflections) == 1: continue

            model = models[category]

            scored_inflections = model.score_all(possible_inflections,
                                                 features)
            ranked_inflections = sorted(scored_inflections, reverse=True)
            predicted_score, predicted_tag, predicted_inflection = ranked_inflections[
                0]

            gold_rank = 1 + [tag for _, tag, _ in ranked_inflections
                             ].index(gold_tag)
            gold_score = next((score for score, tag, _ in ranked_inflections
                               if tag == gold_tag))

            print(u'Expected: {} ({}) r={} score={:.3f} |'
                  ' Predicted: {} ({}) score={:.3f}'.format(
                      gold_inflection, gold_tag, gold_rank, gold_score,
                      predicted_inflection, predicted_tag,
                      predicted_score).encode('utf8'))

            stats[category][0] += 1
            stats[category][1] += 1 / float(gold_rank)
            stats[category][2] += (gold_inflection == predicted_inflection)
            stats[category][3] += gold_score
            stats[category][4] += len(ranked_inflections)

    for category, (n_instances, rrank_sum, n_correct, total_log_prob,
                   n_inflections) in stats.items():
        if n_instances == 0: continue
        mrr = rrank_sum / n_instances
        accuracy = n_correct / float(n_instances)
        ppl = math.exp(-total_log_prob / n_instances)
        avg_inflections = n_inflections / float(n_instances)
        print(
            'Category {}: MRR={:.3f} acc={:.1%} ppl={:.2f} ({} instances; avg #infl={:.2f})'
            .format(category, mrr, accuracy, ppl, n_instances,
                    avg_inflections))

Esempio n. 12

0

Mostra file

def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(description='Trained stuctured model')
    parser.add_argument('category', help='target word category')
    parser.add_argument('rev_map', help='reverse inflection map')
    parser.add_argument('model', help='output directory for models')
    parser.add_argument('-i',
                        '--n_iter',
                        type=int,
                        help='number of SGD iterations')
    parser.add_argument('-r', '--rate', type=float, help='SGD udpate rate')
    args = parser.parse_args()

    category = args.category

    logging.info('Loading reverse inflection map')
    with open(args.rev_map) as f:
        rev_map = cPickle.load(f)

    logging.info('Generating the training data')
    X = []
    Y_all = []
    Y_star = []
    Y_lim = []
    n = 0
    inflection_lims = {}  # inflection set cache (ranges for y in Y_all)
    for source, target, alignment in read_sentences(sys.stdin):
        for word, features in extract_instances(category, source, target,
                                                alignment):
            ref_inflection, lemma, tag = word
            category = tag[0]
            ref_attributes = tag[1:]
            possible_inflections = rev_map.get((lemma, category), [])
            # Skip if |inflections| = 1 [p(infl | lemma) = 1]
            if len(possible_inflections) == 1: continue
            if (ref_attributes, ref_inflection) not in possible_inflections:
                continue
            X.append(features)
            # Y_all / Y_lim
            lims = inflection_lims.get((lemma, category), None)
            if lims is None:  # new set of inflections
                for i, (attributes, _) in enumerate(possible_inflections):
                    label = {
                        attr: 1
                        for attr in config.get_attributes(
                            category, attributes)
                    }
                    Y_all.append(label)  # attributes map
                lims = (n, n + len(possible_inflections))
                inflection_lims[lemma, category] = lims
                n += len(possible_inflections)
            Y_lim.append(lims)
            # Y_star
            for i, (attributes, _) in enumerate(possible_inflections):
                if attributes == ref_attributes:
                    Y_star.append(i)

    # free some memory
    del rev_map

    if not os.path.exists(args.model):
        os.mkdir(args.model)

    def save_model(it, model):
        with open(os.path.join(args.model, 'model.{}.pickle'.format(it + 1)),
                  'w') as f:
            cPickle.dump(model, f, protocol=-1)

    model = StructuredModel(args.category)
    model.train(X,
                Y_all,
                Y_star,
                Y_lim,
                n_iter=args.n_iter,
                alpha_sgd=args.rate,
                every_iter=save_model)

Esempio n. 13

0

Mostra file

File: struct_train.py Progetto: PhdDone/morphogen

def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(description='Trained stuctured model')
    parser.add_argument('category', help='target word category')
    parser.add_argument('rev_map', help='reverse inflection map')
    parser.add_argument('model', help='output directory for models')
    parser.add_argument('-i', '--n_iter', type=int, help='number of SGD iterations')
    parser.add_argument('-r', '--rate', type=float, help='SGD udpate rate')
    parser.add_argument('-c', '--config', help='configuration module for supervised models (must be in config directory)')
    parser.add_argument('-a','--adagrad', action='store_true', default=False, help='Use the AdaGrad adaptive gradient technique to adjust rate')
    parser.add_argument('-l','--l1', type=float, default=0.0 , help='lambda value for l1 regualrization (l1 regularization currently only implemented for use with adagrad). If none given, regularization will not be used')
    args = parser.parse_args()

    category = args.category
    logging.info('Training inflection model for category {}'.format(category))

    if args.config:
      logging.info('Loading external configuration module {}'.format(args.config))
      sup_config = __import__('config_files.'+args.config,globals(),locals(),['get_attributes'])
      attr_function = lambda cat, attr: sup_config.get_attributes(cat, attr)
    else:
      attr_function = lambda cat, attr: config.get_attributes(cat, attr)

    logging.info('Loading reverse inflection map')
    with open(args.rev_map) as f:
        rev_map = cPickle.load(f)
    logging.info('length of reverse map: {}'.format(len(rev_map)))

    logging.info('Generating the training data')
    X = []
    Y_all = []
    Y_star = []
    Y_lim = []
    n = 0
    inflection_lims = {} # inflection set cache (ranges for y in Y_all)
    for source, target, alignment in read_sentences(sys.stdin):
        for word, features in extract_instances(category, source, target, alignment):
            ref_inflection, lemma, tag = word
            category = tag[0]
            ref_attributes = tag[1:]
            possible_inflections = rev_map.get((lemma, category), [])
            # Skip if |inflections| = 1 [p(infl | lemma) = 1]
            if len(possible_inflections) == 1: continue
            if (ref_attributes, ref_inflection) not in possible_inflections: continue
            X.append(features)
            # Y_all / Y_lim
            lims = inflection_lims.get((lemma, category), None)
            if lims is None: # new set of inflections
                for i, (attributes, _) in enumerate(possible_inflections):
                    label = {attr: 1 for attr in attr_function(category, attributes)}
                    Y_all.append(label) # attributes map
                lims = (n, n+len(possible_inflections))
                inflection_lims[lemma, category] = lims
                n += len(possible_inflections)
            Y_lim.append(lims)
            # Y_star
            for i, (attributes, _) in enumerate(possible_inflections):
                if attributes == ref_attributes:
                    Y_star.append(i)

    # free some memory
    del rev_map

    if not os.path.exists(args.model):
        os.mkdir(args.model)
    def save_model(it, model):
        with open(os.path.join(args.model, 'model.{}.pickle'.format(it+1)), 'w') as f:
            cPickle.dump(model, f, protocol=-1)
    
    if args.config:
      model = StructuredModel(args.category, functools.partial(sup_config.get_attributes))
    else:
      model = StructuredModel(args.category, config.get_attributes)
    model.train(X, Y_all, Y_star, Y_lim, n_iter=args.n_iter,
                alpha_sgd=args.rate, every_iter=save_model, 
                adagrad=args.adagrad, l1=args.l1)