Ejemplo n.º 1
0
 def expand_features(self, category, attributes, features):
     for morph in config.get_attributes(category, attributes):
         # target features
         fid = self.convert(morph)
         yield 'F{}=1'.format(fid)
         # pairwise features
         for morph2 in config.get_attributes(category, attributes):
             if morph2 <= morph: continue
             fid = self.convert(u'{}+{}'.format(morph, morph2))
             yield 'F{}=1'.format(fid)
         # translation features
         for fname, fval in features.iteritems():
             fid = self.convert(u'{}_{}'.format(morph, fname))
             yield 'F{}={}'.format(fid, fval)
Ejemplo n.º 2
0
    def score_all(self, inflections, features):
        X = self.feature_dict.transform([features])
        Y_all = []
        for i, (tag, _) in enumerate(inflections):
            label = {attr: 1 for attr in config.get_attributes(self.category, tag)}
            Y_all.append(label)
        Y_all = self.label_dict.transform(Y_all)

        scores = self.model.predict_log_proba(X, Y_all)
        return [(score, tag, inflection) for score, (tag, inflection)
                in zip(scores, inflections)]
Ejemplo n.º 3
0
    def score_all(self, inflections, features):
        X = self.feature_dict.transform([features])
        Y_all = []
        for i, (tag, _) in enumerate(inflections):
            label = {
                attr: 1
                for attr in config.get_attributes(self.category, tag)
            }
            Y_all.append(label)
        Y_all = self.label_dict.transform(Y_all)

        scores = self.model.predict_log_proba(X, Y_all)
        return [(score, tag, inflection)
                for score, (tag, inflection) in zip(scores, inflections)]
Ejemplo n.º 4
0
 def score(self, tag, features):
     score = 0
     for attr in config.get_attributes(self.category, tag):
         for fname, fval in features.iteritems():
             score += fval * self.weights.get(attr+'_'+fname, 0)
     return score
Ejemplo n.º 5
0
def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(description='Create cdec CRF grammars and training data')
    parser.add_argument('category', help='Russian word category to (R/V/A/N/M)')
    parser.add_argument('rev_map', help='reverse inflection map')
    parser.add_argument('output', help='training output path')
    args = parser.parse_args()

    category = args.category

    logging.info('Loading reverse inflection map')
    with open(args.rev_map) as f:
        rev_map = cPickle.load(f)

    # Create training data paths
    if not os.path.exists(args.output):
        os.mkdir(args.output)
    grammar_path = os.path.join(args.output, 'grammars')
    if not os.path.exists(grammar_path):
        os.mkdir(grammar_path)

    sgm = io.open(os.path.join(args.output, 'train.sgm'), 'w', encoding='utf8')

    fvoc = Vocabulary()

    n_sentences = 0
    logging.info('Generating the grammars')
    for source, target, alignment in read_sentences(sys.stdin):
        n_sentences += 1
        if n_sentences % 1000 == 0:
            if too_much_mem():
                logging.info('Running out of memory')
                break
        for word, features in extract_instances(category, source, target, alignment):
            inflection, lemma, tag = word
            category = tag[0]
            ref_attributes = tag[1:]
            possible_inflections = rev_map.get((lemma, category), [])
            if (ref_attributes, inflection) not in possible_inflections:
                logging.debug('Skip: %s (%s)', inflection, ref_attributes)
                continue
            # Write sentence grammar
            grammar_name = os.path.join(grammar_path, uuid.uuid1().hex)
            with io.open(grammar_name, 'w', encoding='utf8') as grammar:
                for attributes, _ in possible_inflections:
                    rule = fvoc.make_rule(lemma, category, attributes, features)
                    grammar.write(rule)
            # Write src / ref
            src = lemma+'_'+category
            ref = ' '.join(config.get_attributes(category, ref_attributes))
            sgm.write(u'<seg grammar="{}"> {} ||| {} {} </seg>\n'.format(
                os.path.abspath(grammar_name), src, category, ref))

    logging.info('Processed %d sentences', n_sentences)
    logging.info('Saving weights')
    ff_path = os.path.join(args.output, 'weights.ini')
    with io.open(ff_path, 'w', encoding='utf8') as f:
        for fname, fid in fvoc.iteritems():
            f.write(u'# {}\n'.format(fname))
            f.write(u'F{} 0\n'.format(fid))

    sgm.close()
Ejemplo n.º 6
0
 def make_rule(self, lemma, category, attributes, features):
     src = lemma+'_'+category
     tgt = ' '.join(config.get_attributes(category, attributes))
     feat = ' '.join(self.expand_features(category, attributes, features))
     return (u'[S] ||| {} ||| {} {} ||| {}\n'.format(src, category, tgt, feat))
Ejemplo n.º 7
0
def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(description='Trained stuctured model')
    parser.add_argument('category', help='target word category')
    parser.add_argument('rev_map', help='reverse inflection map')
    parser.add_argument('model', help='output directory for models')
    parser.add_argument('-i', '--n_iter', type=int, help='number of SGD iterations')
    parser.add_argument('-r', '--rate', type=float, help='SGD udpate rate')
    args = parser.parse_args()

    category = args.category

    logging.info('Loading reverse inflection map')
    with open(args.rev_map) as f:
        rev_map = cPickle.load(f)

    logging.info('Generating the training data')
    X = []
    Y_all = []
    Y_star = []
    Y_lim = []
    n = 0
    inflection_lims = {} # inflection set cache (ranges for y in Y_all)
    for source, target, alignment in read_sentences(sys.stdin):
        for word, features in extract_instances(category, source, target, alignment):
            ref_inflection, lemma, tag = word
            category = tag[0]
            ref_attributes = tag[1:]
            possible_inflections = rev_map.get((lemma, category), [])
            # Skip if |inflections| = 1 [p(infl | lemma) = 1]
            if len(possible_inflections) == 1: continue
            if (ref_attributes, ref_inflection) not in possible_inflections: continue
            X.append(features)
            # Y_all / Y_lim
            lims = inflection_lims.get((lemma, category), None)
            if lims is None: # new set of inflections
                for i, (attributes, _) in enumerate(possible_inflections):
                    label = {attr: 1 for attr in config.get_attributes(category, attributes)}
                    Y_all.append(label) # attributes map
                lims = (n, n+len(possible_inflections))
                inflection_lims[lemma, category] = lims
                n += len(possible_inflections)
            Y_lim.append(lims)
            # Y_star
            for i, (attributes, _) in enumerate(possible_inflections):
                if attributes == ref_attributes:
                    Y_star.append(i)

    # free some memory
    del rev_map

    if not os.path.exists(args.model):
        os.mkdir(args.model)
    def save_model(it, model):
        with open(os.path.join(args.model, 'model.{}.pickle'.format(it+1)), 'w') as f:
            cPickle.dump(model, f, protocol=-1)

    model = StructuredModel(args.category)
    model.train(X, Y_all, Y_star, Y_lim, n_iter=args.n_iter,
            alpha_sgd=args.rate, every_iter=save_model)
Ejemplo n.º 8
0
def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(description='Trained stuctured model')
    parser.add_argument('category', help='target word category')
    parser.add_argument('rev_map', help='reverse inflection map')
    parser.add_argument('model', help='output directory for models')
    parser.add_argument('-i',
                        '--n_iter',
                        type=int,
                        help='number of SGD iterations')
    parser.add_argument('-r', '--rate', type=float, help='SGD udpate rate')
    args = parser.parse_args()

    category = args.category

    logging.info('Loading reverse inflection map')
    with open(args.rev_map) as f:
        rev_map = cPickle.load(f)

    logging.info('Generating the training data')
    X = []
    Y_all = []
    Y_star = []
    Y_lim = []
    n = 0
    inflection_lims = {}  # inflection set cache (ranges for y in Y_all)
    for source, target, alignment in read_sentences(sys.stdin):
        for word, features in extract_instances(category, source, target,
                                                alignment):
            ref_inflection, lemma, tag = word
            category = tag[0]
            ref_attributes = tag[1:]
            possible_inflections = rev_map.get((lemma, category), [])
            # Skip if |inflections| = 1 [p(infl | lemma) = 1]
            if len(possible_inflections) == 1: continue
            if (ref_attributes, ref_inflection) not in possible_inflections:
                continue
            X.append(features)
            # Y_all / Y_lim
            lims = inflection_lims.get((lemma, category), None)
            if lims is None:  # new set of inflections
                for i, (attributes, _) in enumerate(possible_inflections):
                    label = {
                        attr: 1
                        for attr in config.get_attributes(
                            category, attributes)
                    }
                    Y_all.append(label)  # attributes map
                lims = (n, n + len(possible_inflections))
                inflection_lims[lemma, category] = lims
                n += len(possible_inflections)
            Y_lim.append(lims)
            # Y_star
            for i, (attributes, _) in enumerate(possible_inflections):
                if attributes == ref_attributes:
                    Y_star.append(i)

    # free some memory
    del rev_map

    if not os.path.exists(args.model):
        os.mkdir(args.model)

    def save_model(it, model):
        with open(os.path.join(args.model, 'model.{}.pickle'.format(it + 1)),
                  'w') as f:
            cPickle.dump(model, f, protocol=-1)

    model = StructuredModel(args.category)
    model.train(X,
                Y_all,
                Y_star,
                Y_lim,
                n_iter=args.n_iter,
                alpha_sgd=args.rate,
                every_iter=save_model)
Ejemplo n.º 9
0
 def score(self, tag, features):
     score = 0
     for attr in config.get_attributes(self.category, tag):
         for fname, fval in features.iteritems():
             score += fval * self.weights.get(attr + '_' + fname, 0)
     return score