Example #1
0
def main():

    # set the seed for replicability
    np.random.seed(89) #(42)

    defaults = {}
    
    parser = argparse.ArgumentParser(description="Train or use a Named Entity tagger.")
    
    parser.add_argument('-c', '--config', dest='config_file',
                        help='Specify config file', metavar='FILE')

    # args, remaining_argv = parser.parse_known_args()

    # if args.config_file:
    #     config = ConfigParser.SafeConfigParser()
    #     config.read([args.config_file])
    #     defaults = dict(config.items('Defaults'))

    # parser.set_defaults(**defaults)

    parser.add_argument('model', type=str,
                        help='Model file to train/use.')

    # training options
    train = parser.add_argument_group('Train')
    train.add_argument('-t', '--train', type=str, default='',
                        help='File with annotated data for training.')
    train.add_argument('-w', '--window', type=int, default=2,
                        help='Size of the word window (default %(default)s)')
    train.add_argument('-s', '--embeddings-size', type=int, default=50,
                        help='Number of features per word (default %(default)s)',
                        dest='embeddings_size')
    train.add_argument('-e', '--epochs', type=int, default=100,
                        help='Number of training epochs (default %(default)s)',
                        dest='iterations')
    train.add_argument('-l', '--learning_rate', type=float, default=0.001,
                        help='Learning rate for network weights (default %(default)s)',
                        dest='learning_rate')
    train.add_argument('-n', '--hidden', type=int, default=200,
                        help='Number of hidden neurons (default %(default)s)',
                        dest='hidden')
    train.add_argument('--eps', type=float, default=1e-6,
                        help='Epsilon value for AdaGrad (default %(default)s)')
    train.add_argument('--ro', type=float, default=0.95,
                        help='Ro value for AdaDelta (default %(default)s)')
    train.add_argument('-o', '--output', type=str, default='',
                        help='File where to save embeddings')

    # Embeddings
    embeddings = parser.add_argument_group('Embeddings')
    embeddings.add_argument('--vocab', type=str, default='',
                        help='Vocabulary file, either read or created')
    embeddings.add_argument('--vocab-size', type=int, default=0,
                            help='Maximum size of vocabulary from corpus (default %(default)s)')
    embeddings.add_argument('--vectors', type=str, default='',
                        help='Embeddings file, either read or created')
    embeddings.add_argument('--min-occurr', type=int, default=3,
                        help='Minimum occurrences for inclusion in vocabulary (default %(default)s',
                        dest='minOccurr')
    embeddings.add_argument('--load', type=str, default='',
                        help='Load previously saved model')
    embeddings.add_argument('--variant', type=str, default='',
                        help='Either "senna" (default), "polyglot" or "word2vec".')

    # Extractors:
    extractors = parser.add_argument_group('Extractors')
    extractors.add_argument('--caps', const=5, nargs='?', type=int, default=None,
                        help='Include capitalization features. Optionally, supply the number of features (default %(default)s)')
    extractors.add_argument('--pos', const=1, type=int, nargs='?', default=None,
                        help='Use POS tag. Optionally supply the POS token field index (default %(default)s)')
    extractors.add_argument('--suffix', const=5, nargs='?', type=int, default=None,
                            help='Include suffix features. Optionally, supply the number of features (default %(default)s)')
    extractors.add_argument('--suffixes', type=str, default='',
                        help='Load suffixes from this file')
    extractors.add_argument('--prefix', const=5, nargs='?', type=int, default=None,
                            help='Include prefix features. Optionally, '\
                            'supply the number of features (default %(default)s)')
    extractors.add_argument('--prefixes', type=str, default='',
                        help='Load prefixes from this file')
    extractors.add_argument('--gazetteer', type=str,
                        help='Load gazetteer from this file')
    extractors.add_argument('--gsize', type=int, default=5,
                        help='Size of gazetteer features (default %(default)s)')

    # reader
    parser.add_argument('--form-field', type=int, default=0,
                        help='Token field containing form (default %(default)s)',
                        dest='formField')

    # common
    parser.add_argument('--threads', type=int, default=1,
                        help='Number of threads (default %(default)s)')
    parser.add_argument('-v', '--verbose', help='Verbose mode',
                        action='store_true')

    # Use this for obtaining defaults from config file:
    #args = arguments.get_args()
    args = parser.parse_args()

    log_format = '%(message)s'
    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(format=log_format, level=log_level)
    logger = logging.getLogger("Logger")

    config = ConfigParser()
    if args.config_file:
        config.read(args.config_file)

    # merge args with config

    if args.train:
        reader = NerReader(args.formField)

        # a generator (can be iterated several times)
        sentence_iter = reader.read(args.train)

        if args.vocab and os.path.exists(args.vocab):
            if args.vectors and os.path.exists(args.vectors):
                # use supplied embeddings
                embeddings = Embeddings(vectors=args.vectors, vocab_file=args.vocab,
                                        variant=args.variant)
            else:
                # create random embeddings
                embeddings = Embeddings(args.embeddings_size, vocab_file=args.vocab,
                                        variant=args.variant)
            # add the ngrams from the corpus
            # build vocabulary and tag set
            if args.vocab_size:
                vocab, tagset = reader.create_vocabulary(sentence_iter,
                                                         args.vocab_size,
                                                         args.minOccurr)
                embeddings.merge(vocab)
                logger.info("Overriding vocabulary in %s" % args.vocab)
                embeddings.save_vocabulary(args.vocab)
            else:
                tagset = reader.create_tagset(sentence_iter)

        elif args.variant == 'word2vec':
            if os.path.exists(args.vectors):
                embeddings = Embeddings(vectors=args.vectors,
                                        variant=args.variant)
                vocab, tagset = reader.create_vocabulary(sentence_iter,
                                                         args.vocab_size,
                                                         args.minOccurr)
                embeddings.merge(vocab)
            else:
                vocab, tagset = reader.create_vocabulary(sentence_iter,
                                                         args.vocab_size,
                                                         args.minOccurr)
                embeddings = Embeddings(vocab=vocab,
                                        variant=args.variant)
            if args.vocab:
                logger.info("Saving vocabulary in %s" % args.vocab)
                embeddings.save_vocabulary(args.vocab)
        elif not args.vocab_size:
            logger.error("Missing parameter --vocab-size")
            return
        else:
            # build vocabulary and tag set
            vocab, tagset = reader.create_vocabulary(sentence_iter,
                                                     args.vocab_size,
                                                     args.minOccurr)
            logger.info("Creating word embeddings")
            embeddings = Embeddings(args.embeddings_size, vocab=vocab,
                                    variant=args.variant)
            if args.vocab:
                logger.info("Saving vocabulary in %s" % args.vocab)
                embeddings.save_vocabulary(args.vocab)

        converter = Converter()
        # pass just the formField from tokens to the extractor
        converter.add(embeddings, reader.formField)
        
        if args.caps:
            logger.info("Creating capitalization features...")
            converter.add(CapsExtractor(args.caps), reader.formField)

        if args.pos:
            logger.info("Creating POS features...")
            postags = frozenset((token[args.pos] for sent in sentence_iter for token in sent))
            # tell the extractor which field to use 
            converter.add(AttributeExtractor(postags), args.pos) # no variant, preserve case

        if ((args.suffixes and not os.path.exists(args.suffixes)) or
            (args.prefixes and not os.path.exists(args.prefixes))):
            # collect the forms once
            words = (tok[reader.formField] for sent in sentence_iter for tok in sent)

        if args.suffix:
            if os.path.exists(args.suffixes):
                logger.info("Loading suffix list...")
                extractor = SuffixExtractor(args.suffix, args.suffixes)
                converter.add(extractor, reader.formField)
            else:
                logger.info("Creating suffix list...")
                extractor = SuffixExtractor(args.suffix, None, words)
                converter.add(extractor, reader.formField)
                if args.suffixes:
                    logger.info("Saving suffix list to: %s", args.suffixes)
                    extractor.write(args.suffixes)

        if args.prefix:
            if os.path.exists(args.prefixes):
                logger.info("Loading prefix list...")
                extractor = PrefixExtractor(args.prefix, args.prefixes)
                converter.add(extractor, reader.formField)
            else:
                logger.info("Creating prefix list...")
                extractor = PrefixExtractor(args.prefix, None, words)
                converter.add(extractor, reader.formField)
                if args.prefixes:
                    logger.info("Saving prefix list to: %s", args.prefixes)
                    extractor.write(args.prefixes)

        if args.gazetteer:
            if os.path.exists(args.gazetteer):
                logger.info("Loading gazetteers")
                for extractor in GazetteerExtractor.create(args.gazetteer, args.gsize):
                    # tell the extractor which field to use 
                    converter.add(extractor, reader.formField)
            else:
                logger.info("Creating gazetteer")
                tries = GazetteerExtractor.build(sentence_iter, reader.formField, reader.tagField)
                for tag, trie in tries.items():
                    # tell the extractor which field to use 
                    converter.add(GazetteerExtractor(trie, args.gsize), reader.formField)
                logger.info("Saving gazetteer list to: %s", args.gazetteer)
                with open(args.gazetteer, 'wb') as file:
                    for tag, trie in tries.iteritems():
                        for ngram in trie:
                            print >> file, ('%s\t%s' % (tag, ' '.join(ngram))).encode('UTF-8')

        # if args.pos:
        #     converter.add(POS(arg.pos))

        # obtain the tags for each sentence
        tag_index = { t:i for i,t in enumerate(tagset) }
        sentences = []
        tags = []
        for sent in sentence_iter:
            sentences.append(converter.convert(sent))
            tags.append(np.array([tag_index[token[reader.tagField]] for token in sent],
                                 np.int32))
        logger.info("Vocabulary size: %d" % embeddings.dict.size())
        logger.info("Tagset size: %d" % len(tagset))
        trainer = create_trainer(args, converter, tag_index)
        logger.info("Starting training with %d sentences" % len(sentences))

        report_frequency = max(args.iterations / 200, 1)
        report_frequency = 1    # DEBUG
        trainer.train(sentences, tags, args.iterations, report_frequency,
                      args.threads)
    
        logger.info("Saving trained model ...")
        trainer.saver(trainer)
        logger.info("... to %s" % args.model)

    else:
        with open(args.model) as file:
            tagger = NerTagger.load(file)
        reader = TaggerReader()
        for sent in reader:
            ConllWriter.write(tagger.tag(sent, reader.tagField))
Example #2
0
def main():

    # set the seed for replicability
    np.random.seed(42)

    defaults = {}
    
    parser = argparse.ArgumentParser(description="Learn word embeddings.")
    
    parser.add_argument('-c', '--config', dest='config_file',
                        help='Specify config file', metavar='FILE')

    # args, remaining_argv = parser.parse_known_args()

    # if args.config_file:
    #     config = ConfigParser.SafeConfigParser()
    #     config.read([args.config_file])
    #     defaults = dict(config.items('Defaults'))

    # parser.set_defaults(**defaults)

    parser.add_argument('model', type=str,
                        help='Model file to train/use.')
    parser.add_argument('-w', '--window', type=int, default=5,
                        help='Size of the word window (default 5)')
    parser.add_argument('-s', '--embeddings-size', type=int, default=50,
                        help='Number of features per word (default 50)',
                        dest='embeddings_size')
    parser.add_argument('-e', '--epochs', type=int, default=100,
                        help='Number of training epochs (default 100)',
                        dest='iterations')
    parser.add_argument('-l', '--learning_rate', type=float, default=0.001,
                        help='Learning rate for network weights (default 0.001)',
                        dest='learning_rate')
    parser.add_argument('-n', '--hidden', type=int, default=200,
                        help='Number of hidden neurons (default 200)',
                        dest='hidden')
    parser.add_argument('--threads', type=int, default=1,
                        help='Number of threads (default 1)')
    parser.add_argument('-t', '--train', type=str, default=None,
                        help='File with annotated data for training.')
    parser.add_argument('-o', '--output', type=str, default=None,
                        help='File where to save embeddings')

    # Extractors:
    parser.add_argument('--caps', const=5, nargs='?', type=int, default=None,
                        help='Include capitalization features. Optionally, supply the number of features (default 5)')
    parser.add_argument('--suffix', const=5, nargs='?', type=int, default=None,
                            help='Include suffix features. Optionally, supply the number of features (default 5)')
    parser.add_argument('--suffixes', type=str,
                        help='Load suffixes from this file')
    parser.add_argument('--prefix', const=0, nargs='?', type=int, default=None,
                        help='Include prefix features. Optionally, '\
                        'supply the number of features (default 0)')
    parser.add_argument('--prefixes', type=str,
                        help='Load prefixes from this file')
    parser.add_argument('--gazetteer', type=str,
                        help='Load gazetteer from this file')
    parser.add_argument('--gsize', type=int, default=5,
                        help='Size of gazetteer features (default 5)')
    # common
    parser.add_argument('--vocab', type=str, default=None,
                        help='Vocabulary file, either read or created')
    parser.add_argument('--vectors', type=str, default=None,
                        help='Embeddings file, either read or created')
    parser.add_argument('--min-occurr', type=int, default=3,
                        help='Minimum occurrences for inclusion in vocabulary',
                        dest='minOccurr')
    parser.add_argument('--load', type=str, default=None,
                        help='Load previously saved model')
    parser.add_argument('--variant', type=str, default=None,
                        help='Either "senna" (default), "polyglot", "word2vec" or "gensym".')
    parser.add_argument('-v', '--verbose', help='Verbose mode',
                        action='store_true')

    # Use this for obtaining defaults from config file:
    #args = arguments.get_args()
    args = parser.parse_args()

    log_format = '%(message)s'
    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(format=log_format, level=log_level)
    logger = logging.getLogger("Logger")

    config = ConfigParser()
    if args.config_file:
        config.read(args.config_file)

    # merge args with config

    if args.train:
        reader = NerReader()

        # a generator (can be iterated several times)
        sentence_iter = reader.read(args.train)

        if args.vocab:
            if not args.vectors:
                logger.error("No --vectors specified")
                return
            embeddings = Embeddings(args.embeddings_size, args.vocab,
                                    args.vectors, variant=args.variant)
            tagset = reader.create_tagset(sentence_iter)
            #tagset = Plain.read_vocabulary('ner-tag-dict.txt') # DEBUG
        elif args.variant == 'word2vec':
            embeddings = Embeddings(vectors=args.vectors,
                                    variant=args.variant)
            tagset = reader.create_tagset(sentence_iter)
        else:
            # build vocabulary and tag set
            vocab, tagset = reader.create_vocabulary(sentence_iter,
                                                     args.vocab_size,
                                                     args.minOccurr)
            logger.info("Creating word embeddings")
            embeddings = Embeddings(args.embeddings_size, vocab=vocab,
                                    variant=args.variant)

        converter = Converter()
        converter.add(embeddings)

        if args.caps:
            logger.info("Creating capitalization features...")
            converter.add(CapsExtractor(args.caps))

        if args.suffix:
            logger.info("Creating suffix features...")
            # collect the forms
            words = (tok[0] for sent in sentence_iter for tok in sent)
            extractor = SuffixExtractor(args.suffix, args.suffixes, words)
            converter.add(extractor)

        if args.prefix:
            logger.info("Creating prefix features...")
            extractor = PrefixExtractor(args.prefix, args.prefixes, sentence_iter)
            converter.add(extractor)

        if args.gazetteer:
            logger.info("Creating gazetteer features")
            for extractor in GazetteerExtractor.create(args.gazetteer, args.gsize):
                converter.add(extractor)

        # if args.pos:
        #     converter.add(POS(arg.pos))

        # obtain the tags for each sentence
        tags_dict = { t:i for i,t in enumerate(tagset) }
        sentences = []
        tags = []
        for sent in sentence_iter:
            sentences.append(converter.convert([token[0] for token in sent]))
            tags.append(np.array([tags_dict[token[-1]] for token in sent]))
    
        trainer = create_trainer(args, converter, tags_dict)
        logger.info("Starting training with %d sentences" % len(sentences))

        report_frequency = max(args.iterations / 200, 1)
        report_frequency = 1    # DEBUG
        trainer.train(sentences, tags, args.iterations, report_frequency,
                      args.threads)
    
        logger.info("Saving trained model ...")
        trainer.saver(trainer)
        logger.info("... to %s" % args.model)

    else:
        with open(args.model) as file:
            tagger = NerTagger.load(file)
        reader = ConllReader()
        for sent in reader:
            sent = [x[0] for x in sent] # extract form
            ConllWriter.write(tagger.tag(sent))
Example #3
0
def main():

    # set the seed for replicability
    np.random.seed(42)

    defaults = {}
    
    parser = argparse.ArgumentParser(description="POS tagger using word embeddings.")
    
    parser.add_argument('-c', '--config', dest='config_file',
                        help='Specify config file', metavar='FILE')

    # args, remaining_argv = parser.parse_known_args()

    # if args.config_file:
    #     config = ConfigParser.SafeConfigParser()
    #     config.read([args.config_file])
    #     defaults = dict(config.items('Defaults'))

    # parser.set_defaults(**defaults)

    parser.add_argument('model', type=str,
                        help='Model file to train/use.')
    parser.add_argument('--threads', type=int, default=1,
                        help='Number of threads (default %(default)s)')
    parser.add_argument('-v', '--verbose', help='Verbose mode',
                        action='store_true')

    # training options
    train = parser.add_argument_group('Train')
    train.add_argument('-t', '--train', type=str, default=None,
                        help='File with annotated data for training.')

    train.add_argument('-w', '--window', type=int, default=5,
                        help='Size of the word window (default %(default)s)')
    train.add_argument('-s', '--embeddings-size', type=int, default=50,
                        help='Number of features per word (default %(default)s)',
                        dest='embeddings_size')
    train.add_argument('-e', '--epochs', type=int, default=100,
                        help='Number of training epochs (default %(default)s)',
                        dest='iterations')
    train.add_argument('-l', '--learning_rate', type=float, default=0.001,
                        help='Learning rate for network weights (default %(default)s)',
                        dest='learning_rate')
    train.add_argument('-n', '--hidden', type=int, default=200,
                        help='Number of hidden neurons (default %(default)s)',
                        dest='hidden')
    train.add_argument('--eps', type=float, default=1e-8,
                        help='Epsilon value for AdaGrad (default %(default)s)')
    train.add_argument('--ro', type=float, default=0.95,
                        help='Ro value for AdaDelta (default %(default)s)')
    train.add_argument('-o', '--output', type=str, default='',
                        help='File where to save embeddings')

    # Embeddings
    embeddings = parser.add_argument_group('Embeddings')
    embeddings.add_argument('--vocab', type=str, default='',
                        help='Vocabulary file, either read or created')
    embeddings.add_argument('--vocab-size', type=int, default=0,
                            help='Maximum size of vocabulary (default %(default)s)')
    embeddings.add_argument('--vectors', type=str, default='',
                        help='Embeddings file, either read or created')
    embeddings.add_argument('--min-occurr', type=int, default=3,
                        help='Minimum occurrences for inclusion in vocabulary',
                        dest='minOccurr')
    embeddings.add_argument('--load', type=str, default='',
                        help='Load previously saved model')
    embeddings.add_argument('--variant', type=str, default='',
                        help='Either "senna" (default), "polyglot" or "word2vec".')

    # Extractors:
    extractors = parser.add_argument_group('Extractors')
    extractors.add_argument('--caps', const=5, nargs='?', type=int, default=None,
                            help='Include capitalization features. Optionally, supply the number of features (default %(default)s)')
    extractors.add_argument('--suffix', const=5, nargs='?', type=int, default=None,
                            help='Include suffix features. Optionally, supply the number of features (default %(default)s)')
    extractors.add_argument('--suffixes', type=str, default='',
                        help='Load suffixes from this file')
    extractors.add_argument('--prefix', const=5, nargs='?', type=int, default=None,
                            help='Include prefix features. Optionally, '\
                            'supply the number of features (default %(default)s)')
    extractors.add_argument('--prefixes', type=str, default='',
                        help='Load prefixes from this file')

    # reader
    parser.add_argument('--form-field', type=int, default=0,
                        help='Token field containing form (default %(default)s)',
                        dest='formField')

    # Use this for obtaining defaults from config file:
    #args = arguments.get_args()
    args = parser.parse_args()

    log_format = '%(message)s'
    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(format=log_format, level=log_level)
    logger = logging.getLogger("Logger")

    config = ConfigParser()
    if args.config_file:
        config.read(args.config_file)

    # merge args with config

    if args.train:
        reader = PosReader(args.formField)
        # a generator (can be iterated several times)
        sentence_iter = reader.read(args.train)

        if args.vocab and os.path.exists(args.vocab):
            # start with the given vocabulary
            base_vocab = reader.load_vocabulary(args.vocab)
            if args.vectors and os.path.exists(args.vectors):
                embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab,
                                        variant=args.variant)
            else:
                # create random embeddings
                embeddings = Embeddings(args.embeddings_size, vocab=base_vocab,
                                        variant=args.variant)
            # add the ngrams from the corpus
            # build vocabulary and tag set
            if args.vocab_size:
                vocab, tagset = reader.create_vocabulary(sentence_iter,
                                                         args.vocab_size,
                                                         args.minOccurr)
                embeddings.merge(vocab)
                logger.info("Overriding vocabulary in %s" % args.vocab)
                embeddings.save_vocabulary(args.vocab)
            else:
                vocab = base_vocab
                tagset = reader.create_tagset(sentence_iter)

        elif args.vocab:
            if not args.vectors:
                logger.error("No --vectors specified")
                return
            embeddings = Embeddings(args.embeddings_size, args.vocab,
                                    args.vectors, variant=args.variant)
            tagset = reader.create_tagset(sentence_iter)
            logger.info("Creating vocabulary in %s" % args.vocab)
            embeddings.save_vocabulary(args.vocab)

        elif args.variant == 'word2vec':
            if os.path.exists(args.vectors):
                embeddings = Embeddings(vectors=args.vectors,
                                        variant=args.variant)
                vocab, tagset = reader.create_vocabulary(sentence_iter,
                                                         args.vocab_size,
                                                         args.minOccurr)
                embeddings.merge(vocab)
            else:
                embeddings = Embeddings(vectors=args.vectors,
                                        variant=args.variant)
                tagset = reader.create_tagset(sentence_iter)
            if args.vocab:
                logger.info("Creating vocabulary in %s" % args.vocab)
                embeddings.save_vocabulary(args.vocab)
        else:
            # build vocabulary and tag set
            vocab, tagset = reader.create_vocabulary(sentence_iter,
                                                     args.vocab_size,
                                                     args.minOccurr)
            logger.info("Creating vocabulary in %s" % args.vocab)
            embeddings.save_vocabulary(args.vocab)
            logger.info("Creating word embeddings")
            embeddings = Embeddings(args.embeddings_size, vocab=vocab,
                                    variant=args.variant)

        converter = Converter()
        converter.add(embeddings)

        if args.caps:
            logger.info("Creating capitalization features...")
            converter.add(CapsExtractor(args.caps))

        if ((args.suffixes and not os.path.exists(args.suffixes)) or
            (args.prefixes and not os.path.exists(args.prefixes))):
            # collect the forms once
            words = (tok[reader.formField] for sent in sentence_iter for tok in sent)

        if args.suffix:
            if os.path.exists(args.suffixes):
                logger.info("Loading suffix list...")
                extractor = SuffixExtractor(args.suffix, args.suffixes)
                converter.add(extractor)
            else:
                logger.info("Creating suffix list...")
                extractor = SuffixExtractor(args.suffix, None, words)
                converter.add(extractor)
                if args.suffixes:
                    logger.info("Saving suffix list to: %s", args.suffixes)
                    extractor.write(args.suffixes)

        if args.prefix:
            if os.path.exists(args.prefixes):
                logger.info("Loading prefix list...")
                extractor = PrefixExtractor(args.prefix, args.prefixes)
                converter.add(extractor)
            else:
                logger.info("Creating prefix list...")
                extractor = PrefixExtractor(args.prefix, None, words)
                converter.add(extractor)
                if args.prefixes:
                    logger.info("Saving prefix list to: %s", args.prefixes)
                    extractor.write(args.prefixes)

        # obtain the tags for each sentence
        tag_index = { t:i for i,t in enumerate(tagset) }
        sentences = []
        tags = []
        for sent in sentence_iter:
            sentences.append(converter.convert([token[reader.formField] for token in sent]))
            tags.append(np.array([tag_index[token[reader.tagField]] for token in sent]))
    
        trainer = create_trainer(args, converter, tag_index)
        logger.info("Starting training with %d sentences" % len(sentences))

        report_frequency = max(args.iterations / 200, 1)
        report_frequency = 1    # DEBUG
        trainer.train(sentences, tags, args.iterations, report_frequency,
                      args.threads)
    
        logger.info("Saving trained model ...")
        trainer.saver(trainer)
        logger.info("... to %s" % args.model)

    else:
        with open(args.model) as file:
            tagger = Tagger.load(file)
        reader = ConllReader()
        for sent in reader:
            sent = [x[args.formField] for x in sent] # extract form
            ConllWriter.write(tagger.tag_sequence(sent, return_tokens=True))
Example #4
0
def main():

    # set the seed for replicability
    np.random.seed(42)

    defaults = {}

    parser = argparse.ArgumentParser(
        description="Train or use a Named Entity tagger.")

    parser.add_argument('-c',
                        '--config',
                        dest='config_file',
                        help='Specify config file',
                        metavar='FILE')

    # args, remaining_argv = parser.parse_known_args()

    # if args.config_file:
    #     config = ConfigParser.SafeConfigParser()
    #     config.read([args.config_file])
    #     defaults = dict(config.items('Defaults'))

    # parser.set_defaults(**defaults)

    parser.add_argument('model', type=str, help='Model file to train/use.')
    parser.add_argument('-w',
                        '--window',
                        type=int,
                        default=5,
                        help='Size of the word window (default 5)')
    parser.add_argument('-s',
                        '--embeddings-size',
                        type=int,
                        default=50,
                        help='Number of features per word (default 50)',
                        dest='embeddings_size')
    parser.add_argument('-e',
                        '--epochs',
                        type=int,
                        default=100,
                        help='Number of training epochs (default 100)',
                        dest='iterations')
    parser.add_argument(
        '-l',
        '--learning_rate',
        type=float,
        default=0.001,
        help='Learning rate for network weights (default 0.001)',
        dest='learning_rate')
    parser.add_argument('-n',
                        '--hidden',
                        type=int,
                        default=200,
                        help='Number of hidden neurons (default 200)',
                        dest='hidden')
    parser.add_argument('--threads',
                        type=int,
                        default=1,
                        help='Number of threads (default 1)')
    parser.add_argument('-t',
                        '--train',
                        type=str,
                        default='',
                        help='File with annotated data for training.')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        default='',
                        help='File where to save embeddings')

    # Extractors:
    parser.add_argument(
        '--caps',
        const=5,
        nargs='?',
        type=int,
        default=None,
        help=
        'Include capitalization features. Optionally, supply the number of features (default 5)'
    )
    parser.add_argument(
        '--suffix',
        const=5,
        nargs='?',
        type=int,
        default=None,
        help=
        'Include suffix features. Optionally, supply the number of features (default 5)'
    )
    parser.add_argument('--suffixes',
                        type=str,
                        default='',
                        help='Load suffixes from this file')
    parser.add_argument('--prefix', const=0, nargs='?', type=int, default=None,
                        help='Include prefix features. Optionally, '\
                        'supply the number of features (default 0)')
    parser.add_argument('--prefixes',
                        type=str,
                        default='',
                        help='Load prefixes from this file')
    parser.add_argument('--gazetteer',
                        type=str,
                        help='Load gazetteer from this file')
    parser.add_argument('--gsize',
                        type=int,
                        default=5,
                        help='Size of gazetteer features (default 5)')

    # reader
    parser.add_argument('--form-field',
                        type=int,
                        default=0,
                        dest='formField',
                        help='Token field containin form (default 0)')

    # common
    parser.add_argument('--vocab',
                        type=str,
                        default='',
                        help='Vocabulary file, either read or created')
    parser.add_argument('--vocab-size',
                        type=int,
                        default=0,
                        dest='vocab_size',
                        help='Size of vocabulary to create')
    parser.add_argument('--vectors',
                        type=str,
                        default='',
                        help='Embeddings file, either read or created')
    parser.add_argument('--min-occurr',
                        type=int,
                        default=3,
                        help='Minimum occurrences for inclusion in vocabulary',
                        dest='minOccurr')
    parser.add_argument('--load',
                        type=str,
                        default='',
                        help='Load previously saved model')
    parser.add_argument(
        '--variant',
        type=str,
        default='',
        help='Either "senna" (default), "polyglot" or "word2vec".')
    parser.add_argument('-v',
                        '--verbose',
                        help='Verbose mode',
                        action='store_true')

    # Use this for obtaining defaults from config file:
    #args = arguments.get_args()
    args = parser.parse_args()

    log_format = '%(message)s'
    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(format=log_format, level=log_level)
    logger = logging.getLogger("Logger")

    config = ConfigParser()
    if args.config_file:
        config.read(args.config_file)

    # merge args with config

    if args.train:
        reader = NerReader(args.formField)

        # a generator (can be iterated several times)
        sentence_iter = reader.read(args.train)

        if os.path.exists(args.vocab):
            # start with the given vocabulary
            base_vocab = reader.load_vocabulary(args.vocab)
            if os.path.exists(args.vectors):
                embeddings = Embeddings(vectors=args.vectors,
                                        vocab=base_vocab,
                                        variant=args.variant)
            else:
                embeddings = Embeddings(args.embeddings_size,
                                        vocab=base_vocab,
                                        variant=args.variant)
            # add the ngrams from the corpus
            # build vocabulary and tag set
            vocab, tagset = reader.create_vocabulary(sentence_iter,
                                                     args.vocab_size,
                                                     args.minOccurr)
            embeddings.merge(vocab)
            logger.info("Overriding vocabulary in %s" % args.vocab)
            embeddings.save_vocabulary(args.vocab)

        elif args.vocab:
            if not args.vectors:
                logger.error("No --vectors specified")
                return
            embeddings = Embeddings(args.embeddings_size,
                                    args.vocab,
                                    args.vectors,
                                    variant=args.variant)
            tagset = reader.create_tagset(sentence_iter)
            logger.info("Creating vocabulary in %s" % args.vocab)
            embeddings.save_vocabulary(args.vocab)

        elif args.variant == 'word2vec':
            if os.path.exists(args.vectors):
                embeddings = Embeddings(vectors=args.vectors,
                                        variant=args.variant)
                vocab, tagset = reader.create_vocabulary(
                    sentence_iter, args.vocab_size, args.minOccurr)
                embeddings.merge(vocab)
            else:
                embeddings = Embeddings(vectors=args.vectors,
                                        variant=args.variant)
                tagset = reader.create_tagset(sentence_iter)
        else:
            # build vocabulary and tag set
            vocab, tagset = reader.create_vocabulary(sentence_iter,
                                                     args.vocab_size,
                                                     args.minOccurr)
            logger.info("Creating vocabulary in %s" % args.vocab)
            embeddings.save_vocabulary(args.vocab)
            logger.info("Creating word embeddings")
            embeddings = Embeddings(args.embeddings_size,
                                    vocab=vocab,
                                    variant=args.variant)

        converter = Converter()
        converter.add(embeddings)

        if args.caps:
            logger.info("Creating capitalization features...")
            converter.add(CapsExtractor(args.caps))

        if ((args.suffixes and not os.path.exists(args.suffixes))
                or (args.prefixes and not os.path.exists(args.prefixes))):
            # collect the forms once
            words = (tok[reader.formField] for sent in sentence_iter
                     for tok in sent)
        if os.path.exists(args.suffixes):
            logger.info("Loading suffix list...")
            extractor = SuffixExtractor.create(args.suffix, args.suffixes)
            converter.add(extractor)
        elif args.suffixes:
            logger.info("Creating suffix list...")
            # collect the forms
            words = (tok[reader.formField] for sent in sentence_iter
                     for tok in sent)
            extractor = SuffixExtractor(args.suffix, None, words)
            converter.add(extractor)
            logger.info("Saving suffix list to: %s", args.suffixes)
            extractor.write(args.suffixes)

        if os.path.exists(args.prefixes):
            logger.info("Loading prefix list...")
            extractor = PrefixExtractor.create(args.prefix, args.prefixes)
            converter.add(extractor)
        elif args.prefixes:
            logger.info("Creating prefix list...")
            extractor = PrefixExtractor(args.prefix, None, words)
            converter.add(extractor)
            if args.prefixes:
                logger.info("Saving prefix list to: %s", args.prefixes)
                extractor.write(args.prefixes)

        if os.path.exists(args.gazetteer):
            logger.info("Loading gazetteers")
            for extractor in GazetteerExtractor.create(args.gazetteer,
                                                       args.gsize):
                converter.add(extractor)
        elif args.gazetteer:
            logger.info("Creating gazetteer")
            # strip B-/I-
            classes = sorted([tag[2:] or tag for tag in tagset])
            # gazetteers must be kept in the same order as tags
            gazs = OrderedDict()
            for tag in classes:
                if tag != 'O':
                    gazs[tag] = Counter(
                    )  # we might want to keep the most frequent
            for sent in sentence_iter:
                for tok in sent:
                    tag = tok[reader.tagField]  # last field
                    if tag != 'O':
                        tag = tag[2:]  # strip B-/I-
                        form = tok[reader.formField].lower()  # lowercase
                        gazs[tag][form] += 1  # FORM
            for tag, counter in gazs.items():
                converter.add(GazetteerExtractor(counter.keys(), args.gsize))
            logger.info("Saving gazetter list to: %s", args.gazetteer)
            with open(args.gazetteer, 'wb') as file:
                for tag, counter in gazs.iteritems():
                    for w in counter.keys():
                        print >> file, '\t'.join((tag, w)).encode('UTF-8')

        # if args.pos:
        #     converter.add(POS(arg.pos))

        # obtain the tags for each sentence
        tags_dict = {t: i for i, t in enumerate(tagset)}
        sentences = []
        tags = []
        for sent in sentence_iter:
            sentences.append(
                converter.convert([token[reader.formField] for token in sent]))
            tags.append(
                np.array([tags_dict[token[reader.tagField]]
                          for token in sent]))

        trainer = create_trainer(args, converter, tags_dict)
        logger.info("Starting training with %d sentences" % len(sentences))

        report_frequency = max(args.iterations / 200, 1)
        report_frequency = 1  # DEBUG
        trainer.train(sentences, tags, args.iterations, report_frequency,
                      args.threads)

        logger.info("Saving trained model ...")
        trainer.saver(trainer)
        logger.info("... to %s" % args.model)

    else:
        with open(args.model) as file:
            tagger = NerTagger.load(file)
        reader = ConllReader()
        for sent in reader:
            sent = [x[args.formField] for x in sent]  # extract form
            ConllWriter.write(tagger.tag(sent))
Example #5
0
def main():

    # set the seed for replicability
    np.random.seed(89)  #(42)

    defaults = {}

    parser = argparse.ArgumentParser(
        description="Train or use a Named Entity tagger.")

    parser.add_argument('-c',
                        '--config',
                        dest='config_file',
                        help='Specify config file',
                        metavar='FILE')

    # args, remaining_argv = parser.parse_known_args()

    # if args.config_file:
    #     config = ConfigParser.SafeConfigParser()
    #     config.read([args.config_file])
    #     defaults = dict(config.items('Defaults'))

    # parser.set_defaults(**defaults)

    parser.add_argument('model', type=str, help='Model file to train/use.')

    # training options
    train = parser.add_argument_group('Train')
    train.add_argument('-t',
                       '--train',
                       type=str,
                       default='',
                       help='File with annotated data for training.')
    train.add_argument('-w',
                       '--window',
                       type=int,
                       default=2,
                       help='Size of the word window (default %(default)s)')
    train.add_argument(
        '-s',
        '--embeddings-size',
        type=int,
        default=50,
        help='Number of features per word (default %(default)s)',
        dest='embeddings_size')
    train.add_argument('-e',
                       '--epochs',
                       type=int,
                       default=100,
                       help='Number of training epochs (default %(default)s)',
                       dest='iterations')
    train.add_argument(
        '-l',
        '--learning_rate',
        type=float,
        default=0.001,
        help='Learning rate for network weights (default %(default)s)',
        dest='learning_rate')
    train.add_argument('-n',
                       '--hidden',
                       type=int,
                       default=200,
                       help='Number of hidden neurons (default %(default)s)',
                       dest='hidden')
    train.add_argument('--eps',
                       type=float,
                       default=1e-6,
                       help='Epsilon value for AdaGrad (default %(default)s)')
    train.add_argument('--ro',
                       type=float,
                       default=0.95,
                       help='Ro value for AdaDelta (default %(default)s)')
    train.add_argument('-o',
                       '--output',
                       type=str,
                       default='',
                       help='File where to save embeddings')

    # Embeddings
    embeddings = parser.add_argument_group('Embeddings')
    embeddings.add_argument('--vocab',
                            type=str,
                            default='',
                            help='Vocabulary file, either read or created')
    embeddings.add_argument(
        '--vocab-size',
        type=int,
        default=0,
        help='Maximum size of vocabulary from corpus (default %(default)s)')
    embeddings.add_argument('--vectors',
                            type=str,
                            default='',
                            help='Embeddings file, either read or created')
    embeddings.add_argument(
        '--min-occurr',
        type=int,
        default=3,
        help=
        'Minimum occurrences for inclusion in vocabulary (default %(default)s',
        dest='minOccurr')
    embeddings.add_argument('--load',
                            type=str,
                            default='',
                            help='Load previously saved model')
    embeddings.add_argument(
        '--variant',
        type=str,
        default='',
        help='Either "senna" (default), "polyglot" or "word2vec".')

    # Extractors:
    extractors = parser.add_argument_group('Extractors')
    extractors.add_argument(
        '--caps',
        const=5,
        nargs='?',
        type=int,
        default=None,
        help=
        'Include capitalization features. Optionally, supply the number of features (default %(default)s)'
    )
    extractors.add_argument(
        '--pos',
        const=1,
        type=int,
        nargs='?',
        default=None,
        help=
        'Use POS tag. Optionally supply the POS token field index (default %(default)s)'
    )
    extractors.add_argument(
        '--suffix',
        const=5,
        nargs='?',
        type=int,
        default=None,
        help=
        'Include suffix features. Optionally, supply the number of features (default %(default)s)'
    )
    extractors.add_argument('--suffixes',
                            type=str,
                            default='',
                            help='Load suffixes from this file')
    extractors.add_argument('--prefix', const=5, nargs='?', type=int, default=None,
                            help='Include prefix features. Optionally, '\
                            'supply the number of features (default %(default)s)')
    extractors.add_argument('--prefixes',
                            type=str,
                            default='',
                            help='Load prefixes from this file')
    extractors.add_argument('--gazetteer',
                            type=str,
                            help='Load gazetteer from this file')
    extractors.add_argument(
        '--gsize',
        type=int,
        default=5,
        help='Size of gazetteer features (default %(default)s)')

    # reader
    parser.add_argument(
        '--form-field',
        type=int,
        default=0,
        help='Token field containing form (default %(default)s)',
        dest='formField')

    # common
    parser.add_argument('--threads',
                        type=int,
                        default=1,
                        help='Number of threads (default %(default)s)')
    parser.add_argument('-v',
                        '--verbose',
                        help='Verbose mode',
                        action='store_true')

    # Use this for obtaining defaults from config file:
    #args = arguments.get_args()
    args = parser.parse_args()

    log_format = '%(message)s'
    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(format=log_format, level=log_level)
    logger = logging.getLogger("Logger")

    config = ConfigParser()
    if args.config_file:
        config.read(args.config_file)

    # merge args with config

    if args.train:
        reader = NerReader(args.formField)

        # a generator (can be iterated several times)
        sentence_iter = reader.read(args.train)

        if args.vocab and os.path.exists(args.vocab):
            if args.vectors and os.path.exists(args.vectors):
                # use supplied embeddings
                embeddings = Embeddings(vectors=args.vectors,
                                        vocab_file=args.vocab,
                                        variant=args.variant)
            else:
                # create random embeddings
                embeddings = Embeddings(args.embeddings_size,
                                        vocab_file=args.vocab,
                                        variant=args.variant)
            # add the ngrams from the corpus
            # build vocabulary and tag set
            if args.vocab_size:
                vocab, tagset = reader.create_vocabulary(
                    sentence_iter, args.vocab_size, args.minOccurr)
                embeddings.merge(vocab)
                logger.info("Overriding vocabulary in %s" % args.vocab)
                embeddings.save_vocabulary(args.vocab)
            else:
                tagset = reader.create_tagset(sentence_iter)

        elif args.variant == 'word2vec':
            if os.path.exists(args.vectors):
                embeddings = Embeddings(vectors=args.vectors,
                                        variant=args.variant)
                vocab, tagset = reader.create_vocabulary(
                    sentence_iter, args.vocab_size, args.minOccurr)
                embeddings.merge(vocab)
            else:
                vocab, tagset = reader.create_vocabulary(
                    sentence_iter, args.vocab_size, args.minOccurr)
                embeddings = Embeddings(vocab=vocab, variant=args.variant)
            if args.vocab:
                logger.info("Saving vocabulary in %s" % args.vocab)
                embeddings.save_vocabulary(args.vocab)
        elif not args.vocab_size:
            logger.error("Missing parameter --vocab-size")
            return
        else:
            # build vocabulary and tag set
            vocab, tagset = reader.create_vocabulary(sentence_iter,
                                                     args.vocab_size,
                                                     args.minOccurr)
            logger.info("Creating word embeddings")
            embeddings = Embeddings(args.embeddings_size,
                                    vocab=vocab,
                                    variant=args.variant)
            if args.vocab:
                logger.info("Saving vocabulary in %s" % args.vocab)
                embeddings.save_vocabulary(args.vocab)

        converter = Converter()
        # pass just the formField from tokens to the extractor
        converter.add(embeddings, reader.formField)

        if args.caps:
            logger.info("Creating capitalization features...")
            converter.add(CapsExtractor(args.caps), reader.formField)

        if args.pos:
            logger.info("Creating POS features...")
            postags = frozenset(
                (token[args.pos] for sent in sentence_iter for token in sent))
            # tell the extractor which field to use
            converter.add(AttributeExtractor(postags),
                          args.pos)  # no variant, preserve case

        if ((args.suffixes and not os.path.exists(args.suffixes))
                or (args.prefixes and not os.path.exists(args.prefixes))):
            # collect the forms once
            words = (tok[reader.formField] for sent in sentence_iter
                     for tok in sent)

        if args.suffix:
            if os.path.exists(args.suffixes):
                logger.info("Loading suffix list...")
                extractor = SuffixExtractor(args.suffix, args.suffixes)
                converter.add(extractor, reader.formField)
            else:
                logger.info("Creating suffix list...")
                extractor = SuffixExtractor(args.suffix, None, words)
                converter.add(extractor, reader.formField)
                if args.suffixes:
                    logger.info("Saving suffix list to: %s", args.suffixes)
                    extractor.write(args.suffixes)

        if args.prefix:
            if os.path.exists(args.prefixes):
                logger.info("Loading prefix list...")
                extractor = PrefixExtractor(args.prefix, args.prefixes)
                converter.add(extractor, reader.formField)
            else:
                logger.info("Creating prefix list...")
                extractor = PrefixExtractor(args.prefix, None, words)
                converter.add(extractor, reader.formField)
                if args.prefixes:
                    logger.info("Saving prefix list to: %s", args.prefixes)
                    extractor.write(args.prefixes)

        if args.gazetteer:
            if os.path.exists(args.gazetteer):
                logger.info("Loading gazetteers")
                for extractor in GazetteerExtractor.create(
                        args.gazetteer, args.gsize):
                    # tell the extractor which field to use
                    converter.add(extractor, reader.formField)
            else:
                logger.info("Creating gazetteer")
                tries = GazetteerExtractor.build(sentence_iter,
                                                 reader.formField,
                                                 reader.tagField)
                for tag, trie in tries.items():
                    # tell the extractor which field to use
                    converter.add(GazetteerExtractor(trie, args.gsize),
                                  reader.formField)
                logger.info("Saving gazetteer list to: %s", args.gazetteer)
                with open(args.gazetteer, 'wb') as file:
                    for tag, trie in tries.iteritems():
                        for ngram in trie:
                            print(('%s\t%s' %
                                   (tag, ' '.join(ngram))).encode('UTF-8'),
                                  file=file)

        # if args.pos:
        #     converter.add(POS(arg.pos))

        # obtain the tags for each sentence
        tag_index = {t: i for i, t in enumerate(tagset)}
        sentences = []
        tags = []
        for sent in sentence_iter:
            sentences.append(converter.convert(sent))
            tags.append(
                np.array([tag_index[token[reader.tagField]] for token in sent],
                         np.int32))
        logger.info("Vocabulary size: %d" % embeddings.dict.size())
        logger.info("Tagset size: %d" % len(tagset))
        trainer = create_trainer(args, converter, tag_index)
        logger.info("Starting training with %d sentences" % len(sentences))

        report_frequency = max(args.iterations / 200, 1)
        report_frequency = 1  # DEBUG
        trainer.train(sentences, tags, args.iterations, report_frequency,
                      args.threads)

        logger.info("Saving trained model ...")
        trainer.saver(trainer)
        logger.info("... to %s" % args.model)

    else:
        with open(args.model) as file:
            tagger = NerTagger.load(file)
        reader = TaggerReader()
        for sent in reader:
            ConllWriter.write(tagger.tag(sent, reader.tagField))
Example #6
0
def main():

    # set the seed for replicability
    np.random.seed(42)

    defaults = {}
    
    parser = argparse.ArgumentParser(description="Train or use a Named Entity tagger.")
    
    parser.add_argument('-c', '--config', dest='config_file',
                        help='Specify config file', metavar='FILE')

    # args, remaining_argv = parser.parse_known_args()

    # if args.config_file:
    #     config = ConfigParser.SafeConfigParser()
    #     config.read([args.config_file])
    #     defaults = dict(config.items('Defaults'))

    # parser.set_defaults(**defaults)

    parser.add_argument('model', type=str,
                        help='Model file to train/use.')
    parser.add_argument('-w', '--window', type=int, default=5,
                        help='Size of the word window (default 5)')
    parser.add_argument('-s', '--embeddings-size', type=int, default=50,
                        help='Number of features per word (default 50)',
                        dest='embeddings_size')
    parser.add_argument('-e', '--epochs', type=int, default=100,
                        help='Number of training epochs (default 100)',
                        dest='iterations')
    parser.add_argument('-l', '--learning_rate', type=float, default=0.001,
                        help='Learning rate for network weights (default 0.001)',
                        dest='learning_rate')
    parser.add_argument('-n', '--hidden', type=int, default=200,
                        help='Number of hidden neurons (default 200)',
                        dest='hidden')
    parser.add_argument('--threads', type=int, default=1,
                        help='Number of threads (default 1)')
    parser.add_argument('-t', '--train', type=str, default='',
                        help='File with annotated data for training.')
    parser.add_argument('-o', '--output', type=str, default='',
                        help='File where to save embeddings')

    # Extractors:
    parser.add_argument('--caps', const=5, nargs='?', type=int, default=None,
                        help='Include capitalization features. Optionally, supply the number of features (default 5)')
    parser.add_argument('--suffix', const=5, nargs='?', type=int, default=None,
                        help='Include suffix features. Optionally, supply the number of features (default 5)')
    parser.add_argument('--suffixes', type=str, default='',
                        help='Load suffixes from this file')
    parser.add_argument('--prefix', const=0, nargs='?', type=int, default=None,
                        help='Include prefix features. Optionally, '\
                        'supply the number of features (default 0)')
    parser.add_argument('--prefixes', type=str, default='',
                        help='Load prefixes from this file')
    parser.add_argument('--gazetteer', type=str,
                        help='Load gazetteer from this file')
    parser.add_argument('--gsize', type=int, default=5,
                        help='Size of gazetteer features (default 5)')

    # reader
    parser.add_argument('--form-field', type=int, default=0,
                        dest='formField',
                        help='Token field containin form (default 0)')
    
    # common
    parser.add_argument('--vocab', type=str, default='',
                        help='Vocabulary file, either read or created')
    parser.add_argument('--vocab-size', type=int, default=0,
                        dest='vocab_size',
                        help='Size of vocabulary to create')
    parser.add_argument('--vectors', type=str, default='',
                        help='Embeddings file, either read or created')
    parser.add_argument('--min-occurr', type=int, default=3,
                        help='Minimum occurrences for inclusion in vocabulary',
                        dest='minOccurr')
    parser.add_argument('--load', type=str, default='',
                        help='Load previously saved model')
    parser.add_argument('--variant', type=str, default='',
                        help='Either "senna" (default), "polyglot" or "word2vec".')
    parser.add_argument('-v', '--verbose', help='Verbose mode',
                        action='store_true')

    # Use this for obtaining defaults from config file:
    #args = arguments.get_args()
    args = parser.parse_args()

    log_format = '%(message)s'
    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(format=log_format, level=log_level)
    logger = logging.getLogger("Logger")

    config = ConfigParser()
    if args.config_file:
        config.read(args.config_file)

    # merge args with config

    if args.train:
        reader = NerReader(args.formField)

        # a generator (can be iterated several times)
        sentence_iter = reader.read(args.train)

        if os.path.exists(args.vocab):
            # start with the given vocabulary
            base_vocab = reader.load_vocabulary(args.vocab)
            if os.path.exists(args.vectors):
                embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab,
                                        variant=args.variant)
            else:
                embeddings = Embeddings(args.embeddings_size, vocab=base_vocab,
                                        variant=args.variant)
            # add the ngrams from the corpus
            # build vocabulary and tag set
            vocab, tagset = reader.create_vocabulary(sentence_iter,
                                                     args.vocab_size,
                                                     args.minOccurr)
            embeddings.merge(vocab)
            logger.info("Overriding vocabulary in %s" % args.vocab)
            embeddings.save_vocabulary(args.vocab)

        elif args.vocab:
            if not args.vectors:
                logger.error("No --vectors specified")
                return
            embeddings = Embeddings(args.embeddings_size, args.vocab,
                                    args.vectors, variant=args.variant)
            tagset = reader.create_tagset(sentence_iter)
            logger.info("Creating vocabulary in %s" % args.vocab)
            embeddings.save_vocabulary(args.vocab)

        elif args.variant == 'word2vec':
            if os.path.exists(args.vectors):
                embeddings = Embeddings(vectors=args.vectors,
                                        variant=args.variant)
                vocab, tagset = reader.create_vocabulary(sentence_iter,
                                                         args.vocab_size,
                                                         args.minOccurr)
                embeddings.merge(vocab)
            else:
                embeddings = Embeddings(vectors=args.vectors,
                                        variant=args.variant)
                tagset = reader.create_tagset(sentence_iter)
        else:
            # build vocabulary and tag set
            vocab, tagset = reader.create_vocabulary(sentence_iter,
                                                     args.vocab_size,
                                                     args.minOccurr)
            logger.info("Creating vocabulary in %s" % args.vocab)
            embeddings.save_vocabulary(args.vocab)
            logger.info("Creating word embeddings")
            embeddings = Embeddings(args.embeddings_size, vocab=vocab,
                                    variant=args.variant)

        converter = Converter()
        converter.add(embeddings)

        if args.caps:
            logger.info("Creating capitalization features...")
            converter.add(CapsExtractor(args.caps))

        if ((args.suffixes and not os.path.exists(args.suffixes)) or
            (args.prefixes and not os.path.exists(args.prefixes))):
            # collect the forms once
            words = (tok[reader.formField] for sent in sentence_iter for tok in sent)
        if os.path.exists(args.suffixes):
            logger.info("Loading suffix list...")
            extractor = SuffixExtractor.create(args.suffix, args.suffixes)
            converter.add(extractor)
        elif args.suffixes:
            logger.info("Creating suffix list...")
            # collect the forms
            words = (tok[reader.formField] for sent in sentence_iter for tok in sent)
            extractor = SuffixExtractor(args.suffix, None, words)
            converter.add(extractor)
            logger.info("Saving suffix list to: %s", args.suffixes)
            extractor.write(args.suffixes)

        if os.path.exists(args.prefixes):
            logger.info("Loading prefix list...")
            extractor = PrefixExtractor.create(args.prefix, args.prefixes)
            converter.add(extractor)
        elif args.prefixes:
            logger.info("Creating prefix list...")
            extractor = PrefixExtractor(args.prefix, None, words)
            converter.add(extractor)
            if args.prefixes:
                logger.info("Saving prefix list to: %s", args.prefixes)
                extractor.write(args.prefixes)

        if os.path.exists(args.gazetteer):
            logger.info("Loading gazetteers")
            for extractor in GazetteerExtractor.create(args.gazetteer, args.gsize):
                converter.add(extractor)
        elif args.gazetteer:
            logger.info("Creating gazetteer")
            # strip B-/I-
            classes = sorted([tag[2:] or tag for tag in tagset])
            # gazetteers must be kept in the same order as tags
            gazs = OrderedDict()
            for tag in classes:
                if tag != 'O':
                    gazs[tag] = Counter() # we might want to keep the most frequent
            for sent in sentence_iter:
                for tok in sent:
                    tag = tok[reader.tagField] # last field
                    if tag != 'O':
                        tag = tag[2:] # strip B-/I-
                        form = tok[reader.formField].lower() # lowercase
                        gazs[tag][form] += 1 # FORM
            for tag, counter in gazs.items():
                converter.add(GazetteerExtractor(counter.keys(), args.gsize))
            logger.info("Saving gazetter list to: %s", args.gazetteer)
            with open(args.gazetteer, 'wb') as file:
                for tag, counter in gazs.iteritems():
                    for w in counter.keys():
                        print >> file, '\t'.join((tag, w)).encode('UTF-8')

        # if args.pos:
        #     converter.add(POS(arg.pos))

        # obtain the tags for each sentence
        tags_dict = { t:i for i,t in enumerate(tagset) }
        sentences = []
        tags = []
        for sent in sentence_iter:
            sentences.append(converter.convert([token[reader.formField] for token in sent]))
            tags.append(np.array([tags_dict[token[reader.tagField]] for token in sent]))
    
        trainer = create_trainer(args, converter, tags_dict)
        logger.info("Starting training with %d sentences" % len(sentences))

        report_frequency = max(args.iterations / 200, 1)
        report_frequency = 1    # DEBUG
        trainer.train(sentences, tags, args.iterations, report_frequency,
                      args.threads)
    
        logger.info("Saving trained model ...")
        trainer.saver(trainer)
        logger.info("... to %s" % args.model)

    else:
        with open(args.model) as file:
            tagger = NerTagger.load(file)
        reader = ConllReader()
        for sent in reader:
            sent = [x[args.formField] for x in sent] # extract form
            ConllWriter.write(tagger.tag(sent))
Example #7
0
def main():

    # set the seed for replicability
    np.random.seed(42)          # DEBUG

    defaults = {}
    
    parser = argparse.ArgumentParser(description="Learn word embeddings.")
    
    parser.add_argument('-c', '--config', dest='config_file',
                        help='Specify config file', metavar='FILE')

    #args, remaining_argv = parser.parse_known_args()

    # if args.config_file:
    #     config = ConfigParser.SafeConfigParser()
    #     config.read([args.config_file])
    #     defaults = dict(config.items('Defaults'))

    # parser.set_defaults(**defaults)

    parser.add_argument('model', type=str,
                        help='Model file to train/use.')
    parser.add_argument('-w', '--window', type=int, default=5,
                        help='Size of the word window (default 5)')
    parser.add_argument('-s', '--embeddings-size', type=int, default=50,
                        help='Number of features per word (default 50)',
                        dest='embeddings_size')
    parser.add_argument('-e', '--epochs', type=int, default=100,
                        help='Number of training epochs (default 100)',
                        dest='iterations')
    parser.add_argument('-l', '--learning_rate', type=float, default=0.001,
                        help='Learning rate for network weights (default 0.001)',
                        dest='learning_rate')
    parser.add_argument('-n', '--hidden', type=int, default=200,
                        help='Number of hidden neurons (default 200)',
                        dest='hidden')
    parser.add_argument('--threads', type=int, default=1,
                        help='Number of threads (default 1)')
    parser.add_argument('-t', '--train', type=str, default=None,
                        help='File with annotated data for training.')
    parser.add_argument('-o', '--output', type=str, default=None,
                        help='File where to save embeddings')

    # Extractors:
    parser.add_argument('--caps', const=5, nargs='?', type=int, default=None,
                        help='Include capitalization features. Optionally, supply the number of features (default 5)')
    parser.add_argument('--suffix', const=5, nargs='?', type=int, default=None,
                            help='Include suffix features. Optionally, supply the number of features (default 5)')
    parser.add_argument('--suffixes', type=str,
                        help='Load suffixes from this file')
    parser.add_argument('--prefix', const=0, nargs='?', type=int, default=None,
                        help='Include prefix features. Optionally, '\
                        'supply the number of features (default 0)')
    parser.add_argument('--prefixes', type=str,
                        help='Load prefixes from this file')
    # common
    parser.add_argument('--vocab', type=str, default=None,
                        help='Vocabulary file, either read or created')
    parser.add_argument('--vectors', type=str, default=None,
                        help='Embeddings file, either read or created')
    parser.add_argument('--min-occurr', type=int, default=3,
                        help='Minimum occurrences for inclusion in vocabulary',
                        dest='minOccurr')
    parser.add_argument('--load', type=str, default=None,
                        help='Load previously saved model')
    parser.add_argument('--variant', type=str, default=None,
                        help='Either "senna" (default), "polyglot" or "word2vec".')
    parser.add_argument('-v', '--verbose', help='Verbose mode',
                        action='store_true')

    # Use this for obtaining defaults from config file:
    #args = arguments.get_args()
    args = parser.parse_args()

    log_format = '%(message)s'
    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(format=log_format, level=log_level)
    logger = logging.getLogger("Logger")

    config = ConfigParser()
    if args.config_file:
        config.read(args.config_file)

    # merge args with config

    if args.train:
        reader = PosReader()
        # a generator (can be iterated several times)
        sentence_iter = reader.read(args.train)

        if args.vocab:
            if not args.vectors:
                logger.error("No --vectors specified")
                return
            embeddings = Embeddings(args.embeddings_size, args.vocab,
                                    args.vectors, variant=args.variant)
            tagset = reader.create_tagset(sentence_iter)
            #tagset = Plain.read_vocabulary('wsj.nlpnet/pos-tags.txt') # DEBUG
        elif args.variant == 'word2vec':
            embeddings = Embeddings(vectors=args.vectors,
                                    variant=args.variant)
            tagset = reader.create_tagset(sentence_iter)
        else:
            # build vocabulary and tag set
            vocab, tagset = reader.create_vocabulary(sentence_iter,
                                                     args.vocab_size,
                                                     args.minOccurr)
            logger.info("Creating word embeddings")
            embeddings = Embeddings(args.embeddings_size, vocab=vocab,
                                    variant=args.variant)

        converter = Converter()
        converter.add(embeddings)

        if args.caps:
            logger.info("Creating capitalization features...")
            converter.add(CapsExtractor(args.caps))

        if args.suffix:
            logger.info("Creating suffix features...")
            # collect the forms
            words = (tok[0] for sent in sentence_iter for tok in sent)
            extractor = SuffixExtractor(args.suffix, args.suffixes, words)
            converter.add(extractor)

        if args.prefix:
            logger.info("Creating prefix features...")
            extractor = PrefixExtractor(args.prefix, args.prefixes, sentence_iter)
            converter.add(extractor)

        # obtain the tags for each sentence
        tags_dict = { t:i for i,t in enumerate(tagset) }
        sentences = []
        tags = []
        for sent in sentence_iter:
            sentences.append(converter.convert([token[0] for token in sent]))
            tags.append(np.array([tags_dict[token[-1]] for token in sent]))
    
        trainer = create_trainer(args, converter, tags_dict)
        logger.info("Starting training with %d sentences" % len(sentences))

        report_frequency = max(args.iterations / 200, 1)
        report_frequency = 1    # DEBUG
        trainer.train(sentences, tags, args.iterations, report_frequency,
                      args.threads)
    
        logger.info("Saving trained model ...")
        trainer.saver(trainer)
        logger.info("... to %s" % args.model)

    else:
        with open(args.model) as file:
            tagger = Tagger.load(file)
        reader = ConllReader()
        for sent in reader:
            sent = [x[0] for x in sent] # extract form
            ConllWriter.write(tagger.tag_sequence(sent, return_tokens=True))