def fit(self, dataset, epochs=10, dev=None): """ Trains a BIST model on an annotated dataset in CoNLL file format. Args: dataset (str): Path to input dataset for training, formatted in CoNLL/U format. epochs (int, optional): Number of learning iterations. dev (str, optional): Path to development dataset for conducting evaluations. """ if dev: dev = validate_existing_filepath(dev) dataset = validate_existing_filepath(dataset) validate((epochs, int, 0, None)) print("\nRunning fit on " + dataset + "...\n") words, w2i, pos, rels = utils.vocab(dataset) self.params = words, w2i, pos, rels, self.options from nlp_architect.models.bist.mstlstm import MSTParserLSTM self.model = MSTParserLSTM(*self.params) for epoch in range(epochs): print("Starting epoch", epoch + 1) self.model.train(dataset) if dev: ext = dev.rindex(".") res_path = dev[:ext] + "_epoch_" + str(epoch + 1) + "_pred" + dev[ext:] utils.write_conll(res_path, self.model.predict(dev)) utils.run_eval(dev, res_path)
def __init__(self, model_path, settings_path, spacy_model="en", batch_size=32, use_cudnn=False): _model_path = path.join(path.dirname(path.realpath(__file__)), model_path) validate_existing_filepath(_model_path) _settings_path = path.join(path.dirname(path.realpath(__file__)), settings_path) validate_existing_filepath(_settings_path) nlp = spacy.load(spacy_model) for p in nlp.pipe_names: if p not in ["tagger"]: nlp.remove_pipe(p) nlp.add_pipe(nlp.create_pipe("sentencizer"), first=True) nlp.add_pipe( NPAnnotator.load(_model_path, settings_path, batch_size=batch_size, use_cudnn=use_cudnn), last=True, ) self.nlp = nlp
def load(cls, model_path: str, parameter_path: str, batch_size: int = 32, use_cudnn: bool = False): """ Load a NPAnnotator annotator Args: model_path (str): path to trained model parameter_path (str): path to model parameters batch_size (int, optional): inference batch_size use_cudnn (bool, optional): use gpu for inference (cudnn cells) Returns: NPAnnotator class with loaded model """ _model_path = path.join(path.dirname(path.realpath(__file__)), model_path) validate_existing_filepath(_model_path) _parameter_path = path.join(path.dirname(path.realpath(__file__)), parameter_path) validate_existing_filepath(_parameter_path) model = SequenceChunker(use_cudnn=use_cudnn) model.load(_model_path) with open(_parameter_path, "rb") as fp: model_params = pickle.load(fp) word_vocab = model_params["word_vocab"] chunk_vocab = model_params["chunk_vocab"] char_vocab = model_params.get("char_vocab", None) return cls(model, word_vocab, char_vocab, chunk_vocab, batch_size)
def fit(self, dataset, epochs=10, dev=None): """ Trains a BIST model on an annotated dataset in CoNLL file format. Args: dataset (str): Path to input dataset for training, formatted in CoNLL/U format. epochs (int, optional): Number of learning iterations. dev (str, optional): Path to development dataset for conducting evaluations. """ if dev: dev = validate_existing_filepath(dev) dataset = validate_existing_filepath(dataset) validate((epochs, int, 0, None)) print('\nRunning fit on ' + dataset + '...\n') words, w2i, pos, rels = utils.vocab(dataset) self.params = words, w2i, pos, rels, self.options self.model = MSTParserLSTM(*self.params) for epoch in range(epochs): print('Starting epoch', epoch + 1) self.model.train(dataset) if dev: ext = dev.rindex('.') res_path = dev[:ext] + '_epoch_' + str(epoch + 1) + '_pred' + dev[ext:] utils.write_conll(res_path, self.model.predict(dev)) utils.run_eval(dev, res_path)
def predict(self, dataset, evaluate=False): """ Runs inference with the BIST model on a dataset in CoNLL file format. Args: dataset (str): Path to input CoNLL file. evaluate (bool, optional): Write prediction and evaluation files to dataset's folder. Returns: res (list of list of ConllEntry): The list of input sentences with predicted dependencies attached. """ dataset = validate_existing_filepath(dataset) validate((evaluate, bool)) print("\nRunning predict on " + dataset + "...\n") res = list(self.model.predict(conll_path=dataset)) if evaluate: ext = dataset.rindex(".") pred_path = dataset[:ext] + "_pred" + dataset[ext:] utils.write_conll(pred_path, res) utils.run_eval(dataset, pred_path) return res
def predict(self, dataset, evaluate=False): """ Runs inference with the BIST model on a dataset in CoNLL file format. Args: dataset (str): Path to input CoNLL file. evaluate (bool, optional): Write prediction and evaluation files to dataset's folder. Returns: res (list of list of ConllEntry): The list of input sentences with predicted dependencies attached. """ dataset = validate_existing_filepath(dataset) validate((evaluate, bool)) print('\nRunning predict on ' + dataset + '...\n') res = list(self.model.predict(conll_path=dataset)) if evaluate: ext = dataset.rindex('.') pred_path = dataset[:ext] + '_pred' + dataset[ext:] utils.write_conll(pred_path, res) utils.run_eval(dataset, pred_path) return res
'word representations. ' 'Set `max_n` to be lesser than `min_n` to avoid char ngrams being used.') arg_parser.add_argument( '--word_ngrams', default=1, type=int, choices=[ 0, 1], help='fasttext training hyperparameter. If 1, uses enrich word vectors with subword (' 'ngrams) information. If 0, this is equivalent to word2vec training.') args = arg_parser.parse_args() if args.corpus_format != 'conll2000': validate_existing_filepath(args.corpus) np2vec = NP2vec( args.corpus, args.corpus_format, args.mark_char, args.word_embedding_type, args.sg, args.size, args.window, args.alpha, args.min_alpha, args.min_count, args.sample, args.workers, args.hs,
def validate_existing_filepath(prefix, suffix=None): """Validates existing file in the path constructed from prefix.suffix in case prefix is not None""" if prefix is not None and prefix: io.validate_existing_filepath(fix_path(add_suffix(prefix, suffix)))
def _validate_paths(self, data_path): validate_existing_directory(data_path) for f in self.dataset_files: _f_path = path.join(data_path, self.dataset_files[f]) validate_existing_filepath(_f_path) self.dataset_files[f] = _f_path
parser.add_argument('--output', type=str, help='location were to create dump file', required=True) args = parser.parse_args() def vo_dump(): vo_file = args.vo out_file = args.output mentions_event_gold_file = [args.mentions] vocab = load_mentions_vocab(mentions_event_gold_file, True) vo = VerboceanRelationExtraction.load_verbocean_file(vo_file) vo_for_vocab = {} for word in vocab: if word in vo: vo_for_vocab[word] = vo[word] logger.info('Found %d words from vocabulary', len(vo_for_vocab.keys())) logger.info('Preparing to save refDict output file') with open(out_file, 'w') as f: json.dump(vo_for_vocab, f) logger.info('Done saved to-%s', out_file) if __name__ == '__main__': io.validate_existing_filepath(args.mentions) io.validate_existing_filepath(args.output) io.validate_existing_filepath(args.vo) vo_dump()
target_names=data.labels_0.columns.values)) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--file_path", type=str, default="./", help="file_path where the files to parse are located") parser.add_argument("--data_type", type=str, default="amazon", choices=["amazon"], help="dataset source") parser.add_argument( "--epochs", type=int, default=10, help="Number of epochs for both models", action=check_size(1, 20000), ) args_in = parser.parse_args() # Check file path if args_in.file_path: validate_existing_filepath(args_in.file_path) if args_in.data_type == "amazon": data_in = Amazon_Reviews(args_in.file_path) ensemble_models(data_in, args_in)
parser.add_argument('--model_name', default='chunker_model', type=str, required=True, help='Model name (used for saving the model)') parser.add_argument('-b', type=int, action=check_size(1, 9999), default=1, help='inference batch size') args = parser.parse_args() model_path = path.join(path.dirname(path.realpath(__file__)), '{}.h5'.format(str(args.model_name))) settings_path = path.join(path.dirname(path.realpath(__file__)), '{}.params'.format(str(args.model_name))) validate_existing_filepath(model_path) validate_existing_filepath(settings_path) # load model and parameters model = SequenceChunker() model.load(model_path) word_length = model.max_word_len with open(settings_path, 'rb') as fp: model_params = pickle.load(fp) word_vocab = model_params['word_vocab'] chunk_vocab = model_params['chunk_vocab'] char_vocab = model_params.get('char_vocab', None) # parse documents and get tokens nlp = SpacyInstance( disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])
elmo_ecb_embeddings = load_elmo_for_vocab(mentions) with open(out_file, 'wb') as f: pickle.dump(elmo_ecb_embeddings, f) logger.info('Saving dump to file-%s', out_file) if __name__ == '__main__': parser = argparse.ArgumentParser( description='Create Elmo Embedding dataset only dump') parser.add_argument('--mentions', type=str, help='mentions_file file', required=True) parser.add_argument('--output', type=str, help='location were to create dump file', required=True) args = parser.parse_args() if os.path.isdir(args.mentions): io.validate_existing_directory(args.mentions) else: io.validate_existing_filepath(args.mentions) elmo_dump() print('Done!')
type=float, default=1e-8, help='epsilon used to avoid divide by zero in softmax renormalization.', action=check_size(1e-100, 1e-2)) parser.add_argument('--model_file', default='memn2n_weights.npz', help='File to load model weights from.', type=str) parser.set_defaults(batch_size=32, epochs=200) args = parser.parse_args() validate((args.emb_size, int, 1, 10000), (args.eps, float, 1e-15, 1e-2)) # Sanitize inputs validate_existing_filepath(args.model_file) model_file = args.model_file assert model_file.endswith('.npz') validate_parent_exists(args.data_dir) data_dir = args.data_dir babi = BABI_Dialog(path=data_dir, task=args.task, oov=args.use_oov, use_match_type=args.use_match_type, cache_match_type=args.cache_match_type, cache_vectorized=args.cache_vectorized) weight_saver = Saver() # Set num iterations to 1 epoch since we loop over epochs & shuffle
'word representations. ' 'Set `max_n` to be lesser than `min_n` to avoid char ngrams being used.') arg_parser.add_argument( '--word_ngrams', default=1, type=int, choices=[ 0, 1], help='fasttext training hyperparameter. If 1, uses enrich word vectors with subword (' 'ngrams) information. If 0, this is equivalent to word2vec training.') args = arg_parser.parse_args() if args.corpus_format is not 'conll2000': validate_existing_filepath(args.corpus) np2vec = NP2vec( args.corpus, args.corpus_format, args.mark_char, args.word_embedding_type, args.sg, args.size, args.window, args.alpha, args.min_alpha, args.min_count, args.sample, args.workers, args.hs,
help='epsilon used to avoid divide by zero in softmax renormalization.', action=check_size(1e-100,1e-2)) parser.add_argument( '--model_file', default='memn2n_weights.npz', help='File to load model weights from.', type=str) parser.set_defaults(batch_size=32, epochs=200) args = parser.parse_args() validate((args.emb_size, int, 1, 10000), (args.eps, float, 1e-15, 1e-2)) # Sanitize inputs validate_existing_filepath(args.model_file) model_file = args.model_file assert model_file.endswith('.npz') validate_parent_exists(args.data_dir) data_dir = args.data_dir babi = BABI_Dialog( path=data_dir, task=args.task, oov=args.use_oov, use_match_type=args.use_match_type, cache_match_type=args.cache_match_type, cache_vectorized=args.cache_vectorized) weight_saver = Saver()