def fit(self, dataset, epochs=10, dev=None): """ Trains a BIST model on an annotated dataset in CoNLL file format. Args: dataset (str): Path to input dataset for training, formatted in CoNLL/U format. epochs (int, optional): Number of learning iterations. dev (str, optional): Path to development dataset for conducting evaluations. """ if dev: dev = validate_existing_filepath(dev) dataset = validate_existing_filepath(dataset) validate((epochs, int, 0, None)) print('\nRunning fit on ' + dataset + '...\n') words, w2i, pos, rels = utils.vocab(dataset) self.params = words, w2i, pos, rels, self.options self.model = MSTParserLSTM(*self.params) for epoch in range(epochs): print('Starting epoch', epoch + 1) self.model.train(dataset) if dev: ext = dev.rindex('.') res_path = dev[:ext] + '_epoch_' + str(epoch + 1) + '_pred' + dev[ext:] utils.write_conll(res_path, self.model.predict(dev)) utils.run_eval(dev, res_path)
def fit(self, dataset, epochs=10, dev=None): """ Trains a BIST model on an annotated dataset in CoNLL file format. Args: dataset (str): Path to input dataset for training, formatted in CoNLL/U format. epochs (int, optional): Number of learning iterations. dev (str, optional): Path to development dataset for conducting evaluations. """ if dev: dev = validate_existing_filepath(dev) dataset = validate_existing_filepath(dataset) validate((epochs, int, 0, None)) print("\nRunning fit on " + dataset + "...\n") words, w2i, pos, rels = utils.vocab(dataset) self.params = words, w2i, pos, rels, self.options from nlp_architect.models.bist.mstlstm import MSTParserLSTM self.model = MSTParserLSTM(*self.params) for epoch in range(epochs): print("Starting epoch", epoch + 1) self.model.train(dataset) if dev: ext = dev.rindex(".") res_path = dev[:ext] + "_epoch_" + str(epoch + 1) + "_pred" + dev[ext:] utils.write_conll(res_path, self.model.predict(dev)) utils.run_eval(dev, res_path)
def to_conll(self, doc_text): """Converts a document to CoNLL format with spacy POS tags. Args: doc_text (str): raw document text. Yields: list of ConllEntry: The next sentence in the document in CoNLL format. """ validate((doc_text, str)) for sentence in self.spacy_parser(doc_text).sents: sentence_conll = [ConllEntry(0, '*root*', '*root*', 'ROOT-POS', 'ROOT-CPOS', '_', -1, 'rroot', '_', '_')] i_tok = 0 for tok in sentence: if self.verbose: print(tok.text + '\t' + tok.tag_) if not tok.is_space: pos = tok.tag_ text = tok.text if text != '-' or pos != 'HYPH': pos = _spacy_pos_to_ptb(pos, text) token_conll = ConllEntry(i_tok + 1, text, tok.lemma_, pos, pos, tok.ent_type_, -1, '_', '_', tok.idx) sentence_conll.append(token_conll) i_tok += 1 if self.verbose: print('-----------------------\ninput conll form:') for entry in sentence_conll: print(str(entry.id) + '\t' + entry.form + '\t' + entry.pos + '\t') yield sentence_conll
def _spacy_pos_to_ptb(pos, text): """ Converts a Spacy part-of-speech tag to a Penn Treebank part-of-speech tag. Args: pos (str): Spacy POS tag. text (str): The token text. Returns: ptb_tag (str): Standard PTB POS tag. """ validate((pos, str, 0, 30), (text, str, 0, 1000)) ptb_tag = pos if text in ['...', '—']: ptb_tag = ':' elif text == '*': ptb_tag = 'SYM' elif pos == 'AFX': ptb_tag = 'JJ' elif pos == 'ADD': ptb_tag = 'NN' elif text != pos and text in [',', '.', ":", '``', '-RRB-', '-LRB-']: ptb_tag = text elif pos in ['NFP', 'HYPH', 'XX']: ptb_tag = 'SYM' return ptb_tag
def to_conll(self, doc_text): """Converts a document to CoNLL format with spacy POS tags. Args: doc_text (str): raw document text. Yields: list of ConllEntry: The next sentence in the document in CoNLL format. """ validate((doc_text, str)) for sentence in self.spacy_parser(doc_text).sents: sentence_conll = [ConllEntry(0, '*root*', '*root*', 'ROOT-POS', 'ROOT-CPOS', '_', -1, 'rroot', '_', '_')] i_tok = 0 for tok in sentence: if self.verbose: print(tok.text + '\t' + tok.tag_) if not tok.is_space: pos = tok.tag_ text = tok.text if text != '-' or pos != 'HYPH': pos = _spacy_pos_to_ptb(pos, text) token_conll = ConllEntry(i_tok + 1, text, tok.lemma_, pos, pos, tok.ent_type_, -1, '_', '_', tok.idx) sentence_conll.append(token_conll) i_tok += 1 if self.verbose: print('-----------------------\ninput conll form:') for entry in sentence_conll: print(str(entry.id) + '\t' + entry.form + '\t' + entry.pos + '\t') yield sentence_conll
def _spacy_pos_to_ptb(pos, text): """ Converts a Spacy part-of-speech tag to a Penn Treebank part-of-speech tag. Args: pos (str): Spacy POS tag. text (str): The token text. Returns: ptb_tag (str): Standard PTB POS tag. """ validate((pos, str, 0, 30), (text, str, 0, 100)) ptb_tag = pos if text == '...': ptb_tag = ':' elif text == '*': ptb_tag = 'SYM' elif pos == 'AFX': ptb_tag = 'JJ' elif pos == 'ADD': ptb_tag = 'NN' elif text != pos and text in [',', '.', ":", '``', '-RRB-', '-LRB-']: ptb_tag = text elif pos in ['NFP', 'HYPH', 'XX']: ptb_tag = 'SYM' return ptb_tag
def _spacy_pos_to_ptb(pos, text): """ Converts a Spacy part-of-speech tag to a Penn Treebank part-of-speech tag. Args: pos (str): Spacy POS tag. text (str): The token text. Returns: ptb_tag (str): Standard PTB POS tag. """ validate((pos, str, 0, 30), (text, str, 0, 1000)) ptb_tag = pos if text in ["...", "—"]: ptb_tag = ":" elif text == "*": ptb_tag = "SYM" elif pos == "AFX": ptb_tag = "JJ" elif pos == "ADD": ptb_tag = "NN" elif text != pos and text in [",", ".", ":", "``", "-RRB-", "-LRB-"]: ptb_tag = text elif pos in ["NFP", "HYPH", "XX"]: ptb_tag = "SYM" return ptb_tag
def __init__(self, activation='tanh', lstm_layers=2, lstm_dims=125, pos_dims=25): validate((activation, str), (lstm_layers, int, 0, None), (lstm_dims, int, 0, 1000), (pos_dims, int, 0, 1000)) self.options = get_options_dict(activation, lstm_dims, lstm_layers, pos_dims) self.params = None self.model = None
def parse(self, doc_text, show_tok=True, show_doc=True): """Parse a raw text document. Args: doc_text (str) show_tok (bool, optional): Specifies whether to include token text in output. show_doc (bool, optional): Specifies whether to include document text in output. Returns: CoreNLPDoc: The annotated document. """ validate((doc_text, str), (show_tok, bool), (show_doc, bool)) doc_conll = self.to_conll(doc_text) parsed_doc = CoreNLPDoc() if show_doc: parsed_doc.doc_text = doc_text for sent_conll in self.bist_parser.predict_conll(doc_conll): parsed_sent = [] conj_governors = {'and': set(), 'or': set()} for tok in sent_conll: gov_id = int(tok.pred_parent_id) rel = tok.pred_relation if tok.form != '*root*': if tok.form.lower() == 'and': conj_governors['and'].add(gov_id) if tok.form.lower() == 'or': conj_governors['or'].add(gov_id) if rel == 'conj': if gov_id in conj_governors['and']: rel += '_and' if gov_id in conj_governors['or']: rel += '_or' parsed_tok = { 'start': tok.misc, 'len': len(tok.form), 'pos': tok.pos, 'ner': tok.feats, 'lemma': tok.lemma, 'gov': gov_id - 1, 'rel': rel } if show_tok: parsed_tok['text'] = tok.form parsed_sent.append(parsed_tok) if parsed_sent: parsed_doc.sentences.append(parsed_sent) return parsed_doc
def parse(self, doc_text, show_tok=True, show_doc=True): """Parse a raw text document. Args: doc_text (str) show_tok (bool, optional): Specifies whether to include token text in output. show_doc (bool, optional): Specifies whether to include document text in output. Returns: CoreNLPDoc: The annotated document. """ validate((doc_text, str), (show_tok, bool), (show_doc, bool)) doc_conll = self.to_conll(doc_text) parsed_doc = CoreNLPDoc() if show_doc: parsed_doc.doc_text = doc_text for sent_conll in self.bist_parser.predict_conll(doc_conll): parsed_sent = [] conj_governors = {"and": set(), "or": set()} for tok in sent_conll: gov_id = int(tok.pred_parent_id) rel = tok.pred_relation if tok.form != "*root*": if tok.form.lower() == "and": conj_governors["and"].add(gov_id) if tok.form.lower() == "or": conj_governors["or"].add(gov_id) if rel == "conj": if gov_id in conj_governors["and"]: rel += "_and" if gov_id in conj_governors["or"]: rel += "_or" parsed_tok = { "start": tok.misc, "len": len(tok.form), "pos": tok.pos, "ner": tok.feats, "lemma": tok.lemma, "gov": gov_id - 1, "rel": rel, } if show_tok: parsed_tok["text"] = tok.form parsed_sent.append(parsed_tok) if parsed_sent: parsed_doc.sentences.append(parsed_sent) return parsed_doc
def __init__(self, verbose=False, spacy_model='en', bist_model=None): validate((verbose, bool), (spacy_model, str, 0, 1000), (bist_model, (type(None), str), 0, 1000)) if not bist_model: print("Using pre-trained BIST model.") _download_pretrained_model() bist_model = SpacyBISTParser.pretrained self.verbose = verbose self.bist_parser = BISTModel() self.bist_parser.load(bist_model if bist_model else SpacyBISTParser.pretrained) self.spacy_parser = SpacyInstance(spacy_model, disable=['ner', 'vectors', 'textcat']).parser
def __init__(self, verbose=False, spacy_model="en", bist_model=None): validate( (verbose, bool), (spacy_model, str, 0, 1000), (bist_model, (type(None), str), 0, 1000) ) if not bist_model: print("Using pre-trained BIST model.") _download_pretrained_model() bist_model = SpacyBISTParser._pretrained self.verbose = verbose self.bist_parser = BISTModel() self.bist_parser.load(bist_model if bist_model else SpacyBISTParser._pretrained) self.spacy_parser = SpacyInstance(spacy_model, disable=["ner", "vectors", "textcat"]).parser
def parse(self, doc_text, show_tok=True, show_doc=True): """Parse a raw text document. Args: doc_text (str) show_tok (bool, optional): Specifies whether to include token text in output. show_doc (bool, optional): Specifies whether to include document text in output. Returns: CoreNLPDoc: The annotated document. """ validate((doc_text, str), (show_tok, bool), (show_doc, bool)) doc_conll = self.to_conll(doc_text) parsed_doc = CoreNLPDoc() if show_doc: parsed_doc.doc_text = doc_text for sent_conll in self.bist_parser.predict_conll(doc_conll): parsed_sent = [] conj_governors = {'and': set(), 'or': set()} for tok in sent_conll: gov_id = int(tok.pred_parent_id) rel = tok.pred_relation if tok.form != '*root*': if tok.form.lower() == 'and': conj_governors['and'].add(gov_id) if tok.form.lower() == 'or': conj_governors['or'].add(gov_id) if rel == 'conj': if gov_id in conj_governors['and']: rel += '_and' if gov_id in conj_governors['or']: rel += '_or' parsed_tok = {'start': tok.misc, 'len': len(tok.form), 'pos': tok.pos, 'ner': tok.feats, 'lemma': tok.lemma, 'gov': gov_id - 1, 'rel': rel} if show_tok: parsed_tok['text'] = tok.form parsed_sent.append(parsed_tok) if parsed_sent: parsed_doc.sentences.append(parsed_sent) return parsed_doc
def to_conll(self, doc_text): """Converts a document to CoNLL format with spacy POS tags. Args: doc_text (str): raw document text. Yields: list of ConllEntry: The next sentence in the document in CoNLL format. """ validate((doc_text, str)) for sentence in self.spacy_parser(doc_text).sents: sentence_conll = [ ConllEntry(0, "*root*", "*root*", "ROOT-POS", "ROOT-CPOS", "_", -1, "rroot", "_", "_") ] i_tok = 0 for tok in sentence: if self.verbose: print(tok.text + "\t" + tok.tag_) if not tok.is_space: pos = tok.tag_ text = tok.text if text != "-" or pos != "HYPH": pos = _spacy_pos_to_ptb(pos, text) token_conll = ConllEntry( i_tok + 1, text, tok.lemma_, pos, pos, tok.ent_type_, -1, "_", "_", tok.idx, ) sentence_conll.append(token_conll) i_tok += 1 if self.verbose: print("-----------------------\ninput conll form:") for entry in sentence_conll: print( str(entry.id) + "\t" + entry.form + "\t" + entry.pos + "\t") yield sentence_conll
def predict(self, dataset, evaluate=False): """ Runs inference with the BIST model on a dataset in CoNLL file format. Args: dataset (str): Path to input CoNLL file. evaluate (bool, optional): Write prediction and evaluation files to dataset's folder. Returns: res (list of list of ConllEntry): The list of input sentences with predicted dependencies attached. """ dataset = validate_existing_filepath(dataset) validate((evaluate, bool)) print('\nRunning predict on ' + dataset + '...\n') res = list(self.model.predict(conll_path=dataset)) if evaluate: ext = dataset.rindex('.') pred_path = dataset[:ext] + '_pred' + dataset[ext:] utils.write_conll(pred_path, res) utils.run_eval(dataset, pred_path) return res
def predict(self, dataset, evaluate=False): """ Runs inference with the BIST model on a dataset in CoNLL file format. Args: dataset (str): Path to input CoNLL file. evaluate (bool, optional): Write prediction and evaluation files to dataset's folder. Returns: res (list of list of ConllEntry): The list of input sentences with predicted dependencies attached. """ dataset = validate_existing_filepath(dataset) validate((evaluate, bool)) print("\nRunning predict on " + dataset + "...\n") res = list(self.model.predict(conll_path=dataset)) if evaluate: ext = dataset.rindex(".") pred_path = dataset[:ext] + "_pred" + dataset[ext:] utils.write_conll(pred_path, res) utils.run_eval(dataset, pred_path) return res
def validate_input_args(args): validate((args.b, int, 1, 100000)) validate((args.e, int, 1, 100000)) validate((args.tag_num, int, 1, 1000)) validate((args.sentence_length, int, 1, 10000)) validate((args.word_length, int, 1, 100)) validate((args.word_embedding_dims, int, 1, 10000)) validate((args.character_embedding_dims, int, 1, 1000)) validate((args.char_features_lstm_dims, int, 1, 10000)) validate((args.entity_tagger_lstm_dims, int, 1, 10000)) validate((args.dropout, float, 0, 1)) model_path = path.join(path.dirname(path.realpath(__file__)), str(args.model_path)) validate_parent_exists(model_path) model_info_path = path.join(path.dirname(path.realpath(__file__)), str(args.model_info_path)) validate_parent_exists(model_info_path)
def validate_input_args(): global model_path validate((args.b, int, 1, 100000000)) validate((args.e, int, 1, 100000000)) validate((args.sentence_length, int, 1, 10000)) validate((args.token_emb_size, int, 1, 10000)) validate((args.lstm_hidden_size, int, 1, 10000)) validate((args.encoder_depth, int, 1, 10)) validate((args.decoder_depth, int, 1, 10)) validate((args.encoder_dropout, float, 0, 1)) validate((args.decoder_dropout, float, 0, 1)) model_path = path.join(path.dirname(path.realpath(__file__)), str(args.model_path)) validate_parent_exists(model_path) model_info_path = path.join(path.dirname(path.realpath(__file__)), str(args.model_info_path)) validate_parent_exists(model_info_path)
help='use OOV test set') parser.add_argument( '--eps', type=float, default=1e-8, help='epsilon used to avoid divide by zero in softmax renormalization.', action=check_size(1e-100, 1e-2)) parser.add_argument('--model_file', default='memn2n_weights.npz', help='File to load model weights from.', type=str) parser.set_defaults(batch_size=32, epochs=200) args = parser.parse_args() validate((args.emb_size, int, 1, 10000), (args.eps, float, 1e-15, 1e-2)) # Sanitize inputs validate_existing_filepath(args.model_file) model_file = args.model_file assert model_file.endswith('.npz') validate_parent_exists(args.data_dir) data_dir = args.data_dir babi = BABI_Dialog(path=data_dir, task=args.task, oov=args.use_oov, use_match_type=args.use_match_type, cache_match_type=args.cache_match_type, cache_vectorized=args.cache_vectorized)
def __init__(self, activation='tanh', lstm_layers=2, lstm_dims=125, pos_dims=25): validate((activation, str), (lstm_layers, int, 0, None), (lstm_dims, int, 0, 1000), (pos_dims, int, 0, 1000)) self.options = get_options_dict(activation, lstm_dims, lstm_layers, pos_dims) self.params = None self.model = None
print(' '.join(['%-{}s'.format(x[2]) % x[1] for x in print_helper])) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--model_path', type=validate_existing_filepath, required=True, help='Model file path') parser.add_argument('--dataset_path', type=validate_existing_directory, required=True, help='dataset directory') parser.add_argument('--embedding_model', type=validate_existing_filepath, help='Path to word embedding model') parser.add_argument('--embedding_size', type=int, help='Word embedding model vector size') args = parser.parse_args() if args.embedding_size is not None: validate((args.embedding_size, int, 1, 10000)) model = IntentExtractionModel() model.load(args.model_path) ds = SNIPS(path=args.dataset_path) nlp = SpacyInstance(disable=['tagger', 'ner', 'parser', 'vectors', 'textcat']) emb_vectors = None if args.embedding_model is not None: print('Loading external word embedding model') emb_vectors, emb_size = load_word_embeddings(args.embedding_model) while True: text = input('>> ') tokens = process_text(text)
parser.add_argument( '--interactive', default=False, action='store_true', help='enable interactive mode at the end of training.') parser.add_argument( '--test', default=False, action='store_true', help='evaluate on the test set at the end of training.') parser.set_defaults(batch_size=32, epochs=200) args = parser.parse_args() validate((args.emb_size, int, 1, 10000), (args.eps, float, 1e-15, 1e-2), (args.lr, float, 1e-8, 10), (args.grad_clip_norm, float, 1e-3, 1e5)) # Validate inputs validate_parent_exists(args.log_file) log_file = args.log_file validate_parent_exists(args.weights_save_path) weights_save_path = args.weights_save_path validate_parent_exists(args.data_dir) data_dir = args.data_dir assert weights_save_path.endswith('.npz') assert log_file.endswith('.txt') gradient_clip_norm = args.grad_clip_norm babi = BABI_Dialog(
# parse the command line arguments tf.flags.DEFINE_integer( "task", 1, "the task ID to train/test on from bAbI-dialog dataset (1-6)") tf.flags.DEFINE_integer("emb_size", 20, "Size of the word-embedding used in the model.") tf.flags.DEFINE_integer("nhops", 3, "Number of memory hops in the network") tf.flags.DEFINE_boolean("use_match_type", False, "use match type features") tf.flags.DEFINE_boolean("cache_match_type", False, "cache match type answers") tf.flags.DEFINE_boolean("cache_vectorized", False, "cache vectorized data") tf.flags.DEFINE_boolean("use_oov", False, "use OOV test set") tf.flags.DEFINE_string("data_dir", "data/", "File to save model weights to.") tf.flags.DEFINE_string("weights_save_path", "saved_tf/", "File to save model weights to.") FLAGS = tf.flags.FLAGS validate((FLAGS.task, int, 1, 7), (FLAGS.nhops, int, 1, 100), (FLAGS.emb_size, int, 1, 10000)) # Validate inputs current_dir = os.path.dirname(os.path.realpath(__file__)) weights_save_path = os.path.join(current_dir, FLAGS.weights_save_path) validate_parent_exists(weights_save_path) data_dir = os.path.join(current_dir, FLAGS.data_dir) validate_parent_exists(data_dir) babi = BABI_Dialog( path=data_dir, task=FLAGS.task, oov=FLAGS.use_oov, use_match_type=FLAGS.use_match_type, cache_match_type=FLAGS.cache_match_type, cache_vectorized=FLAGS.cache_vectorized,
'word_vocab': dataset.word_vocab, 'pos_vocab': dataset.pos_vocab, 'chunk_vocab': dataset.chunk_vocab } if args.char_features is True: model_params.update({'char_vocab': dataset.char_vocab}) with open(settings_path, 'wb') as fp: pickle.dump(model_params, fp) model.save(model_path) if __name__ == '__main__': # read input args and validate parser = create_argument_parser() args = parser.parse_args() validate((args.sentence_length, int, 1, 1000)) validate((args.feature_size, int, 1, 10000)) validate((args.b, int, 1, 100000)) validate((args.e, int, 1, 100000)) model_path = path.join(path.dirname(path.realpath(__file__)), '{}.h5'.format(str(args.model_name))) settings_path = path.join(path.dirname(path.realpath(__file__)), '{}.params'.format(str(args.model_name))) validate_parent_exists(model_path) # load dataset and get tokens/chunks/pos tags dataset = CONLL2000(data_path=args.data_dir, sentence_length=args.sentence_length, extract_chars=args.char_features, max_word_length=args.max_word_length) train_set = dataset.train_set
def validate_input_args(): global model_path, settings_path validate((args.sentence_len, int, 1, 1000)) validate((args.lstm_depth, int, 1, 10)) validate((args.lstm_hidden_size, int, 1, 10000)) validate((args.token_embedding_size, int, 1, 10000)) validate((args.pos_embedding_size, int, 1, 1000)) validate((args.vocab_size, int, 1, 100000000)) validate((args.char_hidden_size, int, 1, 1000)) validate((args.max_char_word_length, int, 1, 100)) model_path = path.join(path.dirname(path.realpath(__file__)), str(args.model_name)) settings_path = path.join(path.dirname(path.realpath(__file__)), str(args.settings)) validate_parent_exists(model_path) validate_parent_exists(settings_path)
def validate_input_args(input_args): validate((input_args.b, int, 1, 100000)) validate((input_args.e, int, 1, 100000)) validate((input_args.tag_num, int, 1, 1000)) validate((input_args.sentence_length, int, 1, 10000)) validate((input_args.word_length, int, 1, 100)) validate((input_args.word_embedding_dims, int, 1, 10000)) validate((input_args.character_embedding_dims, int, 1, 1000)) validate((input_args.char_features_lstm_dims, int, 1, 10000)) validate((input_args.entity_tagger_lstm_dims, int, 1, 10000)) validate((input_args.dropout, float, 0, 1)) model_path = path.join(path.dirname(path.realpath(__file__)), str(input_args.model_path)) validate_parent_exists(model_path) model_info_path = path.join(path.dirname(path.realpath(__file__)), str(input_args.model_info_path)) validate_parent_exists(model_info_path)
def add_page(search_page, phrase): try: if search_page is not None: if phrase not in result_dump: result_dump[phrase] = [] result_dump[phrase].append(search_page) else: pages = result_dump[phrase] for page in pages: if page.pageid == search_page.pageid: return result_dump[phrase].append(search_page) logger.info('page-%s added', str(search_page)) except Exception: logger.error('could not extract wiki info from phrase-%s', search_page.orig_phrase) if __name__ == '__main__': io.validate_existing_filepath(args.mentions) io.validate_existing_filepath(args.output) if args.host: io.validate((args.host, str, 1, 1000)) if args.port: io.validate((args.port, int, 1, 65536)) if args.index: io.validate((args.index, str, 1, 10000)) wiki_dump_from_gs()
def validate_input_args(): global model_path validate((args.b, int, 1, 100000000)) validate((args.e, int, 1, 100000000)) validate((args.sentence_length, int, 1, 10000)) validate((args.token_emb_size, int, 1, 10000)) validate((args.lstm_hidden_size, int, 1, 10000)) validate((args.encoder_depth, int, 1, 10)) validate((args.decoder_depth, int, 1, 10)) validate((args.save_epochs, int, 1, 1000)) validate((args.encoder_dropout, float, 0, 1)) validate((args.decoder_dropout, float, 0, 1)) model_path = path.join(path.dirname(path.realpath(__file__)), str(args.model_path)) validate_parent_exists(model_path)
help='specifies whether to use a second dictionary for words within ' + 'specified extended window. ie for "-w 1 -d 2", the ' + 'sentence "hello world how are things" creates a window of "2:hello ' + '1:world <NULL> 1:are 2:things"') parser.add_argument('-t', '--num_threads', type=int, default=4, help='number of threads to use', action=check_size(1, 10)) args = vars(parser.parse_args()) validate_parent_exists(args['data_dir']) if args['entities']: validate_parent_exists(args['entities']) validate((args['window_size'], str, 1, 100), (args['double_dict'], str, 1, 100)) beg = time.time() if args['data_dir']: # also set the entities and input file here args['entities'] = os.path.expanduser( args['data_dir'] + '/movieqa/knowledge_source/entities.txt') args['input_file'] = [ os.path.expanduser(args['data_dir'] + '/movieqa/knowledge_source/wiki.txt') ] else: ValueError("No data_dir given.") with open(
def validate_input_args(): global model_path, settings_path validate((args.sentence_len, int, 1, 1000)) validate((args.lstm_depth, int, 1, 10)) validate((args.lstm_hidden_size, int, 1, 10000)) validate((args.token_embedding_size, int, 1, 10000)) validate((args.pos_embedding_size, int, 1, 1000)) validate((args.vocab_size, int, 1, 100000000)) validate((args.char_hidden_size, int, 1, 1000)) validate((args.max_char_word_length, int, 1, 100)) model_path = path.join(path.dirname(path.realpath(__file__)), str(args.model_name)) settings_path = path.join(path.dirname(path.realpath(__file__)), str(args.settings)) validate_parent_exists(model_path) validate_parent_exists(settings_path)
type=validate_existing_filepath, required=True, help='Model file path') parser.add_argument('--dataset_path', type=validate_existing_directory, required=True, help='dataset directory') parser.add_argument('--embedding_model', type=validate_existing_filepath, help='Path to word embedding model') parser.add_argument('--embedding_size', type=int, help='Word embedding model vector size') args = parser.parse_args() if args.embedding_size is not None: validate((args.embedding_size, int, 1, 10000)) model = IntentExtractionModel() model.load(args.model_path) ds = SNIPS(path=args.dataset_path) nlp = SpacyInstance( disable=['tagger', 'ner', 'parser', 'vectors', 'textcat']) emb_vectors = None if args.embedding_model is not None: print('Loading external word embedding model') emb_vectors, emb_size = load_word_embeddings(args.embedding_model) while True: text = input('>> ')
tf.flags.DEFINE_integer("epochs", 100, "Number of epochs between saving model weights.") tf.flags.DEFINE_boolean("restore", False, "Restore weights if found.") tf.flags.DEFINE_boolean("interactive", False, "enable interactive mode at the end of training.") tf.flags.DEFINE_boolean("test", False, "evaluate on the test set at the end of training.") FLAGS = tf.flags.FLAGS # Validate inputs validate( (FLAGS.task, int, 1, 7), (FLAGS.nhops, int, 1, 10), (FLAGS.batch_size, int, 1, 32000), (FLAGS.emb_size, int, 1, 10000), (FLAGS.eps, float, 1e-15, 1e-2), (FLAGS.lr, float, 1e-8, 10), (FLAGS.grad_clip_norm, float, 1e-3, 1e5), (FLAGS.epochs, int, 1, 1e10), (FLAGS.save_epochs, int, 1, 1e10), ) current_dir = os.path.dirname(os.path.realpath(__file__)) log_file = os.path.join(current_dir, FLAGS.log_file) validate_parent_exists(log_file) weights_save_path = os.path.join(current_dir, FLAGS.weights_save_path) validate_parent_exists(weights_save_path) data_dir = os.path.join(current_dir, FLAGS.data_dir) validate_parent_exists(data_dir) assert log_file.endswith(".txt")
def validate_arguments(args): """Validate input arguments""" io.validate((args.num_units, int, 1, None)) io.validate((args.num_layers, int, 1, None)) io.validate((args.num_encoder_layers, (int, type(None)), 0, None)) io.validate((args.num_decoder_layers, (int, type(None)), 0, None)) io.validate((args.num_embeddings_partitions, int, 0, None)) io.validate((args.learning_rate, float, 0.0, None)) io.validate((args.num_train_steps, int, 1, None)) io.validate((args.warmup_steps, int, 0, args.num_train_steps)) io.validate((args.init_weight, float)) io.validate((args.src, (str, type(None)), 1, 256)) io.validate((args.tgt, (str, type(None)), 1, 256)) io.validate((args.sos, str, 1, 256)) io.validate((args.eos, str, 1, 256)) io.validate((args.src_max_len, int, 1, None)) io.validate((args.tgt_max_len, int, 1, None)) io.validate((args.src_max_len_infer, (int, type(None)), 1, None)) io.validate((args.tgt_max_len_infer, (int, type(None)), 1, None)) io.validate((args.forget_bias, float, 0.0, None)) io.validate((args.dropout, float, 0.0, 1.0)) io.validate((args.max_gradient_norm, float, 0.000000001, None)) io.validate((args.batch_size, int, 1, None)) io.validate((args.steps_per_stats, int, 1, None)) io.validate((args.max_train, int, 0, None)) io.validate((args.num_buckets, int, 1, None)) io.validate((args.num_sampled_softmax, int, 0, None)) io.validate((args.num_gpus, int, 0, None)) io.validate((args.metrics, str, 1, 256)) io.validate((args.inference_list, (str, type(None)), 0, 256)) io.validate((args.steps_per_external_eval, (int, type(None)), 1, None)) io.validate((args.scope, (str, type(None)), 1, 256)) io.validate((args.random_seed, (int, type(None)))) io.validate((args.num_keep_ckpts, int, 0, None)) io.validate((args.infer_batch_size, int, 1, None)) io.validate((args.beam_width, int, 0, None)) io.validate((args.length_penalty_weight, float, 0.0, None)) io.validate((args.sampling_temperature, float, 0.0, None)) io.validate((args.num_translations_per_input, int, 1, None)) io.validate((args.jobid, int, 0, None)) io.validate((args.num_workers, int, 1, None)) io.validate((args.num_inter_threads, int, 0, None)) io.validate((args.num_intra_threads, int, 0, None)) io.validate((args.pruning_hparams, (str, type(None)), 1, 256)) suffixes = [args.src] if not args.language_model: suffixes.append(args.tgt) for suffix in suffixes: validate_existing_filepath(args.train_prefix, suffix) validate_existing_filepath(args.dev_prefix, suffix) validate_existing_filepath(args.test_prefix, suffix) validate_existing_filepath(args.vocab_prefix, suffix) validate_existing_filepath(args.embed_prefix, suffix) validate_existing_filepath(args.inference_ref_file) validate_existing_filepath(args.inference_input_file) validate_existing_filepath(args.hparams_path) validate_parent_exists(args.ckpt) validate_parent_exists(args.inference_output_file) validate_parent_exists(args.out_dir)
help='sizes of windows PER SIDE around words to generate. eg 1 or ' + '1,2,3. ie "-w 1" for "hey world hey" produces "hey <NULL> hey"') parser.add_argument('-d', '--double_dict', type=str, default='3', help='specifies whether to use a second dictionary for words within ' + 'specified extended window. ie for "-w 1 -d 2", the ' + 'sentence "hello world how are things" creates a window of "2:hello ' + '1:world <NULL> 1:are 2:things"') parser.add_argument('-t', '--num_threads', type=int, default=4, help='number of threads to use', action=check_size(1,10)) args = vars(parser.parse_args()) validate_parent_exists(args['data_dir']) if args['entities']: validate_parent_exists(args['entities']) validate((args['window_size'], str, 1, 100), (args['double_dict'], str, 1, 100)) beg = time.time() if args['data_dir']: # also set the entities and input file here args['entities'] = os.path.expanduser(args['data_dir'] + '/movieqa/knowledge_source/entities.txt') args['input_file'] = [os.path.expanduser(args['data_dir'] + '/movieqa/knowledge_source/wiki.txt')] else: ValueError("No data_dir given.") with open(os.path.expanduser(args['data_dir'] + '/movieqa/lower_wiki-w=0-d=3-m-4.txt'), 'w') as out: try:
model.save(model_file_path) # set evaluation error rates loss, binary_accuracy, precision, recall, f1 = model.eval(dataset.test_set) print('loss = %.1f%%' % (loss)) print('Test binary_accuracy rate = %.1f%%' % (binary_accuracy * 100)) print('Test precision rate = %.1f%%' % (precision * 100)) print('Test recall rate = %.1f%%' % (recall * 100)) print('Test f1 rate = %.1f%%' % (f1 * 100)) if __name__ == "__main__": # parse the command line arguments parser = argparse.ArgumentParser() parser.set_defaults(epochs=200) parser.add_argument( '--data', type=validate_existing_filepath, help='Path to the CSV file where the prepared dataset is saved') parser.add_argument('--model_path', type=validate_parent_exists, help='Path to save the model') args = parser.parse_args() validate((args.epochs, int, 1, 100000)) data_path = absolute_path(args.data) model_path = absolute_path(args.model_path) num_epochs = args.epochs # load data sets from file data_set = NpSemanticSegData(data_path, train_to_test_ratio=0.8) # train the mlp classifier train_mlp_classifier(data_set, model_path, args.epochs)
parser.add_argument( '--eps', type=float, default=1e-8, help='epsilon used to avoid divide by zero in softmax renormalization.', action=check_size(1e-100,1e-2)) parser.add_argument( '--model_file', default='memn2n_weights.npz', help='File to load model weights from.', type=str) parser.set_defaults(batch_size=32, epochs=200) args = parser.parse_args() validate((args.emb_size, int, 1, 10000), (args.eps, float, 1e-15, 1e-2)) # Sanitize inputs validate_existing_filepath(args.model_file) model_file = args.model_file assert model_file.endswith('.npz') validate_parent_exists(args.data_dir) data_dir = args.data_dir babi = BABI_Dialog( path=data_dir, task=args.task, oov=args.use_oov, use_match_type=args.use_match_type, cache_match_type=args.cache_match_type, cache_vectorized=args.cache_vectorized)