Ejemplo n.º 1
0
    def fit(self, dataset, epochs=10, dev=None):
        """
        Trains a BIST model on an annotated dataset in CoNLL file format.

        Args:
            dataset (str): Path to input dataset for training, formatted in CoNLL/U format.
            epochs (int, optional): Number of learning iterations.
            dev (str, optional): Path to development dataset for conducting evaluations.
        """
        if dev:
            dev = validate_existing_filepath(dev)
        dataset = validate_existing_filepath(dataset)
        validate((epochs, int, 0, None))

        print('\nRunning fit on ' + dataset + '...\n')
        words, w2i, pos, rels = utils.vocab(dataset)
        self.params = words, w2i, pos, rels, self.options
        self.model = MSTParserLSTM(*self.params)

        for epoch in range(epochs):
            print('Starting epoch', epoch + 1)
            self.model.train(dataset)
            if dev:
                ext = dev.rindex('.')
                res_path = dev[:ext] + '_epoch_' + str(epoch + 1) + '_pred' + dev[ext:]
                utils.write_conll(res_path, self.model.predict(dev))
                utils.run_eval(dev, res_path)
Ejemplo n.º 2
0
    def fit(self, dataset, epochs=10, dev=None):
        """
        Trains a BIST model on an annotated dataset in CoNLL file format.

        Args:
            dataset (str): Path to input dataset for training, formatted in CoNLL/U format.
            epochs (int, optional): Number of learning iterations.
            dev (str, optional): Path to development dataset for conducting evaluations.
        """
        if dev:
            dev = validate_existing_filepath(dev)
        dataset = validate_existing_filepath(dataset)
        validate((epochs, int, 0, None))

        print("\nRunning fit on " + dataset + "...\n")
        words, w2i, pos, rels = utils.vocab(dataset)
        self.params = words, w2i, pos, rels, self.options

        from nlp_architect.models.bist.mstlstm import MSTParserLSTM

        self.model = MSTParserLSTM(*self.params)

        for epoch in range(epochs):
            print("Starting epoch", epoch + 1)
            self.model.train(dataset)
            if dev:
                ext = dev.rindex(".")
                res_path = dev[:ext] + "_epoch_" + str(epoch +
                                                       1) + "_pred" + dev[ext:]
                utils.write_conll(res_path, self.model.predict(dev))
                utils.run_eval(dev, res_path)
Ejemplo n.º 3
0
    def to_conll(self, doc_text):
        """Converts a document to CoNLL format with spacy POS tags.

        Args:
            doc_text (str): raw document text.

        Yields:
            list of ConllEntry: The next sentence in the document in CoNLL format.
        """
        validate((doc_text, str))
        for sentence in self.spacy_parser(doc_text).sents:
            sentence_conll = [ConllEntry(0, '*root*', '*root*', 'ROOT-POS', 'ROOT-CPOS', '_',
                                         -1, 'rroot', '_', '_')]
            i_tok = 0
            for tok in sentence:
                if self.verbose:
                    print(tok.text + '\t' + tok.tag_)

                if not tok.is_space:
                    pos = tok.tag_
                    text = tok.text

                    if text != '-' or pos != 'HYPH':
                        pos = _spacy_pos_to_ptb(pos, text)
                        token_conll = ConllEntry(i_tok + 1, text, tok.lemma_, pos, pos,
                                                 tok.ent_type_, -1, '_', '_', tok.idx)
                        sentence_conll.append(token_conll)
                        i_tok += 1

            if self.verbose:
                print('-----------------------\ninput conll form:')
                for entry in sentence_conll:
                    print(str(entry.id) + '\t' + entry.form + '\t' + entry.pos + '\t')
            yield sentence_conll
Ejemplo n.º 4
0
def _spacy_pos_to_ptb(pos, text):
    """
    Converts a Spacy part-of-speech tag to a Penn Treebank part-of-speech tag.

    Args:
        pos (str): Spacy POS tag.
        text (str): The token text.

    Returns:
        ptb_tag (str): Standard PTB POS tag.
    """
    validate((pos, str, 0, 30), (text, str, 0, 1000))
    ptb_tag = pos
    if text in ['...', '—']:
        ptb_tag = ':'
    elif text == '*':
        ptb_tag = 'SYM'
    elif pos == 'AFX':
        ptb_tag = 'JJ'
    elif pos == 'ADD':
        ptb_tag = 'NN'
    elif text != pos and text in [',', '.', ":", '``', '-RRB-', '-LRB-']:
        ptb_tag = text
    elif pos in ['NFP', 'HYPH', 'XX']:
        ptb_tag = 'SYM'
    return ptb_tag
Ejemplo n.º 5
0
    def to_conll(self, doc_text):
        """Converts a document to CoNLL format with spacy POS tags.

        Args:
            doc_text (str): raw document text.

        Yields:
            list of ConllEntry: The next sentence in the document in CoNLL format.
        """
        validate((doc_text, str))
        for sentence in self.spacy_parser(doc_text).sents:
            sentence_conll = [ConllEntry(0, '*root*', '*root*', 'ROOT-POS', 'ROOT-CPOS', '_',
                                         -1, 'rroot', '_', '_')]
            i_tok = 0
            for tok in sentence:
                if self.verbose:
                    print(tok.text + '\t' + tok.tag_)

                if not tok.is_space:
                    pos = tok.tag_
                    text = tok.text

                    if text != '-' or pos != 'HYPH':
                        pos = _spacy_pos_to_ptb(pos, text)
                        token_conll = ConllEntry(i_tok + 1, text, tok.lemma_, pos, pos,
                                                 tok.ent_type_, -1, '_', '_', tok.idx)
                        sentence_conll.append(token_conll)
                        i_tok += 1

            if self.verbose:
                print('-----------------------\ninput conll form:')
                for entry in sentence_conll:
                    print(str(entry.id) + '\t' + entry.form + '\t' + entry.pos + '\t')
            yield sentence_conll
Ejemplo n.º 6
0
def _spacy_pos_to_ptb(pos, text):
    """
    Converts a Spacy part-of-speech tag to a Penn Treebank part-of-speech tag.

    Args:
        pos (str): Spacy POS tag.
        text (str): The token text.

    Returns:
        ptb_tag (str): Standard PTB POS tag.
    """
    validate((pos, str, 0, 30), (text, str, 0, 100))
    ptb_tag = pos
    if text == '...':
        ptb_tag = ':'
    elif text == '*':
        ptb_tag = 'SYM'
    elif pos == 'AFX':
        ptb_tag = 'JJ'
    elif pos == 'ADD':
        ptb_tag = 'NN'
    elif text != pos and text in [',', '.', ":", '``', '-RRB-', '-LRB-']:
        ptb_tag = text
    elif pos in ['NFP', 'HYPH', 'XX']:
        ptb_tag = 'SYM'
    return ptb_tag
Ejemplo n.º 7
0
def _spacy_pos_to_ptb(pos, text):
    """
    Converts a Spacy part-of-speech tag to a Penn Treebank part-of-speech tag.

    Args:
        pos (str): Spacy POS tag.
        text (str): The token text.

    Returns:
        ptb_tag (str): Standard PTB POS tag.
    """
    validate((pos, str, 0, 30), (text, str, 0, 1000))
    ptb_tag = pos
    if text in ["...", "—"]:
        ptb_tag = ":"
    elif text == "*":
        ptb_tag = "SYM"
    elif pos == "AFX":
        ptb_tag = "JJ"
    elif pos == "ADD":
        ptb_tag = "NN"
    elif text != pos and text in [",", ".", ":", "``", "-RRB-", "-LRB-"]:
        ptb_tag = text
    elif pos in ["NFP", "HYPH", "XX"]:
        ptb_tag = "SYM"
    return ptb_tag
Ejemplo n.º 8
0
 def __init__(self,
              activation='tanh',
              lstm_layers=2,
              lstm_dims=125,
              pos_dims=25):
     validate((activation, str), (lstm_layers, int, 0, None),
              (lstm_dims, int, 0, 1000), (pos_dims, int, 0, 1000))
     self.options = get_options_dict(activation, lstm_dims, lstm_layers,
                                     pos_dims)
     self.params = None
     self.model = None
Ejemplo n.º 9
0
    def parse(self, doc_text, show_tok=True, show_doc=True):
        """Parse a raw text document.

        Args:
            doc_text (str)
            show_tok (bool, optional): Specifies whether to include token text in output.
            show_doc (bool, optional): Specifies whether to include document text in output.

        Returns:
            CoreNLPDoc: The annotated document.
        """
        validate((doc_text, str), (show_tok, bool), (show_doc, bool))
        doc_conll = self.to_conll(doc_text)
        parsed_doc = CoreNLPDoc()

        if show_doc:
            parsed_doc.doc_text = doc_text

        for sent_conll in self.bist_parser.predict_conll(doc_conll):
            parsed_sent = []
            conj_governors = {'and': set(), 'or': set()}

            for tok in sent_conll:
                gov_id = int(tok.pred_parent_id)
                rel = tok.pred_relation

                if tok.form != '*root*':
                    if tok.form.lower() == 'and':
                        conj_governors['and'].add(gov_id)
                    if tok.form.lower() == 'or':
                        conj_governors['or'].add(gov_id)

                    if rel == 'conj':
                        if gov_id in conj_governors['and']:
                            rel += '_and'
                        if gov_id in conj_governors['or']:
                            rel += '_or'

                    parsed_tok = {
                        'start': tok.misc,
                        'len': len(tok.form),
                        'pos': tok.pos,
                        'ner': tok.feats,
                        'lemma': tok.lemma,
                        'gov': gov_id - 1,
                        'rel': rel
                    }

                    if show_tok:
                        parsed_tok['text'] = tok.form
                    parsed_sent.append(parsed_tok)
            if parsed_sent:
                parsed_doc.sentences.append(parsed_sent)
        return parsed_doc
Ejemplo n.º 10
0
    def parse(self, doc_text, show_tok=True, show_doc=True):
        """Parse a raw text document.

        Args:
            doc_text (str)
            show_tok (bool, optional): Specifies whether to include token text in output.
            show_doc (bool, optional): Specifies whether to include document text in output.

        Returns:
            CoreNLPDoc: The annotated document.
        """
        validate((doc_text, str), (show_tok, bool), (show_doc, bool))
        doc_conll = self.to_conll(doc_text)
        parsed_doc = CoreNLPDoc()

        if show_doc:
            parsed_doc.doc_text = doc_text

        for sent_conll in self.bist_parser.predict_conll(doc_conll):
            parsed_sent = []
            conj_governors = {"and": set(), "or": set()}

            for tok in sent_conll:
                gov_id = int(tok.pred_parent_id)
                rel = tok.pred_relation

                if tok.form != "*root*":
                    if tok.form.lower() == "and":
                        conj_governors["and"].add(gov_id)
                    if tok.form.lower() == "or":
                        conj_governors["or"].add(gov_id)

                    if rel == "conj":
                        if gov_id in conj_governors["and"]:
                            rel += "_and"
                        if gov_id in conj_governors["or"]:
                            rel += "_or"

                    parsed_tok = {
                        "start": tok.misc,
                        "len": len(tok.form),
                        "pos": tok.pos,
                        "ner": tok.feats,
                        "lemma": tok.lemma,
                        "gov": gov_id - 1,
                        "rel": rel,
                    }

                    if show_tok:
                        parsed_tok["text"] = tok.form
                    parsed_sent.append(parsed_tok)
            if parsed_sent:
                parsed_doc.sentences.append(parsed_sent)
        return parsed_doc
Ejemplo n.º 11
0
    def __init__(self, verbose=False, spacy_model='en', bist_model=None):
        validate((verbose, bool), (spacy_model, str, 0, 1000),
                 (bist_model, (type(None), str), 0, 1000))
        if not bist_model:
            print("Using pre-trained BIST model.")
            _download_pretrained_model()
            bist_model = SpacyBISTParser.pretrained

        self.verbose = verbose
        self.bist_parser = BISTModel()
        self.bist_parser.load(bist_model if bist_model else SpacyBISTParser.pretrained)
        self.spacy_parser = SpacyInstance(spacy_model,
                                          disable=['ner', 'vectors', 'textcat']).parser
Ejemplo n.º 12
0
    def __init__(self, verbose=False, spacy_model="en", bist_model=None):
        validate(
            (verbose, bool), (spacy_model, str, 0, 1000), (bist_model, (type(None), str), 0, 1000)
        )
        if not bist_model:
            print("Using pre-trained BIST model.")
            _download_pretrained_model()
            bist_model = SpacyBISTParser._pretrained

        self.verbose = verbose
        self.bist_parser = BISTModel()
        self.bist_parser.load(bist_model if bist_model else SpacyBISTParser._pretrained)
        self.spacy_parser = SpacyInstance(spacy_model, disable=["ner", "vectors", "textcat"]).parser
Ejemplo n.º 13
0
    def parse(self, doc_text, show_tok=True, show_doc=True):
        """Parse a raw text document.

        Args:
            doc_text (str)
            show_tok (bool, optional): Specifies whether to include token text in output.
            show_doc (bool, optional): Specifies whether to include document text in output.

        Returns:
            CoreNLPDoc: The annotated document.
        """
        validate((doc_text, str), (show_tok, bool), (show_doc, bool))
        doc_conll = self.to_conll(doc_text)
        parsed_doc = CoreNLPDoc()

        if show_doc:
            parsed_doc.doc_text = doc_text

        for sent_conll in self.bist_parser.predict_conll(doc_conll):
            parsed_sent = []
            conj_governors = {'and': set(), 'or': set()}

            for tok in sent_conll:
                gov_id = int(tok.pred_parent_id)
                rel = tok.pred_relation

                if tok.form != '*root*':
                    if tok.form.lower() == 'and':
                        conj_governors['and'].add(gov_id)
                    if tok.form.lower() == 'or':
                        conj_governors['or'].add(gov_id)

                    if rel == 'conj':
                        if gov_id in conj_governors['and']:
                            rel += '_and'
                        if gov_id in conj_governors['or']:
                            rel += '_or'

                    parsed_tok = {'start': tok.misc, 'len': len(tok.form),
                                  'pos': tok.pos, 'ner': tok.feats,
                                  'lemma': tok.lemma, 'gov': gov_id - 1,
                                  'rel': rel}

                    if show_tok:
                        parsed_tok['text'] = tok.form
                    parsed_sent.append(parsed_tok)
            if parsed_sent:
                parsed_doc.sentences.append(parsed_sent)
        return parsed_doc
Ejemplo n.º 14
0
    def to_conll(self, doc_text):
        """Converts a document to CoNLL format with spacy POS tags.

        Args:
            doc_text (str): raw document text.

        Yields:
            list of ConllEntry: The next sentence in the document in CoNLL format.
        """
        validate((doc_text, str))
        for sentence in self.spacy_parser(doc_text).sents:
            sentence_conll = [
                ConllEntry(0, "*root*", "*root*", "ROOT-POS", "ROOT-CPOS", "_",
                           -1, "rroot", "_", "_")
            ]
            i_tok = 0
            for tok in sentence:
                if self.verbose:
                    print(tok.text + "\t" + tok.tag_)

                if not tok.is_space:
                    pos = tok.tag_
                    text = tok.text

                    if text != "-" or pos != "HYPH":
                        pos = _spacy_pos_to_ptb(pos, text)
                        token_conll = ConllEntry(
                            i_tok + 1,
                            text,
                            tok.lemma_,
                            pos,
                            pos,
                            tok.ent_type_,
                            -1,
                            "_",
                            "_",
                            tok.idx,
                        )
                        sentence_conll.append(token_conll)
                        i_tok += 1

            if self.verbose:
                print("-----------------------\ninput conll form:")
                for entry in sentence_conll:
                    print(
                        str(entry.id) + "\t" + entry.form + "\t" + entry.pos +
                        "\t")
            yield sentence_conll
Ejemplo n.º 15
0
    def predict(self, dataset, evaluate=False):
        """
        Runs inference with the BIST model on a dataset in CoNLL file format.

        Args:
            dataset (str): Path to input CoNLL file.
            evaluate (bool, optional): Write prediction and evaluation files to dataset's folder.
        Returns:
            res (list of list of ConllEntry): The list of input sentences with predicted
            dependencies attached.
        """
        dataset = validate_existing_filepath(dataset)
        validate((evaluate, bool))

        print('\nRunning predict on ' + dataset + '...\n')
        res = list(self.model.predict(conll_path=dataset))
        if evaluate:
            ext = dataset.rindex('.')
            pred_path = dataset[:ext] + '_pred' + dataset[ext:]
            utils.write_conll(pred_path, res)
            utils.run_eval(dataset, pred_path)
        return res
Ejemplo n.º 16
0
    def predict(self, dataset, evaluate=False):
        """
        Runs inference with the BIST model on a dataset in CoNLL file format.

        Args:
            dataset (str): Path to input CoNLL file.
            evaluate (bool, optional): Write prediction and evaluation files to dataset's folder.
        Returns:
            res (list of list of ConllEntry): The list of input sentences with predicted
            dependencies attached.
        """
        dataset = validate_existing_filepath(dataset)
        validate((evaluate, bool))

        print("\nRunning predict on " + dataset + "...\n")
        res = list(self.model.predict(conll_path=dataset))
        if evaluate:
            ext = dataset.rindex(".")
            pred_path = dataset[:ext] + "_pred" + dataset[ext:]
            utils.write_conll(pred_path, res)
            utils.run_eval(dataset, pred_path)
        return res
Ejemplo n.º 17
0
def validate_input_args(args):
    validate((args.b, int, 1, 100000))
    validate((args.e, int, 1, 100000))
    validate((args.tag_num, int, 1, 1000))
    validate((args.sentence_length, int, 1, 10000))
    validate((args.word_length, int, 1, 100))
    validate((args.word_embedding_dims, int, 1, 10000))
    validate((args.character_embedding_dims, int, 1, 1000))
    validate((args.char_features_lstm_dims, int, 1, 10000))
    validate((args.entity_tagger_lstm_dims, int, 1, 10000))
    validate((args.dropout, float, 0, 1))
    model_path = path.join(path.dirname(path.realpath(__file__)), str(args.model_path))
    validate_parent_exists(model_path)
    model_info_path = path.join(path.dirname(path.realpath(__file__)), str(args.model_info_path))
    validate_parent_exists(model_info_path)
Ejemplo n.º 18
0
def validate_input_args():
    global model_path
    validate((args.b, int, 1, 100000000))
    validate((args.e, int, 1, 100000000))
    validate((args.sentence_length, int, 1, 10000))
    validate((args.token_emb_size, int, 1, 10000))
    validate((args.lstm_hidden_size, int, 1, 10000))
    validate((args.encoder_depth, int, 1, 10))
    validate((args.decoder_depth, int, 1, 10))
    validate((args.encoder_dropout, float, 0, 1))
    validate((args.decoder_dropout, float, 0, 1))
    model_path = path.join(path.dirname(path.realpath(__file__)),
                           str(args.model_path))
    validate_parent_exists(model_path)
    model_info_path = path.join(path.dirname(path.realpath(__file__)),
                                str(args.model_info_path))
    validate_parent_exists(model_info_path)
Ejemplo n.º 19
0
                    help='use OOV test set')
parser.add_argument(
    '--eps',
    type=float,
    default=1e-8,
    help='epsilon used to avoid divide by zero in softmax renormalization.',
    action=check_size(1e-100, 1e-2))
parser.add_argument('--model_file',
                    default='memn2n_weights.npz',
                    help='File to load model weights from.',
                    type=str)

parser.set_defaults(batch_size=32, epochs=200)
args = parser.parse_args()

validate((args.emb_size, int, 1, 10000), (args.eps, float, 1e-15, 1e-2))

# Sanitize inputs
validate_existing_filepath(args.model_file)
model_file = args.model_file
assert model_file.endswith('.npz')
validate_parent_exists(args.data_dir)
data_dir = args.data_dir

babi = BABI_Dialog(path=data_dir,
                   task=args.task,
                   oov=args.use_oov,
                   use_match_type=args.use_match_type,
                   cache_match_type=args.cache_match_type,
                   cache_vectorized=args.cache_vectorized)
Ejemplo n.º 20
0
 def __init__(self, activation='tanh', lstm_layers=2, lstm_dims=125, pos_dims=25):
     validate((activation, str), (lstm_layers, int, 0, None), (lstm_dims, int, 0, 1000),
              (pos_dims, int, 0, 1000))
     self.options = get_options_dict(activation, lstm_dims, lstm_layers, pos_dims)
     self.params = None
     self.model = None
Ejemplo n.º 21
0
    print(' '.join(['%-{}s'.format(x[2]) % x[1] for x in print_helper]))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_path', type=validate_existing_filepath, required=True,
                        help='Model file path')
    parser.add_argument('--dataset_path', type=validate_existing_directory, required=True,
                        help='dataset directory')
    parser.add_argument('--embedding_model', type=validate_existing_filepath,
                        help='Path to word embedding model')
    parser.add_argument('--embedding_size', type=int,
                        help='Word embedding model vector size')
    args = parser.parse_args()
    if args.embedding_size is not None:
        validate((args.embedding_size, int, 1, 10000))

    model = IntentExtractionModel()
    model.load(args.model_path)

    ds = SNIPS(path=args.dataset_path)
    nlp = SpacyInstance(disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])

    emb_vectors = None
    if args.embedding_model is not None:
        print('Loading external word embedding model')
        emb_vectors, emb_size = load_word_embeddings(args.embedding_model)

    while True:
        text = input('>> ')
        tokens = process_text(text)
Ejemplo n.º 22
0
parser.add_argument(
    '--interactive',
    default=False,
    action='store_true',
    help='enable interactive mode at the end of training.')
parser.add_argument(
    '--test',
    default=False,
    action='store_true',
    help='evaluate on the test set at the end of training.')

parser.set_defaults(batch_size=32, epochs=200)
args = parser.parse_args()

validate((args.emb_size, int, 1, 10000),
         (args.eps, float, 1e-15, 1e-2),
         (args.lr, float, 1e-8, 10),
         (args.grad_clip_norm, float, 1e-3, 1e5))

# Validate inputs
validate_parent_exists(args.log_file)
log_file = args.log_file
validate_parent_exists(args.weights_save_path)
weights_save_path = args.weights_save_path
validate_parent_exists(args.data_dir)
data_dir = args.data_dir
assert weights_save_path.endswith('.npz')
assert log_file.endswith('.txt')

gradient_clip_norm = args.grad_clip_norm

babi = BABI_Dialog(
Ejemplo n.º 23
0
# parse the command line arguments
tf.flags.DEFINE_integer(
    "task", 1, "the task ID to train/test on from bAbI-dialog dataset (1-6)")
tf.flags.DEFINE_integer("emb_size", 20,
                        "Size of the word-embedding used in the model.")
tf.flags.DEFINE_integer("nhops", 3, "Number of memory hops in the network")
tf.flags.DEFINE_boolean("use_match_type", False, "use match type features")
tf.flags.DEFINE_boolean("cache_match_type", False, "cache match type answers")
tf.flags.DEFINE_boolean("cache_vectorized", False, "cache vectorized data")
tf.flags.DEFINE_boolean("use_oov", False, "use OOV test set")
tf.flags.DEFINE_string("data_dir", "data/", "File to save model weights to.")
tf.flags.DEFINE_string("weights_save_path", "saved_tf/",
                       "File to save model weights to.")
FLAGS = tf.flags.FLAGS

validate((FLAGS.task, int, 1, 7), (FLAGS.nhops, int, 1, 100),
         (FLAGS.emb_size, int, 1, 10000))

# Validate inputs
current_dir = os.path.dirname(os.path.realpath(__file__))
weights_save_path = os.path.join(current_dir, FLAGS.weights_save_path)
validate_parent_exists(weights_save_path)
data_dir = os.path.join(current_dir, FLAGS.data_dir)
validate_parent_exists(data_dir)

babi = BABI_Dialog(
    path=data_dir,
    task=FLAGS.task,
    oov=FLAGS.use_oov,
    use_match_type=FLAGS.use_match_type,
    cache_match_type=FLAGS.cache_match_type,
    cache_vectorized=FLAGS.cache_vectorized,
Ejemplo n.º 24
0
        'word_vocab': dataset.word_vocab,
        'pos_vocab': dataset.pos_vocab,
        'chunk_vocab': dataset.chunk_vocab
    }
    if args.char_features is True:
        model_params.update({'char_vocab': dataset.char_vocab})
    with open(settings_path, 'wb') as fp:
        pickle.dump(model_params, fp)
    model.save(model_path)


if __name__ == '__main__':
    # read input args and validate
    parser = create_argument_parser()
    args = parser.parse_args()
    validate((args.sentence_length, int, 1, 1000))
    validate((args.feature_size, int, 1, 10000))
    validate((args.b, int, 1, 100000))
    validate((args.e, int, 1, 100000))
    model_path = path.join(path.dirname(path.realpath(__file__)),
                           '{}.h5'.format(str(args.model_name)))
    settings_path = path.join(path.dirname(path.realpath(__file__)),
                              '{}.params'.format(str(args.model_name)))
    validate_parent_exists(model_path)

    # load dataset and get tokens/chunks/pos tags
    dataset = CONLL2000(data_path=args.data_dir,
                        sentence_length=args.sentence_length,
                        extract_chars=args.char_features,
                        max_word_length=args.max_word_length)
    train_set = dataset.train_set
Ejemplo n.º 25
0
def validate_input_args():
    global model_path, settings_path
    validate((args.sentence_len, int, 1, 1000))
    validate((args.lstm_depth, int, 1, 10))
    validate((args.lstm_hidden_size, int, 1, 10000))
    validate((args.token_embedding_size, int, 1, 10000))
    validate((args.pos_embedding_size, int, 1, 1000))
    validate((args.vocab_size, int, 1, 100000000))
    validate((args.char_hidden_size, int, 1, 1000))
    validate((args.max_char_word_length, int, 1, 100))
    model_path = path.join(path.dirname(path.realpath(__file__)),
                           str(args.model_name))
    settings_path = path.join(path.dirname(path.realpath(__file__)),
                              str(args.settings))
    validate_parent_exists(model_path)
    validate_parent_exists(settings_path)
Ejemplo n.º 26
0
def validate_input_args(input_args):
    validate((input_args.b, int, 1, 100000))
    validate((input_args.e, int, 1, 100000))
    validate((input_args.tag_num, int, 1, 1000))
    validate((input_args.sentence_length, int, 1, 10000))
    validate((input_args.word_length, int, 1, 100))
    validate((input_args.word_embedding_dims, int, 1, 10000))
    validate((input_args.character_embedding_dims, int, 1, 1000))
    validate((input_args.char_features_lstm_dims, int, 1, 10000))
    validate((input_args.entity_tagger_lstm_dims, int, 1, 10000))
    validate((input_args.dropout, float, 0, 1))
    model_path = path.join(path.dirname(path.realpath(__file__)),
                           str(input_args.model_path))
    validate_parent_exists(model_path)
    model_info_path = path.join(path.dirname(path.realpath(__file__)),
                                str(input_args.model_info_path))
    validate_parent_exists(model_info_path)
Ejemplo n.º 27
0
def add_page(search_page, phrase):
    try:
        if search_page is not None:
            if phrase not in result_dump:
                result_dump[phrase] = []
                result_dump[phrase].append(search_page)
            else:
                pages = result_dump[phrase]
                for page in pages:
                    if page.pageid == search_page.pageid:
                        return

                result_dump[phrase].append(search_page)

            logger.info('page-%s added', str(search_page))
    except Exception:
        logger.error('could not extract wiki info from phrase-%s', search_page.orig_phrase)


if __name__ == '__main__':
    io.validate_existing_filepath(args.mentions)
    io.validate_existing_filepath(args.output)
    if args.host:
        io.validate((args.host, str, 1, 1000))
    if args.port:
        io.validate((args.port, int, 1, 65536))
    if args.index:
        io.validate((args.index, str, 1, 10000))

    wiki_dump_from_gs()
Ejemplo n.º 28
0
def validate_input_args():
    global model_path
    validate((args.b, int, 1, 100000000))
    validate((args.e, int, 1, 100000000))
    validate((args.sentence_length, int, 1, 10000))
    validate((args.token_emb_size, int, 1, 10000))
    validate((args.lstm_hidden_size, int, 1, 10000))
    validate((args.encoder_depth, int, 1, 10))
    validate((args.decoder_depth, int, 1, 10))
    validate((args.save_epochs, int, 1, 1000))
    validate((args.encoder_dropout, float, 0, 1))
    validate((args.decoder_dropout, float, 0, 1))
    model_path = path.join(path.dirname(path.realpath(__file__)), str(args.model_path))
    validate_parent_exists(model_path)
Ejemplo n.º 29
0
    help='specifies whether to use a second dictionary for words within ' +
    'specified extended window. ie for "-w 1 -d 2", the ' +
    'sentence "hello world how are things" creates a window of "2:hello ' +
    '1:world <NULL> 1:are 2:things"')
parser.add_argument('-t',
                    '--num_threads',
                    type=int,
                    default=4,
                    help='number of threads to use',
                    action=check_size(1, 10))
args = vars(parser.parse_args())

validate_parent_exists(args['data_dir'])
if args['entities']:
    validate_parent_exists(args['entities'])
validate((args['window_size'], str, 1, 100),
         (args['double_dict'], str, 1, 100))

beg = time.time()

if args['data_dir']:
    # also set the entities and input file here
    args['entities'] = os.path.expanduser(
        args['data_dir'] + '/movieqa/knowledge_source/entities.txt')
    args['input_file'] = [
        os.path.expanduser(args['data_dir'] +
                           '/movieqa/knowledge_source/wiki.txt')
    ]
else:
    ValueError("No data_dir given.")

with open(
Ejemplo n.º 30
0
def validate_input_args():
    global model_path, settings_path
    validate((args.sentence_len, int, 1, 1000))
    validate((args.lstm_depth, int, 1, 10))
    validate((args.lstm_hidden_size, int, 1, 10000))
    validate((args.token_embedding_size, int, 1, 10000))
    validate((args.pos_embedding_size, int, 1, 1000))
    validate((args.vocab_size, int, 1, 100000000))
    validate((args.char_hidden_size, int, 1, 1000))
    validate((args.max_char_word_length, int, 1, 100))
    model_path = path.join(path.dirname(path.realpath(__file__)), str(args.model_name))
    settings_path = path.join(path.dirname(path.realpath(__file__)), str(args.settings))
    validate_parent_exists(model_path)
    validate_parent_exists(settings_path)
Ejemplo n.º 31
0
                        type=validate_existing_filepath,
                        required=True,
                        help='Model file path')
    parser.add_argument('--dataset_path',
                        type=validate_existing_directory,
                        required=True,
                        help='dataset directory')
    parser.add_argument('--embedding_model',
                        type=validate_existing_filepath,
                        help='Path to word embedding model')
    parser.add_argument('--embedding_size',
                        type=int,
                        help='Word embedding model vector size')
    args = parser.parse_args()
    if args.embedding_size is not None:
        validate((args.embedding_size, int, 1, 10000))

    model = IntentExtractionModel()
    model.load(args.model_path)

    ds = SNIPS(path=args.dataset_path)
    nlp = SpacyInstance(
        disable=['tagger', 'ner', 'parser', 'vectors', 'textcat'])

    emb_vectors = None
    if args.embedding_model is not None:
        print('Loading external word embedding model')
        emb_vectors, emb_size = load_word_embeddings(args.embedding_model)

    while True:
        text = input('>> ')
Ejemplo n.º 32
0
tf.flags.DEFINE_integer("epochs", 100,
                        "Number of epochs between saving model weights.")
tf.flags.DEFINE_boolean("restore", False, "Restore weights if found.")
tf.flags.DEFINE_boolean("interactive", False,
                        "enable interactive mode at the end of training.")
tf.flags.DEFINE_boolean("test", False,
                        "evaluate on the test set at the end of training.")
FLAGS = tf.flags.FLAGS

# Validate inputs
validate(
    (FLAGS.task, int, 1, 7),
    (FLAGS.nhops, int, 1, 10),
    (FLAGS.batch_size, int, 1, 32000),
    (FLAGS.emb_size, int, 1, 10000),
    (FLAGS.eps, float, 1e-15, 1e-2),
    (FLAGS.lr, float, 1e-8, 10),
    (FLAGS.grad_clip_norm, float, 1e-3, 1e5),
    (FLAGS.epochs, int, 1, 1e10),
    (FLAGS.save_epochs, int, 1, 1e10),
)

current_dir = os.path.dirname(os.path.realpath(__file__))
log_file = os.path.join(current_dir, FLAGS.log_file)
validate_parent_exists(log_file)
weights_save_path = os.path.join(current_dir, FLAGS.weights_save_path)
validate_parent_exists(weights_save_path)
data_dir = os.path.join(current_dir, FLAGS.data_dir)
validate_parent_exists(data_dir)
assert log_file.endswith(".txt")
Ejemplo n.º 33
0
def validate_arguments(args):
    """Validate input arguments"""
    io.validate((args.num_units, int, 1, None))
    io.validate((args.num_layers, int, 1, None))
    io.validate((args.num_encoder_layers, (int, type(None)), 0, None))
    io.validate((args.num_decoder_layers, (int, type(None)), 0, None))
    io.validate((args.num_embeddings_partitions, int, 0, None))
    io.validate((args.learning_rate, float, 0.0, None))
    io.validate((args.num_train_steps, int, 1, None))
    io.validate((args.warmup_steps, int, 0, args.num_train_steps))
    io.validate((args.init_weight, float))
    io.validate((args.src, (str, type(None)), 1, 256))
    io.validate((args.tgt, (str, type(None)), 1, 256))
    io.validate((args.sos, str, 1, 256))
    io.validate((args.eos, str, 1, 256))
    io.validate((args.src_max_len, int, 1, None))
    io.validate((args.tgt_max_len, int, 1, None))
    io.validate((args.src_max_len_infer, (int, type(None)), 1, None))
    io.validate((args.tgt_max_len_infer, (int, type(None)), 1, None))
    io.validate((args.forget_bias, float, 0.0, None))
    io.validate((args.dropout, float, 0.0, 1.0))
    io.validate((args.max_gradient_norm, float, 0.000000001, None))
    io.validate((args.batch_size, int, 1, None))
    io.validate((args.steps_per_stats, int, 1, None))
    io.validate((args.max_train, int, 0, None))
    io.validate((args.num_buckets, int, 1, None))
    io.validate((args.num_sampled_softmax, int, 0, None))
    io.validate((args.num_gpus, int, 0, None))
    io.validate((args.metrics, str, 1, 256))
    io.validate((args.inference_list, (str, type(None)), 0, 256))
    io.validate((args.steps_per_external_eval, (int, type(None)), 1, None))
    io.validate((args.scope, (str, type(None)), 1, 256))
    io.validate((args.random_seed, (int, type(None))))
    io.validate((args.num_keep_ckpts, int, 0, None))
    io.validate((args.infer_batch_size, int, 1, None))
    io.validate((args.beam_width, int, 0, None))
    io.validate((args.length_penalty_weight, float, 0.0, None))
    io.validate((args.sampling_temperature, float, 0.0, None))
    io.validate((args.num_translations_per_input, int, 1, None))
    io.validate((args.jobid, int, 0, None))
    io.validate((args.num_workers, int, 1, None))
    io.validate((args.num_inter_threads, int, 0, None))
    io.validate((args.num_intra_threads, int, 0, None))
    io.validate((args.pruning_hparams, (str, type(None)), 1, 256))

    suffixes = [args.src]
    if not args.language_model:
        suffixes.append(args.tgt)

    for suffix in suffixes:
        validate_existing_filepath(args.train_prefix, suffix)
        validate_existing_filepath(args.dev_prefix, suffix)
        validate_existing_filepath(args.test_prefix, suffix)
        validate_existing_filepath(args.vocab_prefix, suffix)
        validate_existing_filepath(args.embed_prefix, suffix)
    validate_existing_filepath(args.inference_ref_file)
    validate_existing_filepath(args.inference_input_file)
    validate_existing_filepath(args.hparams_path)
    validate_parent_exists(args.ckpt)
    validate_parent_exists(args.inference_output_file)
    validate_parent_exists(args.out_dir)
Ejemplo n.º 34
0
                    help='sizes of windows PER SIDE around words to generate. eg 1 or ' +
                    '1,2,3. ie "-w 1" for "hey world hey" produces "hey <NULL> hey"')
parser.add_argument('-d', '--double_dict', type=str, default='3',
                    help='specifies whether to use a second dictionary for words within ' +
                    'specified extended window. ie for "-w 1 -d 2", the ' +
                    'sentence "hello world how are things" creates a window of "2:hello ' +
                    '1:world <NULL> 1:are 2:things"')
parser.add_argument('-t', '--num_threads', type=int, default=4,
                    help='number of threads to use',
                    action=check_size(1,10))
args = vars(parser.parse_args())

validate_parent_exists(args['data_dir'])
if args['entities']:
    validate_parent_exists(args['entities'])
validate((args['window_size'], str, 1, 100), (args['double_dict'], str, 1, 100))

beg = time.time()

if args['data_dir']:
    # also set the entities and input file here
    args['entities'] = os.path.expanduser(args['data_dir'] +
                                          '/movieqa/knowledge_source/entities.txt')
    args['input_file'] = [os.path.expanduser(args['data_dir'] +
                                             '/movieqa/knowledge_source/wiki.txt')]
else:
    ValueError("No data_dir given.")

with open(os.path.expanduser(args['data_dir'] +
                                     '/movieqa/lower_wiki-w=0-d=3-m-4.txt'), 'w') as out:
    try:
Ejemplo n.º 35
0
    model.save(model_file_path)
    # set evaluation error rates
    loss, binary_accuracy, precision, recall, f1 = model.eval(dataset.test_set)
    print('loss = %.1f%%' % (loss))
    print('Test binary_accuracy rate = %.1f%%' % (binary_accuracy * 100))
    print('Test precision rate = %.1f%%' % (precision * 100))
    print('Test recall rate = %.1f%%' % (recall * 100))
    print('Test f1 rate = %.1f%%' % (f1 * 100))


if __name__ == "__main__":
    # parse the command line arguments
    parser = argparse.ArgumentParser()
    parser.set_defaults(epochs=200)
    parser.add_argument(
        '--data',
        type=validate_existing_filepath,
        help='Path to the CSV file where the prepared dataset is saved')
    parser.add_argument('--model_path',
                        type=validate_parent_exists,
                        help='Path to save the model')
    args = parser.parse_args()
    validate((args.epochs, int, 1, 100000))
    data_path = absolute_path(args.data)
    model_path = absolute_path(args.model_path)
    num_epochs = args.epochs
    # load data sets from file
    data_set = NpSemanticSegData(data_path, train_to_test_ratio=0.8)
    # train the mlp classifier
    train_mlp_classifier(data_set, model_path, args.epochs)
Ejemplo n.º 36
0
parser.add_argument(
    '--eps',
    type=float,
    default=1e-8,
    help='epsilon used to avoid divide by zero in softmax renormalization.',
    action=check_size(1e-100,1e-2))
parser.add_argument(
    '--model_file',
    default='memn2n_weights.npz',
    help='File to load model weights from.',
    type=str)

parser.set_defaults(batch_size=32, epochs=200)
args = parser.parse_args()

validate((args.emb_size, int, 1, 10000),
         (args.eps, float, 1e-15, 1e-2))

# Sanitize inputs
validate_existing_filepath(args.model_file)
model_file = args.model_file
assert model_file.endswith('.npz')
validate_parent_exists(args.data_dir)
data_dir = args.data_dir

babi = BABI_Dialog(
    path=data_dir,
    task=args.task,
    oov=args.use_oov,
    use_match_type=args.use_match_type,
    cache_match_type=args.cache_match_type,
    cache_vectorized=args.cache_vectorized)