Example #1
0
def extract_ngrams(corpus):
    """
    input: whole dataset
    output: two dictionaries, key: tweet_id, value: 1-dimensional binary numpy array
    Done: Out of Vocabulary (OOV) words
    """
    if not os.path.exists('baseline/unigram_vocab.txt') or not os.path.exists(
            'baseline/bigram_vocab.txt'):
        construct_vocabulary(corpus)

    # key: word, value: index
    unigram_vocab = read_vocabulary('baseline/unigram_vocab.txt')
    bigram_vocab = read_vocabulary('baseline/bigram_vocab.txt')

    unigram_dict = {}
    bigram_dict = {}
    for data in corpus:
        tokens = data.tweet_words()
        lower_tokens = [t.lower() for t in tokens]
        _id = data.tweet_id
        # +1 for OOV
        unigram_dict[_id] = np.zeros(len(unigram_vocab) + 1).tolist()
        bigram_dict[_id] = np.zeros(len(bigram_vocab) + 1).tolist()
        for idx, ele in enumerate(lower_tokens):
            # unigram
            unigram_dict[_id][unigram_vocab.get(ele, len(unigram_vocab))] = 1.

            if idx == len(lower_tokens) - 1:
                continue

            # bigram
            bigram_dict[_id][bigram_vocab.get((ele, lower_tokens[idx + 1]),
                                              len(bigram_vocab))] = 1.

    return unigram_dict, bigram_dict
Example #2
0
def predict():
    sep_word = thulac.thulac(seg_only=True)

    model = Seq2Seq(batch_size=1, forward_only=True)

    model_path = './models/0612/'

    vocab_en, _, = utils.read_vocabulary(config.TRAIN_ENC_VOCABULARY)
    _, vocab_de, = utils.read_vocabulary(config.TRAIN_DEC_VOCABULARY)

    with tf.Session() as sess:
        # 恢复前一次训练
        ckpt = tf.train.get_checkpoint_state(model_path)
        if ckpt != None:
            print('find modal: ', ckpt.model_checkpoint_path)
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print("没找到模型")

        while True:
            input_string = raw_input('me > ')
            # 退出
            if input_string == 'quit':
                exit()
            personal_ans = utils.check_pre_ques(input_string.decode('utf-8'))
            if personal_ans is not None:
                print('AI > ' + personal_ans)
                continue

            input_string_vec = []
            aseq = sep_word.cut(input_string, text=True)
            for words in aseq.split(' '):
                input_string_vec.append(vocab_en.get(words, config.UNK_ID))
            bucket_id = min([
                b for b in range(len(config.BUCKETS))
                if config.BUCKETS[b][0] > len(input_string_vec)
            ])
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(input_string_vec, [])]}, bucket_id)
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]
            if config.EOS_ID in outputs:
                outputs = outputs[:outputs.index(config.EOS_ID)]

            response = "".join(
                [tf.compat.as_str(vocab_de[output]) for output in outputs])
            print('AI > ' + response)
Example #3
0
def train():
    # Get the vocabulary
    index_to_word, _ = utils.read_vocabulary()

    # Create the model
    if resume_training:
        model = ChatbotModel(weights_file=weights_file)
    else:
        model = ChatbotModel(
            embedding_matrix=utils.read_embedding_matrix(index_to_word))

    # Load the data
    q, a = utils.read_training_sequences()

    print("Total training sequences:", q.shape[0])

    print("Example context-answer pair")
    print(utils.seq_to_text(q[0], index_to_word))
    print(utils.seq_to_text(a[0], index_to_word))

    q_val = q[:N_VAL, :]
    a_val = a[:N_VAL, :]
    q = q[N_VAL:, :]
    a = a[N_VAL:, :]

    n_train = len(q) - N_VAL

    step = round(n_train / NUM_SUBSETS)

    # Prepare validation data
    Q_val, A_val, Y_val = prepare_fit_data(q_val, a_val)

    # Train
    for m in range(EPOCHS):
        print("\nStarting epoch", m + 1, "\n")
        # Loop over training subsets so it fits in RAM
        for n in range(0, n_train, step):
            print("Training epoch: %d. Data slice: %d - %d" %
                  (m + 1, n, n + step))

            Q, A, Y = prepare_fit_data(q[n:n + step], a[n:n + step])
            model.fit([Q, A], Y, batch_size=BATCH_SIZE, epochs=1)

            # Make sure memory is cleared
            del Q
            del A
            del Y
            gc.collect()

        print("Evaluating on validation set...")
        loss, acc = model.evaluate([Q_val, A_val], Y_val, verbose=0)
        print("Validation accuracy: %f, loss = %f" % (acc, loss))

        model.save_weights(weights_file, overwrite=True)
Example #4
0
    def __init__(self):
        self.logger = logging.getLogger('trainlogger')
        self.logger.setLevel(logging.INFO)
        formatter = logging.Formatter(
                fmt='%(levelname)s\t%(asctime)s\t%(message)s',
                datefmt='%Y-%m-%dT%H:%M:%S')
        handler = logging.FileHandler('./logs/predict.log','a')
        handler.setFormatter(formatter)
        self.logger.addHandler(handler)


        self.model = Seq2Seq(batch_size=1, forward_only=True)
        model_path = './models/0612/'
        self.vocab_en, _, = utils.read_vocabulary(config.TRAIN_ENC_VOCABULARY)
        _, self.vocab_de, = utils.read_vocabulary(config.TRAIN_DEC_VOCABULARY)
        self.sess = tf.Session()
        ckpt = tf.train.get_checkpoint_state(model_path)
        if ckpt != None:
            self.logger.info('find modal: ' + ckpt.model_checkpoint_path)
            self.model.saver.restore(self.sess, ckpt.model_checkpoint_path)
        else:
            self.logger.error("没找到模型")
            raise
        self.sep_word = thulac.thulac(seg_only=True)
def testRNN(vocabulary_file, training_dir):
    print("Reading vocabulary " + vocabulary_file + "...")
    words, dictionary = read_vocabulary(vocabulary_file, MAX_VOCAB_SIZE)
    print("Reading sentences and training RNN...")
    start = timer()

    rnn = RNNExtended(len(words), HIDDEN_LAYER_SIZE)
    num_words = 0
    for i in range(NUM_ITER):
        sentences = tokenize_files(dictionary, training_dir)    
        for sentence in itertools.islice(sentences, MAX_SENTENCES):
            # Todo, create context window for each sentence?
            rnn.train(sentence)
            num_words += len(sentence)

        print("Iteration " + str(i + 1) + "/" + str(NUM_ITER) + " finished (" + str(num_words) + " words)")
        num_words = 0

    print("- Took %.2f sec" % (timer() - start))
Example #6
0
def testSkipGram(vocabulary_file, training_dir):
    last_sentence = None
    print("Reading vocabulary " + vocabulary_file + "...")
    words, dictionary = read_vocabulary(vocabulary_file, MAX_VOCAB_SIZE)
    print("Reading sentences and training SkipGram...")
    start = timer()
    skip_gram = SkipGram(len(words), WINDOW_SIZE, HIDDEN_LAYER_SIZE)
    num_words = 0
    for i in range(NUM_ITER):
        sentences = tokenize_files(dictionary, training_dir)    
        for sentence in itertools.islice(sentences, MAX_SENTENCES):
            last_sentence = sentence
            skip_gram.train(sentence)
            num_words += len(sentence)

        ll = skip_gram.train(last_sentence, compute_ll=True)
        print("Iteration " + str(i + 1) + "/" + str(NUM_ITER) + " finished (" + str(num_words) + " words)")
        print("Log-likelihood: " + str(ll))

        num_words = 0

    print("- Took %.2f sec" % (timer() - start))
Example #7
0
def predict_babelnet(input_path: str, output_path: str,
                     resources_path: str) -> None:
    """
    DO NOT MODIFY THE SIGNATURE!
    This is the skeleton of the prediction function.
    The predict function will build your model, load the weights from the checkpoint and write a new file (output_path)
    with your predictions in the "<id> <BABELSynset>" format (e.g. "d000.s000.t000 bn:01234567n").

    The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission.

    N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code.
    If you don't know what HARD CODING means see: https://en.wikipedia.org/wiki/Hard_coding

    :param input_path: the path of the input file to predict in the same format as Raganato's framework (XML files you downloaded).
    :param output_path: the path of the output file (where you save your predictions)
    :param resources_path: the path of the resources folder containing your model and stuff you might need.
    :return: None
    """
    print("Predicting Babelnet...")
    out_vocab = utils.read_vocabulary(
        os.path.join(resources_path, config.OUT_VOCAB_BN))
    bn2wn = utils.read_map(os.path.join(resources_path,
                                        config.BABELNET2WORDNET),
                           reverse=False)

    dense_layer = 0
    is_bn = True

    _prediction(
        input_path,
        output_path,
        resources_path,
        out_vocab,
        dense_layer,
        is_bn,
        bn2domain=bn2wn,
    )
Example #8
0
def main(input_file, vocabulary_file):
    """Automatically check and correct the spelling of a file."""
    vocabulary = utils.read_vocabulary(vocabulary_file)
    logging.info("Read %i words.", len(vocabulary))
    text = utils.read_text(input_file)
    check(text, vocabulary)
Example #9
0
def dependency(corpus, num_cluster=NUM_CLUSTER, min_freq=MIN_FREQ):
    """
    input: whole dataset
    output: two dictionaries, key: tweet_id, value: 1-dimensional binary numpy array
    """
    sorted_corpus = sorted(corpus, key=lambda tweet: tweet.tweet_id)

    depend_fn = 'baseline/dependency_A.txt.predict'
    if not os.path.exists(depend_fn):
        print('dependency parser file not exist, run the repo first')
        sys.exit(1)

    # reading vocab
    cluster_fn = 'baseline/brown_cluster_{}.txt'.format(num_cluster)
    if not os.path.exists(cluster_fn):
        print('brown cluster file not exist, run the repo first')
        sys.exit(1)

    if not os.path.exists('baseline/unigram_vocab.txt'):
        construct_vocabulary(corpus)

    cluster_vocab = read_brown_cluster(cluster_fn, min_freq)
    unigram_vocab = read_vocabulary('baseline/unigram_vocab.txt')

    # dict to store the features
    word_dict = {}
    cluster_dict = {}

    # NOTE: this requires tweets are sorted from 1 to n when passing to the txt file
    idx = 1
    with open(depend_fn, 'r') as inf:
        word_tmp = np.zeros((len(unigram_vocab) + 1, len(unigram_vocab) + 1))
        cluster_tmp = np.zeros(
            (len(cluster_vocab) + 1, len(cluster_vocab) + 1))
        valid_arc = {}
        tweet_word_dict = {}
        tweet_tokens = []
        for line in inf:
            if line.strip():
                word_idx, word, _, tag, tag, _, arc_idx, _ = line.split('\t')
                tweet_word_dict[word_idx] = word
                tweet_tokens.append(word)
                try:
                    int_arc_idx = int(arc_idx)
                except TypeError as err:
                    int_arc_idx = -1
                if int_arc_idx > 0:
                    valid_arc[word_idx] = arc_idx
            else:
                # tweets are separated by a empty line

                # There might be some exceptions due to space
                # lower_tweet_words = [t.lower() for t in sorted_corpus[idx-1].tweet_words()]
                # assert tweet_tokens == lower_tweet_words, \
                #     ' '.join(tweet_tokens) + '\n' + ' '.join(lower_tweet_words)
                # when encounter a empty line, summary and store last chunks, init. for next chunk
                # summary
                for k, v in valid_arc.items():
                    dim1_word, dim2_word = tweet_word_dict[k], tweet_word_dict[
                        v]
                    word_tmp[
                        unigram_vocab.get(dim1_word, len(unigram_vocab)),
                        unigram_vocab.get(dim2_word, len(unigram_vocab))] = 1.
                    cluster_tmp[
                        cluster_vocab.get(dim1_word, len(cluster_vocab)),
                        cluster_vocab.get(dim2_word, len(cluster_vocab))] = 1.

                # flatten
                # TODO: sparse representation needed
                word_dict[idx] = word_tmp.flatten()
                cluster_dict[idx] = cluster_tmp.flatten()

                # init for next chunk
                # plus 1 for OOV
                valid_arc = {}
                tweet_word_dict = {}
                tweet_tokens = []
                word_tmp = np.zeros(
                    (len(unigram_vocab) + 1, len(unigram_vocab) + 1))
                cluster_tmp = np.zeros(
                    (len(cluster_vocab) + 1, len(cluster_vocab) + 1))
                idx += 1
                if idx % int(len(corpus) * 0.1) == 0:
                    print(idx / int(len(corpus) * 0.1))

    return word_dict, cluster_dict
Example #10
0
def _prediction(
    input_path: str,
    output_path: str,
    resources_path: str,
    out_vocab: Dict,
    i_dense: int,
    is_bn: bool,
    bn2domain: Dict = None,
) -> None:
    """
    This method is used to handle the prediction of a task
    :param input_path: the path of the input file to predict in the same format as Raganato's framework (XML files you downloaded).
    :param output_path: the path of the output file (where you save your predictions)
    :param resources_path: the path of the resources folder containing your model and stuff you might need.
    :param out_vocab: sense inventory
    :param i_dense: i-esime fully connected layer:
                    0 -> Babelnet
                    1 -> Wordnet domains
                    2 -> Lexicographer
    :param is_bn: if True, predicts Babelnet
    :param bn2domain: a map Babelnet to domain
    :return: None
    """
    config_tf = tf.ConfigProto()
    config_tf.gpu_options.allow_growth = True
    tf.keras.backend.set_session(tf.Session(config=config_tf))

    test_set = parser.parser_test_set(input_path)

    vocab = utils.read_vocabulary(os.path.join(resources_path, config.VOCAB))
    wn2bn = utils.read_map(os.path.join(resources_path,
                                        config.BABELNET2WORDNET),
                           reverse=True)

    out_vocab_bn = utils.read_vocabulary(
        os.path.join(resources_path, config.OUT_VOCAB_BN))
    out_vocab_wnd = utils.read_vocabulary(
        os.path.join(resources_path, config.OUT_VOCAB_WND))
    out_vocab_lex = utils.read_vocabulary(
        os.path.join(resources_path, config.OUT_VOCAB_LEX))
    out_vocab_pos = utils.read_vocabulary(
        os.path.join(resources_path, config.POS_VOCAB))

    pre_trained = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(
        resources_path, config.SQUEEZED_EMB),
                                                                  binary=True)

    print("Downloading ELMo...")
    model = models.build_model(
        vocab_size=len(vocab),
        out_size_bn=len(out_vocab_bn),
        out_size_wnd=len(out_vocab_wnd),
        out_size_lex=len(out_vocab_lex),
        out_size_pos=len(out_vocab_pos),
        word2vec=pre_trained,
        is_elmo=config.IS_ELMO,
        attention=config.ATTENTION,
        is_sense_emb=config.SENSE_EMB,
    )

    reversed_vocab = utils.reverse_vocab(out_vocab_bn)
    model.load_weights(str(os.path.join(resources_path, config.MODEL_WEIGHTS)))

    with open(str(output_path), mode="w") as file:
        for row in tqdm(test_set):

            if not config.IS_ELMO:
                tmp = preprocesser.text2id([row[0]], vocab)
            else:
                tmp = np.array([row[0]])

            tmp_row = list(row[2])

            inp = tmp
            if config.SENSE_EMB:
                sens_emb = np.ones((1, len(inp[0].split())), dtype=int)
                inp_pos = np.array([row[1]])
                inp = [inp, sens_emb, inp_pos]

            predictions = model.predict(inp, verbose=0)[i_dense]

            for senses in tmp_row:

                sense_position = utils.senses_position_from_vocab(
                    senses["lemma"], out_vocab_bn, bn2domain)

                synsets = [reversed_vocab[x] for x in sense_position]

                if not is_bn:
                    synsets = [
                        bn2domain.get(syn.split("_")[-1]) for syn in synsets
                    ]
                    sense_position = [out_vocab[syn] for syn in synsets]

                to_compute = np.array([
                    predictions[0][senses["position"]][sen_pos]
                    for sen_pos in sense_position
                ])

                if len(to_compute) != 0:
                    file.write(senses["id"] + " " +
                               synsets[to_compute.argmax()].split("_")[-1] +
                               "\n")
                else:
                    file.write(senses["id"] + " " + utils.most_frequent_sense(
                        senses["lemma"],
                        senses["pos"],
                        wn2bn,
                        bn2domain=bn2domain,
                        is_bn=is_bn,
                    ) + "\n")
Example #11
0
import numpy as np
import nltk

import config as cfg
import utils
import textprocessor
from model import ChatbotModel

# Set a random seed for reproducibility
np.random.seed(1337)

index_to_word, word_to_index = utils.read_vocabulary()

# Init our keras model and load the weights from file
#weights_file = "model_weights_low-training-acc33.h5"
#weights_file = "model_weights_halfway-training-acc62.h5"
weights_file = "model_weights_overfit-training-acc86.h5"

model = ChatbotModel(weights_file=weights_file)
print("Model loaded.")
""" Create a sequence from a given raw string. The returned sequence can be fed directly into the bot """


def create_sequence(query):
    # Use NLTK to get word tokens
    tokenized = nltk.word_tokenize(query)
    # Replace out-of-vocabulary words with the UNKNOWN token
    tokenized = [
        w if w in word_to_index else cfg.TOKEN_UNKNOWN for w in tokenized
    ]
    # Map the words to their respective indices