Ejemplo n.º 1
0
def preprocess_questions(args):
    '''Preprocess questions and save them to h5 file'''
    #Open questions json file
    with open(args.questions_json, 'r') as f:
        questions = json.load(f)['questions']

    #If there is a vocabulary, load it. If no, create new one.
    if args.read_vocab:
        print('Loading vocabulary from {}'.format(args.read_vocab))
        with open(args.read_vocab, 'r') as f:
            vocab = json.load(f)
    else:
        print('Building vocabulary')
        question_token_vocab = create_vocab((q['question'] for q in questions))
        answer_token_vocab = create_vocab((q['answer'] for q in questions))
        vocab = {'q': question_token_vocab, 'a': answer_token_vocab}
        with open(args.save_vocab_to, 'w') as f:
            json.dump(vocab, f)

    #Encode questions and answers
    print('Encoding questions')
    questions_encoded = []
    idxs = []
    image_idxs = []
    answers = []

    for idx, q in enumerate(questions):
        idxs.append(idx)
        image_idxs.append(q['image_index'])

        q_tokenized = tokenize(q['question'])
        q_encoded = encode(q_tokenized, vocab['q'])
        questions_encoded.append(q_encoded)
        if 'answer' in q:
            answers.append(vocab['a'][q['answer']])

    #Pad questions with <NULL>`s to make all questions have the same lenght
    max_length = max(len(q) for q in questions_encoded)
    for q in questions_encoded:
        while len(q) < max_length:
            q.append(0)

    #Save questions to h5py file
    with h5py.File(args.save_questions_h5_to, 'w') as f:
        f.create_dataset('idxs', data=np.asarray(idxs))
        f.create_dataset('image_idxs', data=np.asarray(image_idxs))
        f.create_dataset('questions', data=np.asarray(questions_encoded))
        if answers:
            f.create_dataset('answers', data=np.asarray(answers))
    return
Ejemplo n.º 2
0
def input_load(mode="train"):
    """
    Load the input text and the corresponding feature labels

    :param mode: whether to gather data for training and evaluation or for synthesis
    :return: the text labels, the text lengths, and the audio file paths
    """
    # creates vocab conversion dictionaries
    char2idx, _ = create_vocab()
    fpaths, text_lengths, texts = [], [], []

    # the path to the dataset
    base_path = os.path.join(DATA_PATH, 'wavs')
    # the path to the text
    transcript = os.path.join(DATA_PATH, 'metadata.csv')

    # training or evaluation
    if mode in ("train", "eval"):
        # Each epoch
        for _ in range(NUM_EPOCHS):
            # open the text file
            lines = codecs.open(transcript, 'r', ENCODING).readlines()
            for line in lines:
                fname, _, text = line.strip().split("|")

                # get the wav file paths
                fpath = os.path.join(base_path, fname + ".wav")
                fpaths.append(fpath)

                # clean and normalize the text
                text = normalize_text(text) + "$"  # E: EOS
                text = [char2idx[char] for char in text]
                text_lengths.append(len(text))
                texts.append(np.array(text, np.int32).tostring())
        return fpaths, text_lengths, texts
    else:  # synthesis

        # Parse
        lines = codecs.open(TEST_DATA, 'r', 'utf-8').readlines()[1:]

        # Normalize text: $ is EOS
        sents = [
            normalize_text(line.split(" ", 1)[-1]).strip() + "$"
            for line in lines
        ]
        lengths = [len(sent) for sent in sents]
        maxlen = sorted(lengths, reverse=True)[0]

        # Pad the text
        texts = np.zeros((len(sents), maxlen), np.int32)
        for i, sent in enumerate(sents):
            texts[i, :len(sent)] = [char2idx[char] for char in sent]
        # return just the text, no lengths or paths needed
        return texts
Ejemplo n.º 3
0
def processing_data(infile, labelfile, outfile, vocab_file, stopwords_file):
    print('Loading stopwords...')
    stopwords = get_stopwords(stopwords_file)

    print('Loading data...')
    data = pd.read_csv(infile)

    print('Saving labels')
    with open(labelfile, 'w') as f:
        for label in data.columns[2:]:
            f.write(label + '\n')

    # 把句子分割成词
    print('Splitting content')
    contents = data['content'].tolist()
    seg_contents = segmentData(contents, stopwords)

    if not os.path.exists(vocab_file):
        print('Creating vocabulary...')
        create_vocab(seg_contents, vocab_file, 50000)

    print('Loading vocabulary...')
    w2i, _ = read_vocab(vocab_file)

    # word2id
    print('Tokenize...')
    token_contents = [tokenizer(c, w2i) for c in seg_contents]
    data['content'] = token_contents

    # 把标签转换成one hot形式
    print('One-hot label')
    for col in data.columns[2:]:
        label = data[col].tolist()
        onehot_label = [onehot(l) for l in label]
        data[col] = onehot_label

    print('Saving...')
    data[data.columns[1:]].to_csv(outfile, index=False)
Ejemplo n.º 4
0
def test_embedding():
    config = configparser.ConfigParser()
    config.read(global_config_path)
    stopword_path = config["GENERAL"]["stop_word_path"]
    datapath = config["GENERAL"]["test_path"]
    pretrain_path = config["WORD_EMBED"]["pretrain_path"]
    x, _ = utils.preprocessing(datapath)
    vo, sents = utils.create_vocab(x, stopword_path)
    vecs = utils.word2vec(vo, sents)
    print(sents[0])
    pretrainVecLayer = wordEmbed.PreTrainEmbedding(
        vo, pretrain_embedding_path=pretrain_path)
    result = pretrainVecLayer.forward(torch.LongTensor([vecs[0]]))
    print(result)
    # mock bag of word
    pretrainVecLayer_nobow = wordEmbed.PreTrainEmbedding(
        vo, pretrain_embedding_path=pretrain_path, bow=False)
    temp = pretrainVecLayer_nobow.forward(torch.LongTensor([vecs[0]]))
    temp = torch.sum(temp, 1) / len(vecs[0])
    print(temp)
Ejemplo n.º 5
0
def main(train_path, test_path, from_save_path, to_save_path, k, lr, n_epochs,
         freq_eval, topk, batch_test_size):
    # load model from save_path
    model, user2id = cPickle.load(open(from_save_path, 'r'))
    print('loaded model from %s ...' % from_save_path)

    # reading training data from file, remember that training file must contain header itself .
    # the same for testing data path.
    train_dataset = pd.read_csv(train_path, header=None)
    assert train_dataset.columns == ['user_name_1', 'user_name_2']
    print('-- loaded training dataset with %d samples ...' %
          train_dataset.shape[0])

    if test_path != '':
        test_dataset = pd.read_csv(test_path, header=None)
        assert test_dataset.columns == ['user_name_1', 'user_name_2']

        print('-- loaded testing dataset with %d samples ...' %
              test_dataset.shape[0])
    else:
        test_dataset = None

    # create vocabulary (user2id)
    new_user2id = utils.create_vocab(
        raw_datas=train_dataset.to_records(False, False))
    print('-- created mapping vocabulary with %d unique names ...' %
          len(user2id))

    # update old mapping vocabulary with the newer one
    for k, v in new_user2id.items():
        if k not in user2id: user2id[k] = len(user2id)

    train.train(train_dataset=train_dataset,
                test_dataset=test_dataset,
                user2id=user2id,
                model=model,
                batch_test_size=batch_test_size,
                freq_eval=freq_eval,
                topk=topk,
                n_epochs=n_epochs,
                save_path=to_save_path)
Ejemplo n.º 6
0
def main(train_path, test_path, save_path, k, lr, n_epochs, batch_test_size = 100, freq_eval = 100, topk = 100):
    # reading training data from file, remember that training file must contain header itself .
    # the same for testing data path.
    train_dataset = pd.read_csv(train_path)
    assert train_dataset.columns.tolist() == ['user_name_1','user_name_2']
    print ('-- loaded training dataset with %d samples ...' % train_dataset.shape[0])

    if test_path != '':
        test_dataset  = pd.read_csv(test_path)
        assert test_dataset.columns.tolist() == ['user_name_1', 'user_name_2']

        print ('-- loaded testing dataset with %d samples ...' % test_dataset.shape[0])
    else: test_dataset= None

    # create vocabulary (user2id)
    user2id = utils.create_vocab(raw_datas=train_dataset.to_records(False,False))
    print ('-- created mapping vocabulary with %d unique names ...' % len(user2id))

    model = ISGD(n_user=len(user2id) + 1, n_item=len(user2id) + 1, k=k, learning_rate=lr)

    train(train_dataset=train_dataset,test_dataset=test_dataset,user2id=user2id,model=model,
          batch_test_size=batch_test_size,freq_eval=freq_eval,topk=topk,n_epochs=n_epochs,save_path=save_path)
    def load_text(self, text=None):
        """
        Clean up and load the text. If no text, loads from a text file
        :param text: the text to load
        :return: the cleaned up text
        """
        # Clean up the text
        if text is None:
            lines = codecs.open(TEST_DATA, 'r', ENCODING).readlines()
        else:
            lines = text

        char2idx, _ = create_vocab()
        input_lines = [normalize_text(line.strip()) + "$" for line in lines]  # text normalization, $: EOS
        lengths = [len(line_in) for line_in in input_lines]
        maxlen = sorted(lengths, reverse=True)[0]
        texts = np.zeros((len(input_lines), maxlen), np.int32)

        # Convert to int
        for i, line in enumerate(input_lines):
            texts[i, :len(line)] = [char2idx[char] for char in line]

        self.text = texts
Ejemplo n.º 8
0
def test_utils():
    config = configparser.ConfigParser()
    config.read(global_config_path)
    stopword_path = config["GENERAL"]["stop_word_path"]
    datapath = config["GENERAL"]["test_path"]
    sents, _ = preprocessing(datapath)
    vo, sents = create_vocab(sents, stopword_path)
    print(sents)
    vecs = []
    for sent in sents:
        vec = []
        for token in sent:
            vec.append(vo[token])
        vecs.append(vec)
    randomVec = wordEmbed.RandomWordVec(bow=False)
    for i in range(len(vecs)):
        input = torch.LongTensor(vecs[i])
        print(input)
        temp = randomVec.forward(input)
        sum_of_tensor = temp[0]
        for i in range(1, len(temp)):
            sum_of_tensor += temp[i]
        sum_of_tensor /= len(temp)
        print(sum_of_tensor)
Ejemplo n.º 9
0
def train():
    data = np.load("../dataset/dev.npy")
    labels = np.load("../dataset/dev_transcripts.npy")

    # temorary dataset
    data = data[0:2]
    labels = labels[0:2]
    # temporary dataset

    vocab = create_vocab(labels)

    labels = create_labels(labels, vocab)

    shuffle_index = np.arange(len(data))
    shuffle(shuffle_index)

    batch_size = cfg.BATCH_SIZE
    learning_rate = cfg.LEARNING_RATE

    # my_listener = Listener(40, 256, 0.0)
    # my_speller  = Speller(33, 512, 512, 256, 3)

    if isfile("../weights/listener.pt"):
        with open("../weights/listener.pt", 'rb') as fl:
            my_listener = torch.load(fl)
        with open("../weights/speller.pt", 'rb') as fs:
            my_speller = torch.load(fs)
        print("model loading completed.")
    else:
        my_listener = Listener(40, 256, 0.0)
        my_speller = Speller(33, 512, 512, 256, 3)

    loss_fn = torch.nn.CrossEntropyLoss(reduce=False)
    my_optimizer = torch.optim.Adam([{
        'params': my_speller.parameters()
    }, {
        'params': my_listener.parameters()
    }],
                                    lr=cfg.LEARNING_RATE)

    start_index = 0
    for epoch in range(cfg.EPOCH):
        losses = 0.0
        start_index = 0
        while (start_index + batch_size <= len(data)):
            batch_data = data[shuffle_index[start_index:start_index +
                                            batch_size]]
            batch_labels = labels[shuffle_index[start_index:start_index +
                                                batch_size]]
            batch_data, batch_labels, batch_lengths, batch_label_lengths = preprocess(
                batch_data, batch_labels)
            one_hot_batch_labels = OneHot(batch_labels, 33)
            listener_output = my_listener(batch_data, batch_lengths)

            speller_output = my_speller(batch_labels.size(1), listener_output,
                                        one_hot_batch_labels)

            batch_loss = loss_fn(
                speller_output[0].contiguous().view(-1, 33),
                torch.autograd.Variable(batch_labels).view(-1, ))
            batch_loss = batch_loss.view(speller_output[0].size(0),
                                         speller_output[0].size(1))
            mask = torch.zeros(batch_loss.size())
            for i in range(batch_label_lengths.size(0)):
                mask[i, :batch_label_lengths[i]] = 1.0
            batch_loss = torch.mul(batch_loss, torch.autograd.Variable(mask))
            batch_loss = torch.sum(batch_loss) / torch.sum(mask)
            print("epoch {} batch_loss == {:.5f}".format(
                epoch, batch_loss.data[0]))
            batch_loss.backward()
            losses += batch_loss.data.cpu().numpy()
            my_optimizer.step()

            start_index += batch_size
            # break
        if (epoch % 3 == 0):
            with open("../weights/listener.pt", 'wb') as fl:
                torch.save(my_listener, fl)
            with open("../weights/speller.pt", 'wb') as fs:
                torch.save(my_speller, fs)
Ejemplo n.º 10
0
    def __init__(self,
                 data,
                 split='train',
                 vocab=None,
                 word2idx=None,
                 pre_process=None,
                 device=None,
                 debug=False):
        """
        Args:
                pre_process: ['remove_stopwords', 'stemming', 'lemmatization']
        """
        # data = data
        self.device = device
        self.pre_process = pre_process

        if debug:
            self.labels = torch.from_numpy(data['labels'][:100]).to(device)
            # tokenize data
            self.q1s = tokenize_data(data['q1s'][:100])
            self.q2s = tokenize_data(data['q2s'][:100])
        else:
            self.labels = torch.from_numpy(data['labels']).to(device)
            # tokenize data
            self.q1s = tokenize_data(data['q1s'])
            self.q2s = tokenize_data(data['q2s'])

        if pre_process:
            if 'remove_stopwords' in pre_process:
                self.q1s = remove_stopwords(self.q1s)
                self.q2s = remove_stopwords(self.q2s)
            elif 'stemming' in pre_process:
                self.q1s = stemming(self.q1s)
                self.q2s = stemming(self.q2s)
            elif 'lemmatization' in pre_process:
                self.q1s = lemmatization(self.q1s)
                self.q2s = lemmatization(self.q2s)

        # create vocab
        if split == 'train':
            # word2idx is dict of (word , word_id)
            self.vocab, self.word2idx = create_vocab(self.q1s,
                                                     self.q2s,
                                                     k=None)
            print("Vocabulary size: ", len(self.vocab))
        else:
            self.vocab, self.word2idx = vocab, word2idx
            self.q1s = handle_oov(self.q1s, self.vocab)
            self.q2s = handle_oov(self.q2s, self.vocab)

        # create mask
        max_seq_len, self.q1_lengths = max_sent_len(self.q1s)
        self.q1_lengths = torch.from_numpy(self.q1_lengths).to(self.device)
        self.q1_mask = torch.zeros((len(self.labels), max_seq_len))
        for i, l in enumerate(self.q1_lengths):
            self.q1_mask[i, :l] = 1.

        # padding q1s
        for i in range(len(self.q1s)):
            for j in range(max_seq_len - self.q1_lengths[i]):
                self.q1s[i].append('PAD')

        max_seq_len, self.q2_lengths = max_sent_len(self.q2s)
        self.q2_lengths = torch.from_numpy(self.q2_lengths).to(self.device)
        self.q2_mask = torch.zeros((len(self.labels), max_seq_len))
        for i, l in enumerate(self.q2_lengths):
            self.q2_mask[i, :l] = 1.

        # padding q2s
        for i in range(len(self.q2s)):
            for j in range(max_seq_len - self.q2_lengths[i]):
                self.q2s[i].append('PAD')

        self.q1s = sent_word2word_idx(self.q1s, self.word2idx)
        self.q2s = sent_word2word_idx(self.q2s, self.word2idx)