Esempio n. 1
0
def build_char_lang():
    lang = Lang()
    lang.word2index = dict()
    lang.index2word = dict()
    lang.n_words = 0
    chars = "!\"$%&'()*+,-./0123456789:;<>?[]abcdefghijklmnopqrstuvwxyz"
    for c in chars:
        lang.addWord(c)
    return lang
Esempio n. 2
0
    def __init__(self,
                 word_vectors,
                 max_length,
                 char_embed=False,
                 seeder=int(time.time())):
        super(PreTrainedEmbeddingEncoderBiRNN,
              self).__init__(word_vectors.vector_size,
                             max_length,
                             seeder=seeder)
        self.model_type = 'pre_trained_embedding'

        # define word vector embedding
        self.word_vectors = word_vectors

        # empty vector for oov
        self.empty_vector = Variable(torch.Tensor(self.empty_vector)).view(
            1, 1, -1)

        # char embed
        self.char_embed = char_embed
        if self.char_embed:
            lang = Lang()
            lang.word2index = dict()
            lang.index2word = dict()
            lang.n_words = 0
            chars = 'abcdefghijklmnopqrstuvwxyz0123456789'
            for c in chars:
                lang.addWord(c)
            self.charbased_model = WordEncoderBiRNN(self.hidden_size // 2,
                                                    params.CHAR_LENGTH,
                                                    lang,
                                                    seeder=seeder)

        # word vector for start of string
        sos = torch.ones(self.hidden_size)
        self.sos_vector = Variable(sos).view(1, 1, -1)

        # word vector for end of string
        eos = torch.ones(self.hidden_size) * -1
        self.eos_vector = Variable(eos).view(1, 1, -1)

        if params.USE_CUDA:
            self.cuda()
            self.empty_vector = self.empty_vector.cuda()
            self.sos_vector = self.sos_vector.cuda()
            self.eos_vector = self.eos_vector.cuda()

        self.cache_dict = dict()
        self.cache_dict[params.SOS_TOKEN] = self.sos_vector
        self.cache_dict[params.EOS_TOKEN] = self.eos_vector
Esempio n. 3
0
print("Nr of English sentences: ", len(english_data))
print("Nr of French sentences: ", len(french_data))

# use Lang datastructure from Pytorch seq2seq tutorial
english = Lang(english_data)
french = Lang(french_data)

# create parallel sentence pairs
sentences = list(zip(french_data, english_data))
for sent in sentences:
    fr_sent = sent[0].split(' ')
    en_sent = sent[1].split(' ')
    english.addSentence(en_sent)
    french.addSentence(fr_sent)
english.addWord('UNK')
french.addWord('UNK')

# example print of revert BPE
test = sentences[1]
print(test[0])
print(revert_BPE(test[0]))

input_voc_size = french.n_words
output_voc_size = english.n_words

print("Size of English vocabulary: ", output_voc_size)
print("Size of French vocabulary: ", input_voc_size)

#encoder = PositionalEncoder(input_voc_size, word_embedding_size, pos_embedding_size, maximum_length)
encoder = RNNEncoder(input_voc_size, word_embedding_size)
Esempio n. 4
0
class FashionDataSet(Dataset):

    def __init__(self, directory):
        self.word_lang = Lang("normal word")
        self.num_normal_word = -1
        self.MAX_LENGTH = MAX_LENGTH
        self.MAX_MEM_SIZE = MAX_MEM_SIZE
        with open(directory, "rb") as pickle_d:
            self.raw_data = pickle.load(pickle_d)
        self.prepare_lang()

    def prepare_lang(self):
        tuples = []
        with open('../dataset/total_dataset.p', "rb") as pickle_d:
            self.total = pickle.load(pickle_d)
        # add sentence first
        for pair in self.total:
            pair = filter_keywors(pair)
            sen = pair[1]
            self.word_lang.addSentence(sen)
            for tuple in pair[0]:
                tuples.append(tuple)

        self.num_normal_word = self.word_lang.n_words

        # add tuples last
        for (category, keyword) in tuples:
            self.word_lang.addWord(category)
            self.word_lang.addWord(keyword)

        print("Counted words:")
        print(self.word_lang.name, self.word_lang.n_words)

    def indexes_from_sentence(self, sentence):
        return [self.word_lang.word2index[word] for word in sentence.split(' ')]

    def tensor_from_sentence(self, sentence):
        indexes = self.indexes_from_sentence(sentence)
        indexes.append(EOS_token)
        indexes.insert(0, SOS_token)
        # padding for batch
        for i in range(len(indexes), MAX_LENGTH):
            indexes.append(Padding_token)
        return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

    def tensors_from_pair(self, pair):
        keyword_pairs = pair[0]
        sentence = pair[1]
        categories = [self.word_lang.word2index[category] for (category, _) in keyword_pairs]
        keywords = [self.word_lang.word2index[keyword] for (_, keyword) in keyword_pairs]
        # padding for batch
        for i in range(len(categories), MAX_MEM_SIZE):
            categories.append(Padding_token)
            keywords.append(Padding_token)
        categories = torch.tensor(categories[:MAX_MEM_SIZE], dtype=torch.long, device=device).view(-1, 1)
        keywords = torch.tensor(keywords[:MAX_MEM_SIZE], dtype=torch.long, device=device).view(-1, 1)
        tags = ['NN'] + extract_tags(sentence.split()) + ['NN']
        tags = tags[:MAX_LENGTH]
        sentence = self.tensor_from_sentence(sentence)
        if len(tags) < MAX_LENGTH:
            for i in range(len(tags), MAX_LENGTH):
                tags.append('NN')
        g_ground_truth = torch.zeros(sentence.size(0), device=device)

        for di in range(1, sentence.size(0) - 1):
            if sentence[di][0] in keywords.view(-1):
                g_ground_truth[di] = 0
            else:
                g_ground_truth[di] = 1
        return {"categories": categories, "keywords":keywords, "memory_size": min(len(keyword_pairs), MAX_MEM_SIZE), "sentence": sentence, "tags": tags, "g_truth": g_ground_truth}

    def __len__(self):
        return len(self.raw_data)

    def __getitem__(self, index):
        return self.tensors_from_pair(self.raw_data[index])