Ejemplo n.º 1
0
    def create_vocab(self, data):
        assert self.split == 'train', \
            "Vocabulary can only be created for training file."

        w2c = OrderedCounter()
        w2i, i2w = dict(), dict()

        special_tokens = [PAD_TOKEN, UNK_TOKEN, SOS_TOKEN, EOS_TOKEN]
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        for program in data:
            tokens = program.split()
            w2c.update(tokens)

        for w, c in w2c.items():
            if c > self.min_occ:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)
        print("Vocabulary of %i keys created." % len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        return vocab
Ejemplo n.º 2
0
    def _create_vocab(self):

        assert self.split == 'train', "Vocablurary can only be created for training file."

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        patients = np.load(self.raw_data_path)[()]
        
        for patient in patients.keys():
            for visit in patients[patient]:
                w2c.update(visit)

        for w, c in w2c.items():
            if c > self.min_occ and w not in special_tokens:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        print("Vocablurary of %i keys created." %len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(os.path.join(self.data_dir, self.vocab_file), 'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab()
Ejemplo n.º 3
0
    def _create_vocab(self):
        assert self.split == 'train', "Vocabulary can only be created for training file."
        tokenizer = PunktSentenceTokenizer(preserve_case=False)
        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()
        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)
        with open(self.raw_data_path, 'r') as file:
            for i, line in enumerate(file):
                words = tokenizer.tokenize(line)
                w2c.update(words)
            for w, c in w2c.items():
                if c > self.min_occ and w not in special_tokens:
                    i2w[len(w2i)] = w
                    w2i[w] = len(w2i)
        assert len(w2i) == len(i2w)

        print("Vocabulary of %i keys created." % len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(os.path.join(self.data_dir, self.vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))
        self._load_vocab()
Ejemplo n.º 4
0
    def f_create_vocab(self, vocab_obj, train_reviews):
        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']

        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        # train_reviews = train_df.review
       
        max_line = self.m_max_line
        line_i = 0
        for review in train_reviews:
            words = tokenizer.tokenize(review)
            w2c.update(words)

            if line_i > max_line:
                break

            line_i += 1

        print("max line", max_line)

        for w, c in w2c.items():
            if c > self.m_min_occ and w not in special_tokens:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)

        print("len(i2w)", len(i2w))
        vocab_obj.f_set_vocab(w2i, i2w)
Ejemplo n.º 5
0
    def _create_vocab(self):

        assert self.split == 'train', "Vocablurary can only be created for training file."

        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        data_folder = self.raw_data_path + "/" + self.split

        line_num = 0

        for filename in os.listdir(data_folder):
            if "news.en" not in filename:
                continue

            if line_num > self.max_line:
                break

            full_filename = os.path.join(data_folder, filename)
            print("file", full_filename)

            file = open(full_filename, "r")

            print("max line", self.max_line)

            for i, line in enumerate(file):
                words = tokenizer.tokenize(line)
                w2c.update(words)

                line_num += 1
                if line_num > self.max_line:
                    break

        print("line_num", line_num)

        for w, c in w2c.items():
            if c > self.min_occ and w not in special_tokens:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        print("Vocablurary of %i keys created." % len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(os.path.join(self.data_dir, self.vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab()
    def _create_vocab(self):

        assert self.split == 'train', "Vocablurary can only be created for training file."

        tokenizer = TweetTokenizer(preserve_case=False)

        self.w2c = OrderedCounter()
        self.w2i = dict()
        self.i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']

        for st in special_tokens:
            self.i2w[len(self.w2i)] = st
            self.w2i[st] = len(self.w2i)


        labels = ['0', '1']
        for l in labels:
            print("updating vocab with sentences of label {}".format(l))
            file = open(self.raw_data_path + l, 'r')
            num_lines = self.num_lines_0 if l=='0' else self.num_lines_1

            for i, line in enumerate(tqdm(file, total=num_lines)):
                
                if(i == num_lines):
                    break

                words = tokenizer.tokenize(line)

                if(len(words) > self.max_sequence_length):
                    continue

                self.w2c.update(words)

            file.close()


        print("done creating w2c")
        for w, c in tqdm(self.w2c.items()):
            if c > self.min_occ and w not in special_tokens:
                self.i2w[len(self.w2i)] = w
                self.w2i[w] = len(self.w2i)

        print("done creating w2i")

        assert len(self.w2i) == len(self.i2w)

        print("Vocablurary of %i keys created." % len(self.w2i))

        
        vocab = dict(w2i=self.w2i, i2w=self.i2w)
        with io.open(os.path.join(self.data_dir, self.vocab_file), 'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab()
        self.v_size = len(self.w2i)
Ejemplo n.º 7
0
    def create_vocab(self, vocab_file):
        self.vocab_file = vocab_file
        df = pd.DataFrame()

        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()
        preprocess_word = Preprocess_Word()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        with open(self.vocab_file, 'r') as file:
            lines = []
            for i, line in enumerate(file):
                lines.append(line)

                #line = rewrite_to_toklen(line)
                #words = tokenizer.tokenize(line)
                #words = [c for c in line]

                words = preprocess_word.to_words(line)
                w2c.update(words)

            for w, c in w2c.items():
                if c > self.min_occ and w not in special_tokens:
                    i2w[len(w2i)] = w
                    w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        print("Vocablurary of %i keys created." % len(w2i))

        self.w2i = w2i
        self.i2w = i2w

        self.dump()

        if self.train_with_vocab:
            df['url'] = lines
            self.create_data(df)

        elif self.train_file is not None:
            df = pd.read_csv(self.train_file, names=['url'])
            self.create_data(df)
Ejemplo n.º 8
0
    def _create_vocab(self, raw_data_file, **kwargs):

        assert self.split == 'train', "Only for training data Vocablurary can be created."

        print("Creating New Vocablurary.")

        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        with open(raw_data_file) as file:

            for i, line in enumerate(file):
                line = self._preprocess(line)
                question, answer = line.split('|||')
                question = tokenizer.tokenize(question)
                question = question[:self.max_utterance_length]
                answer = tokenizer.tokenize(answer)
                answer = answer[:self.max_utterance_length -
                                1]  # sos or eos token will be added
                words = question + answer
                w2c.update(question + answer)

                if i > 1000000:
                    break

            for w, c in w2c.items():
                if c > self.min_occ:
                    i2w[len(w2i)] = w
                    w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        print("Vocablurary of %i keys created." % len(w2i))

        vocab_file_path = os.path.join(self.root, self.vocab_file_name)
        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(vocab_file_path, 'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab()
Ejemplo n.º 9
0
 def __generate_features_dictionaries__(self):
     for feature in self.getFeatureNames():
         self.features_dicts[feature] = OrderedCounter()
     for graph in self.graphs:
         for edge in graph.get_edges():
             self.__basic_features_dictionaries__(edge)
     self.__add_basic_missing_tags__()
Ejemplo n.º 10
0
    def f_create_vocab(self, vocab_obj):

        # assert self.split == 'train', "Vocablurary can only be created for training file."

        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        with open(self.m_raw_train_data_path, 'r') as file:

            max_i = 0

            for i, line in enumerate(file):
                words = tokenizer.tokenize(line)
                w2c.update(words)

                max_i = i

                if i > self.m_max_line:
                    break

            print("max_i", max_i)

            for w, c in w2c.items():
                if c > self.m_min_occ and w not in special_tokens:
                    i2w[len(w2i)] = w
                    w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        print("Vocablurary of %i keys created." % len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(os.path.join(self.m_data_dir, self.m_vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        print("len(i2w)", len(i2w))
        vocab_obj.f_set_vocab(w2i, i2w)
Ejemplo n.º 11
0
    def _create_vocab(self):

        assert self.split == 'train', "Vocabulary can only be created for training file."

        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        df = pandas.read_csv(self.raw_data_path)

        for i in range(len(df)):
            poem = df.iloc[i]["Poem"]
            poem = poem.replace("\r\r\n", " <nl> ")

            words = tokenizer.tokenize(poem)

            # Filter out poems that don't have newlines
            if words.count('<nl>') <= 2:
                continue

            w2c.update(words)

        for w, c in w2c.items():
            if c >= self.min_occ and w not in special_tokens:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        print("Vocabulary of %i keys created." % len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(os.path.join(self.data_dir, self.vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab()
Ejemplo n.º 12
0
    def create_vocab(self, hints, test_hints):
        w2i = dict()
        i2w = dict()
        w2c = OrderedCounter()

        special_tokens = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN]
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        for hint in hints:
            hint_tokens = hint.split()
            w2c.update(hint_tokens)

        if test_hints is not None:
            for hint in test_hints:
                hint_tokens = hint.split()
                w2c.update(hint_tokens)

        # sort token so that different instantiations of the dataset is compatible
        for w, c in sorted(list(w2c.items())):
            i2w[len(w2i)] = w
            w2i[w] = len(w2i)

        assert len(w2i) == len(i2w) == len(w2c)+len(special_tokens)
        vocab = dict(w2i=w2i, i2w=i2w, w2c=w2c)
        self.vocab = vocab

        logging.info('Created vocab with %d words.' % len(w2c))
Ejemplo n.º 13
0
    def create_vocab(self, hints, test_hints):
        w2i = dict()
        i2w = dict()
        w2c = OrderedCounter()

        special_tokens = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN]
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        for hint in hints:
            hint_tokens = hint.split()
            w2c.update(hint_tokens)

        if test_hints is not None:
            for hint in test_hints:
                hint_tokens = hint.split()
                w2c.update(hint_tokens)

        for w, c in list(w2c.items()):
            i2w[len(w2i)] = w
            w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)
        vocab = dict(w2i=w2i, i2w=i2w)
        self.vocab = vocab

        logging.info('Created vocab with %d words.' % len(w2c))
Ejemplo n.º 14
0
    def _create_vocab(self):

        assert self.split == 'train', "Vocablurary can only be created for training file."
        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']

        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)
        #print("PATH: ", self.raw_data_path)
        if self.rows > 0:
            file = pd.read_csv(self.raw_data_path, nrows=self.rows)['text']
        else:
            file = pd.read_csv(self.raw_data_path)['text']
        #print("Data size: ", file.shape, )
        file = file.dropna(axis=0)
        for i, line in enumerate(file):
            #if i == 27054:
            #    continue
            words = tokenizer.tokenize(line)
            w2c.update(words)

        for w, c in w2c.items():
            if c > self.min_occ and w not in special_tokens:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        #print("Vocabulary of %i keys created." % len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(os.path.join(self.vocab_directory, self.vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab()
Ejemplo n.º 15
0
    def _create_vocab(self, vocab_obj, train_reviews):
        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']

        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        # train_reviews = train_df.review

        max_line = self.m_max_line
        line_i = 0
        for review in train_reviews:
            words = tokenizer.tokenize(review)
            w2c.update(words)

            if line_i > max_line:
                break

            line_i += 1

        print("max line", max_line)

        for w, c in w2c.items():
            if c > self.m_min_occ and w not in special_tokens:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)

        # print("vocabulary of %i keys created"%len(w2i))

        # vocab = dict(w2i=w2i, i2w=i2w)
        # with io.open(os.path.join(self.m_data_dir, self.m_vocab_file), 'wb') as vocab_file:
        #     data = json.dumps(vocab, ensure_ascii=False)
        #     vocab_file.write(data.encode('utf8', 'replace'))

        print("len(i2w)", len(i2w))
        vocab_obj.f_set_vocab(w2i, i2w)
Ejemplo n.º 16
0
    def _create_vocab(self, dataset_raw_file, **kwargs):

        assert self.split == 'train', "Vocablurary can only be created from training file."

        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        # add speical tokens to vocab
        special_tokens = ['<pad>', '<sos>', '<eos>', '<unk>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        with open(dataset_raw_file, 'r') as file:

            # read data and count token occurences
            for line in file.readlines():
                tokens = tokenizer.tokenize(line)
                w2c.update(tokens)

            # create vocab with
            for w, c in w2c.items():
                if c > self.min_occ:
                    i2w[len(w2i)] = w
                    w2i[w] = len(w2i)

        vocab = dict(w2i=w2i, i2w=i2w)

        # save vocab to file
        vocab_file_path = os.path.join(self.root, 'vocab.json')
        with io.open(vocab_file_path, 'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        print(
            "Vocablurary created with %i tokens. Minimum occurence criterion = %i."
            % (len(w2i), self.min_occ))

        self._load_vocab()
Ejemplo n.º 17
0
    def _create_vocab(self):

        assert self.split == 'train', "Vocablurary can only be created for training file."

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        # NOTE: vocab中 で特殊トークンはConditionEncoderでも横断的に現れそうなので、
        #       そのidは統一しておきたい
        assert w2i['<pad>'] == PAD_INDEX
        assert w2i['<unk>'] == UNK_INDEX
        assert w2i['<sos>'] == SOS_INDEX
        assert w2i['<eos>'] == EOS_INDEX

        with open(self.raw_data_path, 'r') as file:

            for i, line in enumerate(file):
                words = self.tokenize(line)
                w2c.update(words)

            for w, c in w2c.items():
                if c > self.min_occ and w not in special_tokens:
                    i2w[len(w2i)] = w
                    w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        print("Vocablurary of %i keys created." % len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(os.path.join(self.data_dir, self.vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab()
Ejemplo n.º 18
0
    def _create_combined_vocab(self):
        # this function uses both snli + yelp to create vocab

        assert self.split == 'train', "Vocablurary can only be created for training file."

        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']

        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        # first for yelp
        with open(self.yelp_raw_data_path, 'r') as file:

            for i, line in enumerate(tqdm(file, total=self.num_lines)):
                if (i == self.num_lines):
                    break
                words = tokenizer.tokenize(line)
                w2c.update(words)

        # now for snli
        with open(self.snli_raw_data_path, 'r') as file:

            for i, line in enumerate(tqdm(file, total=self.num_lines)):
                if (i == self.num_lines):
                    break
                words = tokenizer.tokenize(line)
                w2c.update(words)

        for w, c in tqdm(w2c.items()):
            if c > self.min_occ and w not in special_tokens:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        print("Vocabulary of %i keys created." % len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)

        with io.open(os.path.join(self.data_dir, self.vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab()
class Yelp(Dataset):

    def __init__(self, split, create_data=False, have_vocab=False, **kwargs):

        super().__init__()
        self.data_dir = "./data/yelp/"
        self.save_model_path = "./saved_vae_models"
        self.split = split

        if(split == "train"):
            self.num_lines_0 = 176787
            self.num_lines_1 = 267314
        else:
            self.num_lines_0 = 50278
            self.num_lines_1 = 76392

        self.filter_sentiment_words = True
        self.filter_stop_words = True
        self.embedding_size = 300
        self.max_sequence_length = 15
        self.min_occ = kwargs.get('min_occ', 2)    

        self.have_vocab = have_vocab
        self.raw_data_path = "./data/yelp/sentiment." + split + '.'
        self.preprocessed_data_file = 'yelp.'+split+'.json'
        self.vocab_file = 'yelp.vocab.json'
        self.path_to_w2v_embds = './data/yelp/yelp_w2v_embeddings'
        self.path_to_w2v_weights = './data/yelp/yelp_w2v_weights'

        if create_data:
            print("Creating new %s ptb data." % split.upper())
            self._create_data()

        elif not os.path.exists(os.path.join(self.data_dir, self.preprocessed_data_file)):
            print("%s preprocessed file not found at %s. Creating new." % (
                split.upper(), os.path.join(self.data_dir, self.preprocessed_data_file)))
            self._create_data()

        else:
            print(" found preprocessed files, no need tooo create data!")
            self._load_data()

        # load bow vocab
        with open("./data/yelp/bow.json") as f:
            self.bow_filtered_vocab_indices = json.load(f)

        self.bow_hidden_dim = len(self.bow_filtered_vocab_indices)

        
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        idx = str(idx)

        return {
            'input': np.asarray(self.data[idx]['input']),
            'target': np.asarray(self.data[idx]['target']),
            'bow': self._get_bow_representations(self.data[idx]['input']),
            # 'label': np.asarray(self.data[idx]['label']),
            'label': np.asarray([1-self.data[idx]['label'], self.data[idx]['label']]), # we need to make it 2 dim to match predicted label dim.
            'length': self.data[idx]['length']
        }

    @property
    def vocab_size(self):
        return len(self.w2i)

    @property
    def pad_idx(self):
        return self.w2i['<pad>']

    @property
    def sos_idx(self):
        return self.w2i['<sos>']

    @property
    def eos_idx(self):
        return self.w2i['<eos>']

    @property
    def unk_idx(self):
        return self.w2i['<unk>']

    def get_w2i(self):
        return self.w2i

    def get_i2w(self):
        return self.i2w

    def _load_data(self, vocab=True):

        print("loading preprocessed json data...")

        with open(os.path.join(self.data_dir, self.preprocessed_data_file), 'r') as file:
            self.data = json.load(file)
        if vocab:
            with open(os.path.join(self.data_dir, self.vocab_file), 'r') as file:
                vocab = json.load(file)
            self.w2i, self.i2w = vocab['w2i'], vocab['i2w']


    def _load_vocab(self):
        with open(os.path.join(self.data_dir, self.vocab_file), 'r') as vocab_file:
            vocab = json.load(vocab_file)

        self.w2i, self.i2w = vocab['w2i'], vocab['i2w']

    def _create_data(self):

        if not self.have_vocab and self.split == 'train':
            print("creating vocab for train!")
            self._create_vocab()
            print("finished creating vocab!")
            print("creating bow vocab for train!")
            self.create_bow_vocab(self.w2i)
            print("finished creating bow vocab!")           
            print("creating w2v embs matrix")
            self.create_w2v_weight_matrix()
            print("finished creating w2v embs matrix!")
        else:
            self._load_vocab()
            print("loaded vocab from mem!")

        tokenizer = TweetTokenizer(preserve_case=False)
        data = defaultdict(dict)

        labels = ['0', '1']

        for l in labels: 
            print("import data with label {}".format(l))
            file = open(self.raw_data_path + l, 'r')

            num_lines = self.num_lines_0 if l=='0' else self.num_lines_1

            for i, line in enumerate(tqdm(file, total=num_lines)):

                if(i == num_lines):
                    break

                words = tokenizer.tokenize(line)

                # filter out the words greater than this limit
                if(len(words) > self.max_sequence_length):
                    continue

                input = ['<sos>'] + words
                input = input[:self.max_sequence_length]

                target = words[:self.max_sequence_length-1]
                target = target + ['<eos>']

                assert len(input) == len(target), "%i, %i" % (len(input), len(target))
                length = len(input)

                input.extend(['<pad>'] * (self.max_sequence_length-length))
                target.extend(['<pad>'] * (self.max_sequence_length-length))

                input = [self.w2i.get(w, self.w2i['<unk>']) for w in input]
                target = [self.w2i.get(w, self.w2i['<unk>']) for w in target]

                id = len(data)
                data[id]['input'] = input
                data[id]['label'] = int(l)
                data[id]['target'] = target
                data[id]['length'] = length
            
            file.close()

        # shuffle the combined data
        print("Shuffling the combined data!")
        data = self.shuffle(data)

        with io.open(os.path.join(self.data_dir, self.preprocessed_data_file), 'wb') as preprocessed_data_file:
            data = json.dumps(data, ensure_ascii=False)
            preprocessed_data_file.write(data.encode('utf8', 'replace'))


        self._load_data(vocab=False)
    
    def shuffle(self, data):
        
        keys = [i for i in range(len(data))]
        random.shuffle(keys)
        data_shuffled = defaultdict(dict)

        i = 0
        for k in keys:
            if(data[k] is None):
                print("error in shuffle")
                exit()
            data_shuffled[i] = data[k]
            i = i+1

        return data_shuffled

    def _create_vocab(self):

        assert self.split == 'train', "Vocablurary can only be created for training file."

        tokenizer = TweetTokenizer(preserve_case=False)

        self.w2c = OrderedCounter()
        self.w2i = dict()
        self.i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']

        for st in special_tokens:
            self.i2w[len(self.w2i)] = st
            self.w2i[st] = len(self.w2i)


        labels = ['0', '1']
        for l in labels:
            print("updating vocab with sentences of label {}".format(l))
            file = open(self.raw_data_path + l, 'r')
            num_lines = self.num_lines_0 if l=='0' else self.num_lines_1

            for i, line in enumerate(tqdm(file, total=num_lines)):
                
                if(i == num_lines):
                    break

                words = tokenizer.tokenize(line)

                if(len(words) > self.max_sequence_length):
                    continue

                self.w2c.update(words)

            file.close()


        print("done creating w2c")
        for w, c in tqdm(self.w2c.items()):
            if c > self.min_occ and w not in special_tokens:
                self.i2w[len(self.w2i)] = w
                self.w2i[w] = len(self.w2i)

        print("done creating w2i")

        assert len(self.w2i) == len(self.i2w)

        print("Vocablurary of %i keys created." % len(self.w2i))

        
        vocab = dict(w2i=self.w2i, i2w=self.i2w)
        with io.open(os.path.join(self.data_dir, self.vocab_file), 'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab()
        self.v_size = len(self.w2i)

    def create_w2v_weight_matrix(self):

        self.emb_matrix = np.zeros((self.v_size, self.embedding_size))
        # load the pretrained word embeddings
        w2v_model = KeyedVectors.load_word2vec_format(self.path_to_w2v_embds)

        found = 0
        not_found = 0

        for index in range(self.v_size):
            word = self.i2w[str(index)]
            
            if w2v_model.has_index_for(word):
                self.emb_matrix[index] = w2v_model.get_vector(word)
                found += 1
            else:
                self.emb_matrix[index] = np.random.randn(self.embedding_size)
                # print("word: {} was not found ".format(word))
                not_found += 1

        np.save(self.path_to_w2v_weights, self.emb_matrix)
        print("Done creating w2v embedding matrix. {} found and {} unfound".format(found, not_found))


   
    def _get_bow_representations(self, text_sequence):
        """
        Returns BOW representation of every sequence of the batch
        """
        # self.bow_hidden_dim = len(self.bow_filtered_vocab_indices)
        sequence_bow_representation = np.zeros(shape=self.bow_hidden_dim, dtype=np.float32)
     
        # Iterate over each word in the sequence
        for index in text_sequence:

            if str(index) in self.bow_filtered_vocab_indices:
                bow_index = self.bow_filtered_vocab_indices[str(index)]
                sequence_bow_representation[bow_index] += 1
        
        # removing normalisation because the loss becomes too low with it, anyway it wont change correctness
        sequence_bow_representation /= np.max([np.sum(sequence_bow_representation), 1])

        return np.asarray(sequence_bow_representation)

    def create_bow_vocab(self, word_index):
        """
        Creates a dict of vocab indeces of non-stopwords and non-sentiment words
        """
        blacklisted_words = set()
        bow_filtered_vocab_indices = dict()
        # The '|' operator on sets in python acts as a union operator
        # blacklisted_words |= set(self.predefined_word_index.values())
        if self.filter_sentiment_words:
            blacklisted_words |= self._get_sentiment_words()
        if self.filter_stop_words:
            blacklisted_words |= self._get_stopwords()

        allowed_vocab = word_index.keys() - blacklisted_words
        i = 0

        for word in allowed_vocab:
            vocab_index = word_index[word]
            bow_filtered_vocab_indices[vocab_index] = i
            i += 1

        self.bow_hidden_dim = len(allowed_vocab)
        print("Created word index blacklist for BoW")
        print("BoW size: {}".format(self.bow_hidden_dim))
        
        # saving bow vocab
        with open('./data/yelp/bow.json', 'w') as json_file:
            json.dump(bow_filtered_vocab_indices, json_file)
        
        print("Saved bow.json at {}".format('./data/yelp/bow.json'))

    def _get_sentiment_words(self):
        """
        Returns all the sentiment words (positive and negative)
        which are excluded from the main vocab to form the BoW vocab
        """
        with open(file='./data/lexicon/positive-words.txt', mode='r', encoding='ISO-8859-1') as pos_sentiment_words_file,\
            open(file='./data/lexicon/negative-words.txt', mode='r', encoding='ISO-8859-1') as neg_sentiment_words_file:
            pos_words = pos_sentiment_words_file.readlines()
            neg_words = neg_sentiment_words_file.readlines()
            words = pos_words + neg_words
        words = set(word.strip() for word in words)

        return words

    def _get_stopwords(self):
        """
        Returns all the stopwords which excluded from the
        main vocab to form the BoW vocab
        """
        nltk_stopwords = set(stopwords.words('english'))
        sklearn_stopwords = stop_words.ENGLISH_STOP_WORDS

        all_stopwords = set()
        # The '|' operator on sets in python acts as a union operator
        all_stopwords |= spacy_stopwords
        all_stopwords |= nltk_stopwords
        all_stopwords |= sklearn_stopwords

        return all_stopwords
Ejemplo n.º 20
0
    def _create_vocab(self, bert):

        assert self.split == 'train', "Vocablurary can only be created for training file."

        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        a2c = OrderedCounter()
        w2i = dict()
        i2w = dict()
        a2i = dict()
        i2a = dict()

        if bert:
            self.pad = '[PAD]'
            self.unk = '[UNK]'
            self.sos = '[CLS]'
            self.eos = '[SEP]'
        else:
            self.pad = '[PAD]'
            self.unk = '[UNK]'
            self.sos = '[SOS]'
            self.eos = '[EOS]'

        special_tokens = [self.pad, self.unk, self.sos, self.eos]
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)
            i2a[len(a2i)] = st
            a2i[st] = len(a2i)

        with open(self.raw_definition_path, 'r') as file:

            for i, line in enumerate(file):
                words = tokenizer.tokenize(line)
                w2c.update(words)

            for w, c in w2c.items():
                if c > self.min_occ and w not in special_tokens:
                    i2w[len(w2i)] = w
                    w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        with open(self.raw_word_path, 'r') as file:

            for i, line in enumerate(file):
                words = list(line.strip())
                a2c.update(words)

        for w, c in a2c.items():
            if c > self.min_occ and w not in special_tokens:
                i2a[len(a2i)] = w
                a2i[w] = len(a2i)

        assert len(a2i) == len(i2a)

        print("Vocabulary of %i keys created." % len(w2i))
        print("Alphabet of %i keys created." % len(a2i))

        vocab = dict(w2i=w2i, i2w=i2w, a2i=a2i, i2a=i2a)
        with io.open(os.path.join(self.data_dir, self.vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab(bert)