def f_create_vocab(self, vocab_obj, train_reviews):
        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']

        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        # train_reviews = train_df.review
       
        max_line = self.m_max_line
        line_i = 0
        for review in train_reviews:
            words = tokenizer.tokenize(review)
            w2c.update(words)

            if line_i > max_line:
                break

            line_i += 1

        print("max line", max_line)

        for w, c in w2c.items():
            if c > self.m_min_occ and w not in special_tokens:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)

        print("len(i2w)", len(i2w))
        vocab_obj.f_set_vocab(w2i, i2w)
Beispiel #2
0
    def create_vocab(self, hints, test_hints):
        w2i = dict()
        i2w = dict()
        w2c = OrderedCounter()

        special_tokens = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN]
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        for hint in hints:
            hint_tokens = hint.split()
            w2c.update(hint_tokens)

        if test_hints is not None:
            for hint in test_hints:
                hint_tokens = hint.split()
                w2c.update(hint_tokens)

        # sort token so that different instantiations of the dataset is compatible
        for w, c in sorted(list(w2c.items())):
            i2w[len(w2i)] = w
            w2i[w] = len(w2i)

        assert len(w2i) == len(i2w) == len(w2c)+len(special_tokens)
        vocab = dict(w2i=w2i, i2w=i2w, w2c=w2c)
        self.vocab = vocab

        logging.info('Created vocab with %d words.' % len(w2c))
Beispiel #3
0
    def _create_vocab(self):

        assert self.split == 'train', "Vocablurary can only be created for training file."

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        patients = np.load(self.raw_data_path)[()]
        
        for patient in patients.keys():
            for visit in patients[patient]:
                w2c.update(visit)

        for w, c in w2c.items():
            if c > self.min_occ and w not in special_tokens:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        print("Vocablurary of %i keys created." %len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(os.path.join(self.data_dir, self.vocab_file), 'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab()
Beispiel #4
0
    def create_vocab(self, hints, test_hints):
        w2i = dict()
        i2w = dict()
        w2c = OrderedCounter()

        special_tokens = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN]
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        for hint in hints:
            hint_tokens = hint.split()
            w2c.update(hint_tokens)

        if test_hints is not None:
            for hint in test_hints:
                hint_tokens = hint.split()
                w2c.update(hint_tokens)

        for w, c in list(w2c.items()):
            i2w[len(w2i)] = w
            w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)
        vocab = dict(w2i=w2i, i2w=i2w)
        self.vocab = vocab

        logging.info('Created vocab with %d words.' % len(w2c))
Beispiel #5
0
 def __generate_features_dictionaries__(self):
     for feature in self.getFeatureNames():
         self.features_dicts[feature] = OrderedCounter()
     for graph in self.graphs:
         for edge in graph.get_edges():
             self.__basic_features_dictionaries__(edge)
     self.__add_basic_missing_tags__()
Beispiel #6
0
    def create_vocab(self, data):
        assert self.split == 'train', \
            "Vocabulary can only be created for training file."

        w2c = OrderedCounter()
        w2i, i2w = dict(), dict()

        special_tokens = [PAD_TOKEN, UNK_TOKEN, SOS_TOKEN, EOS_TOKEN]
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        for program in data:
            tokens = program.split()
            w2c.update(tokens)

        for w, c in w2c.items():
            if c > self.min_occ:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)
        print("Vocabulary of %i keys created." % len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        return vocab
Beispiel #7
0
    def _create_vocab(self):
        assert self.split == 'train', "Vocabulary can only be created for training file."
        tokenizer = PunktSentenceTokenizer(preserve_case=False)
        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()
        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)
        with open(self.raw_data_path, 'r') as file:
            for i, line in enumerate(file):
                words = tokenizer.tokenize(line)
                w2c.update(words)
            for w, c in w2c.items():
                if c > self.min_occ and w not in special_tokens:
                    i2w[len(w2i)] = w
                    w2i[w] = len(w2i)
        assert len(w2i) == len(i2w)

        print("Vocabulary of %i keys created." % len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(os.path.join(self.data_dir, self.vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))
        self._load_vocab()
    def _create_vocab(self):

        assert self.split == 'train', "Vocablurary can only be created for training file."

        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        data_folder = self.raw_data_path + "/" + self.split

        line_num = 0

        for filename in os.listdir(data_folder):
            if "news.en" not in filename:
                continue

            if line_num > self.max_line:
                break

            full_filename = os.path.join(data_folder, filename)
            print("file", full_filename)

            file = open(full_filename, "r")

            print("max line", self.max_line)

            for i, line in enumerate(file):
                words = tokenizer.tokenize(line)
                w2c.update(words)

                line_num += 1
                if line_num > self.max_line:
                    break

        print("line_num", line_num)

        for w, c in w2c.items():
            if c > self.min_occ and w not in special_tokens:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        print("Vocablurary of %i keys created." % len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(os.path.join(self.data_dir, self.vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab()
    def _create_vocab(self):

        assert self.split == 'train', "Vocablurary can only be created for training file."

        tokenizer = TweetTokenizer(preserve_case=False)

        self.w2c = OrderedCounter()
        self.w2i = dict()
        self.i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']

        for st in special_tokens:
            self.i2w[len(self.w2i)] = st
            self.w2i[st] = len(self.w2i)


        labels = ['0', '1']
        for l in labels:
            print("updating vocab with sentences of label {}".format(l))
            file = open(self.raw_data_path + l, 'r')
            num_lines = self.num_lines_0 if l=='0' else self.num_lines_1

            for i, line in enumerate(tqdm(file, total=num_lines)):
                
                if(i == num_lines):
                    break

                words = tokenizer.tokenize(line)

                if(len(words) > self.max_sequence_length):
                    continue

                self.w2c.update(words)

            file.close()


        print("done creating w2c")
        for w, c in tqdm(self.w2c.items()):
            if c > self.min_occ and w not in special_tokens:
                self.i2w[len(self.w2i)] = w
                self.w2i[w] = len(self.w2i)

        print("done creating w2i")

        assert len(self.w2i) == len(self.i2w)

        print("Vocablurary of %i keys created." % len(self.w2i))

        
        vocab = dict(w2i=self.w2i, i2w=self.i2w)
        with io.open(os.path.join(self.data_dir, self.vocab_file), 'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab()
        self.v_size = len(self.w2i)
Beispiel #10
0
    def _create_combined_vocab(self):
        # this function uses both snli + yelp to create vocab

        assert self.split == 'train', "Vocablurary can only be created for training file."

        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']

        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        # first for yelp
        with open(self.yelp_raw_data_path, 'r') as file:

            for i, line in enumerate(tqdm(file, total=self.num_lines)):
                if (i == self.num_lines):
                    break
                words = tokenizer.tokenize(line)
                w2c.update(words)

        # now for snli
        with open(self.snli_raw_data_path, 'r') as file:

            for i, line in enumerate(tqdm(file, total=self.num_lines)):
                if (i == self.num_lines):
                    break
                words = tokenizer.tokenize(line)
                w2c.update(words)

        for w, c in tqdm(w2c.items()):
            if c > self.min_occ and w not in special_tokens:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        print("Vocabulary of %i keys created." % len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)

        with io.open(os.path.join(self.data_dir, self.vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab()
    def _create_vocab(self, raw_data_file, **kwargs):

        assert self.split == 'train', "Only for training data Vocablurary can be created."

        print("Creating New Vocablurary.")

        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        with open(raw_data_file) as file:

            for i, line in enumerate(file):
                line = self._preprocess(line)
                question, answer = line.split('|||')
                question = tokenizer.tokenize(question)
                question = question[:self.max_utterance_length]
                answer = tokenizer.tokenize(answer)
                answer = answer[:self.max_utterance_length -
                                1]  # sos or eos token will be added
                words = question + answer
                w2c.update(question + answer)

                if i > 1000000:
                    break

            for w, c in w2c.items():
                if c > self.min_occ:
                    i2w[len(w2i)] = w
                    w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        print("Vocablurary of %i keys created." % len(w2i))

        vocab_file_path = os.path.join(self.root, self.vocab_file_name)
        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(vocab_file_path, 'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab()
Beispiel #12
0
    def create_vocab(self, vocab_file):
        self.vocab_file = vocab_file
        df = pd.DataFrame()

        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()
        preprocess_word = Preprocess_Word()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        with open(self.vocab_file, 'r') as file:
            lines = []
            for i, line in enumerate(file):
                lines.append(line)

                #line = rewrite_to_toklen(line)
                #words = tokenizer.tokenize(line)
                #words = [c for c in line]

                words = preprocess_word.to_words(line)
                w2c.update(words)

            for w, c in w2c.items():
                if c > self.min_occ and w not in special_tokens:
                    i2w[len(w2i)] = w
                    w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        print("Vocablurary of %i keys created." % len(w2i))

        self.w2i = w2i
        self.i2w = i2w

        self.dump()

        if self.train_with_vocab:
            df['url'] = lines
            self.create_data(df)

        elif self.train_file is not None:
            df = pd.read_csv(self.train_file, names=['url'])
            self.create_data(df)
Beispiel #13
0
    def f_create_vocab(self, vocab_obj):

        # assert self.split == 'train', "Vocablurary can only be created for training file."

        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        with open(self.m_raw_train_data_path, 'r') as file:

            max_i = 0

            for i, line in enumerate(file):
                words = tokenizer.tokenize(line)
                w2c.update(words)

                max_i = i

                if i > self.m_max_line:
                    break

            print("max_i", max_i)

            for w, c in w2c.items():
                if c > self.m_min_occ and w not in special_tokens:
                    i2w[len(w2i)] = w
                    w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        print("Vocablurary of %i keys created." % len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(os.path.join(self.m_data_dir, self.m_vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        print("len(i2w)", len(i2w))
        vocab_obj.f_set_vocab(w2i, i2w)
    def _create_vocab(self):

        assert self.split == 'train', "Vocabulary can only be created for training file."

        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        df = pandas.read_csv(self.raw_data_path)

        for i in range(len(df)):
            poem = df.iloc[i]["Poem"]
            poem = poem.replace("\r\r\n", " <nl> ")

            words = tokenizer.tokenize(poem)

            # Filter out poems that don't have newlines
            if words.count('<nl>') <= 2:
                continue

            w2c.update(words)

        for w, c in w2c.items():
            if c >= self.min_occ and w not in special_tokens:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        print("Vocabulary of %i keys created." % len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(os.path.join(self.data_dir, self.vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab()
Beispiel #15
0
    def _create_vocab(self):

        assert self.split == 'train', "Vocablurary can only be created for training file."
        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']

        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)
        #print("PATH: ", self.raw_data_path)
        if self.rows > 0:
            file = pd.read_csv(self.raw_data_path, nrows=self.rows)['text']
        else:
            file = pd.read_csv(self.raw_data_path)['text']
        #print("Data size: ", file.shape, )
        file = file.dropna(axis=0)
        for i, line in enumerate(file):
            #if i == 27054:
            #    continue
            words = tokenizer.tokenize(line)
            w2c.update(words)

        for w, c in w2c.items():
            if c > self.min_occ and w not in special_tokens:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        #print("Vocabulary of %i keys created." % len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(os.path.join(self.vocab_directory, self.vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab()
Beispiel #16
0
    def _create_vocab(self, vocab_obj, train_reviews):
        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']

        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        # train_reviews = train_df.review

        max_line = self.m_max_line
        line_i = 0
        for review in train_reviews:
            words = tokenizer.tokenize(review)
            w2c.update(words)

            if line_i > max_line:
                break

            line_i += 1

        print("max line", max_line)

        for w, c in w2c.items():
            if c > self.m_min_occ and w not in special_tokens:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)

        # print("vocabulary of %i keys created"%len(w2i))

        # vocab = dict(w2i=w2i, i2w=i2w)
        # with io.open(os.path.join(self.m_data_dir, self.m_vocab_file), 'wb') as vocab_file:
        #     data = json.dumps(vocab, ensure_ascii=False)
        #     vocab_file.write(data.encode('utf8', 'replace'))

        print("len(i2w)", len(i2w))
        vocab_obj.f_set_vocab(w2i, i2w)
Beispiel #17
0
    def _create_vocab(self, dataset_raw_file, **kwargs):

        assert self.split == 'train', "Vocablurary can only be created from training file."

        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        # add speical tokens to vocab
        special_tokens = ['<pad>', '<sos>', '<eos>', '<unk>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        with open(dataset_raw_file, 'r') as file:

            # read data and count token occurences
            for line in file.readlines():
                tokens = tokenizer.tokenize(line)
                w2c.update(tokens)

            # create vocab with
            for w, c in w2c.items():
                if c > self.min_occ:
                    i2w[len(w2i)] = w
                    w2i[w] = len(w2i)

        vocab = dict(w2i=w2i, i2w=i2w)

        # save vocab to file
        vocab_file_path = os.path.join(self.root, 'vocab.json')
        with io.open(vocab_file_path, 'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        print(
            "Vocablurary created with %i tokens. Minimum occurence criterion = %i."
            % (len(w2i), self.min_occ))

        self._load_vocab()
Beispiel #18
0
    def _create_vocab(self):

        assert self.split == 'train', "Vocablurary can only be created for training file."

        w2c = OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        # NOTE: vocab中 で特殊トークンはConditionEncoderでも横断的に現れそうなので、
        #       そのidは統一しておきたい
        assert w2i['<pad>'] == PAD_INDEX
        assert w2i['<unk>'] == UNK_INDEX
        assert w2i['<sos>'] == SOS_INDEX
        assert w2i['<eos>'] == EOS_INDEX

        with open(self.raw_data_path, 'r') as file:

            for i, line in enumerate(file):
                words = self.tokenize(line)
                w2c.update(words)

            for w, c in w2c.items():
                if c > self.min_occ and w not in special_tokens:
                    i2w[len(w2i)] = w
                    w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        print("Vocablurary of %i keys created." % len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(os.path.join(self.data_dir, self.vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab()
Beispiel #19
0
    def _create_vocab(self, bert):

        assert self.split == 'train', "Vocablurary can only be created for training file."

        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = OrderedCounter()
        a2c = OrderedCounter()
        w2i = dict()
        i2w = dict()
        a2i = dict()
        i2a = dict()

        if bert:
            self.pad = '[PAD]'
            self.unk = '[UNK]'
            self.sos = '[CLS]'
            self.eos = '[SEP]'
        else:
            self.pad = '[PAD]'
            self.unk = '[UNK]'
            self.sos = '[SOS]'
            self.eos = '[EOS]'

        special_tokens = [self.pad, self.unk, self.sos, self.eos]
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)
            i2a[len(a2i)] = st
            a2i[st] = len(a2i)

        with open(self.raw_definition_path, 'r') as file:

            for i, line in enumerate(file):
                words = tokenizer.tokenize(line)
                w2c.update(words)

            for w, c in w2c.items():
                if c > self.min_occ and w not in special_tokens:
                    i2w[len(w2i)] = w
                    w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        with open(self.raw_word_path, 'r') as file:

            for i, line in enumerate(file):
                words = list(line.strip())
                a2c.update(words)

        for w, c in a2c.items():
            if c > self.min_occ and w not in special_tokens:
                i2a[len(a2i)] = w
                a2i[w] = len(a2i)

        assert len(a2i) == len(i2a)

        print("Vocabulary of %i keys created." % len(w2i))
        print("Alphabet of %i keys created." % len(a2i))

        vocab = dict(w2i=w2i, i2w=i2w, a2i=a2i, i2a=i2a)
        with io.open(os.path.join(self.data_dir, self.vocab_file),
                     'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab(bert)