Example #1
0
class KerasTokenizer(Tokenizer):
    def __init__(self, **kwargs):
        self._keras_tokenizer = KTokenizer(**kwargs)

    def encode(self, text: str) -> List[int]:
        return self._keras_tokenizer.texts_to_sequences([text])[0]

    def decode(self, sequence: List[int]) -> str:
        return self._keras_tokenizer.sequences_to_texts([sequence])[0]

    @property
    def vocab_size(self) -> int:
        return len(self._keras_tokenizer.word_index)

    def fit(self, texts: Iterable[str]):
        self._keras_tokenizer.fit_on_texts(texts)

    @property
    def token_index(self) -> Dict[str, int]:
        return self._keras_tokenizer.word_index
Example #2
0
    def run(self):

        X = None

        for data_file in self.args.data_files:
            ds = pd.read_csv(data_file,
                             sep=self.args.sep,
                             keep_default_na=False)
            if X is None:
                X = ds[self.args.text_field].values
            else:
                X = np.append(X, ds[self.args.text_field].values, axis=0)

        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(X)

        if self.args.w2v:
            X = tokenizer.texts_to_sequences(X)
            X = tokenizer.sequences_to_texts(X)
            self.build_fit_w2v(X)
        else:
            self.build_embedding(tokenizer.word_index)
class DataHandler:
    def __init__(self):
        self.formula_path = os.path.join(config.dataset_path, 'formulas')
        self.images_path = os.path.join(config.dataset_path, 'images')

        self.beg_token = '<BOS>'
        self.end_token = '<EOS>'
        self.unk_token = '<UNK>'
        self.tokenizer = None

        self.__fit_tokenizer()

    def __fit_tokenizer(self):
        if os.path.isfile(config.vocab_path):
            with open(config.vocab_path, 'r') as f:
                json_content = f.read()
                self.tokenizer = tokenizer_from_json(json_content)
        else:
            tmp_doc = (self.beg_token + ' ' + self.end_token + ' ') * 100
            docs = [tmp_doc, self.__read_raw_formulas('train')]
            num_tokens = config.vocab_size - 3  # for beg, and, unk token
            self.tokenizer = Tokenizer(num_words=num_tokens,
                                       filters='\t\n',
                                       lower=False,
                                       oov_token=self.unk_token)
            self.tokenizer.fit_on_texts(docs)
            with open(config.vocab_path, 'w+') as f:
                f.write(self.tokenizer.to_json())

    def get_path(self, mode):
        formulas_path = os.path.join(self.formula_path,
                                     '{}_formulas.txt'.format(mode))
        images_folder = os.path.join(self.images_path,
                                     'images_{}'.format(mode))
        return formulas_path, images_folder

    def __read_raw_formulas(self, mode, split=False):
        path = self.get_path(mode)[0]
        try:
            with open(path, 'r') as f:
                content = f.read()
                if split:
                    lines = content.split('\n')
                    if not lines[-1]:
                        lines = lines[:-1]
                    return lines
                return content
        except:
            return [] if split else ''

    def pad_token(self):
        return self.tokenizer.word_index[self.end_token]

    def start_token(self):
        return self.tokenizer.word_index[self.beg_token]

    def read_formulas(self, mode):
        lines = self.__read_raw_formulas(mode, split=True)
        for i in range(len(lines)):
            lines[i] = '{} {} {}'.format(self.beg_token, lines[i],
                                         self.end_token)
        result = self.tokenizer.texts_to_sequences(lines)
        return result

    def read_images(self, mode, index):
        dir_path = self.get_path(mode)[1]
        images_data = []
        for i in index:
            file_path = os.path.join(dir_path, str(i) + '.png')
            if os.path.isfile(file_path):
                image = imageio.imread(file_path)
                images_data.append(image)
        data = np.array(images_data)
        data = 255 - data
        return data

    def decode_formula(self, sequences):
        def normalize(formula):
            start_idx, end_idx = 0, len(formula)
            if formula[:6] == '<BOS> ':
                start_idx = 6
            try:
                end_idx = formula.index(self.end_token)
            except:
                pass
            return formula[start_idx:end_idx]

        sequences_list = sequences.tolist()
        formulas = self.tokenizer.sequences_to_texts(sequences_list)
        formulas = [normalize(formula) for formula in formulas]
        return formulas

    def plot_sample_sizes(self):
        lines = self.__read_raw_formulas('train', split=True)
        training_size = len(lines)
        lines += self.__read_raw_formulas('validation', split=True)
        validation_size = len(lines) - training_size
        print('Training set size: ', training_size)
        print('Validation set size: ', validation_size)

        sample_sizes = []
        for l in lines:
            sample_sizes += [len(l)]

        # the histogram of the data
        n, bins, patches = plt.hist(sample_sizes,
                                    20,
                                    facecolor='g',
                                    alpha=0.75)
        plt.xlabel('length of formula')
        plt.ylabel('sample size')
        plt.title('Histogram of Length of formulas')
        plt.grid(True)
        plt.show()
Example #4
0
class UnsupervisedKmeansAvgBaseModel(UnsupervisedBaseModel):
    def __init__(self, task):
        super(UnsupervisedKmeansAvgBaseModel, self).__init__(task)
        self.num_clusters = 4  # combinations of social and agency
        self.clf_model = KMeans(init='k-means++',
                                n_clusters=self.num_clusters,
                                n_init=10,
                                random_state=self.args.random_state)

    def augment_features(self, X_text, X_all_feats):

        if not self.args.use_allfeats:
            return X_text

        X_all = np.concatenate([X_text, X_all_feats[:, 2:]], axis=1)

        return X_all

    def train(self, X, y=None):
        X, y = self.augment_instances(X, y)

        #X_text = self.text_repr_model.fit_transform(X[:, self.args.TEXT_COL])

        X_text = X[:, self.args.TEXT_COL]

        self.max_features = 4000
        self.tokenizer = Tokenizer(num_words=self.max_features)
        self.tokenizer.fit_on_texts(X_text)
        X_text = self.tokenizer.texts_to_sequences(X_text)
        X_text = self.tokenizer.sequences_to_texts(X_text)

        self.text_rep_model = self.build_fit_w2v(X_text)

        X_text = self.transform_text_to_w2v(self.text_rep_model, X_text)

        X_all_feats = self.augment_features(X_text, X)

        pca = PCA(n_components=self.num_clusters,
                  random_state=self.args.random_state)
        pca.fit(X_all_feats)

        model = KMeans(init=pca.components_,
                       n_clusters=self.num_clusters,
                       n_init=1,
                       random_state=self.args.random_state)
        model.fit(X_all_feats)

        self.clf_model = model

    def predict(self, X):

        X_text = X[:, self.args.TEXT_COL]

        #X_text = self.text_rep_model.transform(X[:, self.args.TEXT_COL])
        X_text = self.transform_text_to_w2v(self.text_rep_model, X_text)

        X_all_feats = self.augment_features(X_text, X)
        y_pred = self.clf_model.predict(X_all_feats)

        y = y_pred.astype(np.uint8)
        y = np.unpackbits(y)
        y = y.reshape(y_pred.shape[0], 8)
        y = y[:, -2:]
        y = y[:, ::-1]

        return y
Example #5
0
    s1 = np.array(tokenizer.texts_to_sequences(s1))
    s2 = np.array(tokenizer.texts_to_sequences(s2))
    print(s1)
    print(s2)
    return [s1, s2]


w2v = gensim.models.KeyedVectors.load_word2vec_format('./alignment_vec.txt',
                                                      binary=False)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(get_vocab('atcgx'))

layer_outputs = [layer.output for layer in model.layers[2:]]
print(layer_outputs)
for layer in layer_outputs:
    print(layer)
activation_model = Model(inputs=model.input, outputs=layer_outputs)
activations = activation_model.predict(
    get_test_alignment(x_valid, 0, 1, tokenizer))
#print(activations)
prediction = activations[-1]
print(prediction)
activation_1 = activations[7]
#print(activation_1)
for kernel in activation_1[0]:
    words = np.reshape(kernel, (-1, word_length))
    print('Learned alignment motifs:')
    print(np.array(tokenizer.sequences_to_texts(np.round(words))))

    #print(w2v.similar_by_vector(words[0], topn=1)[0])