Esempio n. 1
0
    def filter_and_count(self, filepath_in, filepath_out):
        print("Filtering numbers ...")
        import re
        import glob

        step = 0

        # support for directory mode, saving into 1 single file dump
        if os.path.isdir(filepath_in):
            # directory mode
            files = glob.glob(filepath_in + '/*.txt')
        else:
            files = [filepath_in]

        output = open(filepath_out, 'w', encoding='utf-8')

        for fpath in files:

            with codecs.open(fpath, 'r', encoding='utf-8') as file:
                s = file.read()
                sent = s.strip().split()
                sent_filtered = []
                for token in sent:

                    if is_numeral(token):
                        # ['-32.000'] to ['-32']
                        # prevent '-haha' like token, double check
                        number = str(to_numeral(token))
                        self.nc[number] = self.nc.get(number, 0) + 1
                        sent_filtered.append(number)

                    else:
                        self.wc[token] = self.wc.get(token, 0) + 1
                        sent_filtered.append(token)

                output.write(
                    bytes(' '.join(sent_filtered), 'utf-8').decode('utf-8') +
                    '\n')

        output.close()
        print("filtering corpus done")
    def get_item(self, iword, owords):
        """
        form a proper data structure
        :param iword:
        :param owords:
        :return:
        """
        item = [None, [], 0, None, [0] * 2 * self.window, []]
        # [
        #   iword,
        #   [list of owords],
        #   0 or 1, indicator of iwords,
        #   None if iword is a token, numeral float if iword is a numeral,
        #   [one-hot indicator of owords],
        #   [list of numerals]
        # ]
        #
        # For example: if She is the center word and the window size is 2
        # oh , (She) is 1.67 m
        # [12, [99, 4, 5, 0], 0, None, [0,0,0,1], [1.67]]

        if is_numeral(iword):
            item[0] = self.word2idx[self.unk]
            item[2] = 1
            item[3] = to_numeral(iword)

        else:
            item[0] = self.word2idx[iword]

        for j in range(len(owords)):
            flag, oword = to_numeral_if_possible(owords[j])

            if flag:
                item[1].append(self.word2idx[self.unk])
                item[4][j] = 1
                item[5].append(oword)
            else:
                item[1].append(self.word2idx[oword])

        return item
Esempio n. 3
0
if __name__ == '__main__':

    # nc path
    nc = pickle.load(
        open(
            '../data/wikipedia/preprocess0.05Bnotable/NumeralAsNumeral/nc.dat',
            'rb'))
    gmm_save_dir = 'gmm'
    if not os.path.exists(gmm_save_dir):
        os.makedirs(gmm_save_dir)

    random.seed(100)
    # unfold and shuffle nc data
    data = []
    for k, v in nc.items():
        if to_numeral(k) == None:
            print('invalid numeral {}'.format(k))
        else:
            data += [[to_numeral(k)]] * v

    print('total number of different numerals: ', len(nc))
    print('total number of numeral samples: ', len(data))

    random.shuffle(data)
    data = np.array(data).reshape(-1, 1)

    prototypes = pickle.load(
        open(
            '../data/wikipedia/preprocess0.05Bnotable/NumeralAsNumeral/som/prototypes-50-0.6-1.0.dat',
            'rb'))
    idx2word = pickle.load(
        open(os.path.join(args.preprocess_dir, 'idx2word.dat'), 'rb'))
    wc = pickle.load(open(os.path.join(args.preprocess_dir, 'wc.dat'), 'rb'))
    nc = pickle.load(open(os.path.join(args.preprocess_dir, 'nc.dat'), 'rb'))

    # filter nc
    for k, v in nc.copy().items():
        f = np.float32(
            k
        )  # caution need to be float32 cause we use float32 in further caculation
        if np.isnan(f) or np.isinf(f):
            nc.pop(k)
            print(f)

    numeral2idx = {
        to_numeral(numeral): idx
        for idx, numeral in enumerate(list(nc.keys()))
    }

    wf = np.array([wc[word] for word in idx2word])
    w_sum = wf.sum()
    wf = wf / w_sum
    ws = 1 - np.sqrt(args.ss_t / wf)
    ws = np.clip(ws, 0, 1)
    vocab_size = len(idx2word)
    token_weights = wf if args.weights else None

    nf = np.array(list(nc.values()))
    n_sum = nf.sum()
    nf = nf / n_sum
    numerals = np.array(list(nc.keys()))
    def filter_and_count(self, filepath_in, filepath_out):
        print("Filtering numbers ...")
        import re
        import glob

        step = 0

        # the re for all possible token number
        RE_NUM = re.compile(
            r"(((-?\d+(,\d{3})*(\.\d+)?)\/(-?\d+(,\d{3})*(\.\d+)?))|(-?\d+(,\d{3})*(\.\d+)?))",
            re.UNICODE)

        # support for directory mode, saving into 1 single file dump
        if os.path.isdir(filepath_in):
            # directory mode
            files = glob.glob(filepath_in + '/*.txt')
        else:
            files = [filepath_in]

        output = open(filepath_out, 'w', encoding='utf-8')

        for fpath in files:

            with codecs.open(fpath, 'r', encoding='utf-8') as file:

                for line in file:
                    step += 1
                    if not step % 1000:
                        print("\n working on {}kth line in file {}".format(
                            step // 1000, fpath))

                    line = line.strip()
                    if not line:
                        continue
                    sent = line.split()
                    sent_filtered = []
                    for token in sent:
                        # we treat word and numerals differently
                        # match numerals
                        res = re.findall(RE_NUM, token)
                        if res != []:
                            target = number_handler(token)
                            # we do not want nc to record ''
                            if target == '':
                                continue

                            if type(target) is list:
                                # ['u-32'] to ['u','-'.'32']
                                # [1997/07] to ['1997','/','7']
                                for i in target:
                                    ww = str(
                                        to_numeral(i)) if is_numeral(i) else i
                                    self.wc[ww] = self.wc.get(
                                        ww, 0) + 1  # change to wc
                                    sent_filtered.append(ww)

                            elif is_numeral(target):
                                # ['-32.000'] to ['-32']
                                # prevent '-haha' like token, double check
                                number = str(to_numeral(target))
                                self.wc[number] = self.wc.get(
                                    number, 0) + 1  # change to wc
                                sent_filtered.append(number)

                        else:
                            self.wc[token] = self.wc.get(token, 0) + 1
                            sent_filtered.append(token)

                    output.write(
                        bytes(' '.join(sent_filtered), 'utf-8').decode('utf-8')
                        + '\n')
        output.close()
        print("filtering and counting done")
    def train_gmm(self,
                  components=20,
                  iters=100,
                  gmm_init_mode='rd',
                  gmm_type='soft',
                  prototype_path=None,
                  log_space=False):

        # print('<<<<<<<<<<INITIALIZING>>>>>>>>>> \n means: {} \n sigma: {}\n, weights: {}'.format(gmm.means_, gmm.covariances_, gmm.weights_))
        assert gmm_init_mode in ['rd', 'fp', 'km']
        assert gmm_type in ['soft', 'hard']
        nc = pickle.load(open(os.path.join(self.save_dir, 'nc.dat'), 'rb'))

        # we use fix random seed
        # random.seed(100)
        # unfold and shuffle nc data
        data = []
        for k, v in nc.items():
            if to_numeral(k) == None:
                print('invalid numeral {}'.format(k))
            else:
                data += [[to_numeral(k)]] * v

        print('total number of different numerals: ', len(nc))
        print('total number of numeral samples: ', len(data))

        # shuffle and subsample for MEM problem
        random.shuffle(data)

        if len(data) > 2000000:
            data = data[:2000000]

        if log_space:
            data = [weighted_log(x[0]) for x in data]

        print('subsampled to {}'.format(len(data)))

        data = np.array(data).reshape(-1, 1)
        # getting initialization parameters
        if gmm_init_mode == 'km':

            if gmm_type == 'soft':
                gmm = GaussianMixture(components,
                                      max_iter=iters,
                                      n_init=1,
                                      verbose=10,
                                      init_params='kmeans')
            else:
                gmm = HardEMGaussianMixture(components,
                                            max_iter=iters,
                                            n_init=1,
                                            verbose=10,
                                            init_params='kmeans')

        else:
            # random select means
            if gmm_init_mode == 'rd':
                prototypes = np.random.choice(data.reshape(-1), components)
            else:
                assert prototype_path is not None
                if log_space:
                    path = os.path.join(self.save_dir, 'som_log')
                else:
                    path = os.path.join(self.save_dir, 'som')

                path = os.path.join(path, prototype_path)
                prototypes = pickle.load(open(path, 'rb'))

                assert len(prototypes) == components

            mus = prototypes
            min_sigma = 1e-6

            diff = np.abs(data.reshape(len(data)) - mus[:, np.newaxis])

            amin = np.argmin(diff, axis=0)

            K = len(prototypes)
            clusters = [[0] for i in range(K)]
            for i in range(len(data)):
                clusters[amin[i]].append(data[i])

            means = np.array([np.mean(i) for i in clusters]).reshape(-1, 1)

            covs = np.array([
                np.std(i) if len(i) > 1 else min_sigma for i in clusters
            ]).reshape(-1, 1, 1)
            precision = np.linalg.inv(covs)

            weights = np.array([len(c) for c in clusters])
            weights = weights / np.sum(weights)

            if gmm_type == 'soft':
                gmm = GaussianMixture(components,
                                      max_iter=iters,
                                      n_init=1,
                                      verbose=10,
                                      means_init=means,
                                      precisions_init=precision,
                                      weights_init=weights)
            else:
                gmm = HardEMGaussianMixture(components,
                                            max_iter=iters,
                                            n_init=1,
                                            verbose=10,
                                            means_init=means,
                                            precisions_init=precision,
                                            weights_init=weights)

        gmm.fit(data)
        if log_space:
            gmm_save_dir = os.path.join(self.save_dir, 'gmm_log')
        else:
            gmm_save_dir = os.path.join(self.save_dir, 'gmm')

        if not os.path.exists(gmm_save_dir):
            os.makedirs(gmm_save_dir)

        def single_variable_gaussian(x, mu, sigma):
            return 1. / (np.sqrt(2. * np.pi) * sigma) * np.exp(-np.power(
                (x - mu) / sigma, 2.) / 2)

        def draw(gmm, X):
            x_min, x_max = min(X), max(X)
            # x = np.linspace(x_min, x_max, 10000)
            # x = np.array([])
            # for i in range(len(gmm.means_)):
            #     range_min, range_max = gmm.means_[i][0]-2 * gmm.covariances_[i][0], gmm.means_[i][0] + 2 * gmm.covariances_[i][0]
            #     x = np.append(x, np.linspace(range_min, range_max, 20))
            # x.sort()
            # print(x)
            print(gmm.means_)
            print(gmm.covariances_)
            print(gmm.weights_)

            X.sort()
            sum_y = np.zeros_like(X)
            plt.figure(0)
            plt.title('components')
            for i in range(len(gmm.means_)):
                y = single_variable_gaussian(X, gmm.means_[i][0],
                                             gmm.covariances_[i][0])
                y[y > 1] = 0  # set to 0 for better plot!
                sum_y += y * gmm.weights_[i]
                # yp = single_variable_gaussian(X, gmm.means_[i][0], gmm.covariances_[i][0])
                # yp[yp > 1] = 0
                # sum_yp += yp
                plt.plot(X, y)
            plt.savefig(
                os.path.join(gmm_save_dir,
                             'components-{}.png'.format(components)))

            plt.figure(1)
            plt.title('mixtures')

            plt.plot(X, sum_y, 'g-')
            plt.savefig(
                os.path.join(gmm_save_dir,
                             'mixture-{}.png'.format(components)))

        # 'rd' indicates for random initialization, 'fp' for 'from prototypes'

        pickle.dump(
            gmm,
            open(
                os.path.join(
                    gmm_save_dir,
                    'gmm-{}-{}-{}.dat'.format(components, gmm_init_mode,
                                              gmm_type)), 'wb'))
        print('means: {} \n sigma: {}\n, weights: {}'.format(
            gmm.means_, gmm.covariances_, gmm.weights_))

        if log_space:
            data_points = np.array([
                weighted_log(x)
                for x in np.array(list(nc.keys()), dtype=np.float32)
            ]).reshape(-1, 1)
        else:
            data_points = np.array(list(nc.keys()),
                                   dtype=np.float32).reshape(-1, 1)

        posterior = gmm.predict_proba(data_points)
        path = os.path.join(
            gmm_save_dir,
            'gmm_posterior-{}-{}-{}.dat'.format(components, gmm_init_mode,
                                                gmm_type))
        pickle.dump(posterior, open(path, 'wb'))
        print('...Saving trained GMMs objects to {}'.format(path))
    def train_som(self,
                  prototypes=10,
                  sigma=0.03,
                  lr=0.3,
                  iters=10000,
                  log_space=False):
        """
        :param nc_path: path under the save_directory
        :param prototypes: number of SOM neurons
        :param sigma:  sigma of SOM
        :param lr:  learning rate of SOM
        :return: None

        Train a simple SOM, and save it's neuron weights as prototypes, given numeral counts

        """
        nc = pickle.load(open(os.path.join(self.save_dir, 'nc.dat'), 'rb'))

        # unfold and shuffle nc data
        data = []
        for k, v in nc.items():
            if to_numeral(k) == None:
                print('invalid numeral {}'.format(k))
            else:
                data += [[to_numeral(k)]] * v

        print('total number of different numerals: ', len(nc))
        print('total number of numeral samples: ', len(data))

        random.shuffle(data)
        if log_space:
            data = [[weighted_log(x[0])] for x in data]

        som = SOM(prototypes,
                  1,
                  1,
                  sigma=sigma,
                  learning_rate=lr,
                  random_seed=random_seed)  # initialization

        print("Training SOMs...")
        # som.random_weights_init(data)
        som.train_random(data, iters)  # trains the SOM with 1000 iterations
        print("...Ready!")
        # win_map = som.win_map(data)
        self.prototypes = som.get_weights().reshape(prototypes)  # nd array
        if log_space:
            som_save_dir = os.path.join(self.save_dir, 'som_log')
        else:
            som_save_dir = os.path.join(self.save_dir, 'som')

        if not os.path.exists(som_save_dir):
            os.makedirs(som_save_dir)
        print('prototypes: \n{}'.format(self.prototypes))
        pickle.dump(
            self.prototypes,
            open(
                os.path.join(
                    som_save_dir,
                    'prototypes-{}-{}-{}.dat'.format(prototypes, sigma, lr)),
                'wb'))
        print('...Saving Prototypes')