def sparse_tensor_batches(batch_size=1000, symmetric=symmetric): batches = batch_generator2(self.model, self.sentences_generator(), batch_size=batch_size) for batch in batches: sparse_ppmi_tensor_pair = gatherer.create_pmi_tensor( batch=batch, positive=True, debug=False, symmetric=symmetric, log_info=False, neg_sample_percent=0.0 if nonneg else 0.05, pmi=True, shift=shift, ) yield sparse_ppmi_tensor_pair
def sparse_tensor_batches(batch_size=1000, symmetric=symmetric): if is_glove: def grouper(n, iterable): it = iter(iterable) while True: chunk = tuple(itertools.islice(it, n)) if not chunk: return yield chunk (indices, values) = gatherer.create_pmi_tensor( batch=None, positive=True, debug=False, symmetric=True, log_info=True, pmi=False, ) (indices, values) = (indices, np.log(values)) for i in range(50): indices_shuffled, values_shuffled = shuffle( indices, values) # sklearn's shuffle implementation print('GloVe iteration number {}...'.format(i)) for sampled_indices, sampled_values in zip( grouper(batch_size, indices_shuffled), grouper(batch_size, values_shuffled)): yield (sampled_indices, sampled_values) else: # not is_glove batches = batch_generator2(self.model, self.sentences_generator(), batch_size=batch_size) for batch in batches: sparse_ppmi_tensor_pair = gatherer.create_pmi_tensor( batch=batch, positive=True, debug=False, symmetric=symmetric, log_info=False, neg_sample_percent=neg_sample_percent, pmi=True, shift=shift, ) yield sparse_ppmi_tensor_pair
def sparse_tensor_batches(batch_size=1000): batches = batch_generator2( self.model, self.sentences_generator(num_sents=self.num_sents), batch_size=batch_size) for batch in batches: pairlist = [ gatherer.create_pmi_tensor( batch=batch, positive=True, debug=False, symmetric=True, log_info=False, neg_sample_percent=0.0, pmi=True, shift=shift, ) for (shift, gatherer) in zip(shifts, gatherers) ] yield ([x[0] for x in pairlist], [x[1] for x in pairlist])
def get_pmi_gatherer(self, n): gatherer = None if os.path.exists('gatherer_{}_{}_{}.pkl'.format( self.num_articles, self.min_count, n)): with open( 'gatherer_{}_{}_{}.pkl'.format(self.num_articles, self.min_count, n), 'rb') as f: t = time.time() import gc gc.disable() gatherer = dill.load(f) gc.enable() print('Loading gatherer took {} secs'.format(time.time() - t)) else: # batch_size doesn't matter. But higher is probably better (in terms of threading & speed) batches = batch_generator2( self.model, self.sentences_generator(num_articles=self.num_articles), batch_size=1000) gatherer = PMIGatherer(self.model, n=n) if self.num_articles <= 1e4: gatherer.populate_counts(batches, huge_vocab=False) else: gatherer.populate_counts(batches, huge_vocab=True, min_count=5) with open( 'gatherer_{}_{}_{}.pkl'.format(self.num_articles, self.min_count, n), 'wb') as f: t = time.time() import gc gc.disable() dill.dump(gatherer, f) gc.enable() print('Dumping gatherer took {} secs'.format(time.time() - t)) return gatherer