def create_ngram(sentences, n): """Create n-gram dictionary from set of sentences.""" ngram = Ngram(n) for sentence in sentences.astype('int64'): for i in range(len(sentence) - n + 1): ngram[tuple(sentence[i:i+n])] += 1 return ngram.norm()
def retrieve_ngram(sequence_loader, n): """Retrieve ngram from data loader""" ngram = Ngram(n) for _, y in sequence_loader: for sample in y: ngram[tuple(sample.to('cpu').numpy())] += 1 return ngram.norm()
def get_brown_ngram(n=3, dim=6): text = ''.join(brown.words()).lower() pattern = re.compile('[^' + 'etaoinsrhl'[:dim] + ']+') vowels = pattern.sub('', text) ngram = Ngram(n) for i in range(len(vowels) - n + 1): ngram[strtotuple(vowels[i:i + n])] += 1 return ngram.norm()
def randomized_ngram(n, size, out_dim=10, min_var=0): """Create randomized n-gram""" ngram = Ngram(n) while ngram.size() < size: ngram[tuple(np.random.randint(0, out_dim, n))] = np.random.random() unique = set() for idx in ngram: for i in idx: unique.add(i) if len(unique) != out_dim: return randomized_ngram(n, size, out_dim, min_var) ngram.norm() mu = sum(ngram.values()) / size var = sum([(x - mu)**2 for x in ngram.values()]) / size if var < min_var: return randomized_ngram(n, size, out_dim, min_var) return ngram
def randomized_ngram(n, entries, out_dim=10): """Create randomized n-gram""" ngram = Ngram(n) while ngram.size() < entries: ngram[tuple(np.random.randint(0, out_dim, n))] = np.random.random() unique = set() for idx in ngram: for i in idx: unique.add(i) if len(unique) != out_dim: return randomized_ngram(n, entries, out_dim) return ngram.norm()
def randomized_ngram(n, entries, out_dim=10): """Create randomized n-gram""" ngram = Ngram(n) while ngram.size() < entries: ngram[tuple(np.random.randint(0, out_dim, n))] = np.random.random() return ngram.norm()