Beispiel #1
0
def create_ngram(sentences, n):
    """Create n-gram dictionary from set of sentences."""
    ngram = Ngram(n)
    for sentence in sentences.astype('int64'):
        for i in range(len(sentence) - n + 1):
            ngram[tuple(sentence[i:i+n])] += 1
    return ngram.norm()
Beispiel #2
0
def retrieve_ngram(sequence_loader, n):
    """Retrieve ngram from data loader"""
    ngram = Ngram(n)
    for _, y in sequence_loader:
        for sample in y:
            ngram[tuple(sample.to('cpu').numpy())] += 1
    return ngram.norm()
Beispiel #3
0
def get_brown_ngram(n=3, dim=6):
    text = ''.join(brown.words()).lower()
    pattern = re.compile('[^' + 'etaoinsrhl'[:dim] + ']+')
    vowels = pattern.sub('', text)
    ngram = Ngram(n)
    for i in range(len(vowels) - n + 1):
        ngram[strtotuple(vowels[i:i + n])] += 1
    return ngram.norm()
Beispiel #4
0
def randomized_ngram(n, size, out_dim=10, min_var=0):
    """Create randomized n-gram"""
    ngram = Ngram(n)
    while ngram.size() < size:
        ngram[tuple(np.random.randint(0, out_dim, n))] = np.random.random()
    unique = set()
    for idx in ngram:
        for i in idx:
            unique.add(i)
    if len(unique) != out_dim:
        return randomized_ngram(n, size, out_dim, min_var)
    ngram.norm()
    mu = sum(ngram.values()) / size
    var = sum([(x - mu)**2 for x in ngram.values()]) / size
    if var < min_var:
        return randomized_ngram(n, size, out_dim, min_var)
    return ngram
Beispiel #5
0
def randomized_ngram(n, entries, out_dim=10):
    """Create randomized n-gram"""
    ngram = Ngram(n)
    while ngram.size() < entries:
        ngram[tuple(np.random.randint(0, out_dim, n))] = np.random.random()
    unique = set()
    for idx in ngram:
        for i in idx:
            unique.add(i)
    if len(unique) != out_dim:
        return randomized_ngram(n, entries, out_dim)
    return ngram.norm()
Beispiel #6
0
def randomized_ngram(n, entries, out_dim=10):
    """Create randomized n-gram"""
    ngram = Ngram(n)
    while ngram.size() < entries:
        ngram[tuple(np.random.randint(0, out_dim, n))] = np.random.random()
    return ngram.norm()