Esempio n. 1
0
def ngramgen(source, *cuttoff_info):
    """Generate n-grams with provided cuttoff"""

    try:
        fh = open(source, 'r', encoding='UTF-8')
    except Exception:
        print(f'File not found: {source}')
        return

    try:
        cuttoff_info = map(int, cuttoff_info)
    except ValueError:
        print('Invalid cuttoff info provided, list of integers needed')
        return

    if len(list(cuttoff_info)) == 0:
        print('Cuttoff info provided is zero length')
        return

    sent_idx = _cached_sentences_index(source)

    fh.seek(0)
    lines = (line for line in fh)
    isents = text.iter_sentences(lines, sent_idx)

    itokens = (t for t, s, tid in text.iter_tokens(isents))
    res = ngramgenmod.multi_ngram(itokens, len(list(cuttoff_info)))
    fh.close()

    res = ngramgenmod.cutt_ngrams(res, cuttoff_info)

    for dict_ in res:
        for tpl, v in dict_.items():
            print(f"{' '.join(tpl)} {unicode(v).encode('utf-8')}")
Esempio n. 2
0
def ngramgen(source, *cuttoff_info):
    """Generate n-grams with provided cuttoff"""

    try:
        fh = open(source, 'r')
    except Exception:
        print 'File not found:', source
        return

    try:
        cuttoff_info = map(int, cuttoff_info)
    except ValueError:
        print 'Invalid cuttoff info provided, list of integers needed'
        return

    if len(cuttoff_info) == 0:
        print 'Cuttoff info provided is zero length'
        return

    sent_idx = _cached_sentences_index(source)

    fh.seek(0)
    lines = (line.decode('utf-8') for line in fh)
    isents = text.iter_sentences(lines, sent_idx)

    itokens = (t for t, s, tid in text.iter_tokens(isents))
    res = ngramgenmod.multi_ngram(itokens, len(cuttoff_info))
    fh.close()

    res = ngramgenmod.cutt_ngrams(res, cuttoff_info)

    for dict_ in res:
        for tpl, v in dict_.iteritems():
            print (' '.join(tpl) + ' ' + unicode(v)).encode('utf-8')
Esempio n. 3
0
def _cached_vocab(filepath):
    sig = _cache_sig(filepath)
    try:
        with open('%s/%s.vocab' % (_CACHE, sig), 'rb') as f:
            vocab_bin = f.read()
    except IOError:
        sent_idx = _cached_sentences_index(filepath)
        with open(filepath, 'r') as fh:
            lines = (line.decode('utf-8') for line in fh)
            vocab = text.vocabulary(text.iter_tokens(text.iter_sentences(lines, sent_idx)))
        vocab_bin = compress(json.dumps(vocab))
        with open('%s/%s.vocab' % (_CACHE, sig), 'wb') as f:
            f.write(vocab_bin)
    else:
        vocab = json.loads(decompress(vocab_bin))

    return vocab
Esempio n. 4
0
def _cached_vocab(filepath):
    sig = _cache_sig(filepath)
    try:
        with open('{_CACHE}/{sig}.vocab', 'rb') as f:
            vocab_bin = f.read()
    except IOError:
        sent_idx = _cached_sentences_index(filepath)
        with open(filepath, 'r') as fh:
            lines = (line for line in fh)
            vocab = text.vocabulary(
                text.iter_tokens(text.iter_sentences(lines, sent_idx)))
        vocab_bin = compress(json.dumps(vocab))
        with open('{_CACHE}/{sig.vocab}', 'wb') as f:
            f.write(vocab_bin)
    else:
        vocab = json.loads(decompress(vocab_bin))

    return vocab