def ngramgen(source, *cuttoff_info): """Generate n-grams with provided cuttoff""" try: fh = open(source, 'r', encoding='UTF-8') except Exception: print(f'File not found: {source}') return try: cuttoff_info = map(int, cuttoff_info) except ValueError: print('Invalid cuttoff info provided, list of integers needed') return if len(list(cuttoff_info)) == 0: print('Cuttoff info provided is zero length') return sent_idx = _cached_sentences_index(source) fh.seek(0) lines = (line for line in fh) isents = text.iter_sentences(lines, sent_idx) itokens = (t for t, s, tid in text.iter_tokens(isents)) res = ngramgenmod.multi_ngram(itokens, len(list(cuttoff_info))) fh.close() res = ngramgenmod.cutt_ngrams(res, cuttoff_info) for dict_ in res: for tpl, v in dict_.items(): print(f"{' '.join(tpl)} {unicode(v).encode('utf-8')}")
def ngramgen(source, *cuttoff_info): """Generate n-grams with provided cuttoff""" try: fh = open(source, 'r') except Exception: print 'File not found:', source return try: cuttoff_info = map(int, cuttoff_info) except ValueError: print 'Invalid cuttoff info provided, list of integers needed' return if len(cuttoff_info) == 0: print 'Cuttoff info provided is zero length' return sent_idx = _cached_sentences_index(source) fh.seek(0) lines = (line.decode('utf-8') for line in fh) isents = text.iter_sentences(lines, sent_idx) itokens = (t for t, s, tid in text.iter_tokens(isents)) res = ngramgenmod.multi_ngram(itokens, len(cuttoff_info)) fh.close() res = ngramgenmod.cutt_ngrams(res, cuttoff_info) for dict_ in res: for tpl, v in dict_.iteritems(): print (' '.join(tpl) + ' ' + unicode(v)).encode('utf-8')
def _cached_vocab(filepath): sig = _cache_sig(filepath) try: with open('%s/%s.vocab' % (_CACHE, sig), 'rb') as f: vocab_bin = f.read() except IOError: sent_idx = _cached_sentences_index(filepath) with open(filepath, 'r') as fh: lines = (line.decode('utf-8') for line in fh) vocab = text.vocabulary(text.iter_tokens(text.iter_sentences(lines, sent_idx))) vocab_bin = compress(json.dumps(vocab)) with open('%s/%s.vocab' % (_CACHE, sig), 'wb') as f: f.write(vocab_bin) else: vocab = json.loads(decompress(vocab_bin)) return vocab
def _cached_vocab(filepath): sig = _cache_sig(filepath) try: with open('{_CACHE}/{sig}.vocab', 'rb') as f: vocab_bin = f.read() except IOError: sent_idx = _cached_sentences_index(filepath) with open(filepath, 'r') as fh: lines = (line for line in fh) vocab = text.vocabulary( text.iter_tokens(text.iter_sentences(lines, sent_idx))) vocab_bin = compress(json.dumps(vocab)) with open('{_CACHE}/{sig.vocab}', 'wb') as f: f.write(vocab_bin) else: vocab = json.loads(decompress(vocab_bin)) return vocab