def _read_probs_from_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200): if not loc.exists(): print("Warning: Frequencies file not found") return {}, 0.0 counts = PreshCounter() total = 0 if str(loc).endswith('gz'): file_ = gzip.open(str(loc)) else: file_ = loc.open() for i, line in enumerate(file_): freq, doc_freq, key = line.rstrip().split('\t', 2) freq = int(freq) counts.inc(i + 1, freq) total += freq counts.smooth() log_total = math.log(total) if str(loc).endswith('gz'): file_ = gzip.open(str(loc)) else: file_ = loc.open() probs = {} for line in file_: freq, doc_freq, key = line.rstrip().split('\t', 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len( key) < max_length: word = literal_eval(key) smooth_count = counts.smoother(int(freq)) probs[word] = math.log(smooth_count) - log_total oov_prob = math.log(counts.smoother(0)) - log_total return probs, oov_prob
def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200): if not loc.exists(): print("Warning: Frequencies file not found") return {}, 0.0 counts = PreshCounter() total = 0 if str(loc).endswith('gz'): file_ = gzip.open(str(loc)) else: file_ = loc.open() for i, line in enumerate(file_): freq, doc_freq, key = line.split('\t', 2) freq = int(freq) counts.inc(i+1, freq) total += freq counts.smooth() log_total = math.log(total) probs = {} for line in loc.open(): freq, doc_freq, key = line.split('\t', 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: word = literal_eval(key) smooth_count = counts.smoother(int(freq)) log_smooth_count = math.log(smooth_count) probs[word] = math.log(smooth_count) - log_total oov_prob = math.log(counts.smoother(0)) - log_total return probs, oov_prob
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): counts = PreshCounter() total = 0 with freqs_loc.open() as f: for i, line in enumerate(f): freq, doc_freq, key = line.rstrip().split("\t", 2) freq = int(freq) counts.inc(i + 1, freq) total += freq counts.smooth() log_total = math.log(total) probs = {} with freqs_loc.open() as f: for line in tqdm(f): freq, doc_freq, key = line.rstrip().split("\t", 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len( key) < max_length: try: word = literal_eval(key) except SyntaxError: # Take odd strings literally. word = literal_eval("'%s'" % key) smooth_count = counts.smoother(int(freq)) probs[word] = math.log(smooth_count) - log_total oov_prob = math.log(counts.smoother(0)) - log_total return probs, oov_prob
def test_smooth_prob(): p = PreshCounter() # 1 10 # 2 6 # 3 4 # 5 2 # 8 1 for i in range(10): p.inc(100 - i, 1) # 10 items of freq 1 for i in range(6): p.inc(90 - i, 2) # 6 items of freq 2 for i in range(4): p.inc(80 - i, 3) # 4 items of freq 3 for i in range(2): p.inc(70 - i, 5) # 2 items of freq 5 for i in range(1): p.inc(60 - i, 8) # 1 item of freq 8 assert p.total == (10 * 1) + (6 * 2) + (4 * 3) + (2 * 5) + (1 * 8) assert p.prob(100) == 1.0 / p.total assert p.prob(200) == 0.0 assert p.prob(60) == 8.0 / p.total p.smooth() assert p.smoother(1) < 1.0 assert p.smoother(8) < 8.0 assert p.prob(1000) < p.prob(100) for event, count in reversed(sorted(p, key=lambda it: it[1])): assert p.smoother(count) < count
def test_smooth_prob(): p = PreshCounter() # 1 10 # 2 6 # 3 4 # 5 2 # 8 1 for i in range(10): p.inc(100-i, 1) # 10 items of freq 1 for i in range(6): p.inc(90 - i, 2) # 6 items of freq 2 for i in range(4): p.inc(80 - i, 3) # 4 items of freq 3 for i in range(2): p.inc(70 - i, 5) # 2 items of freq 5 for i in range(1): p.inc(60 - i, 8) # 1 item of freq 8 assert p.total == (10 * 1) + (6 * 2) + (4 * 3) + (2 * 5) + (1 * 8) assert p.prob(100) == 1.0 / p.total assert p.prob(200) == 0.0 assert p.prob(60) == 8.0 / p.total p.smooth() assert p.smoother(1) < 1.0 assert p.smoother(8) < 8.0 assert p.prob(1000) < p.prob(100) for event, count in reversed(sorted(p, key=lambda it: it[1])): assert p.smoother(count) < count
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): print("Counting frequencies...") counts = PreshCounter() total = 0 with freqs_loc.open() as f: for i, line in enumerate(f): freq, doc_freq, key = line.rstrip().split('\t', 2) freq = int(freq) counts.inc(i + 1, freq) total += freq counts.smooth() log_total = math.log(total) probs = {} with freqs_loc.open() as f: for line in tqdm(f): freq, doc_freq, key = line.rstrip().split('\t', 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len( key) < max_length: word = literal_eval(key) smooth_count = counts.smoother(int(freq)) probs[word] = math.log(smooth_count) - log_total oov_prob = math.log(counts.smoother(0)) - log_total return probs, oov_prob
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): counts = PreshCounter() total = 0 with freqs_loc.open() as f: for i, line in enumerate(f): freq, doc_freq, key = line.rstrip().split("\t", 2) freq = int(freq) counts.inc(i + 1, freq) total += freq counts.smooth() log_total = math.log(total) probs = {} with freqs_loc.open() as f: for line in tqdm(f): freq, doc_freq, key = line.rstrip().split("\t", 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: try: word = literal_eval(key) except SyntaxError: # Take odd strings literally. word = literal_eval("'%s'" % key) smooth_count = counts.smoother(int(freq)) probs[word] = math.log(smooth_count) - log_total oov_prob = math.log(counts.smoother(0)) - log_total return probs, oov_prob
def test_large_freqs(): if 'TEST_FILE_LOC' in os.environ: loc = os.environ['TEST_FILE_LOC'] else: return None counts = PreshCounter() for i, line in enumerate(open(loc)): line = line.strip() if not line: continue freq = int(line.split()[0]) counts.inc(i + 1, freq) oov = i + 2 assert counts.prob(oov) == 0.0 assert counts.prob(1) < 0.1 counts.smooth() assert counts.prob(oov) > 0 assert counts.prob(oov) < counts.prob(i)
def test_large_freqs(): if 'TEST_FILE_LOC' in os.environ: loc = os.environ['TEST_FILE_LOC'] else: return None counts = PreshCounter() for i, line in enumerate(open(loc)): line = line.strip() if not line: continue freq = int(line.split()[0]) counts.inc(i+1, freq) oov = i+2 assert counts.prob(oov) == 0.0 assert counts.prob(1) < 0.1 counts.smooth() assert counts.prob(oov) > 0 assert counts.prob(oov) < counts.prob(i)
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200): counts = PreshCounter() total = 0 freqs_file = check_unzip(freqs_path) for i, line in enumerate(freqs_file): freq, doc_freq, key = line.rstrip().split('\t', 2) freq = int(freq) counts.inc(i+1, freq) total += freq counts.smooth() log_total = math.log(total) freqs_file = check_unzip(freqs_path) probs = {} for line in freqs_file: freq, doc_freq, key = line.rstrip().split('\t', 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: word = literal_eval(key) smooth_count = counts.smoother(int(freq)) probs[word] = math.log(smooth_count) - log_total oov_prob = math.log(counts.smoother(0)) - log_total return probs, oov_prob
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): print("Counting frequencies...") counts = PreshCounter() total = 0 with freqs_loc.open() as f: for i, line in enumerate(f): freq, doc_freq, key = line.rstrip().split('\t', 2) freq = int(freq) counts.inc(i + 1, freq) total += freq counts.smooth() log_total = math.log(total) probs = {} with freqs_loc.open() as f: for line in tqdm(f): freq, doc_freq, key = line.rstrip().split('\t', 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: word = literal_eval(key) smooth_count = counts.smoother(int(freq)) probs[word] = math.log(smooth_count) - log_total oov_prob = math.log(counts.smoother(0)) - log_total return probs, oov_prob