Example #1
0
def count_frequencies(lang_id, corpus_dir):
    """
    Counts word count en document count for each word in the corpus.

    :param lang_id:
    :param corpus_dir: directory with text files
    :return:
    """
    nlp = spacy.load(lang_id)
    vocab = nlp.vocab
    tokenizer = nlp.tokenizer

    counts = PreshCounter()
    doccounts = PreshCounter()
    for filename in os.listdir(corpus_dir):
        with codecs.open(os.path.join(corpus_dir, filename),
                         encoding='utf-8') as f:
            data = f.read()
            doc = tokenizer(data)
            doc.count_by(ORTH, counts=counts)
            doccount = doc.count_by(ORTH)
            for k, v in doccount.iteritems():
                doccounts.inc(k, 1)

    return counts, doccounts, tokenizer
Example #2
0
class Corpus(object):
    def __init__(self, directory, min_freq=10):
        self.directory = directory
        self.counts = PreshCounter()
        self.strings = {}
        self.min_freq = min_freq

    def count_doc(self, words):
        # Get counts for this document
        doc_counts = PreshCounter()
        doc_strings = {}
        for word in words:
            key = hash_string(word)
            doc_counts.inc(key, 1)
            doc_strings[key] = word

        n = 0
        for key, count in doc_counts:
            self.counts.inc(key, count)
            # TODO: Why doesn't inc return this? =/
            corpus_count = self.counts[key]
            # Remember the string when we exceed min count
            if corpus_count >= self.min_freq and (corpus_count -
                                                  count) < self.min_freq:
                self.strings[key] = doc_strings[key]
            n += count
        return n

    def __iter__(self):
        for text_loc in iter_dir(self.directory):
            with io.open(text_loc, 'r', encoding='utf8') as file_:
                sent_strs = list(file_)
                random.shuffle(sent_strs)
                for sent_str in sent_strs:
                    yield sent_str.split()
Example #3
0
class Corpus(object):
    def __init__(self, directory, min_freq=10):
        self.directory = directory
        self.counts = PreshCounter()
        self.strings = {}
        self.min_freq = min_freq

    def count_doc(self, words):
        # Get counts for this document
        doc_counts = PreshCounter()
        doc_strings = {}
        for word in words:
            key = hash_string(word)
            doc_counts.inc(key, 1)
            doc_strings[key] = word
 
        n = 0
        for key, count in doc_counts:
            self.counts.inc(key, count)
            # TODO: Why doesn't inc return this? =/
            corpus_count = self.counts[key]
            # Remember the string when we exceed min count
            if corpus_count >= self.min_freq and (corpus_count - count) < self.min_freq:
                 self.strings[key] = doc_strings[key]
            n += count
        return n

    def __iter__(self):
        for text_loc in iter_dir(self.directory):
            with io.open(text_loc, 'r', encoding='utf8') as file_:
                sent_strs = list(file_)
                random.shuffle(sent_strs)
                for sent_str in sent_strs:
                    yield sent_str.split()
Example #4
0
def countTheWords(doc):
    counts = PreshCounter()

    for word in doc:
        counts.inc(word.orth, 1)
    for (word_id, count) in counts:
        print(count, nlp.vocab.strings[word_id])
    return counts
Example #5
0
def test_unsmooth_prob():
    counter = PreshCounter()
    assert counter.prob(12) == 0.0
    counter.inc(12, 1)
    assert counter.prob(12) == 1.0
    counter.inc(14, 10)
    assert counter.prob(14) == 10 / 11
    assert counter.prob(12) == 1.0 / 11
Example #6
0
def merge_counts(locs, out_loc):
    string_map = StringStore()
    counts = PreshCounter()
    for loc in locs:
        with codecs.open(loc, 'r', 'utf8') as file_:
            for line in file_:
                freq, word = line.strip().split('\t', 1)
                orth = string_map[word]
                counts.inc(orth, int(freq))
    with codecs.open(out_loc, 'w', 'utf8') as file_:
        for orth, count in counts:
            string = string_map[orth]
            file_.write('%d\t%s\n' % (count, string))
Example #7
0
def merge_counts(locs, out_loc):
    string_map = StringStore()
    counts = PreshCounter()
    for loc in locs:
        with io.open(loc, 'r', encoding='utf8') as file_:
            for line in file_:
                freq, word = line.strip().split('\t', 1)
                orth = string_map[word]
                counts.inc(orth, int(freq))
    with io.open(out_loc, 'w', encoding='utf8') as file_:
        for orth, count in counts:
            string = string_map[orth]
            file_.write('%d\t%s\n' % (count, string))
Example #8
0
def get_features(docs, n_docs, max_length=100):
    Xs = np.zeros((n_docs, max_length), dtype=np.int32)
    counts = PreshCounter()

    for i, doc in enumerate(docs):
        doc.count_by
        for j, token in enumerate(doc[:max_length]):
            if token.has_vector:
                Xs[i, j] = token.rank
                counts.inc(token.rank, 1)
            else:
                Xs[i, j] = 0
    return Xs, counts
def _read_probs_from_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
    if not loc.exists():
        print("Warning: Frequencies file not found")
        return {}, 0.0
    counts = PreshCounter()
    total = 0
    if str(loc).endswith('gz'):
        file_ = gzip.open(str(loc))
    else:
        file_ = loc.open()
    for i, line in enumerate(file_):
        freq, doc_freq, key = line.rstrip().split('\t', 2)
        freq = int(freq)
        counts.inc(i + 1, freq)
        total += freq
    counts.smooth()
    log_total = math.log(total)
    if str(loc).endswith('gz'):
        file_ = gzip.open(str(loc))
    else:
        file_ = loc.open()
    probs = {}
    for line in file_:
        freq, doc_freq, key = line.rstrip().split('\t', 2)
        doc_freq = int(doc_freq)
        freq = int(freq)
        if doc_freq >= min_doc_freq and freq >= min_freq and len(
                key) < max_length:
            word = literal_eval(key)
            smooth_count = counts.smoother(int(freq))
            probs[word] = math.log(smooth_count) - log_total
    oov_prob = math.log(counts.smoother(0)) - log_total
    return probs, oov_prob
Example #10
0
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
    print("Counting frequencies...")
    counts = PreshCounter()
    total = 0
    with freqs_loc.open() as f:
        for i, line in enumerate(f):
            freq, doc_freq, key = line.rstrip().split('\t', 2)
            freq = int(freq)
            counts.inc(i + 1, freq)
            total += freq
    counts.smooth()
    log_total = math.log(total)
    probs = {}
    with freqs_loc.open() as f:
        for line in tqdm(f):
            freq, doc_freq, key = line.rstrip().split('\t', 2)
            doc_freq = int(doc_freq)
            freq = int(freq)
            if doc_freq >= min_doc_freq and freq >= min_freq and len(
                    key) < max_length:
                word = literal_eval(key)
                smooth_count = counts.smoother(int(freq))
                probs[word] = math.log(smooth_count) - log_total
    oov_prob = math.log(counts.smoother(0)) - log_total
    return probs, oov_prob
Example #11
0
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
    counts = PreshCounter()
    total = 0
    with freqs_loc.open() as f:
        for i, line in enumerate(f):
            freq, doc_freq, key = line.rstrip().split("\t", 2)
            freq = int(freq)
            counts.inc(i + 1, freq)
            total += freq
    counts.smooth()
    log_total = math.log(total)
    probs = {}
    with freqs_loc.open() as f:
        for line in tqdm(f):
            freq, doc_freq, key = line.rstrip().split("\t", 2)
            doc_freq = int(doc_freq)
            freq = int(freq)
            if doc_freq >= min_doc_freq and freq >= min_freq and len(
                    key) < max_length:
                try:
                    word = literal_eval(key)
                except SyntaxError:
                    # Take odd strings literally.
                    word = literal_eval("'%s'" % key)
                smooth_count = counts.smoother(int(freq))
                probs[word] = math.log(smooth_count) - log_total
    oov_prob = math.log(counts.smoother(0)) - log_total
    return probs, oov_prob
Example #12
0
class Corpus(object):
    def __init__(self, directory):
        self.directory = directory
        self.counts = PreshCounter()

    def count_doc(self, doc):
        for word in doc:
            self.counts.inc(word.orth, 1)

    def __iter__(self):
        for text_loc in self.directory.rglob("*.txt"):
            with io.open(text_loc, "r", encoding="utf8") as file_:
                text = file_.read()

            yield text
Example #13
0
def test_unsmooth_prob():
    counter = PreshCounter()
    assert counter.prob(12) == 0.0
    counter.inc(12, 1)
    assert counter.prob(12) == 1.0
    counter.inc(14, 10)
    assert counter.prob(14) == 10 / 11
    assert counter.prob(12) == 1.0 / 11
Example #14
0
def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=200):
    if not loc.exists():
        print("Warning: Frequencies file not found")
        return {}, 0.0
    counts = PreshCounter()
    total = 0
    if str(loc).endswith('gz'):
        file_ = gzip.open(str(loc))
    else:
        file_ = loc.open()
    for i, line in enumerate(file_):
        freq, doc_freq, key = line.split('\t', 2)
        freq = int(freq)
        counts.inc(i+1, freq)
        total += freq
    counts.smooth()
    log_total = math.log(total)
    probs = {}
    for line in loc.open():
        freq, doc_freq, key = line.split('\t', 2)
        doc_freq = int(doc_freq)
        freq = int(freq)
        if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
            word = literal_eval(key)
            smooth_count = counts.smoother(int(freq))
            log_smooth_count = math.log(smooth_count)
            probs[word] = math.log(smooth_count) - log_total
    oov_prob = math.log(counts.smoother(0)) - log_total
    return probs, oov_prob
Example #15
0
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
    counts = PreshCounter()
    total = 0
    with freqs_loc.open() as f:
        for i, line in enumerate(f):
            freq, doc_freq, key = line.rstrip().split("\t", 2)
            freq = int(freq)
            counts.inc(i + 1, freq)
            total += freq
    counts.smooth()
    log_total = math.log(total)
    probs = {}
    with freqs_loc.open() as f:
        for line in tqdm(f):
            freq, doc_freq, key = line.rstrip().split("\t", 2)
            doc_freq = int(doc_freq)
            freq = int(freq)
            if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
                try:
                    word = literal_eval(key)
                except SyntaxError:
                    # Take odd strings literally.
                    word = literal_eval("'%s'" % key)
                smooth_count = counts.smoother(int(freq))
                probs[word] = math.log(smooth_count) - log_total
    oov_prob = math.log(counts.smoother(0)) - log_total
    return probs, oov_prob
Example #16
0
def main(patterns_loc, text_loc, counts_loc, n=10000000):
    nlp = English(parser=False, tagger=False, entity=False)
    print("Make matcher")
    phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n)
    counts = PreshCounter()
    t1 = time.time()
    for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)):
        counts.inc(hash_string(mwe.text), 1)
    t2 = time.time()
    print("10m tokens in %d s" % (t2 - t1))
    
    with codecs.open(counts_loc, 'w', 'utf8') as file_:
        for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n):
            text = phrase.string
            key = hash_string(text)
            count = counts[key]
            if count != 0:
                file_.write('%d\t%s\n' % (count, text))
Example #17
0
def test_large_freqs():
    if 'TEST_FILE_LOC' in os.environ:
        loc = os.environ['TEST_FILE_LOC']
    else:
        return None
    counts = PreshCounter()
    for i, line in enumerate(open(loc)):
        line = line.strip()
        if not line:
            continue
        freq = int(line.split()[0])
        counts.inc(i + 1, freq)
    oov = i + 2
    assert counts.prob(oov) == 0.0
    assert counts.prob(1) < 0.1
    counts.smooth()
    assert counts.prob(oov) > 0
    assert counts.prob(oov) < counts.prob(i)
Example #18
0
class Corpus(object):
    def __init__(self, directory, min_freq=10):
        self.directory = directory
        self.counts = PreshCounter()
        self.strings = {}
        self.min_freq = min_freq

    def count_doc(self, doc):
        # Get counts for this document
        for word in doc:
            self.counts.inc(word.orth, 1)
        return len(doc)

    def __iter__(self):
        for text_loc in iter_dir(self.directory):
            with text_loc.open("r", encoding="utf-8") as file_:
                text = file_.read()
            yield text
class Corpus(object):
    def __init__(self, directory, min_freq=10):
        self.directory = directory
        self.counts = PreshCounter()
        self.strings = {}
        self.min_freq = min_freq

    def count_doc(self, doc):
        # Get counts for this document
        for word in doc:
            self.counts.inc(word.orth, 1)
        return len(doc)

    def __iter__(self):
        for text_loc in iter_dir(self.directory):
            with io.open(text_loc, 'r', encoding='utf8') as file_:
                text = file_.read()
            yield text
def main(patterns_loc, text_loc, counts_loc, n=10000000):
    nlp = English(parser=False, tagger=False, entity=False)
    print("Make matcher")
    phrases = read_gazetteer(nlp.tokenizer, patterns_loc, n=n)
    counts = PreshCounter()
    t1 = time.time()
    for mwe in get_matches(nlp.tokenizer, phrases, read_text(text_loc)):
        counts.inc(hash_string(mwe.text), 1)
    t2 = time.time()
    print("10m tokens in %d s" % (t2 - t1))

    with codecs.open(counts_loc, 'w', 'utf8') as file_:
        for phrase in read_gazetteer(nlp.tokenizer, patterns_loc, n=n):
            text = phrase.string
            key = hash_string(text)
            count = counts[key]
            if count != 0:
                file_.write('%d\t%s\n' % (count, text))
Example #21
0
def test_count():
    counter = PreshCounter()
    assert counter[12] == 0
    counter.inc(12, 1)
    assert counter[12] == 1
    counter.inc(14, 10)
    counter.inc(9, 10)
    counter.inc(12, 4)
    assert counter[12] == 5
    assert counter[14] == 10
    assert counter[9] == 10
Example #22
0
    def count_doc(self, words):
        # Get counts for this document
        doc_counts = PreshCounter()
        doc_strings = {}
        for word in words:
            key = hash_string(word)
            doc_counts.inc(key, 1)
            doc_strings[key] = word
 
        n = 0
        for key, count in doc_counts:
            self.counts.inc(key, count)
            # TODO: Why doesn't inc return this? =/
            corpus_count = self.counts[key]
            # Remember the string when we exceed min count
            if corpus_count >= self.min_freq and (corpus_count - count) < self.min_freq:
                 self.strings[key] = doc_strings[key]
            n += count
        return n
Example #23
0
    def count_doc(self, words):
        # Get counts for this document
        doc_counts = PreshCounter()
        doc_strings = {}
        for word in words:
            key = hash_string(word)
            doc_counts.inc(key, 1)
            doc_strings[key] = word

        n = 0
        for key, count in doc_counts:
            self.counts.inc(key, count)
            # TODO: Why doesn't inc return this? =/
            corpus_count = self.counts[key]
            # Remember the string when we exceed min count
            if corpus_count >= self.min_freq and (corpus_count -
                                                  count) < self.min_freq:
                self.strings[key] = doc_strings[key]
            n += count
        return n
    def count_doc(self, words):
        doc_counts = PreshCounter()
        doc_strings = {}

        for word in words:
            key = hash_string(word)
            doc_counts.inc(key, 1)
            doc_strings[key] = word

        n = 0
        for key, count in doc_counts:
            self.counts.inc(key, count)
            corpus_count = self.counts[key]

            if corpus_count >= self.min_freq and (corpus_count -
                                                  count) < self.min_freq:
                self.strings[key] = doc_strings[key]

            n += count

        return n
class Corpus(object):
    def __init__(self, directory, min_freq=10):
        self.directory = directory
        self.counts = PreshCounter()
        self.strings = {}
        self.min_freq = min_freq

    def count_doc(self, words):
        doc_counts = PreshCounter()
        doc_strings = {}

        for word in words:
            key = hash_string(word)
            doc_counts.inc(key, 1)
            doc_strings[key] = word

        n = 0
        for key, count in doc_counts:
            self.counts.inc(key, count)
            corpus_count = self.counts[key]

            if corpus_count >= self.min_freq and (corpus_count -
                                                  count) < self.min_freq:
                self.strings[key] = doc_strings[key]

            n += count

        return n

    def __iter__(self):

        for text_loc in trainModel.iterDir(self.directory):

            with io.open(text_loc, 'r', encoding='utf8') as file_:

                sent_strs = list(file_)
                random.shuffle(sent_strs)

                for sent_str in sent_strs:
                    yield sent_str.split()
Example #26
0
def process(batch_id, inputs, output_dir, lang, n_threads, batch_size,
            min_ngram, max_ngram):
    logging.info('Processing batch_id: {}'.format(batch_id))
    subtrees = PreshCounter()
    subtrees_string_map = StringStore()
    noun_chunks = PreshCounter()
    noun_chunks_string_map = StringStore()

    if lang.lower() == "en":
        from spacy.en import English
        NLU = English()
        NLU.matcher = None
    elif lang.lower() == "id":
        from spacy.id import Indonesian
        NLU = Indonesian()
        NLU.matcher = None

    for i, doc in enumerate(
            NLU.pipe(inputs, batch_size=batch_size, n_threads=n_threads)):
        phrases = set()
        for tok in doc:
            st_len = len(list(tok.subtree))
            if min_ngram <= st_len <= max_ngram:
                st = ''.join([rep_text(t.text_with_ws)
                              for t in tok.subtree]).strip()
                orth = subtrees_string_map[st]
                subtrees.inc(orth, 1)
        for np in doc.noun_chunks:
            if min_ngram <= len(np) <= max_ngram:
                st = ''.join([rep_text(t.text_with_ws) for t in np]).strip()
                orth = noun_chunks_string_map[st]
                noun_chunks.inc(orth, 1)

        if i % batch_size == 0:
            logging.info('Processing batch_id: {}, doc: {}'.format(
                batch_id, i))

    output_fname = path.join(output_dir, 'batch{}.st.freq'.format(batch_id))
    with io.open(output_fname, 'w', encoding='utf-8') as out:
        for orth, count in subtrees:
            st = subtrees_string_map[orth]
            if count >= 5 and '!LONGWORD!' not in st:
                out.write('{}\t{}\n'.format(count, st))

    output_fname = path.join(output_dir, 'batch{}.np.freq'.format(batch_id))
    with io.open(output_fname, 'w', encoding='utf-8') as out:
        for orth, count in noun_chunks:
            if count >= 5:
                st = noun_chunks_string_map[orth]
                out.write('{}\t{}\n'.format(count, st))
Example #27
0
def test_large_freqs():
    if 'TEST_FILE_LOC' in os.environ:
        loc = os.environ['TEST_FILE_LOC']
    else:
        return None
    counts = PreshCounter()
    for i, line in enumerate(open(loc)):
        line = line.strip()
        if not line:
            continue
        freq = int(line.split()[0])
        counts.inc(i+1, freq)
    oov = i+2
    assert counts.prob(oov) == 0.0
    assert counts.prob(1) < 0.1
    counts.smooth()
    assert counts.prob(oov) > 0
    assert counts.prob(oov) < counts.prob(i)
Example #28
0
def count_freqs(Language, input_loc, output_loc):
    tokenizer = Language.Defaults.create_tokenizer()

    counts = PreshCounter()
    for json_comment in iter_comments(input_loc):
        doc = tokenizer(json_comment['body'])
        doc.count_by(ORTH, counts=counts)

    with io.open(output_loc, 'w', 'utf8') as file_:
        for orth, freq in counts:
            string = tokenizer.vocab.strings[orth]
            if not string.isspace():
                file_.write('%d\t%s\n' % (freq, string))
Example #29
0
def test_count():
    counter = PreshCounter()
    assert counter[12] == 0
    counter.inc(12, 1)
    assert counter[12] == 1
    counter.inc(14, 10)
    counter.inc(9, 10)
    counter.inc(12, 4)
    assert counter[12] == 5
    assert counter[14] == 10
    assert counter[9] == 10
Example #30
0
def count_freqs(input_loc, output_loc):
    print output_loc
    nlp = spacy.en.English(Parser=None, Tagger=None, Entity=None, load_vectors=False)
    nlp.vocab.lexeme_props_getter = null_props

    counts = PreshCounter()
    tokenizer = nlp.tokenizer
    for json_comment in iter_comments(input_loc):
        doc = tokenizer(json_comment['body'])
        doc.count_by(ORTH, counts=counts)

    with codecs.open(output_loc, 'w', 'utf8') as file_:
        for orth, freq in counts:
            string = nlp.vocab.strings[orth]
            file_.write('%d\t%s\n' % (freq, repr(string)))
Example #31
0
def count_freqs(input_loc, output_loc):
    print(output_loc)
    vocab = English.default_vocab(get_lex_attr=None)
    tokenizer = Tokenizer.from_dir(
        vocab, path.join(English.default_data_dir(), 'tokenizer'))

    counts = PreshCounter()
    for json_comment in iter_comments(input_loc):
        doc = tokenizer(json_comment['body'])
        doc.count_by(ORTH, counts=counts)

    with codecs.open(output_loc, 'w', 'utf8') as file_:
        for orth, freq in counts:
            string = tokenizer.vocab.strings[orth]
            if not string.isspace():
                file_.write('%d\t%s\n' % (freq, string))
Example #32
0
def _tokenize(language, path, output_path):
    ''' Create a tokenizer function '''
    counter = PreshCounter()
    defaults = spacy.blank(language).Defaults

    rules = defaults.tokenizer_exceptions
    token_match = defaults.token_match
    prefix_search = (spacy.util.compile_prefix_regex(defaults.prefixes).search
                     if defaults.prefixes else None)
    suffix_search = (spacy.util.compile_suffix_regex(defaults.suffixes).search
                     if defaults.suffixes else None)

    # Correct for the fact that spacy does not preserve infix hyphens by default
    punct = r'?";:=,.'
    ignore_infix = r'(?<=[{a}])[{p}]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS, p=punct)

    # Correctly support ending sentences: 1) Start of token followed by a punct, 2)
    # punct followed by punct, 3) two consecutive non-punct chars followed by a
    # punct char, 4) Start of token followed by non-punct char, followed by punct
    punct_infix = r'^[{p}]|(?<=[{p}])[{p}]|(?<=[^{p}]{{2}})[{p}]|(?<=^[^{p}])[{p}]$'.format(p=punct)
    infixes = (
        [punct_infix] +
        [infix for infix in defaults.infixes if infix != ignore_infix]
    )
    infix_finditer = spacy.util.compile_infix_regex(infixes).finditer

    tokenizer = spacy.tokenizer.Tokenizer(
        defaults.create_vocab(), rules=rules,
        prefix_search=prefix_search,
        suffix_search=suffix_search,
        infix_finditer=infix_finditer,
        token_match=token_match
    )

    with ExitStack() as stack:
        input_file = stack.enter_context(open(path, 'rt'))
        output_file = stack.enter_context(open(output_path, 'wt'))
        for text in tokenizer.pipe(input_file):
            tokenized = ' '.join([token.text for token in text if not token.is_space])
            text.count_by(spacy.attrs.ORTH, counts=counter)
            output_file.write(tokenized + '\n')

    return Counter({
        tokenizer.vocab[i].text: c
        for i, c in counter
        if not tokenizer.vocab[i].is_space
    })
Example #33
0
def count_freqs(input_loc, output_loc, LangClass):
    start = time.time()
    print('INFO: Processing ', input_loc)
    vocab = LangClass.Defaults.create_vocab()
    tokenizer = LangClass.Defaults.create_tokenizer()
    #Tokenizer(vocab,path.join(LangClass.default_data_dir(), 'tokenizer'))

    counts = PreshCounter()
    for text in iter_comments(input_loc):
        doc = tokenizer(text)
        doc.count_by(ORTH, counts=counts)

    with io.open(output_loc, 'w', encoding='utf8') as file_:
        for orth, freq in counts:
            string = tokenizer.vocab.strings[orth]
            if not string.isspace():
                file_.write('%d\t%s\n' % (freq, string))
    end = time.time() - start
    print('INFO: File {} took {} min '.format(input_loc, end / 60))
Example #34
0
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
    counts = PreshCounter()
    total = 0
    freqs_file = check_unzip(freqs_path)
    for i, line in enumerate(freqs_file):
        freq, doc_freq, key = line.rstrip().split('\t', 2)
        freq = int(freq)
        counts.inc(i+1, freq)
        total += freq
    counts.smooth()
    log_total = math.log(total)
    freqs_file = check_unzip(freqs_path)
    probs = {}
    for line in freqs_file:
        freq, doc_freq, key = line.rstrip().split('\t', 2)
        doc_freq = int(doc_freq)
        freq = int(freq)
        if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
            word = literal_eval(key)
            smooth_count = counts.smoother(int(freq))
            probs[word] = math.log(smooth_count) - log_total
    oov_prob = math.log(counts.smoother(0)) - log_total
    return probs, oov_prob
Example #35
0
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
    counts = PreshCounter()
    total = 0
    freqs_file = check_unzip(freqs_path)
    for i, line in enumerate(freqs_file):
        freq, doc_freq, key = line.rstrip().split('\t', 2)
        freq = int(freq)
        counts.inc(i+1, freq)
        total += freq
    counts.smooth()
    log_total = math.log(total)
    freqs_file = check_unzip(freqs_path)
    probs = {}
    for line in freqs_file:
        freq, doc_freq, key = line.rstrip().split('\t', 2)
        doc_freq = int(doc_freq)
        freq = int(freq)
        if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
            word = literal_eval(key)
            smooth_count = counts.smoother(int(freq))
            probs[word] = math.log(smooth_count) - log_total
    oov_prob = math.log(counts.smoother(0)) - log_total
    return probs, oov_prob
Example #36
0
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
    print("Counting frequencies...")
    counts = PreshCounter()
    total = 0
    with freqs_loc.open() as f:
        for i, line in enumerate(f):
            freq, doc_freq, key = line.rstrip().split('\t', 2)
            freq = int(freq)
            counts.inc(i + 1, freq)
            total += freq
    counts.smooth()
    log_total = math.log(total)
    probs = {}
    with freqs_loc.open() as f:
        for line in tqdm(f):
            freq, doc_freq, key = line.rstrip().split('\t', 2)
            doc_freq = int(doc_freq)
            freq = int(freq)
            if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
                word = literal_eval(key)
                smooth_count = counts.smoother(int(freq))
                probs[word] = math.log(smooth_count) - log_total
    oov_prob = math.log(counts.smoother(0)) - log_total
    return probs, oov_prob
Example #37
0
 def __init__(self, directory, min_freq=10):
     self.directory = directory
     self.counts = PreshCounter()
     self.strings = {}
     self.min_freq = min_freq
Example #38
0
 def __init__(self, directory):
     self.directory = directory
     self.counts = PreshCounter()
Example #39
0
 def __init__(self, directory, min_freq=10):
     self.directory = directory
     self.counts = PreshCounter()
     self.strings = {}
     self.min_freq = min_freq
Example #40
0
def test_smooth_prob():
    p = PreshCounter()
    # 1 10
    # 2 6
    # 3 4
    # 5 2
    # 8 1
    for i in range(10):
        p.inc(100-i, 1) # 10 items of freq 1
    for i in range(6):
        p.inc(90 - i, 2) # 6 items of freq 2
    for i in range(4):
        p.inc(80 - i, 3) # 4 items of freq 3
    for i in range(2):
        p.inc(70 - i, 5) # 2 items of freq 5
    for i in range(1):
        p.inc(60 - i, 8) # 1 item of freq 8

    assert p.total == (10 * 1) + (6 * 2) + (4 * 3) + (2 * 5) + (1 * 8)

    assert p.prob(100) == 1.0 / p.total
    assert p.prob(200) == 0.0
    assert p.prob(60) == 8.0 / p.total

    p.smooth()

    assert p.smoother(1) < 1.0
    assert p.smoother(8) < 8.0
    assert p.prob(1000) < p.prob(100)

    for event, count in reversed(sorted(p, key=lambda it: it[1])):
        assert p.smoother(count) < count
Example #41
0
def test_smooth_prob():
    p = PreshCounter()
    # 1 10
    # 2 6
    # 3 4
    # 5 2
    # 8 1
    for i in range(10):
        p.inc(100 - i, 1)  # 10 items of freq 1
    for i in range(6):
        p.inc(90 - i, 2)  # 6 items of freq 2
    for i in range(4):
        p.inc(80 - i, 3)  # 4 items of freq 3
    for i in range(2):
        p.inc(70 - i, 5)  # 2 items of freq 5
    for i in range(1):
        p.inc(60 - i, 8)  # 1 item of freq 8

    assert p.total == (10 * 1) + (6 * 2) + (4 * 3) + (2 * 5) + (1 * 8)

    assert p.prob(100) == 1.0 / p.total
    assert p.prob(200) == 0.0
    assert p.prob(60) == 8.0 / p.total

    p.smooth()

    assert p.smoother(1) < 1.0
    assert p.smoother(8) < 8.0
    assert p.prob(1000) < p.prob(100)

    for event, count in reversed(sorted(p, key=lambda it: it[1])):
        assert p.smoother(count) < count