def decompose(self, corpus): skip_counts = bounter(size_mb=1024) word_counts = bounter(size_mb=1024) for l in corpus: wds = l.split() skips = list(skipgrams(wds, 2, 5)) skips = ["#".join(t) for t in skips] if len(wds) > 0 and len(skips) > 0: skip_counts.update(skips) word_counts.update(wds) vocabulary = list(word_counts) shift = 1 # shift 1 does nothing since log(1) == 0.0 M = count_skipgrams(skip_counts, word_counts, vocabulary, shift) # TODO: eigen something trick # singular value decomposition # U, _, V = svds(M, k=256) # U, S, V U, _, V = sparsesvd(M, 300) # add context to U word_vecs = U.T + V.T del U del V # normalize rows word_vecs_norm = word_vecs / np.sqrt( np.sum(word_vecs * word_vecs, axis=0, keepdims=True)) del word_vecs return vocabulary, word_vecs_norm
def init_parameters(self): """参数初始化""" # 1.theta参数初始化 # 遍历所有可能的候选短语,统计词频 counter = bounter.bounter(size_mb=self.memory_size_mb) for i in range(1, len(self.C) + 1): for j in range(i + 1, min(i + self.length_threshold + 1, len(self.C) - 1)): if not self.Q.startswith(self.C.word(i, j)): break counter.update((self.C.word(i, j), )) # 词频分布 distr = np.array([i for i in counter.values()]) _mean, _std = distr.mean(), distr.std() norm = truncnorm((0. - _mean) / _std, (1. - _mean) / _std, loc=_mean, scale=_std) # θ <- {短语 <- 短语下标} theta = bounter.bounter(size_mb=self.memory_size_mb) idx_theta = 0 # 从零计数 for candidate in counter.keys(): theta[candidate] = idx_theta idx_theta += 1 del counter # 按正态分布随机初始化 θ param_theta = norm.rvs(size=idx_theta) # 2.delta参数初始化 # 获取所有可能的pos标签二元组, delta = self.Tx_Ty() # 按均匀分布随机初始化 δ param_delta = np.random.uniform(size=len(delta)) self._delta = delta self._theta = theta self._param_delta = param_delta self._param_theta = param_theta # 检查参数收敛情况 self._trace_delta.append( compare_arrays(param_delta, np.ones(param_delta.size))) self._trace_theta.append( compare_arrays(param_theta, np.ones(param_theta.size)))
def ngram_counter(self, remove_stopwords, memory_size_mb): # Bounter is only a probabilistic frequency counter and # cannot be relied on for exact counting # mentioned in https://github.com/RaRe-Technologies/bounter count = bounter.bounter(size_mb=memory_size_mb) if remove_stopwords: stopwords = one_token_a_line(fname='stopwords.txt') else: stopwords = None process_bar = tqdm.tqdm(self.corpus, ascii=True) for doc in process_bar: process_bar.set_description('Counting n-grams') if not isinstance(doc, Sentences): err = 'doc must be a Sentence object. but got: {}'.format( type(doc)) raise TypeError(err) for sent in doc: # filter stopwords after ngram token is formed # but that not getting rid of them in the raw corpus if remove_stopwords and stopwords is not None: ngram_tokens = [ t for t in ngram(sent, self.ngram_size) if t not in stopwords ] else: ngram_tokens = ngram(sent, self.ngram_size) count.update(ngram_tokens) return count
def __init__(self, dumpfile = None, lowfreq_threshold = 0, highfreq_threshold = 65535, hash_size = 7+10**8, hash_dtype = np.dtype([('counter', 'u2'), ('n-gram', bytes, 5)])): self.hash_dtype = hash_dtype self.hash_size = hash_size self.hash_dumpfile = dumpfile self.lowfreq_threshold = lowfreq_threshold self.highfreq_threshold = highfreq_threshold self.hash_add_tries = 0 # sum of hasd-add calls self.hash_added_keys = 0 # num of buckets in use self.hash_relookups = 0 # sum of re-lookups of all hash-add calls self.hash_collisions = 0 # sum of hash-add calls which fail on all re-lookups self.hash_ceilings = 0 # sum of hash-add calls which counter overflows highfreq_threshold self.hash_overwrites = 0 # num of hash-add calls which key overwrites another one and counter resets to 1 self.hash_counter_lost = 0 # sum of counters when key is overwritten self.bnt = bounter(need_counts=False) # use HLL algorithm only self.bnt_count = 0 self.ht = None self.hash_seeds = [2819948058, 5686873063, 1769651746, 8745608315, 2414950003, 3583714723, 1224464945, 2514535028] #np.random.randint(10**9,10**10,8) self.hash_funcs_num = len(self.hash_seeds) if dumpfile: self.ht = self.load(dumpfile) if self.ht is None: self.ht = self.init() if self.ht is not None: print("hash_table dtype(%s)" % self.ht.dtype) print("hash_size(%d): %s" % (len(self.ht), self.ht)) else: print("hash_table load() or init() failed.")
def num_documents_with_ngram(self, memory_size_mb): if self._candidates is None: raise ValueError( 'n-gram candidate is None, cannot count # documents ' 'with ngram if it is not given.') # less candidates, less memory usage. count = bounter.bounter(size_mb=memory_size_mb // 2) process_bar = tqdm.tqdm(self.corpus, ascii=True) for doc in process_bar: process_bar.set_description('Counting doc with ngram') tokens_doc_level = set() if not isinstance(doc, Sentences): err = 'doc must be a Sentence object. but got: {}'.format( type(doc)) raise TypeError(err) for sent in doc: tokens_sent_level = set(t for t in ngram(sent, self.ngram_size) if t in self._candidates) tokens_doc_level.update(tokens_sent_level) if tokens_doc_level: count.update(tokens_doc_level) return count
def test_sanity_nocount(self): counter = bounter(need_counts=False) counter.update([u'foo', u'bar', u'foo']) self.assertEqual(counter.total(), 3) self.assertEqual(counter.cardinality(), 2) with self.assertRaises(NotImplementedError): print(counter[u'foo'])
def update_delta(self, B): assert (isinstance(B, list)) assert len(B) > 1 pre_param = self._param_delta m = len(B) n = len(self.C) TxTy_numerator = bounter.bounter(size_mb=self.memory_size_mb // 2) TxTy_denominator = bounter.bounter(size_mb=self.memory_size_mb // 2) bar = tqdm.tqdm(range(1, m - 1), ascii=True) for i in bar: bar.set_description('delta 1/3') for j in range(B[i], B[i + 1] - 1): TxTy_numerator.increment(self.C.tag(j, j + 2)) bar = tqdm.tqdm(range(1, n), ascii=True) for i in bar: bar.set_description('delta 2/3') TxTy_denominator.increment(self.C.tag(i, i + 2)) bar = tqdm.tqdm(self._delta.items(), total=len(self._delta), ascii=True) for txty, idx in bar: bar.set_description('delta 3/3') if TxTy_numerator[txty] != 0: print(txty) new_delta_value = TxTy_numerator[txty] / TxTy_denominator[txty] self._param_delta[idx] = new_delta_value else: self._param_delta[idx] = 0. self._trace_delta.append(compare_arrays(self._param_delta, pre_param)) del pre_param
def count_number_worker(fname, size_mb=200): """ Worker for multithreading the counting operation for speed. :param fname: the bz2 file to open. :param size_mb: the maximum memory footprint of the bounter. :return: bounter object on the subreddit keys """ b = bounter.bounter(size_mb=size_mb) with bz2.open(fname, "rt", encoding="utf8") as F: b.update(json.loads(i)["subreddit"] for i in F) return b
def get_word_cnts_gt_min_cnt(self, sentences, min_cnt): flattened = [word for sentence in sentences for word in sentence] counter = bounter(size_mb=4096) counter.update(flattened) array_cnt = np.array(list(counter.iteritems())) array = array_cnt[:, 0] cnts = array_cnt[:, 1] cnts = cnts.astype(int) valid = np.where(cnts > min_cnt) valid_array = array[valid] valid_cnts = cnts[valid] word_cnts = list(zip(valid_array, valid_cnts)) return word_cnts, valid_array, len(flattened)
def count_ngrams(self, sentences, n, use_bounter=True, sep='\t', **bounterargs): """Counts n-gram occurrences in a corpus. Counts n-gram occurrences in a corpus and inserts the output in an SQLite database. Parameters ---------- sentences: Iterable Iterable of sentences. Each sentence must be a list of strings representing word features separated with the character that is passed to the 'sep' argument of this function. n: int or list of int length of the n-grams use_bounter: bool, default=True If True, the counts are performed via bounter, a probabilistic and memory efficient counter. If false, they are performed via regular Counter. The use of bounter is strongly recommended when working with a large corpus. sep: str, default '\t' The character that separates the features of each word in the input. **bounterargs keyword arguments passed to the bounter constructor if used. """ messages.msg("Counting ngrams of length {}...".format(n)) if use_bounter: bounterargs.setdefault('size_mb', 1024) counter = bounter(**bounterargs) else: counter = Counter() for sent in sentences: if type(n) == list: ngrams = list() for i in n: ngrams += NgramCounter._gen_ngrams(sent, i) else: ngrams = NgramCounter._gen_ngrams(sent, n) counter.update(ngrams) messages.done() self._counts_to_db(counter, sep)
def _index_words(df): all_words = np.concatenate(df['mecab'].values) if self.use_min_cnt: cnt = bounter(size_mb=4096) cnt.update(all_words) words_cnt = np.array(list(cnt.iteritems())) words = words_cnt[:, 0] cnts = words_cnt[:, 1] cnts = cnts.astype(int) unique_words = words[np.where(cnts >= self.word_min_cnt)] else: unique_words = pd.unique(all_words) print('number of unique_words:{}'.format(unique_words.shape[0])) w2i, i2w = {}, {} for i, w in enumerate(unique_words): w2i[w] = i + 1 i2w[i + 1] = w return unique_words, w2i, i2w
def _frequencies_from_records(records, ids=None, size_mb=None): """Frequencies from records generator.""" if ids is not None: x = set() if isinstance(ids, numbers.Integral): x.add(ids) else: x.update(ids) ids = x is_1d = None def is_sequence(obj): if isinstance(obj, str): return True return not isinstance(obj, collections.abc.Sequence) def stringify(features): nonlocal is_1d if is_1d is None: is_1d = is_sequence(features) if is_1d: return str(features) if ids: # select set of indices return ' '.join(str(x) for j, x in enumerate(features) if j in ids) return ' '.join(str(x) for x in features) if callable(records): records = records() if size_mb: # approximate counting using bounter counts = bounter(size_mb=size_mb) else: counts = collections.Counter() counts.update(stringify(row) for row in records) return counts
def do_gen_mfw(paths_freqs, estimate=True, n=None, by_ntext=False, by_fpm=False, progress=False, desc='', num_proc=1, floatpad=100000): from bounter import bounter from collections import Counter from tqdm import tqdm countd = bounter(1024) if estimate else Counter() for freqs in pmap_iter( getfreqs, paths_freqs, kwargs=dict(by_ntext=by_ntext, by_fpm=by_fpm), progress=progress, num_proc=num_proc, desc='Computing most frequent words across all texts'): freqs = dict((w, c) for w, c in freqs.items() if is_valid_mfw_word(w)) # if these aren't integers... typs = {type(c) for w, c in freqs.items()} # print(typs) if typs != {int}: # if we're not estimating, it should be ok? # if we are... if estimate: # just make the count a fpm as integer freqs_int = dict((w, int(math.ceil(c * floatpad))) for w, c in freqs.items()) freqs = [w for w, c in freqs_int.items() for _ in range(c)] # print(f'freqs is now a list of {len(freqs)} items long') # print(f'freqs has {len(freqs)} keys now') countd.update(freqs) # print(f'countd now has {len(countd)} keys') # print(f'returning countd of {len(countd)} keys') return countd
def create_counts(input): qout = Queue(cpus * 2) workers = [] logging.info("Spawning {} count processes on {}".format(cpus, input)) for i in range(cpus): p = Process(target=counter, args=(qout, i, input, "count")) p.start() workers.append(p) wordcounter = bounter.bounter(memory) while True: try: input_dict = qout.get_nowait() logging.debug("inputting queue of length {} from worker".format( len(input_dict))) wordcounter.update(input_dict) except queue.Empty: if running_processes(workers): time.sleep(1 / 100) else: break except ValueError: for k, v in input_dict.items(): print("'{}'\t'{}'".format(k, v)) wordcounter.update({k: v}) raise except TypeError: for k, v in input_dict.items(): print("'{}'\t'{}'".format(k, v)) wordcounter.update({k: v}) raise return wordcounter
def update_theta(self, B): assert (isinstance(B, list)) assert len(B) > 1 pre_param = self._param_theta m = len(B) numerator = bounter.bounter(size_mb=self.memory_size_mb // 2) denominator = collections.defaultdict(int) bar = tqdm.tqdm(range(m - 1), ascii=True) for i in bar: bar.set_description('theta 1/2') numerator.increment(self.C.word(B[i], B[i + 1])) denominator[B[i + 1] - B[i]] += 1 bar = tqdm.tqdm(self._theta.items(), total=len(self._theta), ascii=True) for candidate, idx in bar: bar.set_description('theta 2/2') if numerator[candidate] != 0: u_len = candidate.count(' ') + 1 new_theta_value = numerator[candidate] / denominator[u_len] self._param_theta[idx] = new_theta_value else: self._param_theta[idx] = 0. self._trace_theta.append(compare_arrays(self._param_theta, pre_param)) del pre_param
def count_posts(num_threads): """ Main function to multithread the subreddit counting. Saves the counts out to a new csv file. :param num_threads: int How many threads to use when processing the data. More --> faster, but also uses more memory. :param reddit_dir: str The folder containing the Reddit Public Comment Corpus files in bz2 format. The files can be contained in subfolders, but they should be the ONLY files in this directory. :param outfile: str, ends with .csv The path to save the output file to. Must be a .csv file. :return: bounter object with subreddit counts """ # files = [] # for i in os.walk("reddit"): # if i[0] == "by subreddit": continue # for j in i[2]: # files.append(f"{i[0]}\\{j}") files = list(recursive_scan("reddit")) # multiprocess the counting of posts per subreddit. with multiprocessing.Pool(processes=num_threads) as pool: bounters = list( tqdm(pool.imap(count_number_worker, files), desc="Counting Posts", total=len(files))) big_bounter = bounter.bounter(size_mb=200) for i in bounters: big_bounter.update(i) df = pd.Series(dict(big_bounter.items())) df.to_csv("Subreddit Post Counts.csv") return big_bounter
for e in v: e = e.split() for wd in e: nodes.append(wd) months = sorted(months) nodes = Counter(nodes) nodes = { k: v for k, v in sorted(nodes.items(), key=lambda item: item[1]) if v > 500 and (len(k) > 1 or k == "a") and k not in stoplist } vocabulary = set(nodes.keys()) print(len(vocabulary)) wd_times = {} edges = bounter(size_mb=1024) i = len(vocabulary) for wd in vocabulary: print(f"remains {i} words") times = [] for k, v in month_utterances.items(): for e in v: e = e.split() e = [w for w in e if w in vocabulary] if len(e) > 1: if wd in set(e): times.append(int(k)) skips = list(skipgrams(e, 2, 3)) skips = [skip[0] + "_" + skip[1] for skip in skips] edges.update(skips) if len(times) > 0:
def __init__(self, filename=None, vocab_size=None, max_len=None, chunk_size=10**5, delimiter=None, size_mb=4024, pad_symbol='<pad>', start_symbol='<s>', end_symbol='</s>', unknown_symbol='<unk>', default_pad_start=False, default_pad_end=True, filter_on=None, prune_at=10**10, encoding='utf8', **kwargs): """ This is the object to store text and read them into vocabulary indices. The object is an iterable that yields vocabulary indices of the tokens in the sentences. :param filename: Textfile that contains source sentences. :type filename: str :param vocab_size: Max no. of words to keep in the source vocab. :type vocab_size: int :param chunk_size: Use to limit no. of sentences to load at a time when populating the vocabulary. :type chunk_size: int :param delimiter: Delimiter to split on when "tokenizing" :type delimiter: str :param size_mb: Memory footprint of the bounter object use to count the vocab. :type size_mb: int :param start_symbol: Start symbol use for padding. :type start_symbol: str :param end_symbol: End symbol use for padding. :type end_symbol: str :param unknown_symbol: Unknown symbol for OOV words. :type unknown_symbol: str :param default_pad_start: By default, pad the <s> to sentence when vectorizing. :type default_pad_start: bool :param default_pad_end: By default, pad the </s> to sentence when vectorizing. :type default_pad_end: bool :param filter_on: Option to filter on term-freq ('tf') or doc-freq ('df') :type filter_on: str :param prune_at: *prune_at* parameter used by gensim.Dictionary :type prune_at: int """ if 'loadfrom' not in kwargs: # Creating. self.filename = absolute_path(filename) # Check that inputs are not None. assert Path(self.filename).exists( ), "File {filename} does not exist".format(filename=filename) # Initialize encoding. self.encoding = encoding # Initialize the pad, start, end and unknown symbols. self.PAD, self.PAD_IDX = pad_symbol, 0 self.START, self.START_IDX = start_symbol, 1 self.END, self.END_IDX = end_symbol, 2 self.UNK, self.UNK_IDX = unknown_symbol, 3 self.default_pad_start = default_pad_start self.default_pad_end = default_pad_end # Save the user-specific delimiter self.delimiter = delimiter # Gensim related attribute to keep the pruning cap. self.prune_at = prune_at # Populate the source vocabulary. print('Creating Vocabulary...', end='\n', file=sys.stderr) self.vocab = Dictionary( [[pad_symbol], [start_symbol], [end_symbol], [unknown_symbol]], prune_at=self.prune_at) self.counter = bounter(size_mb=size_mb) print('Building source vocab and counter...', end=' ', file=sys.stderr) self.populate_dictionary(self.filename, self.vocab, self.counter, chunk_size) # Use the user-specified source/target vocab size if set, # else use the full vocab_size. self.vocab_size = min(len( self.vocab), vocab_size) if vocab_size else len(self.vocab) # Keep the vocabulary to a max set by user. if filter_on and self.vocab_size < len(self.vocab): print('Filtering least frequent words in vocab.', end='\n', file=sys.stderr) if filter_on == 'tf': self.filter_n_least_frequent( self.vocab, self.counter, self.vocab_size, keep_tokens=['<pad>', '<s>', '</s>', '<unk>']) elif filter_on == 'df': self.vocab.filter_extremes( no_below=1, no_above=self.prune_at, keep_n=self.vocab_size, keep_tokens=['<pad>', '<s>', '</s>', '<unk>']) self.iterable = self._iterate() else: # Loading. self.load(kwargs['loadfrom'], filename, kwargs.get('load_counter', False)) self.iterable = self._iterate()
def test_sanity_default(self): counter = bounter(size_mb=16) counter.update([u'foo', u'bar', u'foo']) self.assertEqual(counter[u'foo'], 2) self.assertEqual(counter[u'bar'], 1) self.assertEqual(counter.cardinality(), 2)
def test_nocounts_init(self): counter = bounter(need_counts=False) self.assertTrue(issubclass(type(counter), CountMinSketch)) self.assertEqual(counter.size(), 4)
def test_ht_log_init(self): with self.assertRaises(ValueError): bounter(size_mb=4, log_counting=8)
def test_cms_init_log8(self): counter = bounter(size_mb=1, need_iteration=False, log_counting=8) self.assertEqual(type(counter), CountMinSketch) self.assertEqual(type(counter.cms), cmsc.CMS_Log8) self.assertEqual(counter.size(), 2**20)
def test_cms_init_default(self): counter = bounter(size_mb=64, need_iteration=False) self.assertEqual(type(counter), CountMinSketch) self.assertEqual(type(counter.cms), cmsc.CMS_Conservative) self.assertEqual(counter.size(), 2**26)
def test_explicit_init(self): counter = bounter(size_mb=2, need_iteration=True) self.assertEqual(type(counter), HashTable) self.assertEqual(counter.buckets(), 2**16)
def test_no_size_init(self): with self.assertRaises(ValueError): counter = bounter()
def test_default_init(self): counter = bounter(7) self.assertEqual(type(counter), HashTable)
def test_contains(self): counter = bounter(size_mb=16) counter.update([u'foo', u'bar', u'foo']) self.assertTrue('foo' in counter) self.assertFalse('foobar' in counter)
def process_pubtator(sample_ratio=0.03): """Sample Pubtator documents and batchify for multiprocessing""" logger.info('Processing PubTator documents...') global docs docs = [] total_meshes = 0 total_docs_read = 0 bsz = 10000 # batch size word_freq = bounter(size_mb=1024 * 4) def cb_proc_pubtator(res): nonlocal total_meshes docs_, words_, num_meshes = res docs.extend(docs_) word_freq.update(words_) total_meshes += num_meshes print('total_meshes {}, num_meshes {}\r' ''.format(total_meshes, num_meshes), end='') p = mp.Pool() # Read the PubTator datafile with pubtator_file.open('rt') as f: batch = [] aDoc = [] flgSample = rnd.random() < sample_ratio while True: line = f.readline() if not line: # End of file break if not flgSample: # Do nothing and just read on if line == '\n': # End of document flgSample = rnd.random() < sample_ratio total_docs_read += 1 else: continue else: if line == '\n': # End of document total_docs_read += 1 batch.append(aDoc) # Add current document flgSample = rnd.random() < sample_ratio aDoc = [] if len(batch) == bsz: # If batch is full, assign a job p.apply_async(mp_proc_pubtator, (batch, ), callback=cb_proc_pubtator) batch = [] else: aDoc.append(line.rstrip()) if len(batch) > 0: p.apply_async(mp_proc_pubtator, (batch, ), callback=cb_proc_pubtator) p.close() p.join() # move from bounter to Counter (bounter dosen't have most_common()) global words words = Counter({k: v for k, v in word_freq.items()}) logger.info('{}/{} documents processed, {} mesh terms found ({} meshes per' ' doc)'.format(len(docs), total_docs_read, total_meshes, total_meshes / len(docs)))
import pickle import networkx as nx import numpy as np from bounter import bounter with open("data/langmods/seedonly.p", "rb") as f: vocab = pickle.load(f) m = np.load("data/langmods/seedonly.npy") G = nx.Graph() counts = bounter(size_mb=1024) for i in range(len(vocab)): source = vocab[i] dists = np.dot(m, m[i]) closests = sorted(list(dists), reverse=True)[:6] edges = [] for c in closests: wordid = list(dists).index(c) target = vocab[wordid] if source != target: t = "@".join(sorted((source, target))) edges.append(t) counts.update(edges) G = nx.Graph() i = 0 for skip, freq in counts.iteritems(): if freq > 0:
def __init__(self, bounter_size_mb=1024, *args, **kwargs): # Global counter for term frequency self.TF = bounter(size_mb=bounter_size_mb)