def worker(index, path): global counter """ :param index: the index of the dump this worker should work on. :return: """ print "Process %d start processing" % index with open("%s/wiki_0%s" % (path, index), "r") as f: batch = Counter() batch_limit = 10000 sketch = CountMinSketch(DEPTH, WIDTH, HASH_FUNCTIONS) current = datetime.now().date() for line in f: # Extrat timestamp from header if line[:4] == "<doc": m = TIMESTEMP_RE.search(line) if m: current = datetime.strptime(m.group(1), "%Y-%m-%dT%H:%M:%SZ").date() continue elif line[:5] == "</doc>": continue else: for pair in map(lambda word: (current, word.lower()), WORD_RE.findall(line)): batch[pair] += 1 if len(batch) > batch_limit: for key, count in batch.iteritems(): sketch.add(key, count) batch.clear() counter.value += 1 if counter.value % 10000 == 0: print "Processed %s lines" % counter.value for key, count in batch.iteritems(): sketch.add(key, count) batch.clear() print "Process %d finished" % index return sketch.get_matrix()