Ejemplo n.º 1
0
def worker(index, path):
    global counter
    """
    :param index: the index of the dump this worker should work on.
    :return:
    """
    print "Process %d start processing" % index
    with open("%s/wiki_0%s" % (path, index), "r") as f:
        batch = Counter()
        batch_limit = 10000
        sketch = CountMinSketch(DEPTH, WIDTH, HASH_FUNCTIONS)
        current = datetime.now().date()
        for line in f:
            # Extrat timestamp from header
            if line[:4] == "<doc":
                m = TIMESTEMP_RE.search(line)
                if m:
                    current = datetime.strptime(m.group(1),
                                                "%Y-%m-%dT%H:%M:%SZ").date()
                continue
            elif line[:5] == "</doc>":
                continue
            else:
                for pair in map(lambda word: (current, word.lower()),
                                WORD_RE.findall(line)):
                    batch[pair] += 1
            if len(batch) > batch_limit:
                for key, count in batch.iteritems():
                    sketch.add(key, count)
                batch.clear()

            counter.value += 1
            if counter.value % 10000 == 0:
                print "Processed %s lines" % counter.value

        for key, count in batch.iteritems():
            sketch.add(key, count)
        batch.clear()

    print "Process %d finished" % index
    return sketch.get_matrix()
Ejemplo n.º 2
0
def worker(index, path):
    global counter
    """
    :param index: the index of the dump this worker should work on.
    :return:
    """
    print "Process %d start processing" % index
    with open("%s/wiki_0%s" % (path, index), "r") as f:
        batch = Counter()
        batch_limit = 10000
        sketch = CountMinSketch(DEPTH, WIDTH, HASH_FUNCTIONS)
        current = datetime.now().date()
        for line in f:
            # Extrat timestamp from header
            if line[:4] == "<doc":
                m = TIMESTEMP_RE.search(line)
                if m:
                    current = datetime.strptime(m.group(1), "%Y-%m-%dT%H:%M:%SZ").date()
                continue
            elif line[:5] == "</doc>":
                continue
            else:
                for pair in map(lambda word: (current, word.lower()), WORD_RE.findall(line)):
                    batch[pair] += 1
            if len(batch) > batch_limit:
                for key, count in batch.iteritems():
                    sketch.add(key, count)
                batch.clear()

            counter.value += 1
            if counter.value % 10000 == 0:
                print "Processed %s lines" % counter.value

        for key, count in batch.iteritems():
            sketch.add(key, count)
        batch.clear()

    print "Process %d finished" % index
    return sketch.get_matrix()