Beispiel #1
0
def test_cluster():
    c = Corpus(similarity=0.1)
    for doc in docs:
        c.add(doc)

    groups = c.cluster()

    eq_(len(groups), 2)
    eq_(len(groups[0].similars), 1)
    eq_(len(groups[1].similars), 1)
Beispiel #2
0
def cluster_queryset(qs):
    seen = {}
    c = Corpus(similarity=SIM_THRESHOLD, stopwords=STOPWORDS)

    for op in qs:

        if op.description in seen:
            continue

        # filter short descriptions
        if len(op.description) < 15:
            continue

        seen[op.description] = 1
        c.add(op, str=op.description, key=op.id)

    return c.cluster()
Beispiel #3
0
def process(inStream,
            outStream,
            fields={
                "id": "id",
                "text": "text"
            },
            limits={
                "clusters": 10,
                "top_documents": 10
            }):
    all = {}

    text_field = fields["text"]
    key_field = fields["id"]
    max_clusters = limits["clusters"]
    max_top_docs = limits["top_documents"]

    c = Corpus()
    for line in inStream:
        data = line.split('\t', 1)[1]
        doc = json.loads(data.decode("utf8"))
        key = doc[key_field]
        all[key] = doc
        text = c.add((key, doc[text_field]), key=key)

    clusters = c.cluster()
    results = []
    for c in clusters[:max_clusters]:
        tophits = [c.primary]
        tophits += [hit["object"] for hit in c.similars[:max_top_docs - 1]]
        topdocs = []
        for (key, text) in tophits:
            topdocs.append(all[key])
        results.append({"top_documents": topdocs})

    json.dump({"clusters": results}, outStream)