Example #1
0
def process(inStream, outStream,
            fields={"id": "id", "text": "text"},
            limits={"clusters": 10, "top_documents": 10}):
    all = {}

    text_field = fields["text"]
    key_field = fields["id"]
    max_clusters = limits["clusters"]
    max_top_docs = limits["top_documents"]

    c = Corpus()
    for line in inStream:
        data = line.split('\t', 1)[1]
        doc = json.loads(data.decode("utf8"))
        key = doc[key_field]
        all[key] = doc
        text = c.add((key, doc[text_field]), key=key)

    clusters = c.cluster()
    results = []
    for c in clusters[:max_clusters]:
        tophits = [c.primary]
        tophits += [hit["object"] for hit in c.similars[:max_top_docs-1]]
        topdocs = []
        for (key, text) in tophits:
            topdocs.append(all[key])
        results.append({"top_documents": topdocs})

    json.dump({"clusters": results}, outStream)
Example #2
0
def test_cluster():
    c = Corpus(similarity=0.1)
    for doc in docs:
        c.add(doc)

    groups = c.cluster()

    eq_(len(groups), 2)
    eq_(len(groups[0].similars), 1)
    eq_(len(groups[1].similars), 1)
Example #3
0
def test_cluster():
    c = Corpus(similarity=0.1)
    for doc in docs:
        c.add(doc)

    groups = c.cluster()

    eq_(len(groups), 2)
    eq_(len(groups[0].similars), 1)
    eq_(len(groups[1].similars), 1)
Example #4
0
def cluster_by_platform(qs, feeling, version):
    qs = qs.filter(version=version)
    # We need to create corpii for each platform and manually inspect each
    # opinion and put it in the right platform bucket.

    seen = {}

    for platform in OS_USAGE:
        c = Corpus(similarity=SIM_THRESHOLD, stopwords=STOPWORDS)
        seen = {}

        for op in qs.filter(os=platform.short):

            if op.description in seen:
                continue

            # filter short descriptions
            if len(op.description) < 15:
                continue

            seen[op.description] = 1
            c.add(op, str=op.description, key=op.id)

        result = c.cluster()

        if result:
            cluster_type, created = ClusterType.objects.get_or_create(
                feeling=feeling,
                version=version,
                platform=platform.short,
                frequency='weekly')

            # Remove the old cluster_groups
            Cluster.objects.filter(type=cluster_type).delete()

            # Store the clusters into groups
            for group in result:
                cluster = Cluster(type=cluster_type)
                cluster.num_opinions = len(group.similars) + 1
                cluster.pivot = group.primary
                cluster.save()

                for s in group.similars:
                    ClusterItem(
                        cluster=cluster,
                        opinion=s['object'],
                        score=s['similarity']).save()
Example #5
0
def cluster_queryset(qs):
    seen = {}
    c = Corpus(similarity=SIM_THRESHOLD, stopwords=STOPWORDS)

    for op in qs:

        if op.description in seen:
            continue

        # filter short descriptions
        if len(op.description) < 15:
            continue

        seen[op.description] = 1
        c.add(op, str=op.description, key=op.id)

    return c.cluster()
Example #6
0
def cluster_queryset(qs):
    seen = {}
    c = Corpus(similarity=SIM_THRESHOLD, stopwords=STOPWORDS)

    for op in qs:

        if op.description in seen:
            continue

        # filter short descriptions
        if len(op.description) < 15:
            continue

        seen[op.description] = 1
        c.add(op, str=op.description, key=op.id)

    return c.cluster()
Example #7
0
def process(inStream,
            outStream,
            fields={
                "id": "id",
                "text": "text"
            },
            limits={
                "clusters": 10,
                "top_documents": 10
            }):
    all = {}

    text_field = fields["text"]
    key_field = fields["id"]
    max_clusters = limits["clusters"]
    max_top_docs = limits["top_documents"]

    c = Corpus()
    for line in inStream:
        data = line.split('\t', 1)[1]
        doc = json.loads(data.decode("utf8"))
        key = doc[key_field]
        all[key] = doc
        text = c.add((key, doc[text_field]), key=key)

    clusters = c.cluster()
    results = []
    for c in clusters[:max_clusters]:
        tophits = [c.primary]
        tophits += [hit["object"] for hit in c.similars[:max_top_docs - 1]]
        topdocs = []
        for (key, text) in tophits:
            topdocs.append(all[key])
        results.append({"top_documents": topdocs})

    json.dump({"clusters": results}, outStream)