def process(inStream, outStream, fields={"id": "id", "text": "text"}, limits={"clusters": 10, "top_documents": 10}): all = {} text_field = fields["text"] key_field = fields["id"] max_clusters = limits["clusters"] max_top_docs = limits["top_documents"] c = Corpus() for line in inStream: data = line.split('\t', 1)[1] doc = json.loads(data.decode("utf8")) key = doc[key_field] all[key] = doc text = c.add((key, doc[text_field]), key=key) clusters = c.cluster() results = [] for c in clusters[:max_clusters]: tophits = [c.primary] tophits += [hit["object"] for hit in c.similars[:max_top_docs-1]] topdocs = [] for (key, text) in tophits: topdocs.append(all[key]) results.append({"top_documents": topdocs}) json.dump({"clusters": results}, outStream)
def test_cluster(): c = Corpus(similarity=0.1) for doc in docs: c.add(doc) groups = c.cluster() eq_(len(groups), 2) eq_(len(groups[0].similars), 1) eq_(len(groups[1].similars), 1)
def cluster_by_platform(qs, feeling, version): qs = qs.filter(version=version) # We need to create corpii for each platform and manually inspect each # opinion and put it in the right platform bucket. seen = {} for platform in OS_USAGE: c = Corpus(similarity=SIM_THRESHOLD, stopwords=STOPWORDS) seen = {} for op in qs.filter(os=platform.short): if op.description in seen: continue # filter short descriptions if len(op.description) < 15: continue seen[op.description] = 1 c.add(op, str=op.description, key=op.id) result = c.cluster() if result: cluster_type, created = ClusterType.objects.get_or_create( feeling=feeling, version=version, platform=platform.short, frequency='weekly') # Remove the old cluster_groups Cluster.objects.filter(type=cluster_type).delete() # Store the clusters into groups for group in result: cluster = Cluster(type=cluster_type) cluster.num_opinions = len(group.similars) + 1 cluster.pivot = group.primary cluster.save() for s in group.similars: ClusterItem( cluster=cluster, opinion=s['object'], score=s['similarity']).save()
def cluster_queryset(qs): seen = {} c = Corpus(similarity=SIM_THRESHOLD, stopwords=STOPWORDS) for op in qs: if op.description in seen: continue # filter short descriptions if len(op.description) < 15: continue seen[op.description] = 1 c.add(op, str=op.description, key=op.id) return c.cluster()
def process(inStream, outStream, fields={ "id": "id", "text": "text" }, limits={ "clusters": 10, "top_documents": 10 }): all = {} text_field = fields["text"] key_field = fields["id"] max_clusters = limits["clusters"] max_top_docs = limits["top_documents"] c = Corpus() for line in inStream: data = line.split('\t', 1)[1] doc = json.loads(data.decode("utf8")) key = doc[key_field] all[key] = doc text = c.add((key, doc[text_field]), key=key) clusters = c.cluster() results = [] for c in clusters[:max_clusters]: tophits = [c.primary] tophits += [hit["object"] for hit in c.similars[:max_top_docs - 1]] topdocs = [] for (key, text) in tophits: topdocs.append(all[key]) results.append({"top_documents": topdocs}) json.dump({"clusters": results}, outStream)