from database import getPageCollection, getLikesCollection, getPagesClusterInfoCollection, getClusterCollection from pprint import pprint import dateutil.parser as dateparser allpages = getPageCollection() alllikes = getLikesCollection() fbpagesinfo = getPagesClusterInfoCollection() clusterinfo = getClusterCollection() fbpagesinfo.drop() counter = 0 for pageId in allpages.find(): cursor = alllikes.find({'data': {'$elemMatch': {'id': pageId['_id']}}}) cluster = clusterinfo.find_one({'pages': pageId['_id']}) cluster = cluster["cluster"] document = {'_id': pageId['_id'], 'people': [], 'count': cursor.count(), 'cluster': cluster} for c in cursor: dd = {'id': c['id']} for pages in c['data']: if pages['id'] == pageId['_id']: if 'created_time' in pages: dd['created_time'] = dateparser.parse(pages['created_time']) break document['people'].append(dd) counter += 1 print 'document', counter, 'done' fbpagesinfo.insert(document)
print(" The number of documents must be >= k.") sys.exit(1) try: k = KVAL except ValueError(): usage() vocab = {} xs = [] args = [] fid = '' fetchTestpage(PID) clusterCollection = getClusterCollection() clusters = clusterCollection.distinct("cluster") fbdataCollection = getPageDataCollection() for cluster in clusterCollection.find(): pageIds = cluster['pages'][:len(cluster['pages']) / 10] for p in pageIds: data = fbdataCollection.find_one({'_id': p}) _id = data['_id'] string = data['data'] f = open('temp/' + _id + '.txt', 'w') f.write(string) f.close() for name in glob.glob('./temp/*.txt'):
xs = [] args = [] for name in glob.glob('./data/*.txt'): args.append(name) for a in args: x = defaultdict(float) with open(a) as f: for w in re.findall(r"\w+", f.read()): vocab.setdefault(w, len(vocab)) x[vocab[w]] += 1 xs.append(x.items()) cluster_ind = kmeans(k, xs, len(vocab)) clusters = [set() for _ in xrange(k)] for i, j in enumerate(cluster_ind): clusters[j].add(i) def cleanName(string): return string[7:-4] collection = getClusterCollection() collection.drop() for j, c in enumerate(clusters): print("cluster %d:" % j) array = [] for i in c: print("\t%s" % args[i]) array.append(args[i]) array = map(cleanName, array) doc = {'cluster': j, 'pages': array} collection.insert(doc)
from database import getPageCollection, getLikesCollection, getPagesClusterInfoCollection, getClusterCollection from pprint import pprint import dateutil.parser as dateparser allpages = getPageCollection() alllikes = getLikesCollection() fbpagesinfo = getPagesClusterInfoCollection() clusterinfo = getClusterCollection() fbpagesinfo.drop() counter = 0 for pageId in allpages.find(): cursor = alllikes.find({'data': {'$elemMatch': {'id': pageId['_id']}}}) cluster = clusterinfo.find_one({'pages': pageId['_id']}) cluster = cluster["cluster"] document = { '_id': pageId['_id'], 'people': [], 'count': cursor.count(), 'cluster': cluster } for c in cursor: dd = {'id': c['id']} for pages in c['data']: if pages['id'] == pageId['_id']: if 'created_time' in pages: dd['created_time'] = dateparser.parse( pages['created_time']) break document['people'].append(dd)