from database import getPageCollection, getLikesCollection, getPagesClusterInfoCollection, getClusterCollection
from pprint import pprint
import dateutil.parser as dateparser

allpages = getPageCollection()
alllikes = getLikesCollection()
fbpagesinfo = getPagesClusterInfoCollection()
clusterinfo = getClusterCollection()

fbpagesinfo.drop()
counter = 0
for pageId in allpages.find():
    cursor = alllikes.find({'data': {'$elemMatch': {'id': pageId['_id']}}})
    cluster = clusterinfo.find_one({'pages': pageId['_id']})
    cluster = cluster["cluster"]

    document = {'_id': pageId['_id'], 'people': [], 'count': cursor.count(), 'cluster': cluster}
    for c in cursor:
        dd = {'id': c['id']}
        for pages in c['data']:
            if pages['id'] == pageId['_id']:
                if 'created_time' in pages:
                    dd['created_time'] = dateparser.parse(pages['created_time'])
                    break
        document['people'].append(dd)
    counter += 1
    print 'document', counter, 'done'
    fbpagesinfo.insert(document)
        print("    The number of documents must be >= k.")
        sys.exit(1)


    try:
        k = KVAL
    except ValueError():
        usage()

    vocab = {}
    xs = []
    args = []
    fid = ''
    fetchTestpage(PID)

    clusterCollection = getClusterCollection()

    clusters = clusterCollection.distinct("cluster")

    fbdataCollection = getPageDataCollection()
    for cluster in clusterCollection.find():
        pageIds = cluster['pages'][:len(cluster['pages']) / 10]
        for p in pageIds:
            data = fbdataCollection.find_one({'_id': p})
            _id = data['_id']
            string = data['data']
            f = open('temp/' + _id + '.txt', 'w')
            f.write(string)
            f.close()

    for name in glob.glob('./temp/*.txt'):
Beispiel #3
0
    xs = []
    args = []
    for name in glob.glob('./data/*.txt'):
        args.append(name)
    for a in args:
        x = defaultdict(float)
        with open(a) as f:
            for w in re.findall(r"\w+", f.read()):
                vocab.setdefault(w, len(vocab))
                x[vocab[w]] += 1
        xs.append(x.items())

    cluster_ind = kmeans(k, xs, len(vocab))
    clusters = [set() for _ in xrange(k)]
    for i, j in enumerate(cluster_ind):
        clusters[j].add(i)

    def cleanName(string):
        return string[7:-4]

    collection = getClusterCollection()
    collection.drop()
    for j, c in enumerate(clusters):
        print("cluster %d:" % j)
        array = []
        for i in c:
            print("\t%s" % args[i])
            array.append(args[i])
        array = map(cleanName, array)
        doc = {'cluster': j, 'pages': array}
        collection.insert(doc)
Beispiel #4
0
from database import getPageCollection, getLikesCollection, getPagesClusterInfoCollection, getClusterCollection
from pprint import pprint
import dateutil.parser as dateparser

allpages = getPageCollection()
alllikes = getLikesCollection()
fbpagesinfo = getPagesClusterInfoCollection()
clusterinfo = getClusterCollection()

fbpagesinfo.drop()
counter = 0
for pageId in allpages.find():
    cursor = alllikes.find({'data': {'$elemMatch': {'id': pageId['_id']}}})
    cluster = clusterinfo.find_one({'pages': pageId['_id']})
    cluster = cluster["cluster"]

    document = {
        '_id': pageId['_id'],
        'people': [],
        'count': cursor.count(),
        'cluster': cluster
    }
    for c in cursor:
        dd = {'id': c['id']}
        for pages in c['data']:
            if pages['id'] == pageId['_id']:
                if 'created_time' in pages:
                    dd['created_time'] = dateparser.parse(
                        pages['created_time'])
                    break
        document['people'].append(dd)