def main2(): corpus = {} db = Corpus(database="sanal", collection=sys.argv[1]) query = {} for i, item in enumerate(db.find(query)): text = item["text"] words = getwords(unicode(text)) wordsd = {} for w in words: countup(wordsd, w) doc = { "text": wordsd, "id": item["id"] } #u = item["user"]["screen_name"] u = item["screen_name"] try: corpus[u].append(doc) except KeyError: corpus[u] = [ doc ] print(i) with file(sys.argv[2], "w") as opened: for k, v in corpus.items(): opened.write("%s\n" % json.dumps({k: v}))
uid = extractd.getid(n2i, u) vid = extractd.getid(n2i, v) graph.add_edge(uid, vid) extractd.countup(weights, (uid, vid)) extractd.countup(weights, (vid, uid)) with file('%s.wpairs' % sys.argv[1], 'w') as opened: for e in graph.edges(): w = weights[(e[0], e[1])] if weights[(e[0], e[1])] <= weights[(e[1], e[0])] else weights[(e[1], e[0])] opened.write( '%d\t%d\t%d\n' % (e[0], e[1], w) ) with file('%s.n2i' % sys.argv[1], 'w') as opened: for u in n2i: opened.write('%s\t%d\n' % (u, n2i[u])) if __name__ == '__main__': dbinfo = Pit.get("says") db = Corpus(database=dbinfo["db"], collection=dbinfo["items"]) t_end = time.mktime( datetime.today().timetuple() ) t_begin = t_end - (24 * 60 * 60 * 10) items = [ item for item in db.find({'created_at': { '$gt': t_begin, '$lt': t_end }}) ] make_graph(items)
def parse_args(): usage = "[--interval] [interval] [-l] [path-to-log]" parser = argparse.ArgumentParser(description="says") parser.add_argument("--interval", type=float, default=1.0) parser.add_argument("-l", "--log", default=".log/log") args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() dbinfo = Pit.get("says") users_db = Corpus(database=dbinfo["db"], collection=dbinfo["users"]) #users = users_db.find({}) users = [ item["screen_name"] for item in users_db.find({}) ] api = activate_api() items_db = Corpus(database=dbinfo["db"], collection=dbinfo["items"]) getitems(users, api, items_db)
return args if __name__ == "__main__": args = parse_args() db = Corpus(database=args.database, collection=args.items) db_stats = Corpus(database=args.database, collection=args.itemstats) try: latstats = db_stats.findsorted({}, key="id")[0]["id"] except IndexError: latstats = 0L for i, item in enumerate(db.find({ "id": { "$gt": latstats }})): words = extractd.getwords(item) messages = extractd.getmessages(item) tags = extractd.gethashtags(item) urls = extractd.geturls(item) db_stats.append({ "screen_name": item["screen_name"] , "words": words , "messages": messages , "hashtags": tags , "urls": urls , "created_at": item["created_at"] , "id": item["id"] })
# -*- encoding: utf-8 -*- # -*- coding: utf-8 -*- import sys import extractor from Corpus import Corpus if __name__ == "__main__": dbname, collname = sys.argv[1], sys.argv[2] corpus_db = Corpus(database=dbname, collection=collname) df_dbname, df_collname = dbname, sys.argv[3] df = {} for j, item in enumerate(corpus_db.find({})): for word in set( extractor.getwords(item["text"]) ): extractor.countup(df, word) with file(df_collname, "w") as opened: for word, freq in sorted(df.items(), key=lambda x:x[1], reverse=True): opened.write("%s\t%d\n" % (word, freq))