for k, v in items.items(): if k == 'text' or k == 'sentence' or k == 'w': result[k] = v elif len(v) == 1: result[k] = v[0] else: result[k] = v return result #return { k: v[0] if len(v) == 1 else v for k, v in items.items() } # saving a parsed top level element given in python dictionary to a mongodb in json format. Changing top level # attribute 'id' to mongo specific '_id'. If connection fails auto reconnects 5 times. def save_to_db(user): user[0]["_id"] = user[0]["id"] for i in range(5): try: db.collection.save(user[0]) break except pymongo.errors.AutoReconnect: time.sleep(pow(2, i)) if __name__ == "__main__": #db = thtdb.ThtConnection(collectionName='pldebatt_october_multi') #db = thtdb.ThtConnection(host='squib.de', dbName='karinas_twitter_db', collectionName='twitter-pldebatt-131006-new') db = thtdb.ThtConnection(dbName='local', collectionName='twitter-pldebatt-130612') parse_xml(input_path + "twitter-pldebatt-130612.xml") db.client.disconnect()
total_term_frequency = map(list, zip(*total_term_frequency)) containing_doc_number = [sum([1 for x in y if x > 0]) for y in total_term_frequency] is_more = [sum([1 for x in y if x > 1]) for y in total_term_frequency] print "Number of times the terms appear: " PrettyPrintList([sum(x) for x in total_term_frequency]) print "Number of docs the term appears in: " PrettyPrintList(containing_doc_number) print "idf: " PrettyPrintListFloat([math.log(doc_number/(x+1),10) for x in containing_doc_number]) print "Number of tweets having the term frequency calculated: ", [len(x) for x in total_term_frequency] print "This many tweets have the term more than once: ", is_more ''' if __name__ == "__main__": db = thtdb.ThtConnection(host='squib.de', dbName='karinas_twitter_db', collectionName='twitter-pldebatt-131006') ''' print 'saving crime tweets...' #createTopicDocuments('crime') print 'crime tweets done.' print 'saving school tweets...' #createTopicDocuments('school') print 'school tweets done.' print 'saving climate tweets...' #createTopicDocuments('climate') print 'climate tweets done.' print 'saving tax tweets...' createTopicDocuments('tax') print 'tax tweets done.' print 'saving immigration tweets...'
print db.collection.find({ "created": { "$gt": d } }).sort("username").explain()["nscanned"] print "find indexed users created..." d = "2008-01-01" db.collection.create_index([("created", DESCENDING), ("username", ASCENDING)]) print db.collection.find({ "created": { "$lt": d } }).sort("username").explain()["cursor"] print db.collection.find({ "created": { "$lt": d } }).sort("username").explain()["nscanned"] for user in db.collection.find({"created": {"$lt": d}}).sort("username"): print user["username"] print "aggregation example..." if __name__ == "__main__": #client = Connection('localhost', 27017) #db = client.local #collection = db.test_3 db = thtdb.ThtConnection(collectionName='test_pldebatt_june') querries()
max_key = max(user_stat.iterkeys(), key=(lambda key: user_stat[key])) max_val = user_stat[max_key] user_stat_file.write( max_key.encode('utf8') + ',' + str(max_val) + '\n') del user_stat[max_key] while hash_stat != {}: max_key = max(hash_stat.iterkeys(), key=(lambda key: hash_stat[key])) max_val = hash_stat[max_key] hash_stat_file.write( max_key.encode('utf8') + ',' + str(max_val) + '\n') del hash_stat[max_key] while word_stat != {}: max_key = max(word_stat.iterkeys(), key=(lambda key: word_stat[key])) max_val = word_stat[max_key] word_stat_file.write( max_key.encode('utf8') + ',' + str(max_val) + '\n') del word_stat[max_key] if __name__ == "__main__": #db = thtdb.ThtConnection(collectionName='test_short_1') db = thtdb.ThtConnection(host='squib.de', dbName='karinas_twitter_db', collectionName='pldebatt_october') querries("word_stat_october_test", "user_stat_october_test", "hash_stat_october_test") #querries_aggregated() #querries_mapreduce_test()
items[elem.tag].append(temp) else: temp_dict = pt(context, elem) items[elem.tag].append(temp_dict) elif action == "end": text = elem.text.strip() if elem.text else "" break if elem.tag == 'user': save_to_db(items[elem.tag]) del items[elem.tag] elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] del context if len(items) == 0: return text return {k: v[0] if len(v) == 1 else v for k, v in items.items()} def save_to_db(user): user[0]["_id"] = user[0]["id"] db.collection.save(user[0]) if __name__ == "__main__": db = thtdb.ThtConnection(collectionName='test_1b') #db = thtdb.ThtConnection(host='squib.de') parse_xml("/Users/karinabunyik/BTSync/Data/twitter-pldebatt.xml") db.client.disconnect()