def restore_db(dir_name='data', db_params=mongohq.ideagenstest): files = listdir(dir_name) col_names = [file.split('.json')[0] for file in files] db = mongohq.get_db(db_params) existing_cols = db.collection_names() for file_name in files: file_path = path.join(dir_name, file_name) col = file_name.split('.json')[0] print "writing to data to collection " + col + \ " in db: " + db_params['dbName'] if col != 'users': data = decode_json_file(file_path) if col not in existing_cols: print "creating collection: " + col db.create_collection(col) else: print "inserting into existing collection" try: if data: db[col].insert(data, continue_on_error=True) except DuplicateKeyError: print "Attempted insert of document with duplicate key" else: print "success" else: print "not writing users to db"
def clear_db(db_params=mongohq.ideagenstest): db = mongohq.get_db(db_params) cols = db.collection_names() clear_cols = [col for col in cols if col not in default_collections] for col in clear_cols: # Remove all docs from collection db[col].remove()
def __init__(self, db_params=mongohq.ideagens): """ Constructor to instantiate references to Ideagens instance and its corresponding database """ self.db_params = db_params self.db = mongohq.get_db(db_params)
def dump_db(dir_name='data', db_params=mongohq.ideagenstest): # set up the connnection db = mongohq.get_db(db_params) allCollections = [ col for col in db.collection_names() if col not in default_collections ] print "list of collections: " for col in allCollections: print "collection name: " + col docs = db[col].find() data = [doc for doc in docs] write_json_to_file(data, dir_name, col)
def get_data_output(dir_path='data', db_params=mongohq.ideagenstest): if not path.exists(dir_path): mkdir(dir_path, 0774) db = mongohq.get_db(db_params) # just grab the ideas and clusters clusters = {} for cluster in db.clusters.find(): clusters[cluster[u'_id']] = cluster[u'name'] ideas = [] for idea in db.ideas.find(): rowDict = {} rowDict["idea"] = idea[u'content'] if len(idea[u'clusterIDs']) > 0: clusterID = idea[u'clusterIDs'][0] rowDict["theme"] = clusters[clusterID] else: rowDict["theme"] = "-" rowDict["starred"] = idea[u'isGamechanger'] if idea[u'isGamechanger']: rowDict["starred"] = idea[u'isGamechanger'] else: rowDict["starred"] = "-" rowDict["userID"] = idea[u'userID'] rowDict["promptID"] = idea[u'promptID'] ideas.append(rowDict) ideasDF = pd.DataFrame(ideas) file_path = path.join(dir_path, "ideas.csv") ideasDF.to_csv(file_path) users = {} for user in db.myUsers.find(): users[user[u'_id']] = user[u'name'] # usersDF = pd.DataFrame.from_dict(users) usersOutFile = open(path.join(dir_path, "users.csv"), 'w') for userID, userName in users.items(): usersOutFile.write(userID + "," + userName + "\n") usersOutFile.close() # file_path = path.join(dir_path, "users.csv") # usersDF.to_csv(file_path) ideasByPrompt = [] for prompt in db.prompts.find(): # rowDict = {} thisPromptID = prompt[u'_id'] promptQuestion = prompt[u'question'] for idea in db.ideas.find({u'promptID': thisPromptID}): rowDict = {} rowDict['promptID'] = thisPromptID rowDict['promptQuestion'] = promptQuestion rowDict['ideaID'] = idea[u'_id'] rowDict['idea'] = idea[u'content'] rowDict['userID'] = idea[u'userID'] rowDict['userName'] = idea[u'userName'] rowDict['submissionTime'] = idea[u'time'] if len(idea[u'clusterIDs']) > 0: clusterID = idea[u'clusterIDs'][0] rowDict["theme"] = clusters[clusterID] ideasByPrompt.append(rowDict) ideasByPromptDF = pd.DataFrame(ideasByPrompt) file_path = path.join(dir_path, "ideasByPrompt.csv") ideasByPromptDF.to_csv(file_path) notifications = [] for notification in db.notifications.find(): rowDict = {} if u'message' in notification: rowDict["message"] = notification[u'message'] elif u'examples' in notification: examples = [ex[u'content'] for ex in notification[u'examples']] examplesMessage = "Sent Examples: %s" % (', '.join(examples)) examplesMessage[:-2] rowDict["message"] = examplesMessage elif u'theme' in notification: themeID = notification[u'theme'] rowDict["message"] = "Sent theme: %s" % clusters[themeID] elif u'prompt' in notification: rowDict["message"] = "Sent message: %s" % notification[u'prompt'] else: break # get author info # get time? notifications.append(rowDict) notificationsDF = pd.DataFrame(notifications) # Create file path file_path = path.join(dir_path, "notifications.csv") notificationsDF.to_csv(file_path)
def set_db(self, db_params=mongohq.ideagenstest): """ Set the db where ideagens data sits """ self.db = mongohq.get_db(db_params)
def get_data_output(dir_path='data', db_params=mongohq.ideagenstest): if not path.exists(dir_path): mkdir(dir_path, 0774) db = mongohq.get_db(db_params) # just grab the ideas and clusters clusters = {} for cluster in db.clusters.find(): clusters[cluster[u'_id']] = cluster[u'name'] ideas = [] for idea in db.ideas.find(): rowDict = {} rowDict["idea"] = idea[u'content'] if len(idea[u'clusterIDs']) > 0: clusterID = idea[u'clusterIDs'][0] rowDict["theme"] = clusters[clusterID] else: rowDict["theme"] = "-" rowDict["starred"] = idea[u'isGamechanger'] if idea[u'isGamechanger']: rowDict["starred"] = idea[u'isGamechanger'] else: rowDict["starred"] = "-" rowDict["userID"] = idea[u'userID'] rowDict["promptID"] = idea[u'promptID'] ideas.append(rowDict) ideasDF = pd.DataFrame(ideas) file_path = path.join(dir_path, "ideas.csv") ideasDF.to_csv(file_path) users = {} for user in db.myUsers.find(): users[user[u'_id']] = user[u'name'] # usersDF = pd.DataFrame.from_dict(users) usersOutFile = open(path.join(dir_path, "users.csv"),'w') for userID, userName in users.items(): usersOutFile.write(userID + "," + userName + "\n") usersOutFile.close() # file_path = path.join(dir_path, "users.csv") # usersDF.to_csv(file_path) ideasByPrompt = [] for prompt in db.prompts.find(): # rowDict = {} thisPromptID = prompt[u'_id'] promptQuestion = prompt[u'question'] for idea in db.ideas.find({u'promptID':thisPromptID}): rowDict = {} rowDict['promptID'] = thisPromptID rowDict['promptQuestion'] = promptQuestion rowDict['ideaID'] = idea[u'_id'] rowDict['idea'] = idea[u'content'] rowDict['likes'] = len(idea[u'votes']) rowDict['userID'] = idea[u'userID'] rowDict['userName'] = idea[u'userName'] rowDict['submissionTime'] = idea[u'time'] if len(idea[u'clusterIDs']) > 0: clusterID = idea[u'clusterIDs'][0] rowDict["theme"] = clusters[clusterID] ideasByPrompt.append(rowDict) ideasByPromptDF = pd.DataFrame(ideasByPrompt) file_path = path.join(dir_path,"ideasByPrompt.csv") ideasByPromptDF.to_csv(file_path) notifications = [] for notification in db.notifications.find(): rowDict = {} if u'message' in notification: rowDict["message"] = notification[u'message'] elif u'examples' in notification: examples = [ex[u'content'] for ex in notification[u'examples']] examplesMessage = "Sent Examples: %s" %(', '.join(examples)) examplesMessage[:-2] rowDict["message"] = examplesMessage elif u'theme' in notification: themeID = notification[u'theme'] rowDict["message"] = "Sent theme: %s" %clusters[themeID] elif u'prompt' in notification: rowDict["message"] = "Sent message: %s" %notification[u'prompt'] else: break # get author info # get time? notifications.append(rowDict) notificationsDF = pd.DataFrame(notifications) # Create file path file_path = path.join(dir_path, "notifications.csv") notificationsDF.to_csv(file_path) participants = [] for participant in db.participants.find(): print participant rowDict = {} rowDict['participantID'] = participant[u'_id'] rowDict['userName'] = participant[u'userName'] expID = participant[u'experimentID'] exp = db.experiments.find_one({u'_id':expID}) print exp rowDict['experimentID'] = expID rowDict['experimentName'] = exp[u'description'] rowDict['promptID'] = exp[u'promptID'] cond = db['exp-conditions'].find_one({u'_id':participant[u'conditionID']}) print cond rowDict['condition'] = cond[u'description'] participants.append(rowDict) participantsDF = pd.DataFrame(participants) file_path = path.join(dir_path, "participants.csv") participantsDF.to_csv(file_path)
# remember: we expect the following fields: {extID, title, content} # note also: by using pandas read, we automatically get # the first half of the unicode sandwich, i.e., all text is in unicode (the equivalent of calling theString.decode('utf-8')) # source: https://stackoverflow.com/questions/23594878/pandas-dataframe-and-character-encoding-when-reading-excel-file source_docs = pd.read_csv(options.source) print source_docs.count() # break the content into sentences source_docs['sentences'] = [ sent_tokenize(t.decode('utf-8', 'ignore').encode("ascii", "ignore")) for t in source_docs['content'] ] # create db connection db_name = options.db db = mongohq.get_db(db_params.ALL_DBs[db_name]) db_util = mongohq.Data_Utility('data', db_params.ALL_DBs[db_name]) # TODO: add a better dupe check. here we skip if the title is literally duplicated by another doc already in the db # won't work for slight inconsistencies due to encoding, etc. inserted_docs = [] for index, doc in source_docs.iterrows(): # print "Inserting doc", doc['extID'], doc['title'] # docID = create_document(doc) # if docID is not None: # docData = doc # docData['db_id'] = docID # inserted_docs.append(docData)