def dump_user_summary(usernames, db, COLL_USER_SUMMARY, COLL_SAVE_STATUS): collection = db[COLL_USER_SUMMARY] coll_status = db[COLL_SAVE_STATUS] # find the list discussion usernames which have not been saved in mongodb # unsaved_usernames = [] print "-------------" # cn = 0 # for username in usernames: # cn = cn + 1 # print cn # if not fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_SUMMARY, username): # unsaved_usernames.append(username) # print len(unsaved_usernames) for username in usernames: if fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_SUMMARY, username): print "user summary already saved : " + username else: user_url = "http://www.tudiabetes.org/forum/users/" + username + "/summary.json" json_user = requests.get(user_url).json() user = json_user ## dump to mongodb collection.insert_one(user) # save this id in "save" collection coll_status.insert_one({"collection":COLL_USER_SUMMARY, "id":username}) print "user summary saved: " + username
def dump_user_description(usernames, db, COLL_USER_DESCRIPTION, COLL_SAVE_STATUS): collection = db[COLL_USER_DESCRIPTION] coll_status = db[COLL_SAVE_STATUS] # find the list discussion usernames which have not been saved in mongodb # unsaved_usernames = [] print "-------------" # cn = 0 # for username in usernames: # cn = cn + 1 # print cn # if not fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_SUMMARY, username): # unsaved_usernames.append(username) # print len(unsaved_usernames) for username in usernames: if fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_DESCRIPTION, username): print "user description already saved : " + username else: user_url = "http://www.tudiabetes.org/forum/users/" + username + "/activity.json" json_desc = requests.get(user_url).json() # json_desc = json_desc["user"] try: json_desc["user"] except KeyError: continue else: json_desc = json_desc["user"] ## dump to mongodb collection.insert_one(json_desc) # save this id in "save" collection coll_status.insert_one({"collection":COLL_USER_DESCRIPTION, "id":username}) print "user summary saved: " + username
def dump_user_summary(usernames, db, COLL_USER_SUMMARY, COLL_SAVE_STATUS): collection = db[COLL_USER_SUMMARY] coll_status = db[COLL_SAVE_STATUS] # find the list discussion usernames which have not been saved in mongodb # unsaved_usernames = [] print "-------------" # cn = 0 # for username in usernames: # cn = cn + 1 # print cn # if not fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_SUMMARY, username): # unsaved_usernames.append(username) # print len(unsaved_usernames) for username in usernames: if fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_SUMMARY, username): print "user summary already saved : " + username else: user_url = "http://www.tudiabetes.org/forum/users/" + username + "/summary.json" json_user = requests.get(user_url).json() user = json_user ## dump to mongodb collection.insert_one(user) # save this id in "save" collection coll_status.insert_one({ "collection": COLL_USER_SUMMARY, "id": username }) print "user summary saved: " + username
def dump_user_replies(usernames, db, COLL_USER_REPLY, COLL_SAVE_STATUS): collection = db[COLL_USER_REPLY] coll_status = db[COLL_SAVE_STATUS] # find the list discussion usernames which have not been saved in mongodb # unsaved_usernames = [] # for username in usernames: # # if not fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_REPLY, username): # unsaved_usernames.append(username) for username in usernames: if fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_REPLY, username): print "user reply already saved : " + username else: epoch = int(round(time.time() * 1000)) offset = 0 # starts from 0 replies_complete = [] first = True while True: reply_url = "http://www.tudiabetes.org/forum/user_actions.json?offset=" + str( offset) + "&username="******"&filter=5&_=" + str( epoch) json_reply = requests.get(reply_url).json() # reply = json_reply["user_actions"] try: json_reply["user_actions"] except KeyError: reply = "" else: reply = json_reply["user_actions"] if (len(reply) == 0): break else: if (first): replies_complete = reply first = False else: offset += 30 for r in reply: ### save to mongodb replies_complete.append(r) reply_dict = {"username": username, "reply": replies_complete} #dump the list of relies to the mongodb collection.insert(reply_dict) # save this id in "save" collection coll_status.insert_one({ "collection": COLL_USER_REPLY, "id": username }) print "user reply saved: " + username
def dump_user_replies(usernames, db, COLL_USER_REPLY, COLL_SAVE_STATUS): collection = db[COLL_USER_REPLY] coll_status = db[COLL_SAVE_STATUS] # find the list discussion usernames which have not been saved in mongodb # unsaved_usernames = [] # for username in usernames: # # if not fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_REPLY, username): # unsaved_usernames.append(username) for username in usernames: if fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_REPLY, username): print "user reply already saved : " + username else: epoch = int(round(time.time() * 1000)) offset = 0 # starts from 0 replies_complete = [] first = True while True: reply_url = "http://www.tudiabetes.org/forum/user_actions.json?offset=" + str( offset) + "&username="******"&filter=5&_=" + str(epoch) json_reply = requests.get(reply_url).json() # reply = json_reply["user_actions"] try: json_reply["user_actions"] except KeyError: reply = "" else: reply = json_reply["user_actions"] if (len(reply) == 0): break else: if(first): replies_complete = reply first = False else: offset += 30 for r in reply: ### save to mongodb replies_complete.append(r) reply_dict = {"username": username, "reply": replies_complete} #dump the list of relies to the mongodb collection.insert(reply_dict) # save this id in "save" collection coll_status.insert_one({"collection":COLL_USER_REPLY, "id":username}) print "user reply saved: " + username
def dump_user_description(usernames, db, COLL_USER_DESCRIPTION, COLL_SAVE_STATUS): collection = db[COLL_USER_DESCRIPTION] coll_status = db[COLL_SAVE_STATUS] # find the list discussion usernames which have not been saved in mongodb # unsaved_usernames = [] print "-------------" # cn = 0 # for username in usernames: # cn = cn + 1 # print cn # if not fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_SUMMARY, username): # unsaved_usernames.append(username) # print len(unsaved_usernames) for username in usernames: if fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_DESCRIPTION, username): print "user description already saved : " + username else: user_url = "http://www.tudiabetes.org/forum/users/" + username + "/activity.json" json_desc = requests.get(user_url).json() # json_desc = json_desc["user"] try: json_desc["user"] except KeyError: continue else: json_desc = json_desc["user"] ## dump to mongodb collection.insert_one(json_desc) # save this id in "save" collection coll_status.insert_one({ "collection": COLL_USER_DESCRIPTION, "id": username }) print "user summary saved: " + username
def discussions(db, COLL_DISCUSSION, COLL_SAVE_STATUS): # user_final_list = csv_usercol() # Another_type_of_diabetes= csv_Another_type_of_diabetes('Another_type_of_diabetes') # Gestational_diabetes=csv_Gestational_diabetes('Gestational_diabetes') # Idk_type_of_diabetes=csv_Idk_type_of_diabetes('Idk_type_of_diabetes') # No_diabetes=csv_No_diabetes('No_diabetes') # Pre_diabetes=csv_Pre_diabetes('Pre_diabetes') # Type1= csv_Type1('Type1') # Type2= csv_Type2('Type2') cursor = db[COLL_DISCUSSION].find() coll_status = db[COLL_SAVE_STATUS] Tech_Cat_ID = ['28', '29', '30', '31', '33', '43', '53', '54'] Cat_ID = [ '1', '3', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '20', '21', '22', '23', '24', '25', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45' ] c_dic = { -1: "", 1: "General", 3: 'TuDiabetes Website', 5: 'Type 1 and LADA / none', 6: 'New to Type 1 Diabetes', 7: 'Parents of Children with Type 1 Diabetes', 8: 'Teens and Young Adults', 9: 'Type 2', 10: 'New to Type 2 diabetes', 11: 'Teens and Young Adults', 12: 'Diabetes and Pregnancy', 13: 'gestational diabetes', 14: 'Trying to Get Pregnant', 15: 'Managing Pregnancy with Diabetes', 16: 'Community', 17: 'Share Your Stories', 18: 'Fun and Games', 19: 'Arts and Poetry', 52: 'Giveaways', 20: 'Treatment', 22: 'Oral Medications and non-insulin injectables', 23: 'Insulin', 24: 'Research/Cure', 25: 'Food / none', 26: 'Recipes', 27: 'Nutrition', 28: 'Diabetes Technology / none', 29: 'Insulin Pumps', 30: 'Glucose Monitoring', 31: 'Diabetes Apps', 53: 'DIY Closed Loop Systems', 54: 'Commercial Closed Loop Systems', 32: 'Healthy Living', 33: 'Physical Activity', 34: 'Weight', 35: 'Mental and Emotional Wellness', 36: 'Diabetes Complications and other Conditions', 37: 'Eyes', 38: 'Kidneys', 39: 'Feet', 40: 'Digestion', 41: 'Other Conditions', 42: 'Diabetes Advocacy', 43: 'Self Advocacy', 44: 'Public Advocacy', 53: 'DIY Closed Loop Systems', 54: 'Commercial Closed Loop Systems' } header_disucssion = "like_count,highest_post_number,discuss_id,user_id,category_id,category_name,title,last_posted_at,participant_count,views,reply_count,links,sum_of_clicks,replies" header_replies = ",post_number,quote_count,updated_at,moderator,reads,reply_count,id,avg_time,cooked,topic_id,username,user_created_at,user_id,incoming_link_count,reply_to_post_number" header = header_disucssion + header_replies # f = open("discussions.csv", "a") if (coll_status.find({"collection": "discussion_export"}).count() == 0): # f.write(header) data = [header.split(",")] csv_writer(data, "discussions.csv") cnt = 0 for d in cursor: like_count = -1 highest_post_number = -1 discuss_id = -1 user_id = -1 category_id = -1 category_name = -1 title = "" last_posted_at = -1 participant_count = -1 views = -1 reply_count = -1 links = -1 sum_of_clicks = -1 replies = -1 try: d['like_count'] except KeyError: continue else: like_count = d['like_count'] try: d['highest_post_number'] except KeyError: continue else: highest_post_number = d['highest_post_number'] try: d['id'] except KeyError: continue else: discuss_id = d['id'] try: d['user_id'] except KeyError: continue else: user_id = d['user_id'] try: d['category_id'] except KeyError: continue else: # print "category_id : " + str(category_id) + "----- Discussion_ID : "+ str(discuss_id) category_id = d['category_id'] # print "category_id : " + str(category_id) + "----- Discussion_ID : "+ str(discuss_id) category_name = c_dic[category_id] try: d['title'] except KeyError: continue else: title = d['title'].replace(",", "").encode('ascii', 'ignore') try: d['last_posted_at'] except KeyError: continue else: last_posted_at = d['last_posted_at'] try: d['participant_count'] except KeyError: continue else: participant_count = d['participant_count'] try: d['views'] except KeyError: continue else: views = d['views'] try: d['reply_count'] except KeyError: continue else: reply_count = d['reply_count'] try: d["details"]["links"] except KeyError: continue else: l_dict = d["details"]["links"] links = len(l_dict) for l in l_dict: sum_of_clicks = sum_of_clicks + l["clicks"] try: d["post_stream"]["posts"] except KeyError: continue else: replies = len(d["post_stream"]["posts"]) - 1 line_discuss = [ like_count, highest_post_number, discuss_id, user_id, category_id, category_name, title, last_posted_at, participant_count, views, reply_count, links, sum_of_clicks, replies ] line_discuss_str = [str(i) for i in line_discuss] line_discuss_str = ",".join(line_discuss_str) # print line_discuss # print line_discuss_str # pprint.pprint( d["post_stream"]["posts"]) for p in d["post_stream"]["posts"]: post_number = -1 quote_count = -1 updated_at = "" moderator = "" reads = -1 reply_count = -1 id = -1 avg_time = -1 cooked = "" topic_id = -1 username = "" user_created_at = "" user_id = -1 incoming_link_count = -1 reply_to_post_number = -1 post_number = p["post_number"] quote_count = p["quote_count"] updated_at = p["updated_at"] moderator = p["moderator"] reads = p["reads"] reply_count = p["reply_count"] id = p["id"] avg_time = p["avg_time"] cooked = p["cooked"].replace(",", "").encode('ascii', 'ignore') topic_id = p["topic_id"] username = p["username"] user_created_at = p["created_at"] user_id = p["user_id"] incoming_link_count = p["incoming_link_count"] reply_to_post_number = p["reply_to_post_number"] # print cooked # cooked = re.search('<p>(.*)</p>', cooked).group(1) # if fm.check_if_saved(db, COLL_SAVE_STATUS, "discussion_export", id): print "Reply ID : " + str(id) + " already exported" elif str(category_id) in Tech_Cat_ID: soup = BeautifulSoup(cooked) cooked_parsed = "" try: soup.blockquote.text except AttributeError: # print "blockquote skipped" blockquote_parsed = "" # continue else: blockquote_parsed = soup.blockquote.text # blockquote_parsed = soup.blockquote.text try: soup.findAll("p") except AttributeError: print "p tag skipped" print soup continue else: for p in soup.findAll("p"): cooked_parsed = cooked_parsed + ''.join( p.findAll(text=True)) Doc = cooked_parsed # blockquote_parsed = re.sub(r'[^a-zA-Z0-9-@\s]+', '', blockquote_parsed).lower().strip().replace("\n", " ") # cooked_parsed = re.sub(r'[^a-zA-Z0-9\s]+', '', cooked_parsed).lower().strip().replace("\n", " ") # # remove the blockquote section from cooked # cooked_parsed = re.sub(blockquote_parsed, "", cooked_parsed) # Doc= cooked_parsed # cooked_parsed=clean(cooked_parsed) cooked_parsed = cooked_parsed.lower().strip().replace( "\n", " ").replace(".", " ").replace("-", ' ') # Remove punctuation cooked_parsed = result = re.sub(r"http\S+", "", cooked_parsed) cooked_parsed = result = re.sub(r"@\S+", "", cooked_parsed) # # cooked_parsed = re.sub(r'[^\w\s]', '', cooked_parsed).replace(" "," ") # # # Remove HTML tags: # # cooked_parsed = re.sub('<[^<]+?>', '', cooked_parsed) # # # Remove URLs: # # # Standardize words (remove multiple letters): # cooked_parsed = ''.join(''.join(s)[:2] for _, s in itertools.groupby(cooked_parsed)) # cooked_parsed = lda.clean(cooked_parsed) # Call Clean Function # cooked_parsed=clean(cooked_parsed) # --------- Carete the Text data files print user_id print Doc print cooked_parsed + "\n==========================================================\n" # print type(user_id), " ", type(user_final_list[0]) # if str(user_id) in user_final_list: # with open("C:/TuDiabetes_Code - Final/"+"Text/"+str(user_id)+".txt", "a") as f: # f.write(cooked_parsed+"\n\n\n") # f.close() # ========================================================================================= # if str(user_id) in Another_type_of_diabetes: # with open("C:/TuDiabetes_Code - Final/Types_Text/Another_type_of_diabetes.txt", "a") as f: # f.write(cooked_parsed) # f.close() # elif str(user_id) in Gestational_diabetes: # with open("C:/TuDiabetes_Code - Final/Types_Text/Gestational_diabetes.txt", "a") as f: # f.write(cooked_parsed) # f.close() # elif str(user_id) in Idk_type_of_diabetes: # with open("C:/TuDiabetes_Code - Final/Types_Text/Idk_type_of_diabetes.txt", "a") as f: # f.write(cooked_parsed) # f.close() # elif str(user_id) in No_diabetes: # with open("C:/TuDiabetes_Code - Final/Types_Text/No_diabetes.txt", "a") as f: # f.write(cooked_parsed) # f.close() # elif str(user_id) in Pre_diabetes: # with open("C:/TuDiabetes_Code - Final/Types_Text/Pre_diabetes.txt", "a") as f: # f.write(cooked_parsed) # f.close() # elif str(user_id) in Type1: # with open("C:/TuDiabetes_Code - Final/Types_Text/Type1_Diabites.txt", "a") as f: # f.write(cooked_parsed) # f.close() # elif str(user_id) in Type2: # with open("C:/TuDiabetes_Code - Final/Types_Text/Type2_Diabites.txt", "a") as f: # f.write(cooked_parsed) # f.close() # =========================================================================================== # if str(user_id) in Another_type_of_diabetes: # with open("C:/TuDiabetes_Code - Final/Types_Text/Another_type_of_diabetes.txt", "a") as f: # f.write(cooked_parsed) # f.close() # elif str(user_id) in Gestational_diabetes: # with open("C:/TuDiabetes_Code - Final/Types_Text/Gestational_diabetes.txt", "a") as f: # f.write(cooked_parsed) # f.close() # elif str(user_id) in Idk_type_of_diabetes: # with open("C:/TuDiabetes_Code - Final/Types_Text/Idk_type_of_diabetes.txt", "a") as f: # f.write(cooked_parsed) # f.close() # elif str(user_id) in No_diabetes: # with open("C:/TuDiabetes_Code - Final/Types_Text/No_diabetes.txt", "a") as f: # f.write(cooked_parsed) # f.close() # elif str(user_id) in Pre_diabetes: # with open("C:/TuDiabetes_Code - Final/Types_Text/Pre_diabetes.txt", "a") as f: # f.write(cooked_parsed) # f.close() # elif str(user_id) in Type1: # with open("C:/TuDiabetes_Code - Final/Types_Text/Type1_Diabites.txt", "a") as f: # f.write(cooked_parsed) # f.close() # elif str(user_id) in Type2: # with open("C:/TuDiabetes_Code - Final/Types_Text/Type2_Diabites.txt", "a") as f: # f.write(cooked_parsed) # f.close() # if str(category_id) in Tech_Cat_ID: # with open("C:/TuDiabetes_Code - Final/Types_Text/Diabetes_Technology.txt", "a") as f: # f.write(cooked_parsed) # f.close() # =========================================================================================== C_name = re.sub(r'[^\w\s]', '', str(category_name)).replace(" ", " ").strip() print C_name if str(category_id) in Cat_ID: with open( "C:/TuDiabetes_Code - Final/Diabetes_Text/All.txt", "a") as f: f.write(cooked_parsed) f.close() with open( "C:/TuDiabetes_Code - Final/Diabetes_Text/" + C_name + ".txt", "a") as f: f.write(cooked_parsed) f.close() print "\n ***** " + str( category_id) + " ***** " + C_name + " ******" # #=============================================================================================== # C_name= re.sub(r'[^\w\s]', '', str(category_name)).replace(" "," ") # print C_name # if str(category_id)== "28": # with open("C:/TuDiabetes_Code - Final/TechTypes_Text_New/Diabetes_Tech_General.txt", "a") as f: # f.write(cooked_parsed) # f.close() # print "\n ***** " + str(category_id) + " ***** "+ str(category_name)+ " ******" # elif str(category_id)== "29": # with open("C:/TuDiabetes_Code - Final/TechTypes_Text_New/Insulin_Pumps.txt", "a") as f: # f.write(cooked_parsed) # f.close() # print "\n ***** " + str(category_id) + " *****" # elif str(category_id)== "30": # with open("C:/TuDiabetes_Code - Final/TechTypes_Text_New/Glucose_Monitoring.txt", "a") as f: # f.write(cooked_parsed) # f.close() # print "\n ***** " + str(category_id) + " *****" # elif str(category_id)== "31": # with open("C:/TuDiabetes_Code - Final/TechTypes_Text_New/Diabetes_Apps.txt", "a") as f: # f.write(cooked_parsed) # f.close() # print "\n ***** " + str(category_id) + " *****" # elif str(category_id)== "33": # with open("C:/TuDiabetes_Code - Final/TechTypes_Text_New/Physical_Activity.txt", "a") as f: # f.write(cooked_parsed) # f.close() # print "\n ***** " + str(category_id) + " *****" # elif str(category_id)== "43": # with open("C:/TuDiabetes_Code - Final/TechTypes_Text_New/Self_Advocacy.txt", "a") as f: # f.write(cooked_parsed) # f.close() # print "\n ***** " + str(category_id) + " *****" # # =========================================================================================== # # print ">"*20 # print "cooked " + str(len(cooked)) # print cooked # print "-"*20 # print "blockquote parsed " + str(len(blockquote_parsed)) # print blockquote_parsed # print "-"*20 # print "cooked parsed "+ str(len(cooked_parsed)) # print cooked_parsed # print ">"*20 # cooked_parsed = "" line_post = [ post_number, quote_count, updated_at, moderator, reads, reply_count, id, avg_time, cooked_parsed, topic_id, username, user_created_at, user_id, incoming_link_count, reply_to_post_number ] # print line_discuss # print len(line_post) line_post = [str(i) for i in line_post] line_post = ",".join(line_post).replace("\n", "") final_line = line_discuss_str + "," + line_post + "\n" final_line = final_line.encode('ascii', 'ignore') # print final_line data = [final_line.split(",")] csv_writer(data, "discussions.csv") # f.write(final_line) coll_status.insert_one({ "collection": "discussion_export", "id": id }) # cnt += 1 # print "Count: ", cnt, " Reply ID: ", id print "Reply ID : " + str( id ) + " Exported To CSV File" + "----- Discussion_ID : " + str( discuss_id)
def users(db, COLL_USER_DES, COLL_SAVE_USTATUS): cursor = db[COLL_USER_DES].find() coll_status = db[COLL_SAVE_USTATUS] header_user = "******" header_user_fields = ",user_gender,diabetes_type,account_owner,meds_tools,DOB" header = header_user + header_user_fields if (coll_status.find({"collection": "user_export"}).count() == 0): data = [header.split(",")] csv_writer(data, "final_users.csv") cnt = 0 for d in cursor: user_id = -1 user_name = "" gender = "" diabetes_type = "" account_owner = "" meds_tools = "" DOB = "" try: d['id'] except KeyError: continue else: user_id = d['id'] try: d['username'] except KeyError: continue else: user_name = d['username'].replace(",", "").encode('ascii', 'ignore') # try: # d["custom_fields"]["date_of_birth"] # except KeyError: # continue # else: # date_of_birth = d["custom_fields"]["date_of_birth"].replace(",", "").encode('ascii', 'ignore') try: d["user_fields"]["10"] except KeyError: continue else: if d["user_fields"]["10"] == None: gender = "" else: gender = d["user_fields"]["10"].encode('ascii', 'ignore') try: d["user_fields"]["3"] except KeyError: continue else: if type(d["user_fields"]["3"]) == type([]): diabetes_type = d["user_fields"]["3"][0].replace( ",", "").encode('ascii', 'ignore') else: if d["user_fields"]["3"] == None: diabetes_type = "" else: diabetes_type = d["user_fields"]["3"].replace( ",", "").encode('ascii', 'ignore') try: d["user_fields"]["2"] except KeyError: continue else: if d["user_fields"]["2"] == None: account_owner = "" else: account_owner = d["user_fields"]["2"].replace(",", "").encode( 'ascii', 'ignore') try: d["user_fields"]["5"] except KeyError: continue else: if d["user_fields"]["5"] == None: meds_tools = "" else: meds_tools = d["user_fields"]["5"].replace(",", "").encode( 'ascii', 'ignore') try: d["user_fields"]["9"] except KeyError: continue else: if d["user_fields"]["9"] == None: DOB = "" else: DOB = d["user_fields"]["9"].replace(",", "").encode( 'ascii', 'ignore') line_user = [ user_id, user_name, gender, diabetes_type, account_owner, meds_tools, DOB ] line_user_str = [str(i) for i in line_user] line_user_str = ",".join(line_user_str) # #------------------------------------------ if diabetes_type.strip( ) == "another type of diabetes" or diabetes_type.strip( ) == "Another type of diabetes": Another_type_of_diabetes.append(user_id) elif diabetes_type.strip() == "Gestational diabetes": Gestational_diabetes.append(user_id) elif diabetes_type.strip( ) == "I do not know what type of diabetes" or diabetes_type.strip( ) == "I don't know what type of diabetes": Idk_type_of_diabetes.append(user_id) elif diabetes_type.strip() == "No diabetes": No_diabetes.append(user_id) elif diabetes_type.strip() == "Pre-diabetes": Pre_diabetes.append(user_id) elif diabetes_type.strip( ) == "Type 1 or type 1.5 (LADA) diabetes" or diabetes_type.strip( ) == "Type 1 or Type 1.5 (LADA) diabetes" or diabetes_type.strip( ) == "Diabetes Tipo 1 o 1.5 (LADA)": Type1.append(user_id) elif diabetes_type.strip( ) == "Diabetes de Tipo 2" or diabetes_type.strip( ) == "Type 2 diabetes": Type2.append(user_id) #------------------------------------------ # print line_user_str # print line_discuss_str # pprint.pprint( d["post_stream"]["posts"]) if diabetes_type != "": if fm.check_if_saved(db, COLL_SAVE_USTATUS, "user_export", user_id): print "Reply ID : " + str(user_id) + " already exported" # if fm.check_if_saved(db, COLL_SAVE_STATUS, "discussion_export", id): # print "Reply ID : " + str(id) + " already exported" else: final_line = line_user_str + "\n" final_line = final_line.encode('ascii', 'ignore') data = [final_line.split(",")] csv_writer(data, "final_users.csv") coll_status.insert_one({ "collection": "user_export", "id": user_id }) print "Reply ID : " + str( user_id) + " User Info Exported To CSV File" print len(Another_type_of_diabetes), len(Gestational_diabetes), len( Idk_type_of_diabetes), len(No_diabetes), len(Pre_diabetes), len( Type1), len(Type2) print " Number of Users Extracted ---> ", len( Another_type_of_diabetes) + len(Gestational_diabetes) + len( Idk_type_of_diabetes) + len(No_diabetes) + len(Pre_diabetes) + len( Type1) + len(Type2) with open('Another_type_of_diabetes.csv', 'ab') as f: writer = csv.writer(f) writer.writerows(zip(Another_type_of_diabetes)) with open('Gestational_diabetes.csv', 'ab') as f: writer = csv.writer(f) writer.writerows(zip(Gestational_diabetes)) with open('Idk_type_of_diabetes.csv', 'ab') as f: writer = csv.writer(f) writer.writerows(zip(Idk_type_of_diabetes)) with open('No_diabetes.csv', 'ab') as f: writer = csv.writer(f) writer.writerows(zip(No_diabetes)) with open('Pre_diabetes.csv', 'ab') as f: writer = csv.writer(f) writer.writerows(zip(Pre_diabetes)) with open('Type1.csv', 'ab') as f: writer = csv.writer(f) writer.writerows(zip(Type1)) with open('Type2.csv', 'ab') as f: writer = csv.writer(f) writer.writerows(zip(Type2))
def discussions(db, COLL_DISCUSSION, COLL_SAVE_STATUS): cursor = db[COLL_DISCUSSION].find() coll_status = db[COLL_SAVE_STATUS] header_disucssion = "like_count,highest_post_number,discuss_id,user_id,title,last_posted_at,participant_count,views,reply_count,links,sum_of_clicks,replies" header_replies = ",post_number,quote_count,updated_at,moderator,reads,reply_count,id,avg_time,cooked,topic_id,username,user_created_at,user_id,incoming_link_count,reply_to_post_number" header = header_disucssion + header_replies + "\n" f = open("discussions.csv", "a") if(coll_status.find({"collection":"discussion_export"}).count() == 0): f.write(header) cnt = 0 for d in cursor: like_count = -1 highest_post_number = -1 discuss_id = -1 user_id = -1 title = "" last_posted_at = -1 participant_count = -1 views = -1 reply_count = -1 links = -1 sum_of_clicks = -1 replies = -1 try: d['like_count'] except KeyError: continue else: like_count = d['like_count'] try: d['highest_post_number'] except KeyError: continue else: highest_post_number = d['highest_post_number'] try: d['id'] except KeyError: continue else: discuss_id = d['id'] try: d['user_id'] except KeyError: continue else: user_id = d['user_id'] try: d['title'] except KeyError: continue else: title = d['title'].replace(",", "").encode('ascii', 'ignore') try: d['last_posted_at'] except KeyError: continue else: last_posted_at = d['last_posted_at'] try: d['participant_count'] except KeyError: continue else: participant_count = d['participant_count'] try: d['views'] except KeyError: continue else: views = d['views'] try: d['reply_count'] except KeyError: continue else: reply_count = d['reply_count'] try: d["details"]["links"] except KeyError: continue else: l_dict = d["details"]["links"] links = len(l_dict) for l in l_dict: sum_of_clicks = sum_of_clicks + l["clicks"] try: d["post_stream"]["posts"] except KeyError: continue else: replies = len(d["post_stream"]["posts"]) - 1 line_discuss = [like_count,highest_post_number,discuss_id,user_id,title,last_posted_at,participant_count,views,reply_count,links,sum_of_clicks,replies] line_discuss_str = [str(i) for i in line_discuss] line_discuss_str = ",".join(line_discuss_str) for p in d["post_stream"]["posts"]: post_number = -1 quote_count = -1 updated_at = "" moderator = "" reads = -1 reply_count = -1 id = -1 avg_time = -1 cooked = "" topic_id = -1 username = "" user_created_at = "" user_id = -1 incoming_link_count = -1 reply_to_post_number = -1 post_number = p["post_number"] quote_count = p["quote_count"] updated_at = p["updated_at"] moderator = p["moderator"] reads = p["reads"] reply_count = p["reply_count"] id = p["id"] avg_time = p["avg_time"] cooked = p["cooked"].replace(",", "").encode('ascii', 'ignore') topic_id = p["topic_id"] username = p["username"] user_created_at = p["user_created_at"] user_id = p["user_id"] incoming_link_count = p["incoming_link_count"] reply_to_post_number = p["reply_to_post_number"] # print cooked # cooked = re.search('<p>(.*)</p>', cooked).group(1) # if fm.check_if_saved(db, COLL_SAVE_STATUS, "discussion_export", id): print "Reply ID : " + str(id) + " already exported" else: soup = BeautifulSoup(cooked) cooked_parsed = "" try: soup.blockquote.text except AttributeError: # print "blockquote skipped" blockquote_parsed = "" # continue else: blockquote_parsed = soup.blockquote.text # blockquote_parsed = soup.blockquote.text try: soup.findAll("p") except AttributeError: print "p tag skipped" print soup continue else: for p in soup.findAll("p"): cooked_parsed = cooked_parsed + ''.join(p.findAll(text=True)) blockquote_parsed = re.sub(r'[^a-zA-Z0-9\s]+', '', blockquote_parsed).lower().strip().replace("\n", " ") cooked_parsed = re.sub(r'[^a-zA-Z0-9\s]+', '', cooked_parsed).lower().strip().replace("\n", " ") # remove the blockquote section from cooked cooked_parsed = re.sub(blockquote_parsed, "", cooked_parsed) # print ">"*20 # print "cooked " + str(len(cooked)) # print cooked # print "-"*20 # print "blockquote parsed " + str(len(blockquote_parsed)) # print blockquote_parsed # print "-"*20 # print "cooked parsed "+ str(len(cooked_parsed)) # print cooked_parsed # print ">"*20 # cooked_parsed = "" line_post = [post_number, quote_count, updated_at, moderator, reads, reply_count, id, avg_time, cooked_parsed, topic_id, username, user_created_at, user_id, incoming_link_count, reply_to_post_number] # print line_discuss # print len(line_post) line_post = [str(i) for i in line_post] line_post = ",".join(line_post).replace("\n", "") final_line = line_discuss_str+","+line_post+"\n" final_line = final_line.encode('ascii', 'ignore') f.write(final_line) coll_status.insert_one({"collection":"discussion_export", "id":id}) # cnt += 1 # print "Count: ", cnt, " Reply ID: ", id print "Reply ID : " + str(id) + " exported" f.close()
def dump_discussion(ids, db, COLL_DISCUSSION, COLL_SAVE_STATUS): coll = db[COLL_DISCUSSION] coll_status = db[COLL_SAVE_STATUS] # find the list discussion id which have not been saved in mongodb # unsaved_ids = [] # print "-------------------------" # print len(ids) # cn = 0 # for id in ids: # cn = cn + 1 # print cn # if not fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_DISCUSSION, id[0]): # unsaved_ids.append(id) count = 0 print "-------------------------" for id in ids: if fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_DISCUSSION, id[0]): print "discussion aready saved : " + str(id[0]) else: discussion_id = id[0] slug = id[1] ##find all replies to this discussion page = 1 discussion_complete = {} first = True while True: discussion_url = "http://www.tudiabetes.org/forum/t/" + slug + "/" + str( discussion_id) + ".json?page=" + str(page) # if this url doesnt return a json response stop the json request, by breaking this while loop try: json_discussion = requests.get(discussion_url).json() # pprint.pprint(json_discussion) except ValueError: break else: page += 1 if (first): discussion_complete = json_discussion first = False else: for p in json_discussion["post_stream"]["posts"]: discussion_complete["post_stream"]["posts"].append(p) # print "post_count:" + str(index) +" >> post_id: " + str(p["id"]) + " >> "+ p["cooked"] ## save to mongodb res = coll.insert_one(discussion_complete) # save this id in "save" collection coll_status.insert_one({"collection":COLL_DISCUSSION, "id":id[0]}) try: discussion_complete["post_stream"]["posts"] except KeyError: replies = 0 else: replies = str(len(discussion_complete["post_stream"]["posts"])) print "saved discussion: count = %s replies = %s [id, slug] = %s):" % (str(count), replies, str(id)) count += 1
def dump_discussion(ids, db, COLL_DISCUSSION, COLL_SAVE_STATUS): coll = db[COLL_DISCUSSION] coll_status = db[COLL_SAVE_STATUS] # find the list discussion id which have not been saved in mongodb # unsaved_ids = [] # print "-------------------------" # print len(ids) # cn = 0 # for id in ids: # cn = cn + 1 # print cn # if not fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_DISCUSSION, id[0]): # unsaved_ids.append(id) count = 0 print "-------------------------" for id in ids: if fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_DISCUSSION, id[0]): print "discussion aready saved : " + str(id[0]) else: discussion_id = id[0] slug = id[1] ##find all replies to this discussion page = 1 discussion_complete = {} first = True while True: discussion_url = "http://www.tudiabetes.org/forum/t/" + slug + "/" + str( discussion_id) + ".json?page=" + str(page) # if this url doesnt return a json response stop the json request, by breaking this while loop try: json_discussion = requests.get(discussion_url).json() # pprint.pprint(json_discussion) except ValueError: break else: page += 1 if (first): discussion_complete = json_discussion first = False else: for p in json_discussion["post_stream"]["posts"]: discussion_complete["post_stream"]["posts"].append( p) # print "post_count:" + str(index) +" >> post_id: " + str(p["id"]) + " >> "+ p["cooked"] ## save to mongodb res = coll.insert_one(discussion_complete) # save this id in "save" collection coll_status.insert_one({ "collection": COLL_DISCUSSION, "id": id[0] }) try: discussion_complete["post_stream"]["posts"] except KeyError: replies = 0 else: replies = str(len(discussion_complete["post_stream"]["posts"])) print "saved discussion: count = %s replies = %s [id, slug] = %s):" % ( str(count), replies, str(id)) count += 1