Python check_if_savedの例、util_fetch_mongo.check_if_saved Pythonの例

コード例 #1

0

ファイルを表示

ファイル: util_fetch_tudiabetes.py プロジェクト: bistaprad/tudiabetes_crawler

def dump_user_summary(usernames, db, COLL_USER_SUMMARY, COLL_SAVE_STATUS):
    collection = db[COLL_USER_SUMMARY]
    coll_status = db[COLL_SAVE_STATUS]

    # find the list discussion usernames which have not been saved in mongodb
    # unsaved_usernames = []
    print "-------------"
    # cn = 0
    # for username in usernames:
    #     cn = cn + 1
    #     print cn
    #     if not fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_SUMMARY, username):
    #         unsaved_usernames.append(username)

    # print len(unsaved_usernames)
    for username in usernames:
        if fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_SUMMARY, username):
            print "user summary already saved : " + username
        else:
            user_url = "http://www.tudiabetes.org/forum/users/" + username + "/summary.json"
            json_user = requests.get(user_url).json()
            user = json_user

            ## dump to mongodb
            collection.insert_one(user)
            # save this id in "save" collection
            coll_status.insert_one({"collection":COLL_USER_SUMMARY, "id":username})
            print "user summary saved: " + username

コード例 #2

0

ファイルを表示

ファイル: util_fetch_tudiabetes.py プロジェクト: bistaprad/tudiabetes_crawler

def dump_user_description(usernames, db, COLL_USER_DESCRIPTION, COLL_SAVE_STATUS):
    collection = db[COLL_USER_DESCRIPTION]
    coll_status = db[COLL_SAVE_STATUS]

    # find the list discussion usernames which have not been saved in mongodb
    # unsaved_usernames = []
    print "-------------"
    # cn = 0
    # for username in usernames:
    #     cn = cn + 1
    #     print cn
    #     if not fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_SUMMARY, username):
    #         unsaved_usernames.append(username)

    # print len(unsaved_usernames)
    for username in usernames:
        if fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_DESCRIPTION, username):
            print "user description already saved : " + username
        else:
            user_url = "http://www.tudiabetes.org/forum/users/" + username + "/activity.json"
            json_desc = requests.get(user_url).json()
            # json_desc = json_desc["user"]

            try:
                json_desc["user"]
            except KeyError:
                continue
            else:
                json_desc = json_desc["user"]
                ## dump to mongodb
                collection.insert_one(json_desc)
                # save this id in "save" collection
                coll_status.insert_one({"collection":COLL_USER_DESCRIPTION, "id":username})
                print "user summary saved: " + username

コード例 #3

0

ファイルを表示

ファイル: util_fetch_tudiabetes.py プロジェクト: tnasralah/SManalytics_Forum

def dump_user_summary(usernames, db, COLL_USER_SUMMARY, COLL_SAVE_STATUS):
    collection = db[COLL_USER_SUMMARY]
    coll_status = db[COLL_SAVE_STATUS]

    # find the list discussion usernames which have not been saved in mongodb
    # unsaved_usernames = []
    print "-------------"
    # cn = 0
    # for username in usernames:
    #     cn = cn + 1
    #     print cn
    #     if not fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_SUMMARY, username):
    #         unsaved_usernames.append(username)

    # print len(unsaved_usernames)
    for username in usernames:
        if fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_SUMMARY,
                             username):
            print "user summary already saved : " + username
        else:
            user_url = "http://www.tudiabetes.org/forum/users/" + username + "/summary.json"
            json_user = requests.get(user_url).json()
            user = json_user

            ## dump to mongodb
            collection.insert_one(user)
            # save this id in "save" collection
            coll_status.insert_one({
                "collection": COLL_USER_SUMMARY,
                "id": username
            })
            print "user summary saved: " + username

コード例 #4

0

ファイルを表示

ファイル: util_fetch_tudiabetes.py プロジェクト: tnasralah/SManalytics_Forum

def dump_user_replies(usernames, db, COLL_USER_REPLY, COLL_SAVE_STATUS):
    collection = db[COLL_USER_REPLY]
    coll_status = db[COLL_SAVE_STATUS]

    # find the list discussion usernames which have not been saved in mongodb
    # unsaved_usernames = []
    # for username in usernames:
    #
    #     if not fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_REPLY, username):
    #         unsaved_usernames.append(username)

    for username in usernames:
        if fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_REPLY, username):
            print "user reply already saved : " + username
        else:
            epoch = int(round(time.time() * 1000))
            offset = 0  # starts from 0
            replies_complete = []
            first = True
            while True:
                reply_url = "http://www.tudiabetes.org/forum/user_actions.json?offset=" + str(
                    offset) + "&username="******"&filter=5&_=" + str(
                        epoch)

                json_reply = requests.get(reply_url).json()
                # reply = json_reply["user_actions"]

                try:
                    json_reply["user_actions"]
                except KeyError:
                    reply = ""
                else:
                    reply = json_reply["user_actions"]

                if (len(reply) == 0):
                    break
                else:
                    if (first):
                        replies_complete = reply
                        first = False
                    else:
                        offset += 30
                        for r in reply:
                            ### save to mongodb
                            replies_complete.append(r)

            reply_dict = {"username": username, "reply": replies_complete}
            #dump the list of relies to the mongodb
            collection.insert(reply_dict)

            # save this id in "save" collection
            coll_status.insert_one({
                "collection": COLL_USER_REPLY,
                "id": username
            })
            print "user reply saved: " + username

コード例 #5

0

ファイルを表示

ファイル: util_fetch_tudiabetes.py プロジェクト: bistaprad/tudiabetes_crawler

def dump_user_replies(usernames, db, COLL_USER_REPLY, COLL_SAVE_STATUS):
    collection = db[COLL_USER_REPLY]
    coll_status = db[COLL_SAVE_STATUS]

    # find the list discussion usernames which have not been saved in mongodb
    # unsaved_usernames = []
    # for username in usernames:
    #
    #     if not fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_REPLY, username):
    #         unsaved_usernames.append(username)

    for username in usernames:
        if fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_REPLY, username):
            print "user reply already saved : " + username
        else:
            epoch = int(round(time.time() * 1000))
            offset = 0  # starts from 0
            replies_complete = []
            first = True
            while True:
                reply_url = "http://www.tudiabetes.org/forum/user_actions.json?offset=" + str(
                        offset) + "&username="******"&filter=5&_=" + str(epoch)

                json_reply = requests.get(reply_url).json()
                # reply = json_reply["user_actions"]

                try:
                    json_reply["user_actions"]
                except KeyError:
                    reply = ""
                else:
                    reply = json_reply["user_actions"]

                if (len(reply) == 0):
                    break
                else:
                    if(first):
                        replies_complete = reply
                        first = False
                    else:
                        offset += 30
                        for r in reply:
                            ### save to mongodb
                            replies_complete.append(r)

            reply_dict = {"username": username, "reply": replies_complete}
            #dump the list of relies to the mongodb
            collection.insert(reply_dict)

            # save this id in "save" collection
            coll_status.insert_one({"collection":COLL_USER_REPLY, "id":username})
            print "user reply saved: " + username

コード例 #6

0

ファイルを表示

ファイル: util_fetch_tudiabetes.py プロジェクト: tnasralah/SManalytics_Forum

def dump_user_description(usernames, db, COLL_USER_DESCRIPTION,
                          COLL_SAVE_STATUS):
    collection = db[COLL_USER_DESCRIPTION]
    coll_status = db[COLL_SAVE_STATUS]

    # find the list discussion usernames which have not been saved in mongodb
    # unsaved_usernames = []
    print "-------------"
    # cn = 0
    # for username in usernames:
    #     cn = cn + 1
    #     print cn
    #     if not fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_SUMMARY, username):
    #         unsaved_usernames.append(username)

    # print len(unsaved_usernames)
    for username in usernames:
        if fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_USER_DESCRIPTION,
                             username):
            print "user description already saved : " + username
        else:
            user_url = "http://www.tudiabetes.org/forum/users/" + username + "/activity.json"
            json_desc = requests.get(user_url).json()
            # json_desc = json_desc["user"]

            try:
                json_desc["user"]
            except KeyError:
                continue
            else:
                json_desc = json_desc["user"]
                ## dump to mongodb
                collection.insert_one(json_desc)
                # save this id in "save" collection
                coll_status.insert_one({
                    "collection": COLL_USER_DESCRIPTION,
                    "id": username
                })
                print "user summary saved: " + username

コード例 #7

0

ファイルを表示

ファイル: util_export_to_csv.py プロジェクト: tnasralah/SManalytics_Forum

def discussions(db, COLL_DISCUSSION, COLL_SAVE_STATUS):

    # user_final_list = csv_usercol()

    # Another_type_of_diabetes= csv_Another_type_of_diabetes('Another_type_of_diabetes')
    # Gestational_diabetes=csv_Gestational_diabetes('Gestational_diabetes')
    # Idk_type_of_diabetes=csv_Idk_type_of_diabetes('Idk_type_of_diabetes')
    # No_diabetes=csv_No_diabetes('No_diabetes')
    # Pre_diabetes=csv_Pre_diabetes('Pre_diabetes')
    # Type1= csv_Type1('Type1')
    # Type2= csv_Type2('Type2')

    cursor = db[COLL_DISCUSSION].find()
    coll_status = db[COLL_SAVE_STATUS]

    Tech_Cat_ID = ['28', '29', '30', '31', '33', '43', '53', '54']
    Cat_ID = [
        '1', '3', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15',
        '16', '17', '20', '21', '22', '23', '24', '25', '27', '28', '29', '30',
        '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42',
        '43', '44', '45'
    ]

    c_dic = {
        -1: "",
        1: "General",
        3: 'TuDiabetes Website',
        5: 'Type 1 and LADA / none',
        6: 'New to Type 1 Diabetes',
        7: 'Parents of Children with Type 1 Diabetes',
        8: 'Teens and Young Adults',
        9: 'Type 2',
        10: 'New to Type 2 diabetes',
        11: 'Teens and Young Adults',
        12: 'Diabetes and Pregnancy',
        13: 'gestational diabetes',
        14: 'Trying to Get Pregnant',
        15: 'Managing Pregnancy with Diabetes',
        16: 'Community',
        17: 'Share Your Stories',
        18: 'Fun and Games',
        19: 'Arts and Poetry',
        52: 'Giveaways',
        20: 'Treatment',
        22: 'Oral Medications and non-insulin injectables',
        23: 'Insulin',
        24: 'Research/Cure',
        25: 'Food / none',
        26: 'Recipes',
        27: 'Nutrition',
        28: 'Diabetes Technology / none',
        29: 'Insulin Pumps',
        30: 'Glucose Monitoring',
        31: 'Diabetes Apps',
        53: 'DIY Closed Loop Systems',
        54: 'Commercial Closed Loop Systems',
        32: 'Healthy Living',
        33: 'Physical Activity',
        34: 'Weight',
        35: 'Mental and Emotional Wellness',
        36: 'Diabetes Complications and other Conditions',
        37: 'Eyes',
        38: 'Kidneys',
        39: 'Feet',
        40: 'Digestion',
        41: 'Other Conditions',
        42: 'Diabetes Advocacy',
        43: 'Self Advocacy',
        44: 'Public Advocacy',
        53: 'DIY Closed Loop Systems',
        54: 'Commercial Closed Loop Systems'
    }

    header_disucssion = "like_count,highest_post_number,discuss_id,user_id,category_id,category_name,title,last_posted_at,participant_count,views,reply_count,links,sum_of_clicks,replies"
    header_replies = ",post_number,quote_count,updated_at,moderator,reads,reply_count,id,avg_time,cooked,topic_id,username,user_created_at,user_id,incoming_link_count,reply_to_post_number"
    header = header_disucssion + header_replies

    # f = open("discussions.csv", "a")
    if (coll_status.find({"collection": "discussion_export"}).count() == 0):
        # f.write(header)
        data = [header.split(",")]
        csv_writer(data, "discussions.csv")

    cnt = 0
    for d in cursor:

        like_count = -1
        highest_post_number = -1
        discuss_id = -1
        user_id = -1
        category_id = -1
        category_name = -1
        title = ""
        last_posted_at = -1
        participant_count = -1
        views = -1
        reply_count = -1
        links = -1
        sum_of_clicks = -1
        replies = -1

        try:
            d['like_count']
        except KeyError:
            continue
        else:
            like_count = d['like_count']

        try:
            d['highest_post_number']
        except KeyError:
            continue
        else:
            highest_post_number = d['highest_post_number']

        try:
            d['id']
        except KeyError:
            continue
        else:
            discuss_id = d['id']

        try:
            d['user_id']
        except KeyError:
            continue
        else:
            user_id = d['user_id']

        try:
            d['category_id']
        except KeyError:
            continue
        else:
            # print "category_id : " + str(category_id) + "----- Discussion_ID :  "+ str(discuss_id)
            category_id = d['category_id']
            # print "category_id : " + str(category_id) + "----- Discussion_ID :  "+ str(discuss_id)
            category_name = c_dic[category_id]

        try:
            d['title']
        except KeyError:
            continue
        else:
            title = d['title'].replace(",", "").encode('ascii', 'ignore')

        try:
            d['last_posted_at']
        except KeyError:
            continue
        else:
            last_posted_at = d['last_posted_at']

        try:
            d['participant_count']
        except KeyError:
            continue
        else:
            participant_count = d['participant_count']

        try:
            d['views']
        except KeyError:
            continue
        else:
            views = d['views']

        try:
            d['reply_count']
        except KeyError:
            continue
        else:
            reply_count = d['reply_count']

        try:
            d["details"]["links"]
        except KeyError:
            continue
        else:
            l_dict = d["details"]["links"]
            links = len(l_dict)
            for l in l_dict:
                sum_of_clicks = sum_of_clicks + l["clicks"]

        try:
            d["post_stream"]["posts"]
        except KeyError:
            continue
        else:
            replies = len(d["post_stream"]["posts"]) - 1

        line_discuss = [
            like_count, highest_post_number, discuss_id, user_id, category_id,
            category_name, title, last_posted_at, participant_count, views,
            reply_count, links, sum_of_clicks, replies
        ]
        line_discuss_str = [str(i) for i in line_discuss]
        line_discuss_str = ",".join(line_discuss_str)

        # print line_discuss
        # print line_discuss_str
        # pprint.pprint( d["post_stream"]["posts"])

        for p in d["post_stream"]["posts"]:

            post_number = -1
            quote_count = -1
            updated_at = ""
            moderator = ""
            reads = -1
            reply_count = -1
            id = -1
            avg_time = -1
            cooked = ""
            topic_id = -1
            username = ""
            user_created_at = ""
            user_id = -1
            incoming_link_count = -1
            reply_to_post_number = -1

            post_number = p["post_number"]
            quote_count = p["quote_count"]
            updated_at = p["updated_at"]
            moderator = p["moderator"]
            reads = p["reads"]
            reply_count = p["reply_count"]
            id = p["id"]
            avg_time = p["avg_time"]
            cooked = p["cooked"].replace(",", "").encode('ascii', 'ignore')
            topic_id = p["topic_id"]
            username = p["username"]
            user_created_at = p["created_at"]
            user_id = p["user_id"]
            incoming_link_count = p["incoming_link_count"]
            reply_to_post_number = p["reply_to_post_number"]

            # print cooked
            # cooked = re.search('<p>(.*)</p>', cooked).group(1)
            #
            if fm.check_if_saved(db, COLL_SAVE_STATUS, "discussion_export",
                                 id):
                print "Reply ID : " + str(id) + " already exported"
            elif str(category_id) in Tech_Cat_ID:
                soup = BeautifulSoup(cooked)
                cooked_parsed = ""

                try:
                    soup.blockquote.text
                except AttributeError:
                    # print "blockquote skipped"
                    blockquote_parsed = ""
                    # continue
                else:
                    blockquote_parsed = soup.blockquote.text
                # blockquote_parsed = soup.blockquote.text

                try:
                    soup.findAll("p")
                except AttributeError:
                    print "p tag skipped"
                    print soup
                    continue
                else:
                    for p in soup.findAll("p"):
                        cooked_parsed = cooked_parsed + ''.join(
                            p.findAll(text=True))

                Doc = cooked_parsed

                # blockquote_parsed = re.sub(r'[^a-zA-Z0-9-@\s]+', '', blockquote_parsed).lower().strip().replace("\n", " ")
                # cooked_parsed = re.sub(r'[^a-zA-Z0-9\s]+', '', cooked_parsed).lower().strip().replace("\n", " ")
                # # remove the blockquote section from cooked
                # cooked_parsed = re.sub(blockquote_parsed, "", cooked_parsed)

                # Doc= cooked_parsed
                # cooked_parsed=clean(cooked_parsed)
                cooked_parsed = cooked_parsed.lower().strip().replace(
                    "\n", " ").replace(".", " ").replace("-", ' ')
                # Remove punctuation
                cooked_parsed = result = re.sub(r"http\S+", "", cooked_parsed)
                cooked_parsed = result = re.sub(r"@\S+", "", cooked_parsed)
                #
                # cooked_parsed = re.sub(r'[^\w\s]', '', cooked_parsed).replace("  "," ")
                # # # Remove HTML tags:
                # # cooked_parsed = re.sub('<[^<]+?>', '', cooked_parsed)
                # # # Remove URLs:
                #
                # # Standardize words (remove multiple letters):
                # cooked_parsed = ''.join(''.join(s)[:2] for _, s in itertools.groupby(cooked_parsed))
                # cooked_parsed = lda.clean(cooked_parsed)
                # Call Clean Function
                # cooked_parsed=clean(cooked_parsed)

                # --------- Carete the Text data files
                print user_id
                print Doc
                print cooked_parsed + "\n==========================================================\n"
                # print type(user_id), "  ", type(user_final_list[0])
                # if str(user_id) in user_final_list:
                #     with open("C:/TuDiabetes_Code - Final/"+"Text/"+str(user_id)+".txt", "a") as f:
                #         f.write(cooked_parsed+"\n\n\n")
                #         f.close()
                # =========================================================================================
                # if str(user_id) in Another_type_of_diabetes:
                #     with open("C:/TuDiabetes_Code - Final/Types_Text/Another_type_of_diabetes.txt", "a") as f:
                #         f.write(cooked_parsed)
                #         f.close()
                # elif str(user_id) in Gestational_diabetes:
                #        with open("C:/TuDiabetes_Code - Final/Types_Text/Gestational_diabetes.txt", "a") as f:
                #             f.write(cooked_parsed)
                #             f.close()
                # elif str(user_id) in Idk_type_of_diabetes:
                #        with open("C:/TuDiabetes_Code - Final/Types_Text/Idk_type_of_diabetes.txt", "a") as f:
                #             f.write(cooked_parsed)
                #             f.close()
                # elif str(user_id) in No_diabetes:
                #        with open("C:/TuDiabetes_Code - Final/Types_Text/No_diabetes.txt", "a") as f:
                #             f.write(cooked_parsed)
                #             f.close()
                # elif str(user_id) in Pre_diabetes:
                #        with open("C:/TuDiabetes_Code - Final/Types_Text/Pre_diabetes.txt", "a") as f:
                #             f.write(cooked_parsed)
                #             f.close()
                # elif str(user_id) in Type1:
                #        with open("C:/TuDiabetes_Code - Final/Types_Text/Type1_Diabites.txt", "a") as f:
                #             f.write(cooked_parsed)
                #             f.close()
                # elif str(user_id) in Type2:
                #        with open("C:/TuDiabetes_Code - Final/Types_Text/Type2_Diabites.txt", "a") as f:
                #             f.write(cooked_parsed)
                #             f.close()
                # ===========================================================================================

                # if str(user_id) in Another_type_of_diabetes:
                #     with open("C:/TuDiabetes_Code - Final/Types_Text/Another_type_of_diabetes.txt", "a") as f:
                #         f.write(cooked_parsed)
                #         f.close()
                # elif str(user_id) in Gestational_diabetes:
                #        with open("C:/TuDiabetes_Code - Final/Types_Text/Gestational_diabetes.txt", "a") as f:
                #             f.write(cooked_parsed)
                #             f.close()
                # elif str(user_id) in Idk_type_of_diabetes:
                #        with open("C:/TuDiabetes_Code - Final/Types_Text/Idk_type_of_diabetes.txt", "a") as f:
                #             f.write(cooked_parsed)
                #             f.close()
                # elif str(user_id) in No_diabetes:
                #        with open("C:/TuDiabetes_Code - Final/Types_Text/No_diabetes.txt", "a") as f:
                #             f.write(cooked_parsed)
                #             f.close()
                # elif str(user_id) in Pre_diabetes:
                #        with open("C:/TuDiabetes_Code - Final/Types_Text/Pre_diabetes.txt", "a") as f:
                #             f.write(cooked_parsed)
                #             f.close()
                # elif str(user_id) in Type1:
                #        with open("C:/TuDiabetes_Code - Final/Types_Text/Type1_Diabites.txt", "a") as f:
                #             f.write(cooked_parsed)
                #             f.close()
                # elif str(user_id) in Type2:
                #        with open("C:/TuDiabetes_Code - Final/Types_Text/Type2_Diabites.txt", "a") as f:
                #             f.write(cooked_parsed)
                #             f.close()

                # if str(category_id) in Tech_Cat_ID:
                #        with open("C:/TuDiabetes_Code - Final/Types_Text/Diabetes_Technology.txt", "a") as f:
                #             f.write(cooked_parsed)
                #             f.close()

                # ===========================================================================================
                C_name = re.sub(r'[^\w\s]', '',
                                str(category_name)).replace("  ", " ").strip()
                print C_name
                if str(category_id) in Cat_ID:
                    with open(
                            "C:/TuDiabetes_Code - Final/Diabetes_Text/All.txt",
                            "a") as f:
                        f.write(cooked_parsed)
                        f.close()
                    with open(
                            "C:/TuDiabetes_Code - Final/Diabetes_Text/" +
                            C_name + ".txt", "a") as f:
                        f.write(cooked_parsed)
                        f.close()
                        print "\n *****   " + str(
                            category_id) + "   *****   " + C_name + "   ******"

                # #===============================================================================================
                # C_name= re.sub(r'[^\w\s]', '', str(category_name)).replace("  "," ")
                # print C_name
                # if str(category_id)== "28":
                #     with open("C:/TuDiabetes_Code - Final/TechTypes_Text_New/Diabetes_Tech_General.txt", "a") as f:
                #         f.write(cooked_parsed)
                #         f.close()
                #         print "\n *****   " + str(category_id) + "   *****   "+ str(category_name)+ "   ******"
                # elif str(category_id)== "29":
                #     with open("C:/TuDiabetes_Code - Final/TechTypes_Text_New/Insulin_Pumps.txt", "a") as f:
                #         f.write(cooked_parsed)
                #         f.close()
                #         print "\n *****   " + str(category_id) + "   *****"
                # elif str(category_id)== "30":
                #     with open("C:/TuDiabetes_Code - Final/TechTypes_Text_New/Glucose_Monitoring.txt", "a") as f:
                #         f.write(cooked_parsed)
                #         f.close()
                #         print "\n *****   " + str(category_id) + "   *****"
                # elif str(category_id)== "31":
                #     with open("C:/TuDiabetes_Code - Final/TechTypes_Text_New/Diabetes_Apps.txt", "a") as f:
                #         f.write(cooked_parsed)
                #         f.close()
                #         print "\n *****   " + str(category_id) + "   *****"
                # elif str(category_id)== "33":
                #     with open("C:/TuDiabetes_Code - Final/TechTypes_Text_New/Physical_Activity.txt", "a") as f:
                #         f.write(cooked_parsed)
                #         f.close()
                #         print "\n *****   " + str(category_id) + "   *****"
                # elif str(category_id)== "43":
                #     with open("C:/TuDiabetes_Code - Final/TechTypes_Text_New/Self_Advocacy.txt", "a") as f:
                #         f.write(cooked_parsed)
                #         f.close()
                #         print "\n *****   " + str(category_id) + "   *****"
                # # ===========================================================================================
                # # print ">"*20
                # print "cooked " + str(len(cooked))
                # print cooked
                # print "-"*20
                # print "blockquote parsed " + str(len(blockquote_parsed))
                # print blockquote_parsed
                # print "-"*20
                # print "cooked parsed "+ str(len(cooked_parsed))
                # print cooked_parsed
                # print ">"*20

                # cooked_parsed = ""

                line_post = [
                    post_number, quote_count, updated_at, moderator, reads,
                    reply_count, id, avg_time, cooked_parsed, topic_id,
                    username, user_created_at, user_id, incoming_link_count,
                    reply_to_post_number
                ]

                # print line_discuss
                # print len(line_post)

                line_post = [str(i) for i in line_post]
                line_post = ",".join(line_post).replace("\n", "")

                final_line = line_discuss_str + "," + line_post + "\n"

                final_line = final_line.encode('ascii', 'ignore')
                # print final_line
                data = [final_line.split(",")]
                csv_writer(data, "discussions.csv")
                # f.write(final_line)
                coll_status.insert_one({
                    "collection": "discussion_export",
                    "id": id
                })
                # cnt += 1
                # print "Count: ", cnt, " Reply ID: ", id
                print "Reply ID : " + str(
                    id
                ) + " Exported To CSV File" + "----- Discussion_ID :  " + str(
                    discuss_id)

コード例 #8

0

ファイルを表示

ファイル: util_export_users.py プロジェクト: tnasralah/SManalytics_Forum

def users(db, COLL_USER_DES, COLL_SAVE_USTATUS):

    cursor = db[COLL_USER_DES].find()
    coll_status = db[COLL_SAVE_USTATUS]

    header_user = "******"
    header_user_fields = ",user_gender,diabetes_type,account_owner,meds_tools,DOB"
    header = header_user + header_user_fields

    if (coll_status.find({"collection": "user_export"}).count() == 0):

        data = [header.split(",")]
        csv_writer(data, "final_users.csv")

    cnt = 0
    for d in cursor:

        user_id = -1
        user_name = ""
        gender = ""
        diabetes_type = ""
        account_owner = ""
        meds_tools = ""
        DOB = ""

        try:
            d['id']
        except KeyError:
            continue
        else:
            user_id = d['id']

        try:
            d['username']
        except KeyError:
            continue
        else:
            user_name = d['username'].replace(",",
                                              "").encode('ascii', 'ignore')

        # try:
        #     d["custom_fields"]["date_of_birth"]
        # except KeyError:
        #     continue
        # else:
        #     date_of_birth = d["custom_fields"]["date_of_birth"].replace(",", "").encode('ascii', 'ignore')

        try:
            d["user_fields"]["10"]
        except KeyError:
            continue
        else:
            if d["user_fields"]["10"] == None:
                gender = ""
            else:
                gender = d["user_fields"]["10"].encode('ascii', 'ignore')

        try:
            d["user_fields"]["3"]
        except KeyError:
            continue
        else:
            if type(d["user_fields"]["3"]) == type([]):
                diabetes_type = d["user_fields"]["3"][0].replace(
                    ",", "").encode('ascii', 'ignore')
            else:
                if d["user_fields"]["3"] == None:
                    diabetes_type = ""
                else:
                    diabetes_type = d["user_fields"]["3"].replace(
                        ",", "").encode('ascii', 'ignore')

        try:
            d["user_fields"]["2"]
        except KeyError:
            continue
        else:
            if d["user_fields"]["2"] == None:
                account_owner = ""
            else:
                account_owner = d["user_fields"]["2"].replace(",", "").encode(
                    'ascii', 'ignore')

        try:
            d["user_fields"]["5"]
        except KeyError:
            continue
        else:
            if d["user_fields"]["5"] == None:
                meds_tools = ""
            else:
                meds_tools = d["user_fields"]["5"].replace(",", "").encode(
                    'ascii', 'ignore')

        try:
            d["user_fields"]["9"]
        except KeyError:
            continue
        else:
            if d["user_fields"]["9"] == None:
                DOB = ""
            else:
                DOB = d["user_fields"]["9"].replace(",", "").encode(
                    'ascii', 'ignore')

        line_user = [
            user_id, user_name, gender, diabetes_type, account_owner,
            meds_tools, DOB
        ]
        line_user_str = [str(i) for i in line_user]
        line_user_str = ",".join(line_user_str)

        # #------------------------------------------
        if diabetes_type.strip(
        ) == "another type of diabetes" or diabetes_type.strip(
        ) == "Another type of diabetes":
            Another_type_of_diabetes.append(user_id)
        elif diabetes_type.strip() == "Gestational diabetes":
            Gestational_diabetes.append(user_id)
        elif diabetes_type.strip(
        ) == "I do not know what type of diabetes" or diabetes_type.strip(
        ) == "I don&#039;t know what type of diabetes":
            Idk_type_of_diabetes.append(user_id)
        elif diabetes_type.strip() == "No diabetes":
            No_diabetes.append(user_id)
        elif diabetes_type.strip() == "Pre-diabetes":
            Pre_diabetes.append(user_id)
        elif diabetes_type.strip(
        ) == "Type 1 or type 1.5 (LADA) diabetes" or diabetes_type.strip(
        ) == "Type 1 or Type 1.5 (LADA) diabetes" or diabetes_type.strip(
        ) == "Diabetes Tipo 1 o 1.5 (LADA)":
            Type1.append(user_id)
        elif diabetes_type.strip(
        ) == "Diabetes de Tipo 2" or diabetes_type.strip(
        ) == "Type 2 diabetes":
            Type2.append(user_id)
        #------------------------------------------

        # print line_user_str

        # print line_discuss_str
        # pprint.pprint( d["post_stream"]["posts"])

        if diabetes_type != "":
            if fm.check_if_saved(db, COLL_SAVE_USTATUS, "user_export",
                                 user_id):
                print "Reply ID : " + str(user_id) + " already exported"

            # if fm.check_if_saved(db, COLL_SAVE_STATUS, "discussion_export", id):
            #     print "Reply ID : " + str(id) + " already exported"
            else:

                final_line = line_user_str + "\n"
                final_line = final_line.encode('ascii', 'ignore')
                data = [final_line.split(",")]
                csv_writer(data, "final_users.csv")
                coll_status.insert_one({
                    "collection": "user_export",
                    "id": user_id
                })
                print "Reply ID : " + str(
                    user_id) + " User Info Exported To CSV File"

    print len(Another_type_of_diabetes), len(Gestational_diabetes), len(
        Idk_type_of_diabetes), len(No_diabetes), len(Pre_diabetes), len(
            Type1), len(Type2)
    print " Number of Users Extracted --->  ", len(
        Another_type_of_diabetes) + len(Gestational_diabetes) + len(
            Idk_type_of_diabetes) + len(No_diabetes) + len(Pre_diabetes) + len(
                Type1) + len(Type2)

    with open('Another_type_of_diabetes.csv', 'ab') as f:
        writer = csv.writer(f)
        writer.writerows(zip(Another_type_of_diabetes))

    with open('Gestational_diabetes.csv', 'ab') as f:
        writer = csv.writer(f)
        writer.writerows(zip(Gestational_diabetes))

    with open('Idk_type_of_diabetes.csv', 'ab') as f:
        writer = csv.writer(f)
        writer.writerows(zip(Idk_type_of_diabetes))

    with open('No_diabetes.csv', 'ab') as f:
        writer = csv.writer(f)
        writer.writerows(zip(No_diabetes))

    with open('Pre_diabetes.csv', 'ab') as f:
        writer = csv.writer(f)
        writer.writerows(zip(Pre_diabetes))

    with open('Type1.csv', 'ab') as f:
        writer = csv.writer(f)
        writer.writerows(zip(Type1))

    with open('Type2.csv', 'ab') as f:
        writer = csv.writer(f)
        writer.writerows(zip(Type2))

コード例 #9

0

ファイルを表示

ファイル: util_export_to_csv.py プロジェクト: bistaprad/tudiabetes_crawler

def discussions(db, COLL_DISCUSSION, COLL_SAVE_STATUS):

    cursor = db[COLL_DISCUSSION].find()
    coll_status = db[COLL_SAVE_STATUS]


    header_disucssion = "like_count,highest_post_number,discuss_id,user_id,title,last_posted_at,participant_count,views,reply_count,links,sum_of_clicks,replies"
    header_replies = ",post_number,quote_count,updated_at,moderator,reads,reply_count,id,avg_time,cooked,topic_id,username,user_created_at,user_id,incoming_link_count,reply_to_post_number"
    header = header_disucssion + header_replies + "\n"

    f = open("discussions.csv", "a")

    if(coll_status.find({"collection":"discussion_export"}).count() == 0):
        f.write(header)

    cnt = 0
    for d in cursor:

        like_count = -1
        highest_post_number = -1
        discuss_id = -1
        user_id = -1
        title = ""
        last_posted_at = -1
        participant_count = -1
        views = -1
        reply_count = -1
        links = -1
        sum_of_clicks = -1
        replies = -1

        try:
            d['like_count']
        except KeyError:
            continue
        else:
            like_count = d['like_count']

        try:
            d['highest_post_number']
        except KeyError:
            continue
        else:
            highest_post_number = d['highest_post_number']

        try:
            d['id']
        except KeyError:
            continue
        else:
            discuss_id = d['id']

        try:
            d['user_id']
        except KeyError:
            continue
        else:
            user_id = d['user_id']

        try:
            d['title']
        except KeyError:
            continue
        else:
            title = d['title'].replace(",", "").encode('ascii', 'ignore')

        try:
            d['last_posted_at']
        except KeyError:
            continue
        else:
            last_posted_at = d['last_posted_at']

        try:
            d['participant_count']
        except KeyError:
            continue
        else:
            participant_count = d['participant_count']

        try:
            d['views']
        except KeyError:
            continue
        else:
            views = d['views']

        try:
            d['reply_count']
        except KeyError:
            continue
        else:
            reply_count = d['reply_count']

        try:
            d["details"]["links"]
        except KeyError:
            continue
        else:
            l_dict = d["details"]["links"]
            links = len(l_dict)
            for l in l_dict:
                sum_of_clicks = sum_of_clicks + l["clicks"]

        try:
            d["post_stream"]["posts"]
        except KeyError:
            continue
        else:
            replies = len(d["post_stream"]["posts"]) - 1

        line_discuss = [like_count,highest_post_number,discuss_id,user_id,title,last_posted_at,participant_count,views,reply_count,links,sum_of_clicks,replies]
        line_discuss_str = [str(i) for i in line_discuss]
        line_discuss_str = ",".join(line_discuss_str)

        for p in d["post_stream"]["posts"]:

            post_number = -1
            quote_count = -1
            updated_at = ""
            moderator = ""
            reads = -1
            reply_count = -1
            id = -1
            avg_time = -1
            cooked = ""
            topic_id = -1
            username = ""
            user_created_at = ""
            user_id = -1
            incoming_link_count = -1
            reply_to_post_number = -1

            post_number = p["post_number"]
            quote_count = p["quote_count"]
            updated_at = p["updated_at"]
            moderator = p["moderator"]
            reads = p["reads"]
            reply_count = p["reply_count"]
            id = p["id"]
            avg_time = p["avg_time"]
            cooked = p["cooked"].replace(",", "").encode('ascii', 'ignore')
            topic_id = p["topic_id"]
            username = p["username"]
            user_created_at = p["user_created_at"]
            user_id = p["user_id"]
            incoming_link_count = p["incoming_link_count"]
            reply_to_post_number = p["reply_to_post_number"]



            # print cooked
            # cooked = re.search('<p>(.*)</p>', cooked).group(1)
            #
            if fm.check_if_saved(db, COLL_SAVE_STATUS, "discussion_export", id):
                print "Reply ID : " + str(id) + " already exported"
            else:
                soup  = BeautifulSoup(cooked)
                cooked_parsed = ""

                try:
                    soup.blockquote.text
                except AttributeError:
                    # print "blockquote skipped"
                    blockquote_parsed = ""
                    # continue
                else:
                    blockquote_parsed = soup.blockquote.text
                # blockquote_parsed = soup.blockquote.text

                try:
                    soup.findAll("p")
                except AttributeError:
                    print "p tag skipped"
                    print soup
                    continue
                else:
                    for p in soup.findAll("p"):
                        cooked_parsed = cooked_parsed + ''.join(p.findAll(text=True))

                blockquote_parsed = re.sub(r'[^a-zA-Z0-9\s]+', '', blockquote_parsed).lower().strip().replace("\n", " ")
                cooked_parsed = re.sub(r'[^a-zA-Z0-9\s]+', '', cooked_parsed).lower().strip().replace("\n", " ")

                # remove the blockquote section from cooked
                cooked_parsed = re.sub(blockquote_parsed, "", cooked_parsed)

                # print ">"*20
                # print "cooked " + str(len(cooked))
                # print cooked
                # print "-"*20
                # print "blockquote parsed " + str(len(blockquote_parsed))
                # print blockquote_parsed
                # print "-"*20
                # print "cooked parsed "+ str(len(cooked_parsed))
                # print cooked_parsed
                # print ">"*20

                # cooked_parsed = ""

                line_post = [post_number, quote_count, updated_at, moderator, reads, reply_count, id, avg_time, cooked_parsed, topic_id, username, user_created_at, user_id, incoming_link_count, reply_to_post_number]

                # print line_discuss
                # print len(line_post)

                line_post = [str(i) for i in line_post]
                line_post = ",".join(line_post).replace("\n", "")

                final_line = line_discuss_str+","+line_post+"\n"

                final_line  = final_line.encode('ascii', 'ignore')
                f.write(final_line)
                coll_status.insert_one({"collection":"discussion_export", "id":id})
                # cnt += 1
                # print "Count: ", cnt, " Reply ID: ", id
                print "Reply ID : " + str(id) + " exported"

    f.close()

コード例 #10

0

ファイルを表示

ファイル: util_fetch_tudiabetes.py プロジェクト: bistaprad/tudiabetes_crawler

def dump_discussion(ids, db, COLL_DISCUSSION, COLL_SAVE_STATUS):
    coll = db[COLL_DISCUSSION]
    coll_status = db[COLL_SAVE_STATUS]

    # find the list discussion id which have not been saved in mongodb
    # unsaved_ids = []
    # print "-------------------------"
    # print len(ids)
    # cn = 0
    # for id in ids:
    #     cn = cn + 1
    #     print cn
    #     if not fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_DISCUSSION, id[0]):
    #         unsaved_ids.append(id)
    count = 0
    print "-------------------------"
    for id in ids:

        if fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_DISCUSSION, id[0]):
            print "discussion aready saved : " + str(id[0])
        else:
            discussion_id = id[0]
            slug = id[1]
            ##find all replies to this discussion
            page = 1
            discussion_complete = {}
            first = True
            while True:
                discussion_url = "http://www.tudiabetes.org/forum/t/" + slug + "/" + str(
                    discussion_id) + ".json?page=" + str(page)

                # if this url doesnt return a json response stop the json request, by breaking this while loop
                try:
                    json_discussion = requests.get(discussion_url).json()
                    # pprint.pprint(json_discussion)
                except ValueError:
                    break
                else:
                    page += 1
                    if (first):
                        discussion_complete = json_discussion
                        first = False
                    else:
                        for p in json_discussion["post_stream"]["posts"]:
                            discussion_complete["post_stream"]["posts"].append(p)
                            # print "post_count:" + str(index) +" >> post_id: " + str(p["id"]) + " >> "+ p["cooked"]

            ## save to mongodb
            res = coll.insert_one(discussion_complete)

            # save this id in "save" collection
            coll_status.insert_one({"collection":COLL_DISCUSSION, "id":id[0]})

            try:
                discussion_complete["post_stream"]["posts"]
            except KeyError:
                replies = 0
            else:
                replies = str(len(discussion_complete["post_stream"]["posts"]))

            print "saved discussion: count = %s replies = %s [id, slug] = %s):" % (str(count), replies, str(id))
            count += 1

コード例 #11

0

ファイルを表示

ファイル: util_fetch_tudiabetes.py プロジェクト: tnasralah/SManalytics_Forum

def dump_discussion(ids, db, COLL_DISCUSSION, COLL_SAVE_STATUS):
    coll = db[COLL_DISCUSSION]
    coll_status = db[COLL_SAVE_STATUS]

    # find the list discussion id which have not been saved in mongodb
    # unsaved_ids = []
    # print "-------------------------"
    # print len(ids)
    # cn = 0
    # for id in ids:
    #     cn = cn + 1
    #     print cn
    #     if not fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_DISCUSSION, id[0]):
    #         unsaved_ids.append(id)
    count = 0
    print "-------------------------"
    for id in ids:

        if fm.check_if_saved(db, COLL_SAVE_STATUS, COLL_DISCUSSION, id[0]):
            print "discussion aready saved : " + str(id[0])
        else:
            discussion_id = id[0]
            slug = id[1]
            ##find all replies to this discussion
            page = 1
            discussion_complete = {}
            first = True
            while True:
                discussion_url = "http://www.tudiabetes.org/forum/t/" + slug + "/" + str(
                    discussion_id) + ".json?page=" + str(page)

                # if this url doesnt return a json response stop the json request, by breaking this while loop
                try:
                    json_discussion = requests.get(discussion_url).json()
                    # pprint.pprint(json_discussion)
                except ValueError:
                    break
                else:
                    page += 1
                    if (first):
                        discussion_complete = json_discussion
                        first = False
                    else:
                        for p in json_discussion["post_stream"]["posts"]:
                            discussion_complete["post_stream"]["posts"].append(
                                p)
                            # print "post_count:" + str(index) +" >> post_id: " + str(p["id"]) + " >> "+ p["cooked"]

            ## save to mongodb
            res = coll.insert_one(discussion_complete)

            # save this id in "save" collection
            coll_status.insert_one({
                "collection": COLL_DISCUSSION,
                "id": id[0]
            })

            try:
                discussion_complete["post_stream"]["posts"]
            except KeyError:
                replies = 0
            else:
                replies = str(len(discussion_complete["post_stream"]["posts"]))

            print "saved discussion: count = %s replies = %s [id, slug] = %s):" % (
                str(count), replies, str(id))
            count += 1