Exemple #1
0
    def scrape_posts(self, postElement):
        logging.info("Post Content: {}".format(postElement.text))
        webdriver.ActionChains(self.driver).send_keys(Keys.ESCAPE).perform()
        self.scroll_to_element(postElement)
        self.click_see_more(postElement)
        postIds = scraped_post_ids()
        postHeaderElement = self.find_elem_by_xpath_with_wait(
            "." + postHeaderXpath, postElement)
        timestamp = self.post_timestamp(postHeaderElement)
        time = self.posted_at(postHeaderElement)
        by = self.posted_by(postHeaderElement)
        content = self.post_content(postElement)
        groupId = fbGroupLink.split("/")[-1]
        Id = generate_id(timestamp, by, content, groupId)

        self.postId = Id
        self.postYear = time.split("-")[0]

        if (Id not in postIds):
            memberIds = scraped_member_ids()
            mydb = db()
            # toInsertMemIds = []

            if (by not in memberIds):
                memberIds.append(by)
                name = self.posted_by_name(postHeaderElement)
                mem_data = [by, name]
                mydb.insert(mem_data, "fb_group_name")
            typePost = self.post_type(postHeaderElement)
            group_id = (fbGroupLink.split("/groups/")[1]).strip('/')
            post_param = [Id, content, time, by, typePost, group_id]
            mydb.insert(post_param, "fb_group_posts")
            logging.info(post_param)
        else:
            logging.info("Post already scraped")
Exemple #2
0
    def scrape_post_reactions(self, postElement, postId):
        try:
            mydb = db()
            webdriver.ActionChains(self.driver).send_keys(
                Keys.ESCAPE).perform()
            self.scroll_to_element(postElement)

            scrapedDateTime = self.curr_date_time()
            reactionPaneElement = self.find_elem_by_xpath_with_wait(
                "." + postReactionPaneXpath, postElement)
            likedBy = self.load_post_reactions(postElement)
            scrapedId = scraped_reaction_id()
            memberIds = scraped_member_ids()
            print("Scraping Reactions")
            # print(likedBy)
            for pIdx in tqdm(range(len(likedBy))):
                profile = likedBy[pIdx]
                reactionId = generate_id(profile[0], postId)
                if (reactionId not in scrapedId):
                    if (profile[0] not in memberIds):
                        memberIds.append(profile[0])
                        mydb.insert(profile, "fb_group_name")
                    reactionParam = [
                        reactionId, postId, scrapedDateTime, profile[0]
                    ]
                    # print(reactionParam)
                    mydb.insert(reactionParam, "fb_group_posts_reactions")
            mydb.closeCursor()
        except:
            print("No reaction ", postId)
Exemple #3
0
def scrapedMembersId():
    try:
        mydb = db()
        whereCondn = "1"
        postIds, size = mydb.select("fb_group_name", "User ID", whereCondn)
        return list(np.array(postIds)[:, 0])
    except:
        return []
def totalCommentsScraped(postId):
    try:
        mydb = db()
        whereCondn = " `Comment Post ID` = "+ "'"+postId+"'"
        postIds,size = mydb.select("fb_group_post_comments","Comment ID",whereCondn)
        return size
    except:
        return -1
def updateRow(seen_id, Status="SEEN"):
    mydb = db()
    table = "fb_group_posts_seen"
    col = "Seen Status"
    val = Status
    condnVal = "'" + seen_id + "'"
    where = "`Seen ID` = "+condnVal
    mydb.update(table, col, val, where)
def scrapedCommentsId():
    try:
        mydb = db()
        whereCondn = "1"
        commentIds,size = mydb.select("fb_group_post_comments","Comment ID",whereCondn)
        return np.array(commentIds)[:,0]
    except:
        return []
Exemple #7
0
def totalReactionsScraped(postId):
    try:
        mydb = db()
        whereCondn = " `Facebook Post ID` = " + "'" + postId + "'"
        postIds, size = mydb.select("fb_group_posts_reactions", "Reaction ID",
                                    whereCondn)
        return size
    except:
        return -1
Exemple #8
0
def scrapedReactionId():
    try:
        mydb = db()
        whereCondn = "1"
        reaction_ids, size = mydb.select("fb_group_posts_reactions",
                                         "Reaction ID", whereCondn)
        return np.array(reaction_ids)[:, 0]
    except:
        return []
def scraped_post_ids():
    try:
        mydb = db()
        whereCondn = "1"
        postIds, size = mydb.select(
            "fb_group_posts", "Facebook Post ID", whereCondn)
        return np.array(postIds)[:, 0]
    except:
        return []
Exemple #10
0
def scraped_member_ids():
    try:
        mydb = db()
        whereCondn = "1"
        postIds, size = mydb.select("fb_group_name", "User ID", whereCondn)
        return list(np.array(postIds)[:, 0])
        mydb.closeCursor()
    except:
        return []
def scrapedSeenId():
    try:
        mydb = db()
        whereCondn = "1"
        commentIds, size = mydb.select(
            "fb_group_posts_seen", "Seen ID", whereCondn)
        return np.array(commentIds)[:, 0]
    except:
        return []
Exemple #12
0
def scraped_comment_ids():
    try:
        mydb = db()
        whereCondn = "1"
        commentIds, size = mydb.select(
            "fb_group_post_comments", "Comment ID", whereCondn)
        return np.array(commentIds)[:, 0]
        mydb.closeCursor()
    except:
        return []
Exemple #13
0
    def load_posts(self, postElement):
        self.MoveToElement(postElement)
        postIds = scrapedPostsId()
        Id = self.PostId(postElement)
        if (Id not in postIds):
            seeMore = self.find_elems_by_class_name("see_more_link",
                                                    postElement)
            count = 0
            idx = 0
            while (len(seeMore) != 0):
                try:
                    self.MoveToElement(seeMore[idx])
                    seeMore[idx].click()
                except:
                    break
                currSeeMore = seeMore[0]
                seeMore = self.find_elems_by_class_name(
                    "see_more_link", postElement)
                if (currSeeMore == seeMore[0]):
                    print("Loading")
                    count += 1
                    sleep(2)
                else:
                    count = 0

                if count > 2:
                    print("Skipping load")
                    idx += 1
                else:
                    idx = 0

            # print("Inside the class")

            memberIds = scrapedMembersId()
            mydb = db()
            toInsertMemIds = []

            by = self.PostedBy(postElement)

            if (by not in memberIds):
                memberIds.append(by)
                name = self.PostedByName(postElement)
                mem_data = [by, name]
                mydb.insert(mem_data, "fb_group_name")
            time = self.PostTimestamp(postElement)
            content = self.PostContent(postElement)
            typePost = self.PostType(postElement)
            group_id = fbGroupId
            post_param = [Id, content, time, by, typePost, group_id]
            mydb.insert(post_param, "fb_group_posts")
            print(post_param)
        else:
            print("Post already scraped")
Exemple #14
0
    def load_post_reactions(self, postElement):
        post_id = self.post_id(postElement)
        # try:
        mydb = db()
        self.MoveToElement(postElement)

        scraped_date_time = self.curr_date_time()
        liked_by = self.scrape_post_reactions(postElement)
        scraped_id = scrapedReactionId()
        memberIds = scrapedMembersId()
        for profile in liked_by:
            reaction_id = profile[0] + "&" + post_id
            if (reaction_id not in scraped_id):
                if (profile[0] not in memberIds):
                    memberIds.append(profile[0])
                    mydb.insert(profile, "fb_group_name")
                reaction_param = [
                    reaction_id, post_id, scraped_date_time, profile[0]
                ]
                print(reaction_param)
                mydb.insert(reaction_param, "fb_group_posts_reactions")
Exemple #15
0
    def scrape_comments(self, postElement, postId, year=2020):
        webdriver.ActionChains(self.driver).send_keys(Keys.ESCAPE).perform()
        mydb = db()
        self.scroll_to_element(postElement)
        total_comments_in_post = self.total_comments_in_post(postElement)
        if (total_comments_in_post != -1):
            self.select_most_recent_element(postElement)
            # postId = self.post_id(postElement)
            # total_comments_scraped = total_comments_scraped(postId)
            scrapedCommentsTotal = total_comments_scraped(postId)
            print("Total Comments scraped :", scrapedCommentsTotal)
            # print("Total Comments :", total_comments_in_post)
            if (total_comments_in_post > scrapedCommentsTotal):
                self.load_all_comments(postElement)
                commentElements = self.find_elems_by_xpath_with_wait(
                    "." + commentElementXpath, postElement)
                commentIds = scraped_comment_ids()
                memberIds = list(scraped_member_ids())
                print("Scraping comments")
                # for cIdx in tqdm(range(len(commentElements))):
                parentCommentDict = {}
                for cIdx in tqdm(range(len(commentElements))):
                    comment = commentElements[cIdx]
                    # print(comment.text)
                    commentBy = self.comment_by(comment)
                    commentTimestamp = self.comment_timestamp(comment, year)
                    commentContent = self.comment_content(comment)
                    commentId = generate_id(commentTimestamp, commentBy,
                                            commentContent)

                    # print("CommentId :",commentId)
                    if commentId not in commentIds:
                        commentLabel = comment.get_attribute(
                            "aria-label").lower()
                        if ("reply" not in commentLabel):
                            isReply = "No"
                            parentCommentId = "None"
                            LastParentComment = comment
                        else:
                            isReply = "Yes"

                            # try:
                            #     parentCommentId = self.comment_by(LastParentComment)
                            # except:
                            pCIdx = self.find_parent_comment_element(
                                commentElements, cIdx)
                            if pCIdx not in parentCommentDict:
                                try:
                                    parentCommentBy = self.comment_by(
                                        commentElements[pCIdx])
                                    parentCommentTimestamp = str(
                                        self.comment_timestamp(
                                            commentElements[pCIdx], year))
                                    parentCommentContent = self.comment_content(
                                        commentElements[pCIdx])
                                    parentCommentId = generate_id(
                                        parentCommentTimestamp,
                                        parentCommentBy, parentCommentContent)

                                except:
                                    parentCommentId = "NotFound" + "#" + postId
                                parentCommentDict[pCIdx] = parentCommentId
                            else:
                                parentCommentId = parentCommentDict[pCIdx]

                        commentDateTime = self.comment_at(comment, year)
                        # commentBy = self.comment_by(comment)
                        if (commentBy not in memberIds):
                            memberIds.append(commentBy)
                            name = self.comment_by_name(comment)
                            mem_data = [str(commentBy), str(name)]
                            mydb.insert(mem_data, "fb_group_name")
                            # print("New member")
                            # print(mem_data)
                        # commentContent = self.comment_content(comment)

                        commentParam = [
                            commentId, postId, commentDateTime, isReply,
                            parentCommentId, commentBy, commentContent
                        ]
                        # print(commentParam)
                        mydb.insert(commentParam, "fb_group_post_comments")
                mydb.closeCursor()
                # print("####################################################################")
            else:
                print("No new comments !!")
        else:
            print("No comments present !!!")
    def loadComments(self,postElement):
        mydb = db()
        self.ScrollToElement(postElement)
        
        total_comments_in_post = self.totalCommentsInPost(postElement)
        if(total_comments_in_post != -1):
            
            postId = self.post_id(postElement)
            total_comments_scraped = totalCommentsScraped(postId)
            print("Total Comments scraped :",total_comments_scraped)
            print("Total Comments :",total_comments_in_post)
            if(total_comments_in_post > total_comments_scraped):
                try:
                    mostRelevantElement = self.find_elem_by_class_name_with_wait("_6w1v",postElement)
                    sleep_time = 2
                    while(mostRelevantElement.text != "Newest"):
                        self.ScrollToElement(mostRelevantElement)
                        mostRelevantElement.click()
                        sleep(sleep_time)
                        sleep_time *= 2

                        NewestElement = self.find_elems_by_class_name_with_wait("_54ni")[-1]
                        self.ScrollToElement(NewestElement)
                        NewestElement.click()
                        mostRelevantElement = self.find_elem_by_class_name_with_wait("_6w1v",postElement)
                    mostRelevantElement = "Most Relevant"

                except:
                    print("No Relevancy Factor !!!")
                loadMoreElement = self.find_elems_by_class_name_with_wait("_4sxc",postElement)
                SeeMoreElement = self.find_elems_by_class_name("_5v47",postElement)
                # loadMoreButtonElement = self.find_elem_by_class_name("_4sxc",loadMoreElement[0])

                while(len(loadMoreElement) > 0):
                    
                        # print(loadMoreElement)
                        # loadMoreButtonElement = self.find_elem_by_class_name("_4sxc",loadMoreElement[0])
                        # if(loadMoreButtonElement == None):
                        #     break
                        for load in loadMoreElement:
                            try: 
                                self.ScrollToElement(load)
                                load.click()
                            except:
                                continue

                        for more in SeeMoreElement:
                            try: 
                                self.ScrollToElement(more)
                                more.click()
                            except:
                                continue

                        loadMoreElement = self.find_elems_by_class_name("_4sxc",postElement)
                        SeeMoreElement = self.find_elems_by_class_name("_5v47",postElement)
        
                commentElements = self.find_elems_by_class_name("_4eek",postElement)
                commentIds = scrapedCommentsId()
                memberIds = list(scrapedMembersId())
                for comment in commentElements:
                    #print(comment.text)
                    commentBy = self.comment_by(comment)
                    commentId = commentBy+"&"+self.comment_timestamp(comment)+"&"+postId
                    
                    #print("CommentId :",commentId)
                    if commentId not in commentIds:
                        commentLabel = comment.get_attribute("aria-label")
                        if(commentLabel == "Comment"):
                            isReply = "No"
                            ParentCommentId = "None"
                            LastParentComment = comment
                        else:
                            isReply = "Yes"
                            
                            # try:
                            #     ParentCommentId = self.comment_by(LastParentComment)
                            # except:
                            ParentCommentElement = self.find_parent_comment_element(commentElements,comment)
                            try:
                                ParentCommentId = self.comment_by(ParentCommentElement)
                                ParentCommentId +="&"+self.comment_timestamp(ParentCommentElement)+"&"+postId 
                            except:
                                ParentCommentId = "NotFound"+"#"+postId



                        
                        commentDateTime = self.comment_at(comment)
                        commentBy = self.comment_by(comment)
                        if(commentBy not in memberIds):
                            memberIds.append(commentBy)
                            name = self.comment_by_name(comment)
                            mem_data = [str(commentBy),str(name)]
                            mydb.insert(mem_data,"fb_group_name")
                        commentContent = self.comment_content(comment)

                        commentParam = [commentId,postId,commentDateTime,isReply,ParentCommentId,commentBy,commentContent]
                        print(commentParam)
                        mydb.insert(commentParam,"fb_group_post_comments")
            
            else:
                print("No new comments !!")
        else:
            print("No comments present !!!")
Exemple #17
0
        return []


def scrapedMembersId():
    try:
        mydb = db()
        whereCondn = "1"
        postIds, size = mydb.select("fb_group_name", "User ID", whereCondn)
        return list(np.array(postIds)[:, 0])
    except:
        return []


if __name__ == "__main__":
    fb = fb_login()
    fbGrps = fb_group_posts(fb)
    # fbGrps.LoadGroup(50)
    postIds = scrapedPostsId()
    mydb = db()
    idx = 0
    for postElem in fb.postElements:
        print(idx)
        idx += 1
        Id = fbGrps.PostId(postElem)
        if (Id not in postIds):
            by = fbGrps.PostedBy(postElem)
            time = fbGrps.PostTimestamp(postElem)
            content = fbGrps.PostContent(postElem)
            typePost = fbGrps.PostType(postElem)
            mydb.insert([Id, content, time, by, typePost], "fb_group_posts")