def scrape_posts(self, postElement): logging.info("Post Content: {}".format(postElement.text)) webdriver.ActionChains(self.driver).send_keys(Keys.ESCAPE).perform() self.scroll_to_element(postElement) self.click_see_more(postElement) postIds = scraped_post_ids() postHeaderElement = self.find_elem_by_xpath_with_wait( "." + postHeaderXpath, postElement) timestamp = self.post_timestamp(postHeaderElement) time = self.posted_at(postHeaderElement) by = self.posted_by(postHeaderElement) content = self.post_content(postElement) groupId = fbGroupLink.split("/")[-1] Id = generate_id(timestamp, by, content, groupId) self.postId = Id self.postYear = time.split("-")[0] if (Id not in postIds): memberIds = scraped_member_ids() mydb = db() # toInsertMemIds = [] if (by not in memberIds): memberIds.append(by) name = self.posted_by_name(postHeaderElement) mem_data = [by, name] mydb.insert(mem_data, "fb_group_name") typePost = self.post_type(postHeaderElement) group_id = (fbGroupLink.split("/groups/")[1]).strip('/') post_param = [Id, content, time, by, typePost, group_id] mydb.insert(post_param, "fb_group_posts") logging.info(post_param) else: logging.info("Post already scraped")
def scrape_post_reactions(self, postElement, postId): try: mydb = db() webdriver.ActionChains(self.driver).send_keys( Keys.ESCAPE).perform() self.scroll_to_element(postElement) scrapedDateTime = self.curr_date_time() reactionPaneElement = self.find_elem_by_xpath_with_wait( "." + postReactionPaneXpath, postElement) likedBy = self.load_post_reactions(postElement) scrapedId = scraped_reaction_id() memberIds = scraped_member_ids() print("Scraping Reactions") # print(likedBy) for pIdx in tqdm(range(len(likedBy))): profile = likedBy[pIdx] reactionId = generate_id(profile[0], postId) if (reactionId not in scrapedId): if (profile[0] not in memberIds): memberIds.append(profile[0]) mydb.insert(profile, "fb_group_name") reactionParam = [ reactionId, postId, scrapedDateTime, profile[0] ] # print(reactionParam) mydb.insert(reactionParam, "fb_group_posts_reactions") mydb.closeCursor() except: print("No reaction ", postId)
def scrapedMembersId(): try: mydb = db() whereCondn = "1" postIds, size = mydb.select("fb_group_name", "User ID", whereCondn) return list(np.array(postIds)[:, 0]) except: return []
def totalCommentsScraped(postId): try: mydb = db() whereCondn = " `Comment Post ID` = "+ "'"+postId+"'" postIds,size = mydb.select("fb_group_post_comments","Comment ID",whereCondn) return size except: return -1
def updateRow(seen_id, Status="SEEN"): mydb = db() table = "fb_group_posts_seen" col = "Seen Status" val = Status condnVal = "'" + seen_id + "'" where = "`Seen ID` = "+condnVal mydb.update(table, col, val, where)
def scrapedCommentsId(): try: mydb = db() whereCondn = "1" commentIds,size = mydb.select("fb_group_post_comments","Comment ID",whereCondn) return np.array(commentIds)[:,0] except: return []
def totalReactionsScraped(postId): try: mydb = db() whereCondn = " `Facebook Post ID` = " + "'" + postId + "'" postIds, size = mydb.select("fb_group_posts_reactions", "Reaction ID", whereCondn) return size except: return -1
def scrapedReactionId(): try: mydb = db() whereCondn = "1" reaction_ids, size = mydb.select("fb_group_posts_reactions", "Reaction ID", whereCondn) return np.array(reaction_ids)[:, 0] except: return []
def scraped_post_ids(): try: mydb = db() whereCondn = "1" postIds, size = mydb.select( "fb_group_posts", "Facebook Post ID", whereCondn) return np.array(postIds)[:, 0] except: return []
def scraped_member_ids(): try: mydb = db() whereCondn = "1" postIds, size = mydb.select("fb_group_name", "User ID", whereCondn) return list(np.array(postIds)[:, 0]) mydb.closeCursor() except: return []
def scrapedSeenId(): try: mydb = db() whereCondn = "1" commentIds, size = mydb.select( "fb_group_posts_seen", "Seen ID", whereCondn) return np.array(commentIds)[:, 0] except: return []
def scraped_comment_ids(): try: mydb = db() whereCondn = "1" commentIds, size = mydb.select( "fb_group_post_comments", "Comment ID", whereCondn) return np.array(commentIds)[:, 0] mydb.closeCursor() except: return []
def load_posts(self, postElement): self.MoveToElement(postElement) postIds = scrapedPostsId() Id = self.PostId(postElement) if (Id not in postIds): seeMore = self.find_elems_by_class_name("see_more_link", postElement) count = 0 idx = 0 while (len(seeMore) != 0): try: self.MoveToElement(seeMore[idx]) seeMore[idx].click() except: break currSeeMore = seeMore[0] seeMore = self.find_elems_by_class_name( "see_more_link", postElement) if (currSeeMore == seeMore[0]): print("Loading") count += 1 sleep(2) else: count = 0 if count > 2: print("Skipping load") idx += 1 else: idx = 0 # print("Inside the class") memberIds = scrapedMembersId() mydb = db() toInsertMemIds = [] by = self.PostedBy(postElement) if (by not in memberIds): memberIds.append(by) name = self.PostedByName(postElement) mem_data = [by, name] mydb.insert(mem_data, "fb_group_name") time = self.PostTimestamp(postElement) content = self.PostContent(postElement) typePost = self.PostType(postElement) group_id = fbGroupId post_param = [Id, content, time, by, typePost, group_id] mydb.insert(post_param, "fb_group_posts") print(post_param) else: print("Post already scraped")
def load_post_reactions(self, postElement): post_id = self.post_id(postElement) # try: mydb = db() self.MoveToElement(postElement) scraped_date_time = self.curr_date_time() liked_by = self.scrape_post_reactions(postElement) scraped_id = scrapedReactionId() memberIds = scrapedMembersId() for profile in liked_by: reaction_id = profile[0] + "&" + post_id if (reaction_id not in scraped_id): if (profile[0] not in memberIds): memberIds.append(profile[0]) mydb.insert(profile, "fb_group_name") reaction_param = [ reaction_id, post_id, scraped_date_time, profile[0] ] print(reaction_param) mydb.insert(reaction_param, "fb_group_posts_reactions")
def scrape_comments(self, postElement, postId, year=2020): webdriver.ActionChains(self.driver).send_keys(Keys.ESCAPE).perform() mydb = db() self.scroll_to_element(postElement) total_comments_in_post = self.total_comments_in_post(postElement) if (total_comments_in_post != -1): self.select_most_recent_element(postElement) # postId = self.post_id(postElement) # total_comments_scraped = total_comments_scraped(postId) scrapedCommentsTotal = total_comments_scraped(postId) print("Total Comments scraped :", scrapedCommentsTotal) # print("Total Comments :", total_comments_in_post) if (total_comments_in_post > scrapedCommentsTotal): self.load_all_comments(postElement) commentElements = self.find_elems_by_xpath_with_wait( "." + commentElementXpath, postElement) commentIds = scraped_comment_ids() memberIds = list(scraped_member_ids()) print("Scraping comments") # for cIdx in tqdm(range(len(commentElements))): parentCommentDict = {} for cIdx in tqdm(range(len(commentElements))): comment = commentElements[cIdx] # print(comment.text) commentBy = self.comment_by(comment) commentTimestamp = self.comment_timestamp(comment, year) commentContent = self.comment_content(comment) commentId = generate_id(commentTimestamp, commentBy, commentContent) # print("CommentId :",commentId) if commentId not in commentIds: commentLabel = comment.get_attribute( "aria-label").lower() if ("reply" not in commentLabel): isReply = "No" parentCommentId = "None" LastParentComment = comment else: isReply = "Yes" # try: # parentCommentId = self.comment_by(LastParentComment) # except: pCIdx = self.find_parent_comment_element( commentElements, cIdx) if pCIdx not in parentCommentDict: try: parentCommentBy = self.comment_by( commentElements[pCIdx]) parentCommentTimestamp = str( self.comment_timestamp( commentElements[pCIdx], year)) parentCommentContent = self.comment_content( commentElements[pCIdx]) parentCommentId = generate_id( parentCommentTimestamp, parentCommentBy, parentCommentContent) except: parentCommentId = "NotFound" + "#" + postId parentCommentDict[pCIdx] = parentCommentId else: parentCommentId = parentCommentDict[pCIdx] commentDateTime = self.comment_at(comment, year) # commentBy = self.comment_by(comment) if (commentBy not in memberIds): memberIds.append(commentBy) name = self.comment_by_name(comment) mem_data = [str(commentBy), str(name)] mydb.insert(mem_data, "fb_group_name") # print("New member") # print(mem_data) # commentContent = self.comment_content(comment) commentParam = [ commentId, postId, commentDateTime, isReply, parentCommentId, commentBy, commentContent ] # print(commentParam) mydb.insert(commentParam, "fb_group_post_comments") mydb.closeCursor() # print("####################################################################") else: print("No new comments !!") else: print("No comments present !!!")
def loadComments(self,postElement): mydb = db() self.ScrollToElement(postElement) total_comments_in_post = self.totalCommentsInPost(postElement) if(total_comments_in_post != -1): postId = self.post_id(postElement) total_comments_scraped = totalCommentsScraped(postId) print("Total Comments scraped :",total_comments_scraped) print("Total Comments :",total_comments_in_post) if(total_comments_in_post > total_comments_scraped): try: mostRelevantElement = self.find_elem_by_class_name_with_wait("_6w1v",postElement) sleep_time = 2 while(mostRelevantElement.text != "Newest"): self.ScrollToElement(mostRelevantElement) mostRelevantElement.click() sleep(sleep_time) sleep_time *= 2 NewestElement = self.find_elems_by_class_name_with_wait("_54ni")[-1] self.ScrollToElement(NewestElement) NewestElement.click() mostRelevantElement = self.find_elem_by_class_name_with_wait("_6w1v",postElement) mostRelevantElement = "Most Relevant" except: print("No Relevancy Factor !!!") loadMoreElement = self.find_elems_by_class_name_with_wait("_4sxc",postElement) SeeMoreElement = self.find_elems_by_class_name("_5v47",postElement) # loadMoreButtonElement = self.find_elem_by_class_name("_4sxc",loadMoreElement[0]) while(len(loadMoreElement) > 0): # print(loadMoreElement) # loadMoreButtonElement = self.find_elem_by_class_name("_4sxc",loadMoreElement[0]) # if(loadMoreButtonElement == None): # break for load in loadMoreElement: try: self.ScrollToElement(load) load.click() except: continue for more in SeeMoreElement: try: self.ScrollToElement(more) more.click() except: continue loadMoreElement = self.find_elems_by_class_name("_4sxc",postElement) SeeMoreElement = self.find_elems_by_class_name("_5v47",postElement) commentElements = self.find_elems_by_class_name("_4eek",postElement) commentIds = scrapedCommentsId() memberIds = list(scrapedMembersId()) for comment in commentElements: #print(comment.text) commentBy = self.comment_by(comment) commentId = commentBy+"&"+self.comment_timestamp(comment)+"&"+postId #print("CommentId :",commentId) if commentId not in commentIds: commentLabel = comment.get_attribute("aria-label") if(commentLabel == "Comment"): isReply = "No" ParentCommentId = "None" LastParentComment = comment else: isReply = "Yes" # try: # ParentCommentId = self.comment_by(LastParentComment) # except: ParentCommentElement = self.find_parent_comment_element(commentElements,comment) try: ParentCommentId = self.comment_by(ParentCommentElement) ParentCommentId +="&"+self.comment_timestamp(ParentCommentElement)+"&"+postId except: ParentCommentId = "NotFound"+"#"+postId commentDateTime = self.comment_at(comment) commentBy = self.comment_by(comment) if(commentBy not in memberIds): memberIds.append(commentBy) name = self.comment_by_name(comment) mem_data = [str(commentBy),str(name)] mydb.insert(mem_data,"fb_group_name") commentContent = self.comment_content(comment) commentParam = [commentId,postId,commentDateTime,isReply,ParentCommentId,commentBy,commentContent] print(commentParam) mydb.insert(commentParam,"fb_group_post_comments") else: print("No new comments !!") else: print("No comments present !!!")
return [] def scrapedMembersId(): try: mydb = db() whereCondn = "1" postIds, size = mydb.select("fb_group_name", "User ID", whereCondn) return list(np.array(postIds)[:, 0]) except: return [] if __name__ == "__main__": fb = fb_login() fbGrps = fb_group_posts(fb) # fbGrps.LoadGroup(50) postIds = scrapedPostsId() mydb = db() idx = 0 for postElem in fb.postElements: print(idx) idx += 1 Id = fbGrps.PostId(postElem) if (Id not in postIds): by = fbGrps.PostedBy(postElem) time = fbGrps.PostTimestamp(postElem) content = fbGrps.PostContent(postElem) typePost = fbGrps.PostType(postElem) mydb.insert([Id, content, time, by, typePost], "fb_group_posts")