def get_posts_with_links_and_time(self,html): post_data = {} post_data["link"] = [] post_data["results"] = [] post_data["all_epoch_time"] = [] posts_division = html.split("userContentWrapper _5pcr") del posts_division[0] for post in posts_division: post_data["link"].append(utils.get_data_from_pattern(self.patterns["link_pattern"], post)) post_data["results"].append(''.join(re.findall(self.patterns["result_pattern"], post))) post_data["all_epoch_time"].append(utils.get_data_from_pattern(self.patterns["epoch_time_pattern"], post)) return post_data
def collect_data(self , list_of_groups): for group in list_of_groups: self.driver.get(self.patterns["group_pattern_url"] + group.group_link) page_html = self.driver.page_source.encode('utf-8') group_name = utils.get_data_from_pattern(self.patterns["group_name_pattern"], page_html) print "we now in group: " + group_name # self.join_group(page_html) query_appearences = self.driver.find_elements_by_name("query") if query_appearences: search_box = query_appearences[1] search_box.clear() search_box.send_keys(group.keyword) self.driver.find_element_by_xpath('//button[@type="submit" and @title="Search this group"]').click() html = self.driver.page_source.encode('utf-8') post_data = self.get_posts_with_links_and_time(html) if len(post_data["link"]) > 0: for i in range(len(post_data["link"])): if not post_data["link"][i] == None: new_post = Posts(user_who_post_id = group.user_own_id, group_that_was_post_id = group.id , keyword = group.keyword ,post_text = post_data["results"][i] ,link = post_data["link"][i] , pub_date =post_data["all_epoch_time"][i] ) new_post.save()