def _prepare_list_of_works(self, lst): result = list() result_2 = list() scraped_links = list() for work_data_container in lst: tup = None scheme, url = utils.split_url_and_scheme(work_data_container.url) identifier = self._get_page_identifier_(url) if identifier == -1: self._insert_links([work_data_container.url]) identifier = self._get_page_identifier_(url) pagecontent = work_data_container.page_content_container if pagecontent: if pagecontent.in_links: scraped_links.extend(pagecontent.in_links) if pagecontent.har: processed_har = self._process_har(pagecontent.har) # self._associate_page_har_url(identifier, processed_har) result_2.append((identifier, processed_har)) if pagecontent.article_c: tup = self._prepare_tuple_with_article(work_data_container) else: tup = self._prepare_tuple_without_article( work_data_container) else: tup = self._prepare_tuple_failed_work(work_data_container) result.append(tup) self._insert_links(scraped_links) return result, result_2
def _add_works(self, urls, work_status=WorkStatus.ProcessingInQueue): result = False with self.urls_dict_lock: for url in urls: scheme, cleaned_url = utils.split_url_and_scheme(url) if cleaned_url not in self._jobs_info.keys(): self._jobs_info[cleaned_url] = WorkInfo(cleaned_url, protocol=scheme) self._jobs_info[cleaned_url].work_status = work_status result = True return result
def _prepare_tuple_failed_work(self, work_data_container): url = utils.clean_url(work_data_container.url, False) scheme, url = utils.split_url_and_scheme(url) scraped_flag = work_data_container.scraped attempts_count = work_data_container.attempts_count mime_type = work_data_container.mime_type response_code = work_data_container.http_response_code url_to_refer = work_data_container.url_to_refer error_text = work_data_container.error_text return scraped_flag, attempts_count, mime_type, response_code, None, url_to_refer, \ None, False, None, None, None, \ None, None, None, None, None, error_text, url, 0
def _release_accomplished_work(self, urls): self.num_of_processed_urls = self.num_of_processed_urls + 1 with self.urls_dict_lock: for url in urls: scheme, cleaned_url = utils.split_url_and_scheme(url) if cleaned_url not in self._jobs_info.keys(): self._add_work_unsafe(cleaned_url, scheme, work_status=WorkStatus.Processed) else: self._jobs_info[ cleaned_url].work_status = WorkStatus.Processed
def _release_failed_work(self, url, error_text): self.num_of_failed_urls = self.num_of_failed_urls + 1 with self.urls_dict_lock: scheme, cleaned_url = utils.split_url_and_scheme(url) if cleaned_url not in self._jobs_info.keys(): self._add_work_unsafe(cleaned_url, scheme, work_status=WorkStatus.UnderProcessing) self._jobs_info[cleaned_url].failed_attempts += 1 self._jobs_info[cleaned_url].error_text = error_text self._jobs_info[ cleaned_url].work_status = WorkStatus.ProcessingInQueue
def _prepare_tuple_without_article(self, work_data_container): har = None url = utils.clean_url(work_data_container.url, False) scheme, url = utils.split_url_and_scheme(url) scraped_flag = work_data_container.scraped attempts_count = work_data_container.attempts_count mime_type = work_data_container.mime_type response_code = work_data_container.http_response_code url_to_refer = work_data_container.url_to_refer pagecontent = work_data_container.page_content_container return scraped_flag, attempts_count, mime_type, \ response_code, pagecontent.language, url_to_refer, pagecontent.text,\ False, None, None, None, None, None, None, None, har, None, url, 0
def _insert_links(self, urls): urls = list(set(urls)) tps = list() for url in urls: is_webnews = Article.is_valid_url(url) scheme, cleaned_url = utils.split_url_and_scheme(url) tps.append((cleaned_url, scheme, False, 0, is_webnews)) try: self.insert_data( 'pages', ['url', 'protocol', 'scraped', 'attempts_count', 'is_webnews'], tps) except Exception as ex: self.last_exception = 'insert_links: ' + str(ex)
def _prepare_tuple_with_article(self, work_data_container): har = None url = utils.clean_url(work_data_container.url, False) scheme, url = utils.split_url_and_scheme(url) scraped_flag = work_data_container.scraped attempts_count = work_data_container.attempts_count mime_type = work_data_container.mime_type response_code = work_data_container.http_response_code url_to_refer = work_data_container.url_to_refer pagecontent = work_data_container.page_content_container art_container = pagecontent.article_c videos = ','.join(art_container.videos) authors = ','.join(art_container.authors) sections = ','.join(art_container.sections) publish_date = art_container.publish_date if publish_date and isinstance(publish_date, datetime.datetime): publish_date = utils.convert_datetime_to_format_str(publish_date) return scraped_flag, attempts_count, mime_type, response_code, pagecontent.language, url_to_refer, \ pagecontent.text, True, art_container.title,\ art_container.text, publish_date, \ art_container.top_img, videos, authors, sections, har, None, url, 0
def _get_canonical_url(self): result = None try: tmp_res = self.driver.find_element_by_xpath( '//link[@rel="canonical" and @href]') if tmp_res: href = tmp_res.get_attribute("href") if href: # domain = utils.get_principal_domain(self.current_url) result = href except NoSuchElementException: pass except TimeoutException: pass except Exception: pass if result is None: try: tmp_res = self.driver.find_element_by_xpath( '//meta[@property="og:url"]|//meta[@name="twitter:url"]') result = tmp_res.get_attribute('content') except NoSuchElementException: pass except TimeoutException: pass except Exception: pass if result: result = utils.clean_url(result, False) tmp = utils.clean_url(self.current_url, False) scheme, u = utils.split_url_and_scheme(tmp) if result.startswith(r'//'): result = '{}:{}'.format(scheme, result) elif result.startswith(r'/'): domain = '{}://{}'.format(scheme, utils.get_principal_domain_www(tmp)) result = '{}{}'.format(domain, result) if not utils.is_valid_url_to_navigate(result): result = None return result