Esempio n. 1
0
 def _prepare_list_of_works(self, lst):
     result = list()
     result_2 = list()
     scraped_links = list()
     for work_data_container in lst:
         tup = None
         scheme, url = utils.split_url_and_scheme(work_data_container.url)
         identifier = self._get_page_identifier_(url)
         if identifier == -1:
             self._insert_links([work_data_container.url])
             identifier = self._get_page_identifier_(url)
         pagecontent = work_data_container.page_content_container
         if pagecontent:
             if pagecontent.in_links:
                 scraped_links.extend(pagecontent.in_links)
             if pagecontent.har:
                 processed_har = self._process_har(pagecontent.har)
                 # self._associate_page_har_url(identifier, processed_har)
                 result_2.append((identifier, processed_har))
             if pagecontent.article_c:
                 tup = self._prepare_tuple_with_article(work_data_container)
             else:
                 tup = self._prepare_tuple_without_article(
                     work_data_container)
         else:
             tup = self._prepare_tuple_failed_work(work_data_container)
         result.append(tup)
     self._insert_links(scraped_links)
     return result, result_2
Esempio n. 2
0
 def _add_works(self, urls, work_status=WorkStatus.ProcessingInQueue):
     result = False
     with self.urls_dict_lock:
         for url in urls:
             scheme, cleaned_url = utils.split_url_and_scheme(url)
             if cleaned_url not in self._jobs_info.keys():
                 self._jobs_info[cleaned_url] = WorkInfo(cleaned_url,
                                                         protocol=scheme)
                 self._jobs_info[cleaned_url].work_status = work_status
                 result = True
     return result
Esempio n. 3
0
 def _prepare_tuple_failed_work(self, work_data_container):
     url = utils.clean_url(work_data_container.url, False)
     scheme, url = utils.split_url_and_scheme(url)
     scraped_flag = work_data_container.scraped
     attempts_count = work_data_container.attempts_count
     mime_type = work_data_container.mime_type
     response_code = work_data_container.http_response_code
     url_to_refer = work_data_container.url_to_refer
     error_text = work_data_container.error_text
     return scraped_flag, attempts_count, mime_type, response_code, None, url_to_refer, \
      None, False, None, None, None, \
      None, None, None, None, None, error_text, url, 0
Esempio n. 4
0
 def _release_accomplished_work(self, urls):
     self.num_of_processed_urls = self.num_of_processed_urls + 1
     with self.urls_dict_lock:
         for url in urls:
             scheme, cleaned_url = utils.split_url_and_scheme(url)
             if cleaned_url not in self._jobs_info.keys():
                 self._add_work_unsafe(cleaned_url,
                                       scheme,
                                       work_status=WorkStatus.Processed)
             else:
                 self._jobs_info[
                     cleaned_url].work_status = WorkStatus.Processed
Esempio n. 5
0
 def _release_failed_work(self, url, error_text):
     self.num_of_failed_urls = self.num_of_failed_urls + 1
     with self.urls_dict_lock:
         scheme, cleaned_url = utils.split_url_and_scheme(url)
         if cleaned_url not in self._jobs_info.keys():
             self._add_work_unsafe(cleaned_url,
                                   scheme,
                                   work_status=WorkStatus.UnderProcessing)
         self._jobs_info[cleaned_url].failed_attempts += 1
         self._jobs_info[cleaned_url].error_text = error_text
         self._jobs_info[
             cleaned_url].work_status = WorkStatus.ProcessingInQueue
Esempio n. 6
0
 def _prepare_tuple_without_article(self, work_data_container):
     har = None
     url = utils.clean_url(work_data_container.url, False)
     scheme, url = utils.split_url_and_scheme(url)
     scraped_flag = work_data_container.scraped
     attempts_count = work_data_container.attempts_count
     mime_type = work_data_container.mime_type
     response_code = work_data_container.http_response_code
     url_to_refer = work_data_container.url_to_refer
     pagecontent = work_data_container.page_content_container
     return scraped_flag, attempts_count, mime_type, \
      response_code, pagecontent.language, url_to_refer, pagecontent.text,\
      False, None, None, None, None, None, None, None, har, None, url, 0
Esempio n. 7
0
    def _insert_links(self, urls):

        urls = list(set(urls))
        tps = list()
        for url in urls:
            is_webnews = Article.is_valid_url(url)
            scheme, cleaned_url = utils.split_url_and_scheme(url)
            tps.append((cleaned_url, scheme, False, 0, is_webnews))
        try:
            self.insert_data(
                'pages',
                ['url', 'protocol', 'scraped', 'attempts_count', 'is_webnews'],
                tps)
        except Exception as ex:
            self.last_exception = 'insert_links: ' + str(ex)
Esempio n. 8
0
 def _prepare_tuple_with_article(self, work_data_container):
     har = None
     url = utils.clean_url(work_data_container.url, False)
     scheme, url = utils.split_url_and_scheme(url)
     scraped_flag = work_data_container.scraped
     attempts_count = work_data_container.attempts_count
     mime_type = work_data_container.mime_type
     response_code = work_data_container.http_response_code
     url_to_refer = work_data_container.url_to_refer
     pagecontent = work_data_container.page_content_container
     art_container = pagecontent.article_c
     videos = ','.join(art_container.videos)
     authors = ','.join(art_container.authors)
     sections = ','.join(art_container.sections)
     publish_date = art_container.publish_date
     if publish_date and isinstance(publish_date, datetime.datetime):
         publish_date = utils.convert_datetime_to_format_str(publish_date)
     return scraped_flag, attempts_count, mime_type, response_code, pagecontent.language, url_to_refer, \
      pagecontent.text, True, art_container.title,\
      art_container.text, publish_date, \
      art_container.top_img, videos, authors, sections, har, None, url, 0
Esempio n. 9
0
 def _get_canonical_url(self):
     result = None
     try:
         tmp_res = self.driver.find_element_by_xpath(
             '//link[@rel="canonical" and @href]')
         if tmp_res:
             href = tmp_res.get_attribute("href")
             if href:
                 # domain = utils.get_principal_domain(self.current_url)
                 result = href
     except NoSuchElementException:
         pass
     except TimeoutException:
         pass
     except Exception:
         pass
     if result is None:
         try:
             tmp_res = self.driver.find_element_by_xpath(
                 '//meta[@property="og:url"]|//meta[@name="twitter:url"]')
             result = tmp_res.get_attribute('content')
         except NoSuchElementException:
             pass
         except TimeoutException:
             pass
         except Exception:
             pass
     if result:
         result = utils.clean_url(result, False)
         tmp = utils.clean_url(self.current_url, False)
         scheme, u = utils.split_url_and_scheme(tmp)
         if result.startswith(r'//'):
             result = '{}:{}'.format(scheme, result)
         elif result.startswith(r'/'):
             domain = '{}://{}'.format(scheme,
                                       utils.get_principal_domain_www(tmp))
             result = '{}{}'.format(domain, result)
         if not utils.is_valid_url_to_navigate(result):
             result = None
     return result