Ejemplo n.º 1
0
 def __init__(self, db_record=None):
     Crawler.__init__(self,
                      db_record,
                      self.origin,
                      self.base_url,
                      self.domain,
                      nested_scrape=False)
Ejemplo n.º 2
0
    def __init__(self, db_record=None):

        Crawler.__init__(self,
                         db_record,
                         self.origin,
                         self.base_url,
                         self.domain,
                         first_page_url=self.first_page_url)
Ejemplo n.º 3
0
 def try_except_function(self, *args, **kwargs):
     for i in range(10):
         try:
             results = func(self, *args, **kwargs)
             return results
         except (ClientError, Exception) as e:
             Crawler.log_error(e)
             if 'Not Found' in str(e):
                 raise ValueError('Профиль не найден')
             time.sleep(5)
             continue
Ejemplo n.º 4
0
def main():
    configure_logging(settings={
        'LOG_LEVEL': 'INFO'
    })
    logging.basicConfig(level=logging.INFO)
    
    Crawler().run_newsletter()
    subscriber_list = Subscriber.get_contacts()
    MailSender().send(subscriber_list)
Ejemplo n.º 5
0
 def get_subscribers_count(self, link, internal_id=None):
     screen_name = Crawler.get_screen_name(link)
     info = self.get_raw_info(screen_name=screen_name)
     subscribers = {
         'updated_at':
         datetime.now().astimezone().strftime('%Y-%m-%dT%H:%M:%S%z'),
         'count_subscribers':
         info['edge_followed_by']['count']
     }
     return subscribers
Ejemplo n.º 6
0
 def __request_json(self, url):
     for i in range(5):
         try:
             query = requests.get(url, headers=self.get_headers)
             if query.status_code == 200:
                 return ujson.loads(query.text)
             elif query.status_code == 403 or query.status_code == 429:
                 logging.warning(
                     f'Ошибка {query.status_code}. Возможно, невалидный токен'
                 )
                 self.guest_token = self.get_tokens()
             else:
                 raise RequestError(
                     f'Ошибка {query.status_code} при запросе JSON')
         except RequestError as e:
             time.sleep(self.delay_after_request_error)
             Crawler.log_error(e)
         except (URLError, ConnectionError, Exception) as e:
             time.sleep(self.delay_after_request_error)
             Crawler.log_error(e)
     raise GetInfoError('Не удается получить информацию об источнике')
Ejemplo n.º 7
0
 def get_subscribers_count(self, link, internal_id=None):
     screen_name = Crawler.get_screen_name(link)
     user_info = self.get_raw_info(screen_name=screen_name)
     key_exist = 'legacy' in user_info['data']['user']
     if not key_exist:
         raise GetInfoError(f'Ошибка получения подписчиков')
     info = user_info['data']['user']['legacy']
     subscribers = {
         'updated_at':
         datetime.now().astimezone().strftime('%Y-%m-%dT%H:%M:%S%z'),
         'count_subscribers':
         info['followers_count']
     }
     return subscribers
Ejemplo n.º 8
0
 def get_info(self, link, internal_id=None):
     screen_name = Crawler.get_screen_name(link)
     info = self.get_raw_info(screen_name=screen_name)
     parsed_info = {
         'name': info['full_name'],
         'link': f'https://www.instagram.com/{info["username"]}',
         'internal_id': info['id'],
         'avatar': info['profile_pic_url'],
         'type_social': 'IN',
     }
     if info['full_name'] == '':
         parsed_info.update({'name': info['username']})
     if not parsed_info['avatar']:
         parsed_info.update({'avatar': info['profile_pic_url_hd']})
     return parsed_info
Ejemplo n.º 9
0
 def get_info(self, link, internal_id=None):
     screen_name = Crawler.get_screen_name(link)
     user_info = self.get_raw_info(screen_name=screen_name)
     user_id = user_info['data']['user']['rest_id']
     key_exist = 'legacy' in user_info['data']['user']
     if not key_exist:
         raise GetInfoError(f'Ошибка получения информации об аккаунте')
     info = user_info['data']['user']['legacy']
     parsed_info = {
         'name': info['name'],
         'link': f'https://twitter.com/{info["screen_name"]}',
         'internal_id': str(user_id),
         'avatar': (info['profile_image_url_https']).replace('_normal', ''),
         'type_social': 'TW',
     }
     if info['name'] == '':
         parsed_info.update({'name': info['screen_name']})
     return parsed_info
Ejemplo n.º 10
0
    def __init__(self, db_record=None):

        Crawler.__init__(self, db_record, self.origin, self.base_url, self.domain, nested_scrape=False)
Ejemplo n.º 11
0
    def __init__(self, db_record=None):

        Crawler.__init__(self, db_record, self.origin, self.base_url, self.domain)
Ejemplo n.º 12
0
    def __init__(self, db_record=None):

        Crawler.__init__(self, db_record, self.origin, self.base_url, self.domain, first_page_url=self.first_page_url)
Ejemplo n.º 13
0
def main():
    Crawler(baseurl="https://movie.douban.com/top250?start=",
            save_path="database/douban_top250.db",
            max_page=10,
            max_per_page=25,
            patterns=DoubanPatterns).get_data().savedata()
Ejemplo n.º 14
0
    def __init__(self, db_record=None):

        Crawler.__init__(self, db_record, self.origin, self.base_url, self.domain)
Ejemplo n.º 15
0
 def get_internal_id(self, link):
     screen_name = Crawler.get_screen_name(link)
     info = self.get_raw_info(screen_name=screen_name)
     return info['id']