Esempio n. 1
0
 def get_last_log_info(parser_name):
     result_dict = dict(
         log_name="Parser not started yet",
         new_items=0,
         total_channels=0,
         total_programs=0,
         log_cr_tm="Parser not started yet",
         log_status="Parser not started yet",
         execution_time=0,
     )
     db.execute(
         """ SELECT * FROM log WHERE parser_name='%(parser_name)s'
                    ORDER BY cr_tm DESC; """
         % {"parser_name": parser_name}
     )
     result = db.fetchone()
     if not result:
         return result_dict
     return dict(
         log_name=result.get("parser_name"),
         new_items=result.get("count_new_items"),
         total_channels=result.get("total_channels"),
         total_programs=result.get("total_programs"),
         log_cr_tm=result.get("cr_tm"),
         log_status=result.get("success"),
         execution_time=hours_minutes_seconds_from_seconds(result.get("execution_time")),
     )
Esempio n. 2
0
 def parse_url_channels(self):
     # run_phantomjs()
     time.sleep(20)
     self.driver_start()
     write_to_log('Start channels parsing')
     func_tm = time.time()
     page_height = 0
     elements = {}
     scroll_height_script = """ return window.innerHeight + window.scrollY """
     while page_height != self.driver.execute_script(scroll_height_script):
         page_height = self.driver.execute_script(scroll_height_script)
         self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
         time.sleep(5)
     channels = self.driver.find_elements_by_css_selector(self.get_channel_css_selector())
     write_to_log('Found %s channels' % len(channels))
     for channel in channels:
         time.sleep(1)
         name = channel.find_element_by_css_selector(
             'span.tv-channel-title__text').text.encode('utf-8')
         href = channel.get_attribute('href').encode('utf-8')
         icon = channel.find_elements_by_css_selector(
             'div.tv-channel-title__icon > span[class$="image_type_channel"] > span')
         if icon:
             icon = self.get_background_image(icon[0]).encode('utf-8')
         if (href is not None) and (href not in elements.keys()):
             elements[href] = {'name': name, 'icon': icon}
     save_records = SaveRecordsToDb()
     new_elements_count = save_records.save_channels_to_db(elements)
     func_tm = int(time.time()-func_tm)
     text_for_log = 'Channels parsed successfully.{elements_count} new channels.' \
                    'Execution time: {func_tm}'.\
         format(elements_count=new_elements_count,
                func_tm=hours_minutes_seconds_from_seconds(func_tm))
     send_email(subject='Parser notification',
                text=text_for_log)
     write_to_log(text_for_log)
     SaveRecordsToDb.insert_log_info(execution_time=func_tm, new_items=new_elements_count)
     self.driver.close()
Esempio n. 3
0
 def parse_tv_programs(self):
     # run_phantomjs()
     time.sleep(20)
     self.driver_start()
     write_to_log('Start programs parsing')
     func_tm = time.time()
     ids_and_links = GetRecordsFromDb().get_channels_id_and_link()
     date_today = get_date_and_time_with_timezone()
     count_programs = 0
     for id_and_link in ids_and_links:
         channel = Channel(channel_id=id_and_link['id'])
         channel.update()
         if id_and_link.get('link'):
             self.driver.get(id_and_link.get('link'))
             time.sleep(4)
             if '404' not in self.driver.title:
                 if not channel.description or not channel.web_site:
                     channel_description = self.driver.find_elements_by_css_selector(
                         "tr.b-row div.b-tv-channel-content__text")
                     channel_description = channel_description[0].text.encode('utf-8')\
                         if channel_description else "This channel does not have description"
                     channel_web_site = self.driver.find_elements_by_css_selector(
                         "div.b-tv-channel-content__channel-info > "
                         "div.b-tv-channel-content__links > a")
                     channel_web_site = channel_web_site[0].get_attribute(
                         'href').encode('utf-8') \
                         if channel_web_site else "This channel does not have web site"
                     if len(channel_description) > Channel.description['length']:
                         channel_description = channel_description[:Channel.description[
                             'length']]
                     if len(channel_web_site) > Channel.web_site['length']:
                         channel_web_site = channel_web_site[:Channel.web_site['length']]
                     channel.description, channel.web_site = \
                         channel_description, channel_web_site
                     channel.update()
                 dates_of_week = list()
                 for date in self.driver.find_elements_by_css_selector(
                         'div.tv-filter-days__viewport > div.tv-filter-days__items > '
                         'div.tv-filter-days__item'):
                     date_of_week = re.findall(r'(\d{4}-\d{2}-\d{2})T',
                                               date.get_attribute('data-bem'))[0]
                     if datetime.datetime.strptime(date_today, '%Y-%m-%d') <= \
                             datetime.datetime.strptime(date_of_week, '%Y-%m-%d'):
                         dates_of_week.append(date_of_week)
                 dates_of_week = dates_of_week[:7] if len(dates_of_week) > 7 else dates_of_week
                 for day in dates_of_week:
                     self.driver.get("%(channel_link)s?date=%(date)s" %
                                     {'channel_link': id_and_link['link'], 'date': day})
                     time.sleep(1)
                     channels_tags = self.driver.find_elements_by_css_selector(
                         'div.b-tv-channel-schedule__items > '
                         'div.b-tv-channel-schedule__item > a')
                     tv_channels = []
                     for channel in channels_tags:
                         program_name = channel.find_element_by_class_name(
                             'tv-event__title-inner').text
                         show_time = channel.find_element_by_class_name(
                             'tv-event__time-text').text + ':00'
                         show_date = datetime.datetime.strptime(day, '%Y-%m-%d')
                         genre = json.loads(channel.get_attribute(
                             'data-bem'))['tv-event']['genre']
                         tv_channels.append(TvProgram(name=program_name, genre=genre,
                                                      show_date=show_date, show_time=show_time))
                         count_programs += 1
                     SaveRecordsToDb.save_programs(id_and_link['id'], tv_channels)
             else:
                 write_to_log('Error. Page {page} not found'.format(
                     page=self.driver.current_url))
                 send_email(subject='Page not found',
                            text='Page {page} not found'.format(page=self.driver.current_url))
         else:
             write_to_log('Wrong channel link %s. Channel id %s' %
                          (id_and_link.get('link'), id_and_link.get('id')))
     func_tm = time.time() - func_tm
     text_for_log = 'Tv programs parsed successfully.' \
                    'Execution time: %s' % hours_minutes_seconds_from_seconds(func_tm)
     send_email(subject='Parser notification',
                text=text_for_log)
     write_to_log(text_for_log)
     SaveRecordsToDb.insert_log_info(parser_name='tv_programs', new_items=count_programs,
                                     execution_time=func_tm)
     self.driver.close()