def _parse_calendars(self, input_calendars: List[tuple]) -> dict: print("Parsing input calendars...") timestamp = datetime.now() input_tuples = [] for index, calendar_tuple in enumerate(input_calendars): _, calendar_url, _ = calendar_tuple website_base = utils.get_base_by_url(calendar_url) input_tuples.append((index + 1, len(input_calendars), calendar_tuple, timestamp, website_base)) with multiprocessing.Pool(32) as p: events_lists = p.map( FillCountsForCalendars._parse_calendars_process, input_tuples) events_to_insert = { calendar_id: { 'all': len(events_list), 'new': 0 } for element in events_lists for calendar_id, events_list in element.items() } return self._get_new_counts(events_to_insert)
def _get_events_count_per_parser(events_per_calendar: dict) -> dict: events_per_parser_dict = defaultdict(int) for calendar_url, events_count in events_per_calendar.items(): calendar_base = utils.get_base_by_url(calendar_url) calendar_parser = calendar_base['parser'] events_per_parser_dict[calendar_parser] += events_count return events_per_parser_dict
def _parse_events(self, input_events: List[tuple]) -> List[tuple]: self.logger.info("Parsing events...") logger.set_up_simple_logger(SIMPLE_LOGGER_PREFIX + __file__, log_file=self.args.log_file, log_level=self.args.log_level) timestamp = datetime.now() input_tuples = [] for index, event_tuple in enumerate(input_events): _, _, _, calendar_url = event_tuple website_base = utils.get_base_by_url(calendar_url) input_tuples.append((index + 1, len(input_events), event_tuple, timestamp, website_base)) with multiprocessing.Pool(32) as p: return p.map(ParseEvents._parse_events_process, input_tuples)
def _download_events_process(input_tuple: ( int, int, List[tuple], datetime, bool)) -> (int, str, datetime, str): time.sleep(round(random.uniform(0, 5), 2)) simple_logger = logging.getLogger(SIMPLE_LOGGER_PREFIX + __file__) calendar_index, total_length, events_list, timestamp, dry_run = input_tuple _, _, calendar_url = events_list[0] website_base = utils.get_base_by_url(calendar_url) current_dir = os.path.join(DATA_DIR_PATH, website_base["domain"]) event_file_dir = os.path.join(current_dir, DownloadEvents.EVENTS_FOLDER_NAME) os.makedirs(event_file_dir, exist_ok=True) result_list = [] for event_index, event in enumerate(events_list): event_id, event_url, _ = event event_file_name = timestamp.strftime( "%Y-%m-%d_%H-%M-%S") + "_" + str(event_id) html_file_path = os.path.join(event_file_dir, event_file_name + ".html") result = utils.download_html_content( event_url, html_file_path, encoding=website_base.get("encoding", None), verify=website_base.get("verify", None), dry_run=dry_run) if result != "200": html_file_path = None simple_logger.info( "{}/{} ({}/{}) | Downloading URL: {} | {}".format( calendar_index, total_length, event_index + 1, len(events_list), str(event_url), str(result))) result_list.append((event_id, html_file_path, timestamp, result)) return result_list
def _parse_calendars(self, input_calendars: List[tuple]) -> dict: self.logger.info("Parsing calendars...") logger.set_up_simple_logger(SIMPLE_LOGGER_PREFIX + __file__, log_file=self.args.log_file, log_level=self.args.log_level) timestamp = datetime.now() input_tuples = [] for index, calendar_tuple in enumerate(input_calendars): _, calendar_url, _ = calendar_tuple website_base = utils.get_base_by_url(calendar_url) input_tuples.append((index + 1, len(input_calendars), calendar_tuple, timestamp, website_base)) with multiprocessing.Pool(32) as p: events_lists = p.map(ParseCalendars._parse_calendars_process, input_tuples) events_to_insert = { calendar_id: events_list for element in events_lists for calendar_id, events_list in element.items() } return events_to_insert