Exemple #1
0
    def _parse_calendars(self, input_calendars: List[tuple]) -> dict:
        print("Parsing input calendars...")

        timestamp = datetime.now()
        input_tuples = []
        for index, calendar_tuple in enumerate(input_calendars):
            _, calendar_url, _ = calendar_tuple
            website_base = utils.get_base_by_url(calendar_url)
            input_tuples.append((index + 1, len(input_calendars),
                                 calendar_tuple, timestamp, website_base))

        with multiprocessing.Pool(32) as p:
            events_lists = p.map(
                FillCountsForCalendars._parse_calendars_process, input_tuples)

        events_to_insert = {
            calendar_id: {
                'all': len(events_list),
                'new': 0
            }
            for element in events_lists
            for calendar_id, events_list in element.items()
        }

        return self._get_new_counts(events_to_insert)
    def _get_events_count_per_parser(events_per_calendar: dict) -> dict:
        events_per_parser_dict = defaultdict(int)

        for calendar_url, events_count in events_per_calendar.items():
            calendar_base = utils.get_base_by_url(calendar_url)
            calendar_parser = calendar_base['parser']
            events_per_parser_dict[calendar_parser] += events_count

        return events_per_parser_dict
Exemple #3
0
    def _parse_events(self, input_events: List[tuple]) -> List[tuple]:
        self.logger.info("Parsing events...")

        logger.set_up_simple_logger(SIMPLE_LOGGER_PREFIX + __file__,
                                    log_file=self.args.log_file, log_level=self.args.log_level)
        timestamp = datetime.now()
        input_tuples = []
        for index, event_tuple in enumerate(input_events):
            _, _, _, calendar_url = event_tuple
            website_base = utils.get_base_by_url(calendar_url)
            input_tuples.append((index + 1, len(input_events), event_tuple, timestamp, website_base))

        with multiprocessing.Pool(32) as p:
            return p.map(ParseEvents._parse_events_process, input_tuples)
Exemple #4
0
    def _download_events_process(input_tuple: (
        int, int, List[tuple], datetime, bool)) -> (int, str, datetime, str):
        time.sleep(round(random.uniform(0, 5), 2))
        simple_logger = logging.getLogger(SIMPLE_LOGGER_PREFIX + __file__)

        calendar_index, total_length, events_list, timestamp, dry_run = input_tuple
        _, _, calendar_url = events_list[0]
        website_base = utils.get_base_by_url(calendar_url)

        current_dir = os.path.join(DATA_DIR_PATH, website_base["domain"])
        event_file_dir = os.path.join(current_dir,
                                      DownloadEvents.EVENTS_FOLDER_NAME)
        os.makedirs(event_file_dir, exist_ok=True)

        result_list = []
        for event_index, event in enumerate(events_list):
            event_id, event_url, _ = event
            event_file_name = timestamp.strftime(
                "%Y-%m-%d_%H-%M-%S") + "_" + str(event_id)
            html_file_path = os.path.join(event_file_dir,
                                          event_file_name + ".html")

            result = utils.download_html_content(
                event_url,
                html_file_path,
                encoding=website_base.get("encoding", None),
                verify=website_base.get("verify", None),
                dry_run=dry_run)
            if result != "200":
                html_file_path = None

            simple_logger.info(
                "{}/{} ({}/{}) | Downloading URL: {} | {}".format(
                    calendar_index, total_length, event_index + 1,
                    len(events_list), str(event_url), str(result)))

            result_list.append((event_id, html_file_path, timestamp, result))
        return result_list
Exemple #5
0
    def _parse_calendars(self, input_calendars: List[tuple]) -> dict:
        self.logger.info("Parsing calendars...")

        logger.set_up_simple_logger(SIMPLE_LOGGER_PREFIX + __file__,
                                    log_file=self.args.log_file,
                                    log_level=self.args.log_level)
        timestamp = datetime.now()
        input_tuples = []
        for index, calendar_tuple in enumerate(input_calendars):
            _, calendar_url, _ = calendar_tuple
            website_base = utils.get_base_by_url(calendar_url)
            input_tuples.append((index + 1, len(input_calendars),
                                 calendar_tuple, timestamp, website_base))

        with multiprocessing.Pool(32) as p:
            events_lists = p.map(ParseCalendars._parse_calendars_process,
                                 input_tuples)

        events_to_insert = {
            calendar_id: events_list
            for element in events_lists
            for calendar_id, events_list in element.items()
        }
        return events_to_insert