def __new__(cls, end_date): min_datetime = pendulum.from_format(settings.FOUNDATION_DATE, settings.FILE_DATE_FORMAT) max_datetime = pendulum.today() try: end_date = pendulum.from_format(end_date, settings.FILE_DATE_FORMAT).date() except (ValueError, TypeError): raise exceptions.ArgumentTypeError(settings.END_DATE_ERROR) else: if min_datetime.date() <= end_date <= max_datetime.date(): return super().__new__(cls, end_date.year, end_date.month, end_date.day) else: raise exceptions.ArgumentTypeError(settings.END_DATE_ERROR)
def __new__(cls, cpu_count): if not cpu_count: cpu_count = mp.cpu_count() return super().__new__(cls, cpu_count) elif cpu_count in list(range(1, mp.cpu_count() + 1)): return super().__new__(cls, cpu_count) else: raise exceptions.ArgumentTypeError(settings.CPU_COUNT_ERROR)
def __new__(cls, directory_path): if not directory_path: directory_path = pathlib.Path.cwd() return super().__new__(cls, directory_path) try: directory_path = pathlib.Path(directory_path) except TypeError: raise exceptions.ArgumentTypeError(settings.DIRECTORY_PATH_ERROR) else: return super().__new__(cls, directory_path)
def __init__(self, name, region_names): regions = sc.download_regions(name) if set(region_names).issubset(regions.keys()): super().__init__(sorted(set(region_names))) else: raise exceptions.ArgumentTypeError(settings.REGION_NAME_ERROR)
def __new__(cls, periodicity): if periodicity in settings.PERIODICITY_CODES: return super().__new__(cls, periodicity) else: raise exceptions.ArgumentTypeError(settings.PERIODICITY_ERROR)
def __new__(cls, name): if name in settings.NAME_CODES: return super().__new__(cls, name) else: raise exceptions.ArgumentTypeError(settings.NAME_ERROR)
def __init__(self, name, periodicity, region_names, begin_date, end_date, cpu_count=None, directory_path=None): self._name = classes.Name(name) self._periodicity = classes.Periodicity(periodicity) self._region_names = classes.RegionNames(name, region_names) self._begin_date = classes.BeginDate(begin_date) self._end_date = classes.EndDate(end_date) if self._begin_date > self._end_date: raise exceptions.ArgumentTypeError(settings.DATE_RANGE_ERROR) self._cpu_count = classes.CpuCount(cpu_count) self._directory_path = classes.DirectoryPath(directory_path) name_code = settings.NAME_CODES[self.name] periodicity_code = settings.PERIODICITY_CODES[self.periodicity] all_regions = download_regions(self.name) regions = {} for region_name in all_regions: if region_name in self.region_names: regions[region_name] = all_regions[region_name] regions_items = sorted(regions.items(), key=lambda region: region[0]) regions_items = auto.tqdm(regions_items) if self.name == 'top200': column_names = copy.deepcopy(settings.TOP200_CHART_COLUMN_NAMES) else: column_names = copy.deepcopy(settings.VIRAL50_CHART_COLUMN_NAMES) column_names.extend(['region_name', 'date']) begin_date = self.begin_date.format(settings.FILE_DATE_FORMAT) end_date = self.end_date.format(settings.FILE_DATE_FORMAT) extension = settings.FILE_EXTENSION for region_name, region_code in regions_items: file_name = f'{self.name}_{self.periodicity}_charts_from_{begin_date}_to_{end_date}.{extension}' directory_path = pathlib.Path(self.directory_path).joinpath( self.name, self.periodicity, region_name) directory_path.mkdir(parents=True, exist_ok=True) file_path = directory_path.joinpath(file_name) if file_path.exists(): region_charts = pd.read_csv(file_path, sep=settings.FILE_DELIMITER, encoding=settings.FILE_ENCODING) else: region_charts = pd.DataFrame(columns=column_names) file_dates = [] for file_date in region_charts['date'].unique(): file_date = pendulum.instance( pd.Timestamp(file_date).to_pydatetime()).date() file_dates.append(file_date) current_time = pendulum.now().format( settings.PROGRESS_BAR_TIME_FORMAT) description = f'{current_time} | {region_name}' regions_items.set_description(description) all_dates = download_dates(self.name, self.periodicity, region_name) urls = [] dates = [] for date, date_code in all_dates.items(): if self.begin_date <= date <= self.end_date and date not in file_dates: url = f'{settings.SPOTIFY_CHARTS_URL}/{name_code}/{region_code}/{periodicity_code}/{date_code}' urls.append(url) dates.append(date) logger.info(f'{region_name}:{len(urls)}') with mp.Pool(self.cpu_count) as pool: downloaded_charts = pool.map(classes.Chart, urls) for chart, date in zip(downloaded_charts, dates): if not chart.empty: chart['region_name'] = region_name chart['date'] = date chart['date'] = pd.to_datetime(chart['date']) if downloaded_charts: data = region_charts.append(downloaded_charts, sort=True) data.reset_index(drop=True, inplace=True) data.sort_values(by=['date', 'track_position'], ascending=[False, True], inplace=True) data = data[column_names] data.to_csv(file_path, sep=settings.FILE_DELIMITER, encoding=settings.FILE_ENCODING, index=False) regions_items.close()