def chunk_date_range(start_date: DateTime, interval=1) -> Iterable[Mapping[str, any]]: """ Returns a list of the beginning and ending timetsamps of each day between the start date and now. The return value is a list of dicts {'oldest': float, 'latest': float} which can be used directly with the Slack API """ intervals = [] now = pendulum.now() # Each stream_slice contains the beginning and ending timestamp for a 24 hour period while start_date <= now: end = start_date.add(days=interval) intervals.append({"oldest": start_date.timestamp(), "latest": end.timestamp()}) start_date = start_date.add(days=1) return intervals
def round_to_closest_hour(time_data: pendulum.DateTime) -> pendulum.DateTime: if time_data.minute == 0: return time_data elif time_data.minute > 30: return time_data.add(minutes=60 - time_data.minute) else: return time_data.subtract(minutes=time_data.minute)
async def populate_paths(self, manifest_cache, entry_id: EntryID, early: DateTime, late: DateTime): # TODO : Use future manifest source field to follow files and directories async with trio.open_service_nursery() as child_nursery: child_nursery.start_soon(self.populate_source_path, manifest_cache, entry_id, early.add(microseconds=-1)) child_nursery.start_soon(self.populate_destination_path, manifest_cache, entry_id, late) child_nursery.start_soon(self.populate_current_path, manifest_cache, entry_id, early)
def _getIntradayPage(self, requestedDate: pendulum.DateTime, siteId: str): # Weirdly, to get a day's data, you request the next day in the API... Yep dateString = requestedDate.add(days=1).format('YYYYMMDD') url = f'{_host}/intraday.jsp?id=&sid={siteId}&dt={dateString}&gs=0&m=0' # self._delay() if not self.session: r = requests.get(url) else: r = self.session.get(url) return r
def chunk_date_range(start_date: DateTime) -> Iterable[Mapping[str, any]]: """ Returns a list of each day between the start date and now. Ignore weekends since exchanges don't run on weekends. The return value is a list of dicts {'date': date_string}. """ days = [] now = pendulum.now() while start_date < now: day_of_week = start_date.day_of_week if day_of_week != pendulum.SATURDAY & day_of_week != pendulum.SUNDAY: days.append({"date": start_date.to_date_string()}) start_date = start_date.add(days=1) return days
def get_offsets( subreddit: str, after: pendulum.DateTime, before: pendulum.DateTime, sample_size: int, PUSHSHIFT_LIMIT: int, ) -> list[pendulum.DateTime]: """For sampling, return a set of hourly offsets, beginning near after, that should not overlap""" duration = before - after info(f"{duration.in_days()=}") info(f"{duration.in_hours()=}") info(f"{duration.in_weeks()=}") results_total = get_pushshift_total(subreddit, after, before) results_per_hour = math.ceil(results_total / duration.in_hours()) info(f"{results_per_hour=} on average") info(f"{PUSHSHIFT_LIMIT=}") info(f"{sample_size=}") queries_total = math.ceil(sample_size / PUSHSHIFT_LIMIT) info(f"{queries_total=}") info(f"{range(duration.in_hours())=}") SEEDS_TO_TRY = 300 seed = int(after.timestamp()) for seed_counter in range(SEEDS_TO_TRY): seed += seed_counter # increment seed warning(f"attempt {seed_counter} to find non-overlapping offsets") offsets = get_cacheable_randos(duration.in_hours(), queries_total, seed) if is_overlapping(offsets, PUSHSHIFT_LIMIT, results_per_hour): critical(f" seed attempt {seed_counter} failed") continue else: break else: print( f"I exhausted random sets of offsets at {SEEDS_TO_TRY=}" f"Quitting because I'm too likely to pull overlapping results" ) raise RuntimeError offsets_as_datetime = [] for offset_as_hour in offsets: offset_as_datetime = after.add(hours=offset_as_hour) offsets_as_datetime.append(offset_as_datetime) info(f"{len(offsets)=}") return offsets_as_datetime
from pathlib import Path siteId = '57775' directory = Path('directory'+siteId) if not directory.exists(): directory.mkdir() # if not logged in then this will only work for th last 14 days testDate = DateTime(2019, 2, 1) pvo = PVOutput() pvo.login('username', 'password') for idx in range(1,140): dateString = testDate.to_date_string() print('creating file ', dateString) try: data = pvo.getIntradayData(testDate, siteId) except NameError as e: print(e) print("missing data for " + dateString) testDate = testDate.add(days=1) continue with open(directory.as_posix() + f'/{dateString}.csv','w', newline='') as csvFile: dataFile = csv.writer(csvFile) dataFile.writerow(data.headers) dataFile.writerows(data.data) testDate = testDate.add(days=1)