def tracking_calendar(request, year=None, week=None): ''' This view will get parameters from url and will try to generate a table of the current week where reminders we get can be tracked. ''' # If there is no year we generate it if year == None: ryear = date.today().year else: ryear = int(year) # The same with the week if week == None: rweek = date.today().isocalendar()[1] else: rweek = int(week) # If someone tries to insert manually a week which doesn't exist # we return him/her a 403 page (have to change with something else). if rweek > '53': raise PermissionDenied() isoweek = Week(ryear, rweek) sqlweek = [] for x in range(5): sqlweek.append(isoweek.day(x)) try: RemindersTable.objects.get(rday=isoweek.day(0)) except RemindersTable.DoesNotExist: for x in range(5): RemindersTable.objects.create(rday=isoweek.day(x)) tableresults = [] dayrow = [] for day in sqlweek: weekresults = RemindersTable.objects.filter(rday=day) for value in weekresults.values_list()[0]: dayrow.append(value) tableresults.append(dayrow) dayrow = [] headers = weekresults.values().field_names return render_to_response("calendar.html", {'sqlweek': sqlweek, 'headers': headers, 'tableresults': tableresults}, RequestContext(request))
def scrapper(self) -> str or None: date = Week(self.year_no, self.week_no) month = date.day(0).month options = webdriver.ChromeOptions() options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") options.add_argument("--headless") browser = webdriver.Chrome( options=options, executable_path=conf.CHROME_DRIVER_PATH ) log(log.INFO, "Start get url Norfolk Southern") browser.get(self.URL) log(log.INFO, "Get url Norfolk Southern") generated_html = browser.page_source soup = BeautifulSoup(generated_html, "html.parser") tags = soup.find_all("a") log(log.INFO, "Get all links Norfolk Southern") link = [ link.attrs["href"] for link in tags if f"weekly-performance-reports/{self.year_no}/investor-weekly-carloads" in link.attrs["href"] ] if not link: log(log.WARNING, "Links not found") return None log(log.INFO, "Get link with pdf for Norfolk Southern") link = "http://www.nscorp.com" + link[month - 1] log(log.INFO, "Found pdf link: [%s]", link) return link
def test_days(self): w = Week(2011, 20) self.assertEqual(w.monday().isoformat(), "2011-05-16") self.assertEqual(w.tuesday().isoformat(), "2011-05-17") self.assertEqual(w.wednesday().isoformat(), "2011-05-18") self.assertEqual(w.thursday().isoformat(), "2011-05-19") self.assertEqual(w.friday().isoformat(), "2011-05-20") self.assertEqual(w.saturday().isoformat(), "2011-05-21") self.assertEqual(w.sunday().isoformat(), "2011-05-22") self.assertEqual(w.day(0).isoformat(), "2011-05-16") self.assertEqual(w.day(-1).isoformat(), "2011-05-15") self.assertEqual(w.day(10).isoformat(), "2011-05-26") days = w.days() self.assertEqual(len(days), 7) self.assertEqual(days[0].isoformat(), "2011-05-16") self.assertEqual(days[-1].isoformat(), "2011-05-22") from datetime import date self.assertFalse(w.contains(date(2011, 5, 15))) self.assertTrue(w.contains(date(2011, 5, 16))) self.assertTrue(w.contains(date(2011, 5, 22))) self.assertFalse(w.contains(date(2011, 5, 23)))
def test_days(self): w = Week(2011, 20) self.assertEqual(w.monday().isoformat(), "2011-05-16") self.assertEqual(w.tuesday().isoformat(), "2011-05-17") self.assertEqual(w.wednesday().isoformat(), "2011-05-18") self.assertEqual(w.thursday().isoformat(), "2011-05-19") self.assertEqual(w.friday().isoformat(), "2011-05-20") self.assertEqual(w.saturday().isoformat(), "2011-05-21") self.assertEqual(w.sunday().isoformat(), "2011-05-22") self.assertEqual(w.day(0).isoformat(), "2011-05-16") self.assertEqual(w.day(-1).isoformat(), "2011-05-15") self.assertEqual(w.day(10).isoformat(), "2011-05-26") days = w.days() self.assertEqual(len(days), 7) self.assertEqual(days[0].isoformat(), "2011-05-16") self.assertEqual(days[-1].isoformat(), "2011-05-22") from datetime import date self.assertFalse(w.contains(date(2011,5,15))) self.assertTrue(w.contains(date(2011,5,16))) self.assertTrue(w.contains(date(2011,5,22))) self.assertFalse(w.contains(date(2011,5,23)))
def get_year_ago(self, dia=None, yearsago=None): if not dia: dia = self.day_present years_ago = self.years_ago if yearsago: years_ago = int(yearsago) logger.debug("Calculating {} year ago for {}".format(years_ago, dia)) current = dia.isocalendar() # 0 year, 1 week, 2, weekday year = current[0] week = current[1] weekday = current[2] - 1 # Isocalendar uses american weekdays week_current = Week(year, week) week_past = week_current.replace(year - years_ago, week) past_new = self.ensure_same_day_scenario( week_current, week_past, weekday) if past_new: self.day_year_ago = datetime.combine(past_new, datetime.min.time()) else: self.day_year_ago = datetime.combine( week_past.day(weekday), datetime.min.time()) logger.debug("SUMMARY") logger.debug(" - Present day: {}".format(week_current.day(weekday))) logger.debug(" - Past day ini: {}".format(week_past.day(weekday))) logger.debug( " - Past day correction: {}".format(self.day_year_ago.strftime("%Y-%m-%d"))) logger.info("{} year ago from {} was {}" .format(years_ago, self.day_present.strftime( "%Y-%m-%d"), self.day_year_ago.strftime("%Y-%m-%d"))) return self.day_year_ago
def rentrakHandler(): startUnPRow = 1 startMissRow = 1 for datasource in vendors['data']: keywords['datasource'] = datasource.get('datasource') keywords['cadence'] = datasource.get('cadence') for args in datasource.get('metadata'): if args.get('arrival_start_date'): start_date = getStartDate(args.get('arrival_start_date')) else: start_date = date(2016, 1, 1) missingHispDatesSet = [] missingDatesSet = [] keywords['type'] = args.get('type') keywords['country'] = args.get('country') RawAvailableDatesSet = ComscoreIntlTransform.namedtuple_with_defaults( 'RawAvailableDatesSet', 'general hispanic') CleanAvailableDatesSet = ComscoreIntlTransform.namedtuple_with_defaults( 'CleanAvailableDatesSet', 'general hispanic') AvailableDatesSet = ComscoreIntlTransform.namedtuple_with_defaults( 'AvailableDatesSet', 'general hispanic') UnprocessedDatesSet = ComscoreIntlTransform.namedtuple_with_defaults( 'UnprocessedDatesSet', 'general hispanic') MissedDatesSet = ComscoreIntlTransform.namedtuple_with_defaults( 'MissedDatesSet', 'general hispanic') RawAvailableDatesSet.general = set() RawAvailableDatesSet.hispanic = set() CleanAvailableDatesSet.general = set() CleanAvailableDatesSet.hispanic = set() args['cleanFlag'] = False args['vendor'] = keywords.get('vendor') response = dict() cumulativeResponse = [] for rawInfo in args.get('raw'): rawBucket = rawInfo.split('/')[0] rawPrefix = rawInfo.replace('/', '%', 1).split('%')[1] response = client.list_objects_v2(Bucket=rawBucket, Prefix=rawPrefix, Delimiter='/') cumulativeResponse.append( getFinalContentFromResponse(response, rawBucket)) finalContentList = [] for response in cumulativeResponse: AvailableDatesSet = ComscoreIntlTransform.getDictListForHPEOriginal( response, **args) RawAvailableDatesSet.general.update(AvailableDatesSet.general) if keywords.get('vendor') == 'rentrak': RawAvailableDatesSet.hispanic.update( AvailableDatesSet.hispanic) #Keep it false for rentrak and hpe args['cleanFlag'] = False cumulativeResponse = [] for cleanInfo in args.get('clean'): cleanBucket = cleanInfo.split('/')[0] cleanPrefix = cleanInfo.replace('/', '%', 1).split('%')[1] response = client.list_objects_v2(Bucket=cleanBucket, Prefix=cleanPrefix, Delimiter='/') cumulativeResponse.append( getFinalContentFromResponse(response, cleanBucket)) finalContentList = [] for response in cumulativeResponse: AvailableDatesSet = ComscoreIntlTransform.getDictListForHPEOriginal( response, **args) CleanAvailableDatesSet.general.update( AvailableDatesSet.general) if CleanAvailableDatesSet.hispanic: CleanAvailableDatesSet.hispanic.update( AvailableDatesSet.hispanic) UnprocessedDatesSet.general = RawAvailableDatesSet.general - CleanAvailableDatesSet.general if RawAvailableDatesSet.hispanic or CleanAvailableDatesSet.hispanic: UnprocessedDatesSet.hispanic = RawAvailableDatesSet.hispanic - CleanAvailableDatesSet.hispanic currentSheet = unprocessedWB.get_sheet_by_name( keywords.get('vendor')) #check if unprocessed dates set has only 1 element keywords['type'] = args.get('type') startUnPRow = rowWriter(currentSheet, sorted(UnprocessedDatesSet.general), startUnPRow, **keywords) # unprocessedWB.save(unprocessedWB_out) if RawAvailableDatesSet.hispanic or CleanAvailableDatesSet.hispanic: if UnprocessedDatesSet.hispanic: keywords['type'] = 'hispanic' startUnPRow = rowWriter( currentSheet, sorted(UnprocessedDatesSet.hispanic), startUnPRow, **keywords) unprocessedWB.save(unprocessedWB_out) if datasource.get('cadence') == "daily": missingDatesSet = [] missingDatesSet = set( d_range(start_date, date.today(), datasource.get( 'cadence'))) - (RawAvailableDatesSet.general.union( CleanAvailableDatesSet.general)) if RawAvailableDatesSet.hispanic or CleanAvailableDatesSet.hispanic: missingHispDatesSet = set( d_range(date(2016, 1, 1), date.today(), datasource.get('cadence'))) - ( RawAvailableDatesSet.hispanic.union( CleanAvailableDatesSet.hispanic)) if datasource.get('cadence') == 'weekly': missingDatesSet = [] weeksRange = [] rawAvailableWeeksSet = getWeeksSet( RawAvailableDatesSet.general) cleanAvailableWeeksSet = getWeeksSet( CleanAvailableDatesSet.general) weeks_set = set(w_range(start_date)) missingWeeksSet = weeks_set - ( rawAvailableWeeksSet.union(cleanAvailableWeeksSet)) for missingWeeks in missingWeeksSet: missingDatesSet.append(Week.day(missingWeeks, 0)) if RawAvailableDatesSet.hispanic or CleanAvailableDatesSet.hispanic: rawAvailableWeeksSet = getWeeksSet( RawAvailableDatesSet.hispanic) cleanAvailableWeeksSet = getWeeksSet( CleanAvailableDatesSet.hispanic) missingWeeksSet = weeks_set - ( rawAvailableWeeksSet.union(cleanAvailableWeeksSet)) for missingWeeks in missingWeeksSet: missingHispDatesSet.append(Week.day(missingWeeks, 0)) if datasource.get('cadence') == 'monthly': missingDatesSet = [] availableMonthsList = getMonthsSet( RawAvailableDatesSet.general.union( CleanAvailableDatesSet.general)) monthsRange = getMonthsRange(start_date=start_date) missingMonthsSet = set(monthsRange) - set(availableMonthsList) for yearMonth in missingMonthsSet: missingDate = yearMonth + '-01' missingDate = datetime.date( datetime.strptime(missingDate, '%Y-%m-%d')) missingDatesSet.append(missingDate) if RawAvailableDatesSet.hispanic or CleanAvailableDatesSet.hispanic: availableMonthsList = getMonthsSet( RawAvailableDatesSet.hispanic.union( CleanAvailableDatesSet.hispanic)) missingMonthsSet = set(monthsRange) - set( availableMonthsList) for yearMonth in missingMonthsSet: missingDate = yearMonth + '-01' missingDate = datetime.date( datetime.strptime(missingDate, '%Y-%m-%d')) missingHispDatesSet.append(missingDate) currentSheet = missingWB.get_sheet_by_name(keywords.get('vendor')) keywords['type'] = args.get('type') startMissRow = rowWriter(currentSheet, sorted(missingDatesSet), startMissRow, **keywords) missingWB.save(missingWB_out) if missingHispDatesSet: keywords['type'] = 'hispanic' startMissRow = rowWriter(currentSheet, sorted(missingHispDatesSet), startMissRow, **keywords) missingWB.save(missingWB_out) print("Done ......." + datasource.get('datasource'))
def scrapper(company: str, week: int, year: int, url: str) -> str or None: options = webdriver.ChromeOptions() options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") options.add_argument("--headless") browser = webdriver.Chrome(options=options, executable_path=conf.CHROME_DRIVER_PATH) browser.get(url) generated_html = browser.page_source soup = BeautifulSoup(generated_html, "html.parser") if company == "csx": links = soup.find_all("a", class_="module_link") while len(links) < 53: browser.get(url) generated_html = browser.page_source soup = BeautifulSoup(generated_html, "html.parser") links = soup.find_all("a", class_="module_link") sleep(1) for i in links: scrap_data = i.span.text.split() scrap_year = scrap_data[0] scrap_week = scrap_data[2] if scrap_year == str(year) and scrap_week == str(week): link = i["href"] log(log.INFO, "Found pdf link: [%s]", link) return link log(log.WARNING, "Links not found") return None elif company == "union": links = soup.find_all("a", class_="pdf") for i in links: scrap_data = i.text.split() scrap_week = scrap_data[1] if str(week) == scrap_week: link = "https://www.up.com" + i["href"] log(log.INFO, "Found pdf link: [%s]", link) return link log(log.WARNING, "Links not found") return None elif company == "kansas_city_southern": links = soup.find_all("a", class_="ext-link") for i in links: if len(str(week)) == 1: week = f"0{week}" scrap_data = i.attrs["href"].split("/")[6] scrap_date = scrap_data.split("-") scrap_week = scrap_date[1] scrap_year = scrap_date[4] if str(week) == scrap_week and str(year) == scrap_year: link = "https://investors.kcsouthern.com" + i.attrs["href"] log(log.INFO, "Found pdf link: [%s]", link) return link log(log.WARNING, "Links not found") return None elif company == "canadian_national": return None elif company == "bnsf": links = soup.find_all("a", class_="local-link") links_pdf = [] for link in links: a = link.attrs["href"].split("/")[-1].split(".")[-1] text = link.text match = re.search(r"\d{2}\/\d{2}\/\d{4}", text) if a == "pdf" and match: links_pdf.append(link) for i in links_pdf: scrap_date = i.text.split("/") date = datetime( month=int(scrap_date[0]), day=int(scrap_date[1]), year=int(scrap_date[2]), ) scrap_week = date.isocalendar()[1] if week == scrap_week and int(scrap_date[2]) == year: link = "http://www.bnsf.com" + i["href"] log(log.INFO, "Found pdf link: [%s]", link) return link log(log.WARNING, "Links not found") return None elif company == "norfolk_southern": date = Week(year, week) month = date.day(0).month tags = soup.find_all("a") log(log.INFO, "Get all links Norfolk Southern") link = [ link.attrs["href"] for link in tags if f"weekly-performance-reports/{year}/investor-weekly-carloads" in link.attrs["href"] ] if not link: log(log.WARNING, "Links not found") return None log(log.INFO, "Get link with pdf for Norfolk Southern") link = "http://www.nscorp.com" + link[month-1] log(log.INFO, "Found pdf link: [%s]", link) return link elif company == 'canadian_pacific': tags = soup.find_all('a', class_="button-link") while len(tags) != 2: browser.get(url) generated_html = browser.page_source soup = BeautifulSoup(generated_html, "html.parser") tags = soup.find_all('a', class_="button-link") sleep(1) link = tags[0].attrs["href"] date = link.split("/") scrap_week = datetime(year=int(date[6]), month=int(date[7]), day=int(date[8])).isocalendar()[1] if week == scrap_week and int(date[6]) == year: log(log.INFO, "Found pdf link: [%s]", link) return link log(log.WARNING, "Links not found") return None
def processExecute(vendors, inputStartDate, inputEndDate, **keywords): startUnPRow = 1 startMissRow = 1 WBs = ExcelUtilities.loadWorbook() unprocessedWB = WBs.unprocessed missingWB = WBs.missing for datasource in vendors['data']: keywords['cadence'] = datasource.get('cadence') vendor = keywords.get('vendor') for args in datasource.get('metadata'): if args.get('arrival_start_date'): arrival_date = args.get('arrival_start_date') arrival_date = datetime.date( datetime.strptime(arrival_date, "%Y-%m-%d")) start_date = GeneralUtils.getStartDate( max(arrival_date, inputStartDate)) else: start_date = GeneralUtils.getStartDate( max(inputStartDate, date(2017, 1, 1))) end_date = GeneralUtils.getEndDate(inputEndDate) missingDatesSet = [] yearMonthsRangeList = GeneralUtils.getMonthsRange( start_date, end_date) keywords['type'] = args.get('type') country_list = args.get('country') keywords['regex'] = args.get('regex') for country in country_list: keywords['country'] = country RawAvailableDatesSet = GeneralUtils.namedtuple_with_defaults( 'RawAvailableDatesSet', 'general') CleanAvailableDatesSet = GeneralUtils.namedtuple_with_defaults( 'CleanAvailableDatesSet', 'general') UnprocessedDatesSet = GeneralUtils.namedtuple_with_defaults( 'UnprocessedDatesSet', 'general') RawAvailableDatesSet.general = set() CleanAvailableDatesSet.general = set() args['vendor'] = vendor response = dict() cumulativeResponse = [] for rawInfo in args.get('raw'): for yearMonth_prefix in yearMonthsRangeList: rawBucket = rawInfo.split('/')[0] raw = rawInfo.replace('/', '%', 1).split('%')[1] subs_value = { 'country': country, 'year': yearMonth_prefix.split('-')[0], 'month': yearMonth_prefix.split('-')[1] } rawPrefix = GeneralUtils.prefixBuilder( raw, **subs_value) response = client.list_objects_v2(Bucket=rawBucket, Prefix=rawPrefix, Delimiter='/') cumulativeResponse.append( S3Utilities.getFinalContentFromResponse( client, response, rawBucket)) S3Utilities.finalContentList = [] # finalContentList = [] flat_cumulativeResponse = [ item for sublist in cumulativeResponse for item in sublist ] AvailableDatesAndSourceDict = hpeRawFileHandler( flat_cumulativeResponse, keywords.get('regex')) for source, dates in AvailableDatesAndSourceDict.items(): keywords['datasource'] = source RawAvailableDatesSet.general.update(dates) cumulativeResponse = [] for cleanInfo in args.get('clean').get(source): for yearMonth_prefix in yearMonthsRangeList: cleanBucket = cleanInfo.split('/')[0] clean = cleanInfo.replace('/', '%', 1).split('%')[1] subs_value = { 'country': country.lower(), 'year': yearMonth_prefix.split('-')[0], 'month': yearMonth_prefix.split('-')[1] } cleanPrefix = GeneralUtils.prefixBuilder( clean, **subs_value) response = client.list_objects_v2( Bucket=cleanBucket, Prefix=cleanPrefix, Delimiter='/') # finalContentList = [] AvailableDatesSet = hpeCleanFileHandler(response) CleanAvailableDatesSet.general.update( AvailableDatesSet.general) UnprocessedDatesSet.general = RawAvailableDatesSet.general - CleanAvailableDatesSet.general UnprocessedDatesSet.general = GeneralUtils.getFilteredDates( UnprocessedDatesSet.general, start_date, end_date) currentSheet = unprocessedWB[vendor] #check if unprocessed dates set has only 1 element keywords['type'] = args.get('type') startUnPRow = hpeRowWriter( currentSheet, sorted(UnprocessedDatesSet.general), startUnPRow, **keywords) unprocessedWB.save(unprocessedWB_out) if datasource.get('cadence') == "daily": missingDatesSet = set( GeneralUtils.d_range(start_date, end_date)) - ( RawAvailableDatesSet.general.union( CleanAvailableDatesSet.general)) if datasource.get('cadence') == 'weekly': missingDatesSet = [] weeks_set = set( GeneralUtils.w_range(start_date=start_date, end_date=end_date)) rawAvailableWeeksSet = GeneralUtils.getWeeksSet( RawAvailableDatesSet.general) cleanAvailableWeeksSet = GeneralUtils.getWeeksSet( CleanAvailableDatesSet.general) missingWeeksSet = weeks_set - ( rawAvailableWeeksSet.union(cleanAvailableWeeksSet)) for missingWeeks in missingWeeksSet: missingDatesSet.append(Week.day(missingWeeks, 0)) if datasource.get('cadence') == 'monthly': missingDatesSet = [] monthsRange = GeneralUtils.getMonthsRange( start_date=start_date, end_date=end_date) availableMonthsList = GeneralUtils.getMonthsSet( RawAvailableDatesSet.general.union( CleanAvailableDatesSet.general)) missingMonthsSet = set(monthsRange) - set( availableMonthsList) for yearMonth in missingMonthsSet: missingDate = yearMonth + '-01' missingDate = datetime.date( datetime.strptime(missingDate, '%Y-%m-%d')) missingDatesSet.append(missingDate) currentSheet = missingWB[vendor] keywords['type'] = args.get('type') startMissRow = hpeRowWriter(currentSheet, sorted(missingDatesSet), startMissRow, **keywords) missingWB.save(missingWB_out) print("Done ......." + datasource.get('datasource'))
date(2017, 1, 1), date.today(), datasource.get('cadence'))) - ( rawAvailableDatesSet.union(cleanAvailableDatesSet)) # compiled_List.append(missingDatesSet) if datasource.get('cadence') == 'weekly': missingDatesSet = [] weeksRange = [] rawAvailableWeeksSet = getWeeksSet(rawAvailableDatesSet) cleanAvailableWeeksSet = getWeeksSet(cleanAvailableDatesSet) weeks_set = set(w_range(date(2017, 1, 1))) missingWeeksSet = weeks_set - ( rawAvailableWeeksSet.union(cleanAvailableWeeksSet)) for missingWeeks in missingWeeksSet: missingDatesSet.append(Week.day(missingWeeks, 0)) if datasource.get('cadence') == 'monthly': missingDatesSet = [] availableMonthsList = getMonthsSet( rawAvailableDatesSet.union(cleanAvailableDatesSet)) monthsRange = getMonthsRange(start_date=date(2017, 1, 1)) missingMonthsSet = set(monthsRange) - set(availableMonthsList) for yearMonth in missingMonthsSet: missingDate = yearMonth + '-01' missingDate = datetime.date( datetime.strptime(missingDate, '%Y-%m-%d')) missingDatesSet.append(missingDate) currentSheet = missingWB.get_sheet_by_name(keywords.get('vendor'))