def convert_site_colorspace(): filename = f"convert_{file_safe_timestamp()}.log" f = open(f"{STORAGE_LOGS_PATH}/{filename}", "x") f.write(str(datetime.datetime.now()) + "\n\n") session = Session() screenshots = session.query(Screenshot).filter_by(type=ScreenshotEnum.RGB) for screenshot in screenshots: sc_check = session.query(Screenshot).filter_by( site_id=screenshot.site_id) flag = False for sc in sc_check: if sc.type == ScreenshotEnum.GREYSCALE: print('Found greyscale version of screenshot.') f.write( f"Found greyscale version of screenshot. Skipping (id={sc.id})\n\n" ) flag = True if flag: pass else: site_name = session.query(Site).get(screenshot.site_id).name print( f"Converting screenshot of site {site_name} from RGB to GREYSCALE" ) f.write( f"Converting screenshot of site {site_name} from RGB to GREYSCALE \n" ) path = to_greyscale(screenshot.path, site_name) greyscale_screenshot = Screenshot(site_id=screenshot.site_id, type=ScreenshotEnum.GREYSCALE, path=path) session.add(greyscale_screenshot) session.commit() print(f"Finished conversion of {site_name} from RGB to GREYSCALE") f.write( f"Finished conversion of {site_name} from RGB to GREYSCALE \n\n" ) session.close()
def parse_collected_data(): session = Session() data = session.query(Response).filter_by(parsed=False) filename = f"parsed_response_{file_safe_timestamp()}.log" f = open(f"{STORAGE_LOGS_PATH}/{filename}", "x") f.write(str(datetime.datetime.now()) + "\n\n") for site in data: response_id = site.id print(f"Beginning response number {response_id}") f.write(f"Beginning response number {response_id} \n") for response in parse_response(site.response): url = response['DataUrl'] print(f"Beginning site {url}") f.write(f"Beginning site {url}: ") rank = response['Global']['Rank'] reach_per_million = response['Country']['Reach']['PerMillion'] page_views_per_million = response['Country']['PageViews'][ 'PerMillion'] page_views_per_user = response['Country']['PageViews']['PerUser'] f.write( f"Rank: {rank}, Reach/Million: {reach_per_million}, Page Views/Million: {page_views_per_million}, Page Views/User: {page_views_per_user} \n" ) parsed_response = ParsedResponse( response_id=response_id, url=url, rank=rank, reach_per_million=reach_per_million, page_views_per_million=page_views_per_million, page_views_per_user=page_views_per_user) session.add(parsed_response) session.commit() session.query(Response).get(response_id).parsed = True print(f"Finished parsing response number {response_id}") f.write(f"Finished parsing response number {response_id} \n\n") session.commit() session.close() f.close()
def convert_parsed_to_site(): session = Session() data = session.query(ParsedResponse).all() filename = f"parsed_to_sites_{file_safe_timestamp()}.log" f = open(f"{STORAGE_LOGS_PATH}/{filename}", "x") f.write(str(datetime.datetime.now()) + "\n\n") for response in data: print(f"Beginning response {response.id}") f.write(f"Beginning response {response.id}: \n") split_url = response.url.split(".") site = Site(name='_'.join(split_url), host=response.url) print(f"Parsing site {'_'.join(split_url)} at host {response.url}") f.write( f"Parsing site {'_'.join(split_url)} at host {response.url} \n\n") session.add(site) session.commit() print(f"Finished parsing response number {response.id}") f.write(f"Finished parsing response number {response.id} \n\n") session.close()
def collect_aws_data(): session = Session() limit = 1001 lower = 1 interval = 100 for i in range(lower, limit, interval): query_start = int(open(QUERY_START_PATH, 'r').readlines()[0]) print(f"Querying sites at count {query_start}") if query_start >= limit: print("Exceeded limit") break aws_request_url = query_api_url(start=query_start, count=interval) aws_res = make_api_request(aws_request_url) log_query_response(aws_res.json()) response = Response(query=aws_request_url, response=aws_res.json()) session.add(response) session.commit() f = open(QUERY_START_PATH, 'w') f.write(str(query_start + interval)) f.close() session.close()