def copy_unique_screenshots(): filename = 'unique_domains.log' session = Session() f = open(f'{STORAGE_LOGS_PATH}/{filename}', 'r') log_filename = f"copy_unique_screenshots_{file_safe_timestamp()}.log" log = open(f"{STORAGE_LOGS_PATH}/{log_filename}", "x") log.write(str(datetime.datetime.now()) + "\n\n") unique_domains = f.readlines() for domain in unique_domains: if domain.endswith('\n'): domain = domain[0:len(domain) - 1] print(f"Domain {domain}") log.write(f"Domain {domain}\n") site = session.query(Site).filter_by(host=domain).first() screenshot = session.query(Screenshot).filter_by( type=ScreenshotEnum.GREYSCALE, site_id=site.id).first() copyfile(screenshot.path, f"{CLUSTER_DATA_PATH}/{site.name}.png") print( f"Copied greyscale screenshot to {CLUSTER_DATA_PATH}/{site.name}.png" ) log.write( f"Copied greyscale screenshot to {CLUSTER_DATA_PATH}/{site.name}.png\n" ) f.close() print("Finished copying screenshots") log.write("Finished copying screenshots\n\n") log.close()
def process_sites(): session = Session() sites = session.query(Site).filter_by(processed=False) for site in sites: log_filename = f"screenshot_{file_safe_timestamp()}.log" driver = Driver(log_filename) driver.run(site, session) driver.quit() session.close()
def convert_parsed_to_site(): session = Session() data = session.query(ParsedResponse).all() filename = f"parsed_to_sites_{file_safe_timestamp()}.log" f = open(f"{STORAGE_LOGS_PATH}/{filename}", "x") f.write(str(datetime.datetime.now()) + "\n\n") for response in data: print(f"Beginning response {response.id}") f.write(f"Beginning response {response.id}: \n") split_url = response.url.split(".") site = Site(name='_'.join(split_url), host=response.url) print(f"Parsing site {'_'.join(split_url)} at host {response.url}") f.write( f"Parsing site {'_'.join(split_url)} at host {response.url} \n\n") session.add(site) session.commit() print(f"Finished parsing response number {response.id}") f.write(f"Finished parsing response number {response.id} \n\n") session.close()
def reprocess_failed_sites(): session = Session() failed_sites = session.query(Screenshot).filter_by(failed=True) for site in failed_sites: site = session.query(Site).get(site.site_id) log_filename = f"screenshot_{file_safe_timestamp()}.log" driver = Driver(log_filename) driver.run(site, session) driver.quit() session.close()
def verify_db_integrity(): session = Session() sites = session.query(Site).all() for site in sites: result = session.query(Screenshot).filter_by(site_id=site.id).first() if not result: print(site.id) screenshots = session.query(Screenshot).filter_by( type=ScreenshotEnum.GREYSCALE) for screenshot in screenshots: result = session.query(Site).filter_by(id=screenshot.site_id).first() if not result: print(screenshot.id)
def collect_aws_data(): session = Session() limit = 1001 lower = 1 interval = 100 for i in range(lower, limit, interval): query_start = int(open(QUERY_START_PATH, 'r').readlines()[0]) print(f"Querying sites at count {query_start}") if query_start >= limit: print("Exceeded limit") break aws_request_url = query_api_url(start=query_start, count=interval) aws_res = make_api_request(aws_request_url) log_query_response(aws_res.json()) response = Response(query=aws_request_url, response=aws_res.json()) session.add(response) session.commit() f = open(QUERY_START_PATH, 'w') f.write(str(query_start + interval)) f.close() session.close()
def __init__(self): Base.metadata.create_all(Engine) self.session = Session()
def identify_layout_duplicates(): filename = f"id_layout_duplicates_{file_safe_timestamp()}.log" f = open(f"{STORAGE_LOGS_PATH}/{filename}", "x") f.write(str(datetime.datetime.now()) + "\n\n") f.write( f"Layout duplication identification function, using threshold {SIMILARITY_THRESHOLD} for distance in image similarity" ) session = Session() sites = session.query(Site).order_by(Site.host) list_by_delimiters = {1: [], 2: [], 3: []} f.write(f"Initialize list by delimiters: {list_by_delimiters}\n\n") for site in sites: delimiter_count = site.host.count('.') list_by_delimiters[delimiter_count].append(site.host) f.write(f"Updated list by delimiters: {list_by_delimiters}\n\n") domains = __set_domain_keys__(list_by_delimiters) f.write(f"Domains by keys: {domains}\n\n") domains = __set_domain_values__(domains, list_by_delimiters) f.write(f"Domains with values: {domains}\n\n") unique_domains = [] pre_filter_count = 0 for domains in domains.values(): pre_filter_count += len(domains) if len(domains) == 1: print(f"Unique domain {domains[0]}") f.write(f"Unique domain {domains[0]}\n") unique_domains.append(domains[0]) elif len(domains) == 0: print("No domain found, skipping") f.write("No domain found, skipping\n") else: filtered_domains = [] for domain in domains: filtered_domains.append( session.query(Site).filter_by(host=domain).first().id) # Make sure that the filtered domains are sorted filtered_domains = insertion_sort(filtered_domains) unique_domains.append( session.query(Site).filter_by( id=filtered_domains[0]).first().host) base_domain = filtered_domains[0] print(f"Base domain {base_domain}\n") f.write(f"Base domain {base_domain}\n") base_domain_path = session.query(Screenshot).filter_by( site_id=base_domain).first().path for i in range(1, len(filtered_domains)): response = __determine_image_sim__( base_domain_path, session.query(Screenshot).filter_by( site_id=filtered_domains[i]).first().path) parsed = json.loads(json.dumps(response)) distance = parsed['output']['distance'] print(f"Similarity distance: {distance}\n") f.write(f"Similarity distance: {distance}\n") if int(distance) < int(SIMILARITY_THRESHOLD): pass else: host = session.query(Site).filter_by( id=filtered_domains[i]).first().host print(f"Appending host {host}\n") f.write(f"Appending host {host}\n") unique_domains.append(host) post_filter_count = len(unique_domains) print(f"Pre filter count: {pre_filter_count}\n") f.write(f"Pre filter count: {pre_filter_count}\n") print(f"Post filter count: {post_filter_count}\n") f.write(f"Post filter count: {post_filter_count}\n") session.close() f.close() filename = f"unique_domains_{file_safe_timestamp()}.log" f = open(f"{STORAGE_LOGS_PATH}/{filename}", "x") f.write(str(datetime.datetime.now()) + "\n\n") for d in unique_domains: f.write(f"{d}\n") print(unique_domains) f.close() return unique_domains
def convert_site_colorspace(): filename = f"convert_{file_safe_timestamp()}.log" f = open(f"{STORAGE_LOGS_PATH}/{filename}", "x") f.write(str(datetime.datetime.now()) + "\n\n") session = Session() screenshots = session.query(Screenshot).filter_by(type=ScreenshotEnum.RGB) for screenshot in screenshots: sc_check = session.query(Screenshot).filter_by( site_id=screenshot.site_id) flag = False for sc in sc_check: if sc.type == ScreenshotEnum.GREYSCALE: print('Found greyscale version of screenshot.') f.write( f"Found greyscale version of screenshot. Skipping (id={sc.id})\n\n" ) flag = True if flag: pass else: site_name = session.query(Site).get(screenshot.site_id).name print( f"Converting screenshot of site {site_name} from RGB to GREYSCALE" ) f.write( f"Converting screenshot of site {site_name} from RGB to GREYSCALE \n" ) path = to_greyscale(screenshot.path, site_name) greyscale_screenshot = Screenshot(site_id=screenshot.site_id, type=ScreenshotEnum.GREYSCALE, path=path) session.add(greyscale_screenshot) session.commit() print(f"Finished conversion of {site_name} from RGB to GREYSCALE") f.write( f"Finished conversion of {site_name} from RGB to GREYSCALE \n\n" ) session.close()
def parse_collected_data(): session = Session() data = session.query(Response).filter_by(parsed=False) filename = f"parsed_response_{file_safe_timestamp()}.log" f = open(f"{STORAGE_LOGS_PATH}/{filename}", "x") f.write(str(datetime.datetime.now()) + "\n\n") for site in data: response_id = site.id print(f"Beginning response number {response_id}") f.write(f"Beginning response number {response_id} \n") for response in parse_response(site.response): url = response['DataUrl'] print(f"Beginning site {url}") f.write(f"Beginning site {url}: ") rank = response['Global']['Rank'] reach_per_million = response['Country']['Reach']['PerMillion'] page_views_per_million = response['Country']['PageViews'][ 'PerMillion'] page_views_per_user = response['Country']['PageViews']['PerUser'] f.write( f"Rank: {rank}, Reach/Million: {reach_per_million}, Page Views/Million: {page_views_per_million}, Page Views/User: {page_views_per_user} \n" ) parsed_response = ParsedResponse( response_id=response_id, url=url, rank=rank, reach_per_million=reach_per_million, page_views_per_million=page_views_per_million, page_views_per_user=page_views_per_user) session.add(parsed_response) session.commit() session.query(Response).get(response_id).parsed = True print(f"Finished parsing response number {response_id}") f.write(f"Finished parsing response number {response_id} \n\n") session.commit() session.close() f.close()