Example #1
0
def copy_unique_screenshots():
    filename = 'unique_domains.log'
    session = Session()
    f = open(f'{STORAGE_LOGS_PATH}/{filename}', 'r')
    log_filename = f"copy_unique_screenshots_{file_safe_timestamp()}.log"
    log = open(f"{STORAGE_LOGS_PATH}/{log_filename}", "x")
    log.write(str(datetime.datetime.now()) + "\n\n")
    unique_domains = f.readlines()
    for domain in unique_domains:
        if domain.endswith('\n'):
            domain = domain[0:len(domain) - 1]

        print(f"Domain {domain}")
        log.write(f"Domain {domain}\n")
        site = session.query(Site).filter_by(host=domain).first()
        screenshot = session.query(Screenshot).filter_by(
            type=ScreenshotEnum.GREYSCALE, site_id=site.id).first()
        copyfile(screenshot.path, f"{CLUSTER_DATA_PATH}/{site.name}.png")
        print(
            f"Copied greyscale screenshot to {CLUSTER_DATA_PATH}/{site.name}.png"
        )
        log.write(
            f"Copied greyscale screenshot to {CLUSTER_DATA_PATH}/{site.name}.png\n"
        )
    f.close()
    print("Finished copying screenshots")
    log.write("Finished copying screenshots\n\n")
    log.close()
Example #2
0
def process_sites():
    session = Session()

    sites = session.query(Site).filter_by(processed=False)

    for site in sites:
        log_filename = f"screenshot_{file_safe_timestamp()}.log"
        driver = Driver(log_filename)
        driver.run(site, session)
        driver.quit()

    session.close()
Example #3
0
def convert_parsed_to_site():
    session = Session()
    data = session.query(ParsedResponse).all()

    filename = f"parsed_to_sites_{file_safe_timestamp()}.log"
    f = open(f"{STORAGE_LOGS_PATH}/{filename}", "x")
    f.write(str(datetime.datetime.now()) + "\n\n")

    for response in data:
        print(f"Beginning response {response.id}")
        f.write(f"Beginning response {response.id}: \n")
        split_url = response.url.split(".")
        site = Site(name='_'.join(split_url), host=response.url)
        print(f"Parsing site {'_'.join(split_url)} at host {response.url}")
        f.write(
            f"Parsing site {'_'.join(split_url)} at host {response.url} \n\n")
        session.add(site)
        session.commit()
        print(f"Finished parsing response number {response.id}")
        f.write(f"Finished parsing response number {response.id} \n\n")

    session.close()
Example #4
0
def reprocess_failed_sites():
    session = Session()

    failed_sites = session.query(Screenshot).filter_by(failed=True)
    for site in failed_sites:
        site = session.query(Site).get(site.site_id)
        log_filename = f"screenshot_{file_safe_timestamp()}.log"
        driver = Driver(log_filename)
        driver.run(site, session)
        driver.quit()

    session.close()
Example #5
0
def verify_db_integrity():
    session = Session()
    sites = session.query(Site).all()
    for site in sites:
        result = session.query(Screenshot).filter_by(site_id=site.id).first()
        if not result:
            print(site.id)

    screenshots = session.query(Screenshot).filter_by(
        type=ScreenshotEnum.GREYSCALE)
    for screenshot in screenshots:
        result = session.query(Site).filter_by(id=screenshot.site_id).first()
        if not result:
            print(screenshot.id)
Example #6
0
def collect_aws_data():
    session = Session()
    limit = 1001
    lower = 1
    interval = 100

    for i in range(lower, limit, interval):
        query_start = int(open(QUERY_START_PATH, 'r').readlines()[0])
        print(f"Querying sites at count {query_start}")
        if query_start >= limit:
            print("Exceeded limit")
            break
        aws_request_url = query_api_url(start=query_start, count=interval)
        aws_res = make_api_request(aws_request_url)
        log_query_response(aws_res.json())
        response = Response(query=aws_request_url, response=aws_res.json())
        session.add(response)
        session.commit()
        f = open(QUERY_START_PATH, 'w')
        f.write(str(query_start + interval))
        f.close()

    session.close()
 def __init__(self):
     Base.metadata.create_all(Engine)
     self.session = Session()
Example #8
0
def identify_layout_duplicates():
    filename = f"id_layout_duplicates_{file_safe_timestamp()}.log"
    f = open(f"{STORAGE_LOGS_PATH}/{filename}", "x")
    f.write(str(datetime.datetime.now()) + "\n\n")
    f.write(
        f"Layout duplication identification function, using threshold {SIMILARITY_THRESHOLD} for distance in image similarity"
    )
    session = Session()

    sites = session.query(Site).order_by(Site.host)

    list_by_delimiters = {1: [], 2: [], 3: []}
    f.write(f"Initialize list by delimiters: {list_by_delimiters}\n\n")

    for site in sites:
        delimiter_count = site.host.count('.')
        list_by_delimiters[delimiter_count].append(site.host)
    f.write(f"Updated list by delimiters: {list_by_delimiters}\n\n")

    domains = __set_domain_keys__(list_by_delimiters)
    f.write(f"Domains by keys: {domains}\n\n")
    domains = __set_domain_values__(domains, list_by_delimiters)
    f.write(f"Domains with values: {domains}\n\n")

    unique_domains = []

    pre_filter_count = 0
    for domains in domains.values():
        pre_filter_count += len(domains)
        if len(domains) == 1:
            print(f"Unique domain {domains[0]}")
            f.write(f"Unique domain {domains[0]}\n")
            unique_domains.append(domains[0])
        elif len(domains) == 0:
            print("No domain found, skipping")
            f.write("No domain found, skipping\n")
        else:
            filtered_domains = []
            for domain in domains:
                filtered_domains.append(
                    session.query(Site).filter_by(host=domain).first().id)

            # Make sure that the filtered domains are sorted
            filtered_domains = insertion_sort(filtered_domains)
            unique_domains.append(
                session.query(Site).filter_by(
                    id=filtered_domains[0]).first().host)
            base_domain = filtered_domains[0]
            print(f"Base domain {base_domain}\n")
            f.write(f"Base domain {base_domain}\n")
            base_domain_path = session.query(Screenshot).filter_by(
                site_id=base_domain).first().path
            for i in range(1, len(filtered_domains)):
                response = __determine_image_sim__(
                    base_domain_path,
                    session.query(Screenshot).filter_by(
                        site_id=filtered_domains[i]).first().path)
                parsed = json.loads(json.dumps(response))
                distance = parsed['output']['distance']
                print(f"Similarity distance: {distance}\n")
                f.write(f"Similarity distance: {distance}\n")
                if int(distance) < int(SIMILARITY_THRESHOLD):
                    pass
                else:
                    host = session.query(Site).filter_by(
                        id=filtered_domains[i]).first().host
                    print(f"Appending host {host}\n")
                    f.write(f"Appending host {host}\n")
                    unique_domains.append(host)

    post_filter_count = len(unique_domains)
    print(f"Pre filter count: {pre_filter_count}\n")
    f.write(f"Pre filter count: {pre_filter_count}\n")
    print(f"Post filter count: {post_filter_count}\n")
    f.write(f"Post filter count: {post_filter_count}\n")
    session.close()
    f.close()

    filename = f"unique_domains_{file_safe_timestamp()}.log"
    f = open(f"{STORAGE_LOGS_PATH}/{filename}", "x")
    f.write(str(datetime.datetime.now()) + "\n\n")
    for d in unique_domains:
        f.write(f"{d}\n")
    print(unique_domains)
    f.close()
    return unique_domains
Example #9
0
def convert_site_colorspace():
    filename = f"convert_{file_safe_timestamp()}.log"
    f = open(f"{STORAGE_LOGS_PATH}/{filename}", "x")
    f.write(str(datetime.datetime.now()) + "\n\n")

    session = Session()
    screenshots = session.query(Screenshot).filter_by(type=ScreenshotEnum.RGB)
    for screenshot in screenshots:
        sc_check = session.query(Screenshot).filter_by(
            site_id=screenshot.site_id)
        flag = False
        for sc in sc_check:
            if sc.type == ScreenshotEnum.GREYSCALE:
                print('Found greyscale version of screenshot.')
                f.write(
                    f"Found greyscale version of screenshot. Skipping (id={sc.id})\n\n"
                )

                flag = True
        if flag:
            pass
        else:
            site_name = session.query(Site).get(screenshot.site_id).name

            print(
                f"Converting screenshot of site {site_name} from RGB to GREYSCALE"
            )
            f.write(
                f"Converting screenshot of site {site_name} from RGB to GREYSCALE \n"
            )

            path = to_greyscale(screenshot.path, site_name)
            greyscale_screenshot = Screenshot(site_id=screenshot.site_id,
                                              type=ScreenshotEnum.GREYSCALE,
                                              path=path)
            session.add(greyscale_screenshot)
            session.commit()

            print(f"Finished conversion of {site_name} from RGB to GREYSCALE")
            f.write(
                f"Finished conversion of {site_name} from RGB to GREYSCALE \n\n"
            )

    session.close()
Example #10
0
def parse_collected_data():
    session = Session()
    data = session.query(Response).filter_by(parsed=False)

    filename = f"parsed_response_{file_safe_timestamp()}.log"
    f = open(f"{STORAGE_LOGS_PATH}/{filename}", "x")
    f.write(str(datetime.datetime.now()) + "\n\n")

    for site in data:
        response_id = site.id
        print(f"Beginning response number {response_id}")
        f.write(f"Beginning response number {response_id} \n")

        for response in parse_response(site.response):
            url = response['DataUrl']
            print(f"Beginning site {url}")
            f.write(f"Beginning site {url}: ")
            rank = response['Global']['Rank']
            reach_per_million = response['Country']['Reach']['PerMillion']
            page_views_per_million = response['Country']['PageViews'][
                'PerMillion']
            page_views_per_user = response['Country']['PageViews']['PerUser']
            f.write(
                f"Rank: {rank}, Reach/Million: {reach_per_million}, Page Views/Million: {page_views_per_million}, Page Views/User: {page_views_per_user} \n"
            )
            parsed_response = ParsedResponse(
                response_id=response_id,
                url=url,
                rank=rank,
                reach_per_million=reach_per_million,
                page_views_per_million=page_views_per_million,
                page_views_per_user=page_views_per_user)
            session.add(parsed_response)
            session.commit()

        session.query(Response).get(response_id).parsed = True
        print(f"Finished parsing response number {response_id}")
        f.write(f"Finished parsing response number {response_id} \n\n")
        session.commit()

    session.close()
    f.close()