Example #1
0
def test_parse_url(result_url_str):
    with io.open('./tests/test.zip', 'rb') as fp:
        unzipped = unzip_request_content(fp.read())

    parsed_row = next(parse_row(unzipped)).replace('\r\n', '')
    parsed_result = parse_url(parsed_row)

    assert parsed_result == result_url_str
def test_from_parameter_pagination(app, indexed_records, search_url):
    """Test "from" parameter pagination."""
    with app.test_client() as client:
        res = client.get(search_url, query_string={'size': 1, 'from': 1})
        assert_hits_len(res, 1)
        data = get_json(res)
        assert 'self' in data['links']
        assert 'next' in data['links']
        assert 'prev' not in data['links']

        next_url = get_json(res)['links']['next']
        parsed_url = parse_url(next_url)

        assert parsed_url['qs']['size'] == ['1']
        assert parsed_url['qs']['from'] == ['2']
        assert 'page' not in parsed_url['qs']

        self_url = get_json(res)['links']['self']
        parsed_url = parse_url(self_url)

        assert parsed_url['qs']['size'] == ['1']
        assert parsed_url['qs']['from'] == ['1']
        assert 'page' not in parsed_url['qs']

        res = client.get(next_url)
        assert_hits_len(res, 1)
        data = get_json(res)

        assert data['links']['self'] == next_url
        assert 'next' in data['links']
        assert 'prev' in data['links']

        next_url = get_json(res)['links']['next']
        parsed_url = parse_url(next_url)

        assert parsed_url['qs']['size'] == ['1']
        assert parsed_url['qs']['from'] == ['3']
        assert 'page' not in parsed_url['qs']
Example #3
0
def test_page_links(app, indexed_records, search_url):
    """Test Link HTTP header on multi-page searches."""
    with app.test_client() as client:
        # Limit records
        res = client.get(search_url, query_string=dict(size=1, page=1))
        assert_hits_len(res, 1)

        def parse_link_header(response):
            """Parses the links from a REST response's HTTP header."""
            return {
                k: v
                for (k, v) in map(
                    lambda s: re.findall(r'<(.*)>; rel="(.*)"', s)[0][::-1],
                    [x for x in res.headers['Link'].split(', ')])
            }

        links = parse_link_header(res)
        data = get_json(res)['links']
        assert 'self' in data \
               and 'self' in links \
               and data['self'] == links['self']
        assert 'next' in data \
               and 'next' in links \
               and data['next'] == links['next']
        assert 'prev' not in data \
               and 'prev' not in links

        # Assert next URL before calling it
        first_url = links['self']
        next_url = links['next']
        parsed_url = parse_url(next_url)
        assert parsed_url['qs']['size'] == ['1']
        assert parsed_url['qs']['page'] == ['2']

        # Access next URL
        res = client.get(to_relative_url(next_url))
        assert_hits_len(res, 1)
        links = parse_link_header(res)
        assert links['self'] == next_url
        assert 'next' in links
        assert 'prev' in links and links['prev'] == first_url
def test_page_links(app, indexed_records, search_url):
    """Test Link HTTP header on multi-page searches."""
    with app.test_client() as client:
        # Limit records
        res = client.get(search_url, query_string=dict(size=1, page=1))
        assert_hits_len(res, 1)

        def parse_link_header(response):
            """Parse the links from a REST response's HTTP header."""
            return {
                k: v for (k, v) in
                map(lambda s: re.findall(r'<(.*)>; rel="(.*)"', s)[0][::-1],
                    [x for x in res.headers['Link'].split(', ')])
            }

        links = parse_link_header(res)
        data = get_json(res)['links']
        assert 'self' in data \
               and 'self' in links \
               and data['self'] == links['self']
        assert 'next' in data \
               and 'next' in links \
               and data['next'] == links['next']
        assert 'prev' not in data \
               and 'prev' not in links

        # Assert next URL before calling it
        first_url = links['self']
        next_url = links['next']
        parsed_url = parse_url(next_url)
        assert parsed_url['qs']['size'] == ['1']
        assert parsed_url['qs']['page'] == ['2']

        # Access next URL
        res = client.get(to_relative_url(next_url))
        assert_hits_len(res, 1)
        links = parse_link_header(res)
        assert links['self'] == next_url
        assert 'next' in links
        assert 'prev' in links and links['prev'] == first_url
Example #5
0
def main():
    zipfile_url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'

    crawler = Crawl()
    response = crawler.get(zipfile_url)
    unzipped_file = unzip_request_content(response.content)
    count = 0
    for row in parse_row(unzipped_file):
        url = 'https://' + parse_url(row).strip()

        if not url:
            continue
        elif count >= 10000:
            break

        js_function = 'pbjs.cbTimeout'
        response = crawler.get(url, tor_proxy=False)
        evaluated = crawler.parse_js(response,
                                     look_up=('pbjs', 'prebid'),
                                     js_script=js_function)
        print('[*]', evaluated)
        crawler.save_two_columns_csv(url, evaluated, 'results.json')
def test_page_links(app, indexed_records, search_url):
    """Test Link HTTP header on multi-page searches."""
    with app.test_client() as client:
        # Limit records
        res = client.get(search_url, query_string=dict(size=1, page=1))
        assert_hits_len(res, 1)

        def parse_link_header(response):
            """Parses the links from a REST response's HTTP header."""
            return {
                k: v
                for (k, v) in map(
                    lambda s: re.findall(r'<(.*)>; rel="(.*)"', s)[0][::-1],
                    [x for x in res.headers["Link"].split(", ")],
                )
            }

        links = parse_link_header(res)
        data = get_json(res)["links"]
        assert "self" in data and "self" in links and data["self"] == links["self"]
        assert "next" in data and "next" in links and data["next"] == links["next"]
        assert "prev" not in data and "prev" not in links

        # Assert next URL before calling it
        first_url = links["self"]
        next_url = links["next"]
        parsed_url = parse_url(next_url)
        assert parsed_url["qs"]["size"] == ["1"]
        assert parsed_url["qs"]["page"] == ["2"]

        # Access next URL
        res = client.get(to_relative_url(next_url))
        assert_hits_len(res, 1)
        links = parse_link_header(res)
        assert links["self"] == next_url
        assert "next" in links
        assert "prev" in links and links["prev"] == first_url
def test_pagination(app, indexed_records, search_url):
    """Test pagination."""
    with app.test_client() as client:
        # Limit records
        res = client.get(search_url, query_string=dict(size=1, page=1))
        assert_hits_len(res, 1)
        data = get_json(res)
        assert 'self' in data['links']
        assert 'next' in data['links']
        assert 'prev' not in data['links']

        # Assert next URL before calling it
        next_url = get_json(res)['links']['next']
        parsed_url = parse_url(next_url)
        assert parsed_url['qs']['size'] == ['1']
        assert parsed_url['qs']['page'] == ['2']

        # Access next URL
        res = client.get(to_relative_url(next_url))
        assert_hits_len(res, 1)
        data = get_json(res)
        assert data['links']['self'] == next_url
        assert 'next' in data['links']
        assert 'prev' in data['links']
Example #8
0
def test_pagination(app, indexed_records, search_url):
    """Test pagination."""
    with app.test_client() as client:
        # Limit records
        res = client.get(search_url, query_string=dict(size=1, page=1))
        assert_hits_len(res, 1)
        data = get_json(res)
        assert 'self' in data['links']
        assert 'next' in data['links']
        assert 'prev' not in data['links']

        # Assert next URL before calling it
        next_url = get_json(res)['links']['next']
        parsed_url = parse_url(next_url)
        assert parsed_url['qs']['size'] == ['1']
        assert parsed_url['qs']['page'] == ['2']

        # Access next URL
        res = client.get(to_relative_url(next_url))
        assert_hits_len(res, 1)
        data = get_json(res)
        assert data['links']['self'] == next_url
        assert 'next' in data['links']
        assert 'prev' in data['links']
def test_pagination(app, indexed_records, search_url):
    """Test pagination."""
    with app.test_client() as client:
        # Limit records
        res = client.get(search_url, query_string=dict(size=1, page=1))
        assert_hits_len(res, 1)
        data = get_json(res)
        assert "self" in data["links"]
        assert "next" in data["links"]
        assert "prev" not in data["links"]

        # Assert next URL before calling it
        next_url = get_json(res)["links"]["next"]
        parsed_url = parse_url(next_url)
        assert parsed_url["qs"]["size"] == ["1"]
        assert parsed_url["qs"]["page"] == ["2"]

        # Access next URL
        res = client.get(to_relative_url(next_url))
        assert_hits_len(res, 1)
        data = get_json(res)
        assert data["links"]["self"] == next_url
        assert "next" in data["links"]
        assert "prev" in data["links"]
Example #10
0
json_data = json.loads(open('./name_url_data.json').read())
urls = list(map(lambda obj: obj.get('href'), json_data))
parsed_urls = []


class ParsedUrl:
    def __init__(self, n, g):
        self.name = n
        self.group = g

    def return_dict(self):
        return {'name': self.name, 'group': self.group}


for idx, url in enumerate(urls[:-3]):
    group = parse_url(url).get('group')
    name = parse_url(url).get('name')
    if group is not None and name is not None:
        newItem = ParsedUrl(n=name, g=group)
        parsed_urls.append(newItem)

# races = list(map(lambda a: a.group == 'Races' if a else None, parsed_urls))
print(parsed_urls[0])
filtered_items = search_by_group(url_items=parsed_urls, target_group="Races")
cd = group_counter(filtered_items)
df = pd.DataFrame(cd.items(), columns=['name', 'count'])

df.to_csv('group_counter.csv')
# for idx, url in enumerate(urls):
#     if parse_url(url).get('name'):
#         print(group_counter)