def test_parse_url(result_url_str): with io.open('./tests/test.zip', 'rb') as fp: unzipped = unzip_request_content(fp.read()) parsed_row = next(parse_row(unzipped)).replace('\r\n', '') parsed_result = parse_url(parsed_row) assert parsed_result == result_url_str
def test_from_parameter_pagination(app, indexed_records, search_url): """Test "from" parameter pagination.""" with app.test_client() as client: res = client.get(search_url, query_string={'size': 1, 'from': 1}) assert_hits_len(res, 1) data = get_json(res) assert 'self' in data['links'] assert 'next' in data['links'] assert 'prev' not in data['links'] next_url = get_json(res)['links']['next'] parsed_url = parse_url(next_url) assert parsed_url['qs']['size'] == ['1'] assert parsed_url['qs']['from'] == ['2'] assert 'page' not in parsed_url['qs'] self_url = get_json(res)['links']['self'] parsed_url = parse_url(self_url) assert parsed_url['qs']['size'] == ['1'] assert parsed_url['qs']['from'] == ['1'] assert 'page' not in parsed_url['qs'] res = client.get(next_url) assert_hits_len(res, 1) data = get_json(res) assert data['links']['self'] == next_url assert 'next' in data['links'] assert 'prev' in data['links'] next_url = get_json(res)['links']['next'] parsed_url = parse_url(next_url) assert parsed_url['qs']['size'] == ['1'] assert parsed_url['qs']['from'] == ['3'] assert 'page' not in parsed_url['qs']
def test_page_links(app, indexed_records, search_url): """Test Link HTTP header on multi-page searches.""" with app.test_client() as client: # Limit records res = client.get(search_url, query_string=dict(size=1, page=1)) assert_hits_len(res, 1) def parse_link_header(response): """Parses the links from a REST response's HTTP header.""" return { k: v for (k, v) in map( lambda s: re.findall(r'<(.*)>; rel="(.*)"', s)[0][::-1], [x for x in res.headers['Link'].split(', ')]) } links = parse_link_header(res) data = get_json(res)['links'] assert 'self' in data \ and 'self' in links \ and data['self'] == links['self'] assert 'next' in data \ and 'next' in links \ and data['next'] == links['next'] assert 'prev' not in data \ and 'prev' not in links # Assert next URL before calling it first_url = links['self'] next_url = links['next'] parsed_url = parse_url(next_url) assert parsed_url['qs']['size'] == ['1'] assert parsed_url['qs']['page'] == ['2'] # Access next URL res = client.get(to_relative_url(next_url)) assert_hits_len(res, 1) links = parse_link_header(res) assert links['self'] == next_url assert 'next' in links assert 'prev' in links and links['prev'] == first_url
def test_page_links(app, indexed_records, search_url): """Test Link HTTP header on multi-page searches.""" with app.test_client() as client: # Limit records res = client.get(search_url, query_string=dict(size=1, page=1)) assert_hits_len(res, 1) def parse_link_header(response): """Parse the links from a REST response's HTTP header.""" return { k: v for (k, v) in map(lambda s: re.findall(r'<(.*)>; rel="(.*)"', s)[0][::-1], [x for x in res.headers['Link'].split(', ')]) } links = parse_link_header(res) data = get_json(res)['links'] assert 'self' in data \ and 'self' in links \ and data['self'] == links['self'] assert 'next' in data \ and 'next' in links \ and data['next'] == links['next'] assert 'prev' not in data \ and 'prev' not in links # Assert next URL before calling it first_url = links['self'] next_url = links['next'] parsed_url = parse_url(next_url) assert parsed_url['qs']['size'] == ['1'] assert parsed_url['qs']['page'] == ['2'] # Access next URL res = client.get(to_relative_url(next_url)) assert_hits_len(res, 1) links = parse_link_header(res) assert links['self'] == next_url assert 'next' in links assert 'prev' in links and links['prev'] == first_url
def main(): zipfile_url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip' crawler = Crawl() response = crawler.get(zipfile_url) unzipped_file = unzip_request_content(response.content) count = 0 for row in parse_row(unzipped_file): url = 'https://' + parse_url(row).strip() if not url: continue elif count >= 10000: break js_function = 'pbjs.cbTimeout' response = crawler.get(url, tor_proxy=False) evaluated = crawler.parse_js(response, look_up=('pbjs', 'prebid'), js_script=js_function) print('[*]', evaluated) crawler.save_two_columns_csv(url, evaluated, 'results.json')
def test_page_links(app, indexed_records, search_url): """Test Link HTTP header on multi-page searches.""" with app.test_client() as client: # Limit records res = client.get(search_url, query_string=dict(size=1, page=1)) assert_hits_len(res, 1) def parse_link_header(response): """Parses the links from a REST response's HTTP header.""" return { k: v for (k, v) in map( lambda s: re.findall(r'<(.*)>; rel="(.*)"', s)[0][::-1], [x for x in res.headers["Link"].split(", ")], ) } links = parse_link_header(res) data = get_json(res)["links"] assert "self" in data and "self" in links and data["self"] == links["self"] assert "next" in data and "next" in links and data["next"] == links["next"] assert "prev" not in data and "prev" not in links # Assert next URL before calling it first_url = links["self"] next_url = links["next"] parsed_url = parse_url(next_url) assert parsed_url["qs"]["size"] == ["1"] assert parsed_url["qs"]["page"] == ["2"] # Access next URL res = client.get(to_relative_url(next_url)) assert_hits_len(res, 1) links = parse_link_header(res) assert links["self"] == next_url assert "next" in links assert "prev" in links and links["prev"] == first_url
def test_pagination(app, indexed_records, search_url): """Test pagination.""" with app.test_client() as client: # Limit records res = client.get(search_url, query_string=dict(size=1, page=1)) assert_hits_len(res, 1) data = get_json(res) assert 'self' in data['links'] assert 'next' in data['links'] assert 'prev' not in data['links'] # Assert next URL before calling it next_url = get_json(res)['links']['next'] parsed_url = parse_url(next_url) assert parsed_url['qs']['size'] == ['1'] assert parsed_url['qs']['page'] == ['2'] # Access next URL res = client.get(to_relative_url(next_url)) assert_hits_len(res, 1) data = get_json(res) assert data['links']['self'] == next_url assert 'next' in data['links'] assert 'prev' in data['links']
def test_pagination(app, indexed_records, search_url): """Test pagination.""" with app.test_client() as client: # Limit records res = client.get(search_url, query_string=dict(size=1, page=1)) assert_hits_len(res, 1) data = get_json(res) assert "self" in data["links"] assert "next" in data["links"] assert "prev" not in data["links"] # Assert next URL before calling it next_url = get_json(res)["links"]["next"] parsed_url = parse_url(next_url) assert parsed_url["qs"]["size"] == ["1"] assert parsed_url["qs"]["page"] == ["2"] # Access next URL res = client.get(to_relative_url(next_url)) assert_hits_len(res, 1) data = get_json(res) assert data["links"]["self"] == next_url assert "next" in data["links"] assert "prev" in data["links"]
json_data = json.loads(open('./name_url_data.json').read()) urls = list(map(lambda obj: obj.get('href'), json_data)) parsed_urls = [] class ParsedUrl: def __init__(self, n, g): self.name = n self.group = g def return_dict(self): return {'name': self.name, 'group': self.group} for idx, url in enumerate(urls[:-3]): group = parse_url(url).get('group') name = parse_url(url).get('name') if group is not None and name is not None: newItem = ParsedUrl(n=name, g=group) parsed_urls.append(newItem) # races = list(map(lambda a: a.group == 'Races' if a else None, parsed_urls)) print(parsed_urls[0]) filtered_items = search_by_group(url_items=parsed_urls, target_group="Races") cd = group_counter(filtered_items) df = pd.DataFrame(cd.items(), columns=['name', 'count']) df.to_csv('group_counter.csv') # for idx, url in enumerate(urls): # if parse_url(url).get('name'): # print(group_counter)