Esempio n. 1
0
def reset_sessions(data_url):

    s = SgRequests()

    driver = SgChrome(is_headless=True).driver()
    driver.get(base_url)

    incap_str = "/_Incapsula_Resource?SWJIYLWA=719d34d31c8e3a6e6fffd425f7e032f3"
    incap_url = base_url + incap_str

    s.get(incap_url)

    for request in driver.requests:

        headers = request.headers
        try:
            response = s.get(data_url, headers=headers)
            response_text = response.text

            test_html = response_text.split("div")
            if len(test_html) < 2:
                continue
            else:
                return [s, driver, headers, response_text]

        except Exception:
            continue
Esempio n. 2
0
def get_result(url, headers):
    global session
    try:
        return session.get(url, headers=headers)
    except:
        session = SgRequests()
        raise
Esempio n. 3
0
def fetchSinglePage(data_url, findRedirect=False):
    session = SgRequests()
    driver.get(data_url)
    incap_str = "/_Incapsula_Resource?SWJIYLWA=719d34d31c8e3a6e6fffd425f7e032f3"
    incap_url = website + incap_str
    session.get(incap_url)

    for x in range(10):
        if findRedirect:
            print("find redirect")
        print("try: " + str(x))
        for request in driver.requests:
            headers = request.headers
            try:
                response = session.get(data_url, headers=headers)
                response_text = response.text

                test_html = response_text.split("div")

                if findRedirect and response_text.find("window.location.replace") > -1:

                    try:
                        return [session, headers, response_text.split("window.location.replace('")[1].split(
                            "')"
                        )[0]]
                    except Exception:
                        continue
                elif len(test_html) < 2:
                    continue
                else:

                    return [
                        session,
                        headers,
                        {
                            "response": response_text,
                            "hours_of_operation": getHoursOfOperation(),
                            "phone": getPhone(session, headers, response_text),
                        },
                    ]

            except Exception:
                continue
Esempio n. 4
0
import csv
from sgrequests import SgRequests
from bs4 import BeautifulSoup
import re
import json
import unicodedata
session = SgRequests()


def write_output(data):
    with open('data.csv', mode='w', newline='',
              encoding="utf-8") as output_file:
        writer = csv.writer(output_file,
                            delimiter=',',
                            quotechar='"',
                            quoting=csv.QUOTE_ALL)
        # Header
        writer.writerow([
            "locator_domain", "location_name", "street_address", "city",
            "state", "zip", "country_code", "store_number", "phone",
            "location_type", "latitude", "longitude", "hours_of_operation",
            "page_url"
        ])
        # Body
        for row in data:
            writer.writerow(row)


def fetch_data():
    addressess = []
    headers = {
Esempio n. 5
0
from tenacity import retry
from tenacity import stop_after_attempt
import time

logger = SgLogSetup().get_logger(logger_name="autozone_com")
locator_domain_url = " https://www.autozone.com"
MISSING = "<MISSING>"

headers = {
    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "en-US,en;q=0.9",
    "cache-control": "max-age=0",
}
session = SgRequests()

FIELDS = [
    "locator_domain",
    "page_url",
    "location_name",
    "street_address",
    "city",
    "state",
    "zip",
    "country_code",
    "store_number",
    "phone",
    "location_type",
    "latitude",
    "longitude",
Esempio n. 6
0
async def get_brand(brand_code, brand_name, url):
    url = url + brand_code

    headers = {}
    headers["authority"] = "www.radissonhotels.com"
    headers["method"] = "GET"
    headers["path"] = "/zimba-api/destinations/hotels?brand=" + brand_code
    headers["scheme"] = "https"
    headers["accept"] = "application/json, text/plain, */*"
    headers["accept-encoding"] = "gzip, deflate, br"
    headers["accept-language"] = "en-us"
    headers["referer"] = "https://www.radissonhotels.com/en-us/destination"
    headers["sec-fetch-dest"] = "empty"
    headers["sec-fetch-mode"] = "cors"
    headers["sec-fetch-site"] = "same-origin"
    headers[
        "user-agent"] = "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36"

    session = SgRequests()
    son = session.get(url, headers=headers)
    print(son.text)
    son = son.json()
    task_list = []
    results = []
    chunk_size = 10
    last_chunk = 0
    last_tick = time.monotonic()
    total_records = len(son["hotels"])
    global EXPECTED_TOTAL
    EXPECTED_TOTAL += total_records
    for index, record in enumerate(son["hotels"]):
        task_list.append(fetch_data(index, record["overviewPath"]))
        if index % chunk_size == 0 and last_chunk != index:
            last_tick = time.monotonic()
            last_chunk = index
            if len(task_list) > 0:
                z = await asyncio.gather(*task_list)
                for item in z:
                    results.append({
                        "main": son["hotels"][item["index"]],
                        "sub": item,
                        "@type": brand_name,
                    })
                logzilla.info(
                    f"Finished {last_chunk}/{total_records} for brand {brand_name}, last step took {round(time.monotonic()-last_tick,5)} seconds."
                )
                task_list = []

    last_tick = time.monotonic()
    if len(task_list) > 0:
        z = await asyncio.gather(*task_list)
        for item in z:
            results.append({
                "main": son["hotels"][item["index"]],
                "sub": item,
                "@type": brand_name
            })
        logzilla.info(
            f"Finished {total_records}/{total_records} for brand {brand_name}, last step took {round(time.monotonic()-last_tick,5)} seconds."
        )
    return results
Esempio n. 7
0
import csv
from sgrequests import SgRequests
from sgzip.dynamic import SearchableCountries
from sgzip.static import static_zipcode_list

session = SgRequests(retry_behavior=False)

headers = {
    "user-agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36",
    "content-type": "application/json",
    "accept": "application/json, text/plain, */*",
}


def write_output(data):
    with open("data.csv", mode="w") as output_file:
        writer = csv.writer(output_file,
                            delimiter=",",
                            quotechar='"',
                            quoting=csv.QUOTE_ALL)

        # Header
        writer.writerow([
            "locator_domain",
            "page_url",
            "location_name",
            "street_address",
            "city",
            "state",
            "zip",
Esempio n. 8
0
            except Exception:
                a_tags = div_tag.find_all("a")
                for a_tag in a_tags:
                    try:
                        location_url = a_tag["href"]
                    except Exception:
                        pass

            if location_url in location_urls:
                pass
            else:
                location_urls.append(location_url)
        count = count + 1

x = 0
phone_session = SgRequests()
for location_url in location_urls:
    print(x)
    print(location_url)
    response = s.get(location_url, headers=headers)
    response_text = response.text
    if len(response_text.split("div")) > 2:
        pass
    else:
        new_sess = reset_sessions(location_url)

        s = new_sess[0]
        driver = new_sess[1]
        headers = new_sess[2]
        response_text = new_sess[3]
Esempio n. 9
0
locator_domains = []
page_urls = []
location_names = []
street_addresses = []
citys = []
states = []
zips = []
country_codes = []
store_numbers = []
phones = []
location_types = []
latitudes = []
longitudes = []
hours_of_operations = []

session = SgRequests()
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}

url = "https://www.potatocornerusa.com"
response = session.get(url, headers=headers).text
soup = bs(response, "html.parser")
all_script = soup.find_all("script")

goods = []
for script in all_script:
    try:
        stripped = script
        logger.info(stripped)
        data = json.loads(stripped)
Esempio n. 10
0
def fetch_data():

    base_link = "https://www.picknsave.com/storelocator-sitemap.xml"

    user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36"
    headers = {"User-Agent": user_agent}

    session = SgRequests()

    req = session.get(base_link, headers=headers)
    base = BeautifulSoup(req.text, "lxml")

    items = base.find_all("loc")

    data = []
    locator_domain = "picknsave.com"

    for item in items:
        link = item.text
        if "stores/details" in link:
            req = session.get(link, headers=headers)
            base = BeautifulSoup(req.text, "lxml")

            script = (base.find("script",
                                attrs={
                                    "type": "application/ld+json"
                                }).text.replace("\n", "").strip())
            store = json.loads(script)

            location_name = store["name"]
            street_address = store["address"]["streetAddress"]
            city = store["address"]["addressLocality"]
            state = store["address"]["addressRegion"]
            zip_code = store["address"]["postalCode"]
            country_code = "US"
            store_number = link.split("/")[-1]
            location_type = "<MISSING>"
            phone = store["telephone"]
            hours_of_operation = store["openingHours"][0]
            latitude = store["geo"]["latitude"]
            longitude = store["geo"]["longitude"]

            # Store data
            data.append([
                locator_domain,
                link,
                location_name,
                street_address,
                city,
                state,
                zip_code,
                country_code,
                store_number,
                phone,
                location_type,
                latitude,
                longitude,
                hours_of_operation,
            ])

    return data
Esempio n. 11
0
phones = []
location_types = []
latitudes = []
longitudes = []
hours_of_operations = []

headers = {
    'User-Agent': 'PostmanRuntime/7.19.0',
    "Upgrade-Insecure-Requests": "1",
    "DNT": "1",
    "Accept":
    "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate"
}
session = SgRequests(retry_behavior=None)
url = 'https://www.picknsave.com/stores/api/graphql'
search = DynamicZipSearch(country_codes=[SearchableCountries.USA])

for postal in search:
    data = {
        "query":
        "\n      query storeSearch($searchText: String!, $filters: [String]!) {\n        storeSearch(searchText: $searchText, filters: $filters) {\n          stores {\n            ...storeSearchResult\n          }\n          fuel {\n            ...storeSearchResult\n          }\n          shouldShowFuelMessage\n        }\n      }\n      \n  fragment storeSearchResult on Store {\n    banner\n    vanityName\n    divisionNumber\n    storeNumber\n    phoneNumber\n    showWeeklyAd\n    showShopThisStoreAndPreferredStoreButtons\n    storeType\n    distance\n    latitude\n    longitude\n    tz\n    ungroupedFormattedHours {\n      displayName\n      displayHours\n      isToday\n    }\n    address {\n      addressLine1\n      addressLine2\n      city\n      countryCode\n      stateCode\n      zip\n    }\n    pharmacy {\n      phoneNumber\n    }\n    departments {\n      code\n    }\n    fulfillmentMethods{\n      hasPickup\n      hasDelivery\n    }\n  }\n",
        "variables": {
            "searchText": postal,
            "filters": []
        },
        "operationName": "storeSearch"
    }
    response = session.post(url, json=data, headers=headers).json()
    print(response)
Esempio n. 12
0
def fetch_data():
    out = []
    locator_domain = "https://www.citybbq.com"
    api_url = "https://order.citybbq.com/locations"
    session = SgRequests()
    headers = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Pragma": "no-cache",
        "Cache-Control": "no-cache",
        "TE": "Trailers",
    }

    r = session.get(api_url, headers=headers)
    tree = html.fromstring(r.text)
    block = tree.xpath('//ul[@id="ParticipatingStates"]/li')
    for i in block:
        url1 = "".join(i.xpath(".//a/@href"))
        url1 = f"https://order.citybbq.com{url1}"
        session = SgRequests()
        headers = {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
            "Connection": "keep-alive",
            "Referer": "https://order.citybbq.com/locations",
            "Upgrade-Insecure-Requests": "1",
            "Pragma": "no-cache",
            "Cache-Control": "no-cache",
            "TE": "Trailers",
        }
        cookies = {
            "_gcl_au": "1.1.1275666536.1616147724",
            "_ga": "GA1.2.1565131436.1616147732",
            "_gid": "GA1.2.169092942.1616147732",
            "_fbp": "fb.1.1616147732783.1672002159",
            "__cfduid": "d51d0f4f8d1b467178bce7dd202af32771616149617",
        }
        r = session.get(url1, headers=headers, cookies=cookies)
        trees = html.fromstring(r.text)
        block = trees.xpath("//h2")
        for n in block:
            page_url = "".join(n.xpath(".//a/@href"))
            session = SgRequests()
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3",
                "Connection": "keep-alive",
                "Referer": "https://order.citybbq.com/locations",
                "Upgrade-Insecure-Requests": "1",
                "Pragma": "no-cache",
                "Cache-Control": "no-cache",
                "TE": "Trailers",
            }
            r = session.get(page_url, headers=headers)
            tree = html.fromstring(r.text)
            location_name = "".join(tree.xpath("//h1/text()")).replace(
                "\n", "").strip()
            street_address = ("".join(
                tree.xpath('//span[@class="street-address"]/text()')).replace(
                    "\n", "").strip())
            phone = ("".join(
                tree.xpath('//span[@class="tel"]/text()')).replace("\n",
                                                                   "").strip())
            city = ("".join(
                tree.xpath('//span[@class="locality"]/text()')).replace(
                    "\n", "").strip())
            state = ("".join(
                tree.xpath('//span[@class="region"]/text()')).replace(
                    "\n", "").strip())
            country_code = "US"
            store_number = "<MISSING>"
            latitude = "".join(
                tree.xpath('//span[@class="latitude"]/span/@title'))
            longitude = "".join(
                tree.xpath('//span[@class="longitude"]/span/@title'))
            location_type = "<MISSING>"
            hours_of_operation = tree.xpath(
                '//dl[@id="available-business-hours-popover"]//text()')
            hours_of_operation = list(
                filter(None, [a.strip() for a in hours_of_operation]))
            hours_of_operation = " ".join(hours_of_operation)
            postal = ("".join(
                tree.xpath('//span[@class="postal-code"]/text()')).replace(
                    "\n", "").strip())
            row = [
                locator_domain,
                page_url,
                location_name,
                street_address,
                city,
                state,
                postal,
                country_code,
                store_number,
                phone,
                location_type,
                latitude,
                longitude,
                hours_of_operation,
            ]
            out.append(row)

    return out
Esempio n. 13
0
citys = []
states = []
zips = []
country_codes = []
store_numbers = []
phones = []
location_types = []
latitudes = []
longitudes = []
hours_of_operations = []

search = DynamicZipSearch(country_codes=[SearchableCountries.USA],
                          max_search_results=100)
store_types = {"Pharmacy": "C", "Marketplace": "M", "Healthcare Clinic": "LC"}

session = SgRequests()

x = 0
for code in search:
    url = "https://www.picknsave.com/stores/search?searchText=" + code
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
        "Upgrade-Insecure-Requests": "1",
        "DNT": "1",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate"
    }
Esempio n. 14
0
websites = []
locations = []
names = []
addresses = []
citys = []
states = []
zips = []
countrys = []
stores = []
phones = []
location_types = []
latitudes = []
longitudes = []
hours_op = []

session = SgRequests()
search = DynamicGeoSearch(country_codes=[SearchableCountries.USA])

base_url = "https://www.coffeebean.com/store-locator"

# Country search
locs = []
for x in range(101):
    params = {"field_country_value": "USA", "page": x}
    r = session.get(base_url, params=params).text
    soup = bs(r, "html.parser")
    view_store = soup.find_all("a", attrs={"class": "view-store"})
    for item in view_store:
        locs.append(item["href"])

# Lat Lng Boundary search
Esempio n. 15
0
    "city",
    "state",
    "zip_postal",
    "country_code",
    "store_number",
    "phone",
    "location_type",
    "latitude",
    "longitude",
    "locator_domain",
    "hours_of_operation",
    "brand_website",
]


session = SgRequests().requests_retry_session()
log = sglog.SgLogSetup().get_logger(logger_name=website)
driver = SgChrome(
    is_headless=True, executable_path=ChromeDriverManager().install()
).driver()


def fetchStores():
    stores = []
    response = session.get(website + "/sitemaps/profile.xml", headers=headers)
    root = ET.fromstring(response.text)
    for elem in root:
        for var in elem:
            if "loc" in var.tag:
                stores.append(var.text)
    return stores
Esempio n. 16
0
def fetch_data():
    # Your scraper here
    session = SgRequests()

    items = []
    scraped_items = []

    DOMAIN = "dreamdoors.co.uk"
    start_url = "https://www.dreamdoors.co.uk/kitchen-showrooms"

    all_codes = DynamicZipSearch(
        country_codes=[SearchableCountries.BRITAIN],
        max_radius_miles=10,
        max_search_results=None,
    )
    for code in all_codes:
        formdata = {
            "option": "com_ajax",
            "module": "dreamdoors_store_finder",
            "postcode": code,
            "format": "raw",
        }
        headers = {
            "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
            "user-agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
            "x-requested-with": "XMLHttpRequest",
        }
        response = session.post(start_url, data=formdata, headers=headers)
        if response.status_code != 200:
            continue
        data = json.loads(response.text)

        for poi in data:
            if type(poi) == str:
                continue
            store_url = poi["url"]
            if store_url in scraped_items:
                continue

            loc_response = session.get(store_url)
            loc_dom = etree.HTML(loc_response.text)
            location_name = poi["name"]
            location_name = location_name if location_name else "<MISSING>"
            raw_address = loc_dom.xpath('//div[@class="address"]//text()')
            raw_address = [
                elem.strip() for elem in raw_address if elem.strip()
            ]
            addr = parse_address_intl(" ".join(raw_address).replace(
                "Address", ""))
            if addr.street_address_2:
                street_address = f"{addr.street_address_2} {addr.street_address_1}"
            else:
                street_address = addr.street_address_1
            street_address = street_address if street_address else "<MISSING>"
            if "Coming Soon" in street_address:
                continue
            city = addr.city
            city = city if city else "<MISSING>"
            if "Tbc" in city:
                street_address = city
                city = "<MISSING>"
            state = "<MISSING>"
            zip_code = addr.postcode
            zip_code = zip_code if zip_code else "<MISSING>"
            country_code = addr.country
            country_code = country_code if country_code else "<MISSING>"
            store_number = poi["id"]
            store_number = store_number if store_number else "<MISSING>"
            phone = loc_dom.xpath('//a[@id="showroom-phone"]/text()')
            phone = phone[0] if phone else "<MISSING>"
            location_type = "<MISSING>"
            hoo = loc_dom.xpath('//div[@class="opening_times"]//text()')
            hoo = [elem.strip() for elem in hoo if elem.strip()]
            hours_of_operation = (" ".join(hoo[2:]).split(" Call ")[0]
                                  if hoo else "<MISSING>")

            geo = re.findall(r'.map_initialize\("map_canvas", ".+", (.+?)\)',
                             loc_response.text)
            latitude = "<MISSING>"
            longitude = "<MISSING>"
            if geo:
                geo = geo[0].split(", ")
                latitude = geo[0]
                longitude = geo[1]
            else:
                with SgFirefox() as driver:
                    driver.get(store_url)
                    sleep(10)
                    loc_dom = etree.HTML(driver.page_source)
                    geo = loc_dom.xpath('//a[contains(@href, "maps/@")]/@href')
                    if geo:
                        geo = geo[0].split("maps/@")[-1].split(",")[:2]
                        latitude = geo[0]
                        longitude = geo[1]

            item = [
                DOMAIN,
                store_url,
                location_name,
                street_address,
                city,
                state,
                zip_code,
                country_code,
                store_number,
                phone,
                location_type,
                latitude,
                longitude,
                hours_of_operation,
            ]
            if store_url not in scraped_items:
                scraped_items.append(store_url)
                items.append(item)

    return items