def open_url(url, method='get', headers=None, cookies=None, params={}, data={},
             redirects=True, verify=True, timeout=None, files=None, auth=None,
             proxies=None, hooks=None, stream=None, cert=None, json=None):

        if headers == None:

                headers = {}
                headers['User-Agent'] = User_Agent

        link = getattr(scraper,method)(url, headers=headers, cookies=cookies, params=params, data=data,
                                       allow_redirects=redirects, verify=verify, timeout=timeout, files=files,
                                       auth=auth, proxies=proxies, hooks=hooks, stream=stream, cert=cert, json=json)

        try:

                su = sucuri().get(link.content)
                if su:
                        headers['Cookie'] = su

                        if not url[-1] == '/':
                                url = '%s/' %url

                        link = getattr(scraper,method)(url, headers=headers, cookies=cookies, params=params, data=data,
                                                       allow_redirects=redirects, verify=verify, timeout=timeout,
                                                       files=files, auth=auth, proxies=proxies, hooks=hooks,
                                                       stream=stream, cert=cert, json=json)

        except:
                pass

        if '_Incapsula_' in link.content:

                link = crack(scraper, link)

        return link
Esempio n. 2
0
def unblock():
    """
    Unblock the target url/session
    :return:
    """
    r = session.get(target_url)
    r = crack(session, r)
    with open('unblocked.html', 'wb') as f:
        f.write(r.content)
    return incap_blocked(r)
Esempio n. 3
0
def updatePreviews(previewsOnly):
    """Will update the global array that holds all preview instances (Predictions)"""
    session = requests.Session()
    session.headers.update({
        'User-Agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0'
    })
    # get full page
    r = crack(session, session.get('https://www.whoscored.com'))
    soup = BeautifulSoup(r.content, "html.parser")
    # get tomorrow's matches
    r2 = crack(
        session,
        session.get(
            "https://www.whoscored.com/LiveScoresSummary?day=tomorrow"))
    soup2 = BeautifulSoup(r2.content, "html.parser")
    # find all preview a's
    tables = [
        soup.find("table", class_="detailed-tournaments"),
        soup2.find("table", class_="detailed-tournaments")
    ]
    for t, table in enumerate(tables):
        for i, row in enumerate(table.select("tr.match")):
            # create new prediction instance
            predictions.append(
                Prediction(
                    len(row.select("td.toolbar.right a.preview")) == 1, t,
                    row.select("td.time")[0].contents[0].strip(),
                    row.select("td.home a")[0].get_text().strip(),
                    row.select("td.away a")[0].get_text().strip()))
    previewCounter = 0
    for p in predictions:
        if previewsOnly and not p.hasPreview:
            continue
        previewCounter += 1
        print "time: ", p.day + " " + p.time
        print "home: ", p.hometeam
        print "away: ", p.awayteam
        print "preview: ", "Yes" if p.hasPreview else "No"
    print(
        str(len(predictions)) + " matches in total, of which " +
        str(previewCounter) + " have a preview available.")
Esempio n. 4
0
def open_url(url,
             method='get',
             headers=None,
             cookies=None,
             params=None,
             data=None,
             redirects=True,
             verify=True):

    if headers == None:

        headers = {}
        headers['User-Agent'] = User_Agent

    link = getattr(scraper, method)(url,
                                    headers=headers,
                                    cookies=cookies,
                                    params=params,
                                    data=data,
                                    allow_redirects=redirects,
                                    verify=verify)

    try:
        if link.headers['Server'] == 'Sucuri/Cloudproxy':

            su = sucuri().get(link.content)
            headers['Cookie'] = su

            if not url[-1] == '/':
                url = '%s/' % url

            link = getattr(scraper, method)(url,
                                            headers=headers,
                                            cookies=cookies,
                                            params=params,
                                            data=data,
                                            allow_redirects=redirects,
                                            verify=verify)
    except:
        pass

    if '_Incapsula_' in link.content:

        link = getattr(scraper, method)(url,
                                        headers=headers,
                                        cookies=cookies,
                                        params=params,
                                        data=data,
                                        allow_redirects=redirects,
                                        verify=verify)
        link = crack(scraper, link)

    return link
Esempio n. 5
0
import requests
from incapsula import crack

session = requests.Session()
r = crack(session, session.get('http://www.bjs.com'))
print r.content
Esempio n. 6
0
def OPEN_URL(url,
             method='get',
             headers=None,
             params=None,
             data=None,
             redirects=True,
             verify=True,
             mobile=False,
             timeout=None,
             output=None,
             XHR=False):

    if timeout == None: timeout = 30
    if headers == None:
        headers = {}
        headers['User-Agent'] = random_agent()

    if not 'User-Agent' in headers:
        if not 'user-agent' in headers: headers['User-Agent'] = random_agent()

    elif mobile == True:
        headers['User-Agent'] = ''
        headers['User-Agent'] = 'Apple-iPhone/701.341'

    if output == 'geturl':
        link = requests.head(url, allow_redirects=True)
        link = str(link.url)
        return link

    if output == 'cookie':
        cookie = []
        r = session.get(url, headers=headers)
        cookie_dict = session.cookies.get_dict()
        for k, v in cookie_dict.items():
            cookie = "%s=%s" % (k, v)
        return cookie
    if XHR == True:
        print("REQUESTING WITH XMLHttpRequest")
        headers['X-Requested-With'] = 'XMLHttpRequest'

    if not 'Accept-Language' in headers:
        headers['Accept-Language'] = 'en-US'

    if not 'referer' in headers:
        if not 'Referer' in headers:
            headers['Referer'] = '%s://%s/' % (urlparse.urlparse(url).scheme,
                                               urlparse.urlparse(url).netloc)

    link = requests.get(url,
                        headers=headers,
                        params=params,
                        data=data,
                        allow_redirects=redirects,
                        verify=verify,
                        timeout=int(timeout))
    response_code = link.status_code
    print("RESPONSE CODE", response_code)
    try:
        resp_header = link.headers['Server']
    except:
        resp_header = 'none'

    try:
        if resp_header.lower(
        ) == 'cloudflare-nginx' and response_code == 503 and not "sucuri_cloudproxy_js" in link.content:
            print("DETECTED CLOUDFLARE", url)
            link = scraper.get(url)
    except:
        pass

    try:
        if "sucuri_cloudproxy_js" in link.content:
            print("DETECTED SUCURI", url)
            su = sucuri().get(link.content)
            headers['Cookie'] = su
            link = session.get(url,
                               headers=headers,
                               params=params,
                               data=data,
                               allow_redirects=redirects,
                               verify=verify,
                               timeout=int(timeout))
    except:
        pass

    if '_Incapsula_' in link.content:
        print("DETECTED _Incapsula_", url)
        response = session.get(url)  # url is blocked by incapsula
        link = crack(session, response)

    return link
Esempio n. 7
0
def OPEN_URL(url,
             method='get',
             headers=None,
             params=None,
             data=None,
             redirects=True,
             verify=True,
             mobile=False,
             timeout=None):
    if timeout == None: timeout = '30'
    if headers == None:

        headers = {}
        headers['User-Agent'] = random_agent()

    elif mobile == True:
        headers['User-Agent'] = ''
        headers['User-Agent'] = 'Apple-iPhone/701.341'

    link = requests.get(url,
                        headers=headers,
                        params=params,
                        data=data,
                        allow_redirects=redirects,
                        verify=verify,
                        timeout=int(timeout))
    response_code = link.status_code
    print("RESPONSE CODE", response_code)
    try:
        resp_header = link.headers['Server']
    except:
        resp_header = 'none'

    try:
        if resp_header.lower(
        ) == 'cloudflare-nginx' and response_code == 503 and not "sucuri_cloudproxy_js" in link.content:
            print("DETECTED CLOUDFLARE", url)
            link = scraper.get(url)
    except:
        pass

    try:
        if "sucuri_cloudproxy_js" in link.content:
            print("DETECTED SUCURI", url)
            su = sucuri().get(link.content)
            headers['Cookie'] = su

            if not url[-1] == '/':
                url = '%s/' % url

            link = requests.get(url,
                                headers=headers,
                                params=params,
                                data=data,
                                allow_redirects=redirects,
                                verify=verify,
                                timeout=int(timeout))
    except:
        pass

    if '_Incapsula_' in link.content:
        print("DETECTED _Incapsula_", url)
        response = session.get(url)  # url is blocked by incapsula
        link = crack(session, response)

    return link