Esempio n. 1
0
def get_header():
    header = Headers(
        browser="chrome",  
        os="win", 
        headers=True  
    )
    return header.generate()
Esempio n. 2
0
def startsHere():

    header = Headers()

    uheaders = header.generate()

    urls = list(map(lambda x: x["_id"], db["subreddits"].find({})))

    turls = {}
    for url in urls:
        strCollection = url.split("/r/")[-1].split("/")[0]
        collection = db[strCollection]
        try:
            nele = len(list(collection.find({})))
        except Exception as e:
            nele = 0
        turls[url] = nele

    urls = sorted(turls.items(), key=lambda x: x[1])

    for url, _ in urls:
        try:
            print(url)
            ourl = url
            url = url + "new.json?limit=1000"
            scrap(url, uheaders, ourl)
        except Exception as e:
            print(e)
Esempio n. 3
0
 def parse(self, response):
     try:
         header = Headers(
             browser="chrome",  # Generate only Chrome UA
             os="win",  # Generate ony Windows platform
             headers=True  # generate misc headers
         )
         header1 = ""
         for i in range(1, 10):
             header1 = header.generate()
         print(len(listing_urls))
         for i in range(0, len(listing_urls)):
             yield scrapy.Request(url=listing_urls[i],
                                  callback=self.parse_data,
                                  meta={
                                      'listing_url': listing_urls[i],
                                      'thumb_urls': thumb_urls[i],
                                      'categories': categories[i],
                                      'buying_format': buying_format[i],
                                      'titles': titles[i]
                                  },
                                  dont_filter=True,
                                  headers=header1)
     except Exception as e:
         print(e)
Esempio n. 4
0
    def get_more_suggestions(request_text):
        df = request_text.split(" ")
        finSug = []
        for el in df:
            newurl = 'https://wbxsearch.wildberries.ru/suggests/common?query=REQUEST'.replace(
                "REQUEST", str(el))

            session = requests.Session()
            headers = Headers(browser="chrome", os="win", headers=True)
            session.headers = headers.generate()

            lst_req_text = request_text.split(" ")

            res = session.get(url=newurl)
            res.raise_for_status()
            suggestions = json.loads(res.text)

            for item in suggestions:
                vector_sg = str(item["name"]).split(" ")
                for inItem in vector_sg:
                    for el in lst_req_text:
                        if inItem.find(el) != -1:
                            finSug.append(inItem)

        return finSug
Esempio n. 5
0
    def get_links(self):
        """
        returns list of all chapter's link from https://mangareader.cc/
        """
        ua = Headers(headers=False)  #change headers

        urllib3.disable_warnings(
            urllib3.exceptions.InsecureRequestWarning)  #hiding the warning

        response = requests.get(
            self.URL, headers=ua.generate(), verify=False
        )  #sending a request and storing the response inside response var

        if response.status_code >= 400 and response.status_code < 500:  #if server error
            print("Server Error\nTry again later")
        if response.status_code >= 200 and response.status_code < 300:
            soup = BeautifulSoup(response.content, "html.parser")

            unorder_list = soup.findAll("ul")[2]
            all_hyperlink_tags = unorder_list.findChildren('a')
            all_hrefs = list(
                reversed([
                    hyperlink.get('href') for hyperlink in all_hyperlink_tags
                ]))

            return all_hrefs
Esempio n. 6
0
    def get_chapter_list(self):
        """
        returns list of all chapter's from given manga/
        """
        ua = Headers(headers=False)  #change headers

        urllib3.disable_warnings(
            urllib3.exceptions.InsecureRequestWarning)  #hiding the warning

        response = requests.get(
            self.URL, headers=ua.generate(), verify=False
        )  #sending a request and storing the response inside response var

        if response.status_code >= 400 and response.status_code < 500:  #server error
            print("Server Error!\nTry again later")
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")

            unorder_lists = soup.findAll("ul")
            all_spans = unorder_lists[2].findChildren('span',
                                                      {'class': 'leftoff'})
            all_chapters = list(
                reversed(
                    list(
                        map(self.remove_trails,
                            [span.text for span in all_spans]))))

            return all_chapters
Esempio n. 7
0
async def parse_page(redis_client, url: str, session, netloc: str,
                     spell_checker):
    header = Headers()
    assert spell_checker['pinterest'] == True
    print(f'analyzing {url}')
    async with session.get(url,
                           headers=header.generate(),
                           ssl=False,
                           allow_redirects=True,
                           proxy=random_proxy()) as resp:
        if resp.status in [403, 429]:
            number_of_errors = redis_client.hincrby('4xxerrors', url, 1)
            # TODO: I don't think this is the correct redis location
            if number_of_errors > 3:
                redis_client.srem(f'active:{netloc}')
            return

        soup = BeautifulSoup(await resp.text(), "html.parser")
        visible_words_with_punctuation = get_text(soup)
        pattern = re.compile(r'[\W_]+', re.UNICODE)
        visible_words_strip_punctuation = {
            pattern.sub('', word)
            for word in visible_words_with_punctuation
        }
        wrong_words_set = spell_checker.unknown(
            visible_words_strip_punctuation)
        wrong_words_set_clean = {word for word in wrong_words_set if not ""}
        add_set_to_redis(netloc, url, visible_words_with_punctuation,
                         wrong_words_set_clean, spell_checker, redis_client)

        redis_client.sadd(f'processed:{netloc}', url)
        # this is essentially a recursive search that recalls parse_page() until all the URL's are done
        await extract_and_queue_local_links(soup, netloc, redis_client,
                                            session, spell_checker)
Esempio n. 8
0
 def on_start(self):
     header = Headers(
         browser="firefox",
         os="linux",
         headers=True  # generate misc headers
     )
     headerNow = header.generate()
     self.client.get("/?q=panda&atb=v183-1&ia=web", headers=headerNow)
Esempio n. 9
0
def test_get_text():
    header = Headers()
    resp = requests.get("http://example.com/", headers=header.generate())
    soup = BeautifulSoup(resp.text, "html.parser")
    correct_resp = ['Example', 'Domain', 'This', 'domain', 'is', 'for', 'use', 'in', 'illustrative', 'examples', 'in',
                    'documents.', 'You', 'may', 'use', 'this', 'domain', 'in', 'literature', 'without', 'prior',
                    'coordination', 'or', 'asking', 'for', 'permission.', 'More', 'information...']
    assert get_text(soup) == correct_resp
Esempio n. 10
0
def test_proxy_connection():
    proxies = config.PROXY_LIST.strip('][').split(', ')
    for proxy in proxies:
        header = Headers()
        proxy_sample = {"http": proxy}
        resp = requests.get("http://example.com/",
                            proxies=proxy_sample,
                            headers=header.generate())
        assert resp.status_code == 200
Esempio n. 11
0
 def start_requests(self):
     header = Headers(
         browser="chrome",  # Generate only Chrome UA
         os="win",  # Generate ony Windows platform
         headers=True  # generate misc headers
     )
     header1 = ""
     for i in range(1, 10):
         header1 = header.generate()
     yield scrapy.Request(self.urls, self.parse, headers=header1)
def sp_headers():
    if __name__ == "__main__":
        header = Headers(
            browser="chrome",  # Generate only Chrome UA
            os="win",  # Generate ony Windows platform
            headers=True  # generate misc headers
        )

        for i in range(10):
            header.generate()
Esempio n. 13
0
    def init(self):
        # website infos
        self.base_url = "https://footdistrict.com"
        self.endpoints = ["/zapatillas/f/b/converse/"]

        # create a random headers generator, configured to generate random windows headers
        self.headers_gen = Headers(os="win", headers=True)

        # max links to be monitored
        self.max_links = 5
        self.found_links = []  # type: List[str]
Esempio n. 14
0
def gosreestr_parse_new_uids(fpath,
                             existed_uids,
                             timeout,
                             error_timeout,
                             luigi_callback=None):
    page_index = 0
    s = requests.Session()
    headers = Headers(headers=True)

    _existed_uids = existed_uids

    if os.path.exists(fpath):
        parsed_uids = [u.split(';')[0] for u in read_lines(fpath)]
        page_index = int(read_lines(fpath).pop().split(';')[1]) + 1
        _existed_uids.extend(parsed_uids)

    form_data = prepare_request_data(FORM_DATA, page_index)
    s.headers = headers.generate()
    table_raw = s.post(LIST_URL, data=form_data, timeout=15).text
    status = ''
    new_uids_count = 0
    new_uids = list()
    while not check_empty_table(table_raw):
        uids = parse_ids_from_table(table_raw)
        _new_uids = list()
        for uid in uids:
            if uid not in _existed_uids:
                _new_uids.append(uid)
                append_file(fpath, f'{uid};{page_index}')
            else:
                break

        new_uids.extend(_new_uids)
        new_uids_count += len(_new_uids)

        form_data = prepare_request_data(FORM_DATA, page_index)

        try:
            s.headers = headers.generate()
            table_raw = s.post(LIST_URL, data=form_data, timeout=15).text
        except (ReadTimeout, ConnectTimeout, ConnectionError,
                ReadTimeoutError):
            luigi_callback(
                f'Page: {page_index}, parsed count: {new_uids_count}. Timeout after error',
                0)
            sleep(error_timeout)
        else:
            page_index += 1
            luigi_callback(
                f'Page: {page_index}, parsed count: {new_uids_count}. Timeout after success.',
                0)
            sleep(timeout)

    return new_uids
Esempio n. 15
0
 def header_generator(self):
     """Генерация header'ов"""
     header = Headers()
     headers = header.generate()
     headers["Accept-Language"] = "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7"
     headers["Accept"] = "application/json, text/plain, */*"
     headers["Accept-Encoding"] = "gzip, deflate, br"
     headers["Referer"] = "https://ruz.fa.ru/ruz/main"
     headers["Sec-Fetch-Site"] = "same-origin"
     headers["Sec-Fetch-Mode"] = "cors"
     headers["Sec-Fetch-Dest"] = "empty"
     self.headers = headers
Esempio n. 16
0
def generate_header(browser='chrome', ops='win', random_args=False, **kwargs):
    """生成随机请求头"""
    header = Headers(
        browser=
        browser,  # str, chrome/firefox/opera. User Agent browser. Default: random
        os=ops,  # str, win/mac/lin. OS of User Agent. Default: random
        headers=
        random_args  # bool, True/False. Generate random headers or no. Default: False
    )
    headers = header.generate()
    for key, value in kwargs.items():
        headers[key] = value
    return headers
Esempio n. 17
0
 def __init__(self,
              request_name,
              New_updated_data=[],
              links_storage=c.Links_storage):
     self.New_updated_data = New_updated_data
     self.redisDB = redis.Redis(db=1)
     self.request_name = request_name
     if len(request_name) == 0:
         return
     self.links_storage = links_storage
     self.session = requests.Session()
     headers = Headers(browser="chrome", os="win", headers=True)
     self.session.headers = headers.generate()
     self.products = {}
async def parse_page(redis, url: str, session) -> None:
    header = Headers()

    async with session.get(url,
                           headers=header.generate(),
                           ssl=False,
                           allow_redirects=True,
                           proxy=random_proxy()) as resp:
        current_netloc = urlparse(url).netloc
        # Get the url's parent
        try:
            domain = await Domain.query.where(
                Domain.domain == f'http://{current_netloc}').gino.first()
        except Exception as e:
            logging.error(f'Failed at finding {current_netloc}', exc_info=True)

        # Break out 403 errors for multiple tries
        if resp.status in [403, 429]:
            redis.hincrby("403errors", url, 1)
            await redis.srem('domainbeingcrawled:active', current_netloc)
            number_of_errors = await redis.hget('403errors', url)
            number_of_errors = int(number_of_errors.decode('utf8'))
            if number_of_errors >= 5:
                await Page.create(page=url,
                                  errors=[],
                                  page_response=resp.status,
                                  domain=domain.id)
                await redis.srem('pagestobecrawled:queue', url)

            return
        soup = BeautifulSoup(await resp.text(), "html.parser")
        visible_words = get_text(soup)
        wrong_words = await check_if_spelled_right(redis, words=visible_words)

        try:
            await Page.create(page=url,
                              errors=wrong_words,
                              page_response=resp.status,
                              domain=domain.id)
            await extract_and_queue_local_links(soup=soup,
                                                root_domain=resp.host,
                                                redis=redis)
        except Exception as e:
            logging.error(e)
        print(f'successfully processed {url}')
        print(f'About to pop {current_netloc}')
        await redis.srem('pagestobecrawled:queue', url)
        await redis.srem('domainbeingcrawled:active', current_netloc)
        print('popped!')
Esempio n. 19
0
 def __init__(self):
     self.session = requests.Session()
     headers = Headers(browser="chrome", os="win", headers=True)
     self.session.headers = headers.generate()
     self.links = [
         [
             'https://wbxcatalog-ru.wildberries.ru/nm-2-card/catalog?spp=0&pricemarginCoeff=1.0&reg=0&appType=1&offlineBonus=0&onlineBonus=0&emp=0&locale=ru&lang=ru&curr=rub&nm=IDS;',
             "W_iD"
         ],
         [
             'https://api.retailrocket.net/api/1.0/partner/5ba1feda97a5252320437f20/items/?itemsIds=IDS&stock=&format=json',
             "E_iD"
         ],
         ['https://my-shop.ru/cgi-bin/shop2.pl?q=product&id=IDS', "M_iD"]
     ]
     self.result = []
    def init_driver(browser_name):
        def set_properties(browser_option):
            ua = Headers().generate()      #fake user agent
            browser_option.add_argument('--headless')
            browser_option.add_argument('--disable-extensions')
            browser_option.add_argument('--incognito')
            browser_option.add_argument('--disable-gpu')
            browser_option.add_argument('--log-level=3')
            browser_option.add_argument(f'user-agent={ua}')
            browser_option.add_argument('--disable-notifications')
            browser_option.add_argument('--disable-popup-blocking')
            return browser_option
        try:
            browser_name = browser_name.strip().title()

            ua = Headers().generate()      #fake user agent
            #automating and opening URL in headless browser
            if browser_name.lower() == "chrome":
                browser_option = ChromeOptions()
                browser_option = set_properties(browser_option)    
                driver = webdriver.Chrome(ChromeDriverManager().install(),options=browser_option) #chromedriver's path in first argument
            elif browser_name.lower() == "firefox":
                browser_option = FirefoxOptions()
                browser_option = set_properties(browser_option)
                driver = webdriver.Firefox(executable_path= GeckoDriverManager().install(),options=browser_option)
            else:
                driver = "Browser Not Supported!"
            return driver
        except Exception as ex:
            print(ex)
Esempio n. 21
0
def mainChecker(proxy_type, proxy, position):

    checked[position] = None

    proxyDict = {
        "http": f"{proxy_type}://{proxy}",
        "https": f"{proxy_type}://{proxy}",
    }

    try:

        header = Headers(headers=False).generate()
        agent = header['User-Agent']

        headers = {
            'User-Agent': f'{agent}',
        }

        response = requests.get('https://www.youtube.com/',
                                headers=headers,
                                proxies=proxyDict,
                                timeout=30)
        status = response.status_code

        print(bcolors.OKBLUE + f"Tried {position+1} |" + bcolors.OKGREEN +
              f' {proxy} | GOOD | Type : {proxy_type} | Response : {status}' +
              bcolors.ENDC)

        print(proxy, file=open('GoodProxy.txt', 'a'))

    except:
        print(bcolors.OKBLUE + f"Tried {position+1} |" + bcolors.FAIL +
              f' {proxy} | {proxy_type} |BAD ' + bcolors.ENDC)
        checked[position] = proxy_type
        pass
Esempio n. 22
0
def link_checker(shop):
    shop_name = shop['name']
    fail_url = shop['fail_url']
    for item in shop['items']:
        item_name = item['name']
        headers = Headers(headers=True).generate()
        try:
            response = requests.get(item['link'],
                                    allow_redirects=True,
                                    headers=headers,
                                    timeout=5)
            if response.status_code == 200 and response.url == fail_url:
                status = 'OOS'
                log_result(shop_name, item_name, status)
            elif response.status_code == 200 and response.url != fail_url:
                status = format_hyperlink(item['link'], 'IN STOCK')
                log_result(shop_name, item_name, status)
            elif response.status_code == 404:
                log_result(shop_name, item_name, 'PAGE NOT FOUND')
            else:
                log_result(shop_name, item_name,
                           'ERROR: RESPONSE CODE ' + str(response.status_code))
        except requests.exceptions.Timeout as t:
            log_result(shop_name, item_name, 'TIMEOUT')
        except requests.exceptions.TooManyRedirects as t:
            log_result(shop_name, item_name, str(t))
        except requests.exceptions.RequestException as e:
            log_result(shop_name, item_name, str(e))
Esempio n. 23
0
def yandex_parser(m_class):
    global https_p
    res = []
    _https = https_p.copy()
    urls = set()
    flag = True
    for i in range(len(_https)):
        try:
            htps = random.choice(_https)
            print(htps)
            _https.remove(htps)
            proxies = {
            'https': "https://" + htps}       
            url = "https://yandex.ru/images/search?text=" + str(m_class) + i
            page = requests.get(url, headers=Headers().generate(), proxies=proxies)
            soup = BeautifulSoup(page.text, "html.parser")
                
            result = soup.find_all("div", {"class":"serp-item"}, limit=50)
            for r in result:
                if count >3:
                    return list(urls)
                jsonify = json.loads(r["data-bem"])
                urls.add(jsonify['serp-item']['preview'][0]['url'])
                if len(urls)>=1:
                    res.append(htps)
                    count+=1
                    continue
                else:
                    continue
            return 
        except:
            pass
    return res
Esempio n. 24
0
    def get_image_links(self,chapter_number):
        """returns all image links present for manga chapter"""
        
        URLS = self.get_links()             #all chapters, method from Chapter_list class(base class)
   
        headers = Headers(headers=False).generate()        
        
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) #disable warning
        
        response = requests.get(URLS[int(chapter_number)],headers = headers,verify = False)            #chapter number
        
        if response.status_code < 500 and response.status_code >= 400:   #if server error 
            print("Server Error!\nTry Again later")
            exit()

        if response.status_code >= 200 and response.status_code < 300:
            #if response is success

            soup = BeautifulSoup(response.content,"html.parser") #make bs4 object with response's content and parser is html.parser   
            
            paragraph = soup.find("p",{"id":"arraydata"})

            all_image_hrefs = paragraph.text.split(',')

            return all_image_hrefs
Esempio n. 25
0
 def run(self):
     # 开始计时
     pure_ip_address = self.proxyip.split(':')[0]
     # 验证IP归属
     if not getChinaIP(pure_ip_address):
         pass
         # raise ValueError('不是有效IP')
     #
     start = time.time()
     # 消除关闭证书验证的警告
     urllib3.disable_warnings()
     headers = Headers(headers=True).generate()
     headers['Referer'] = 'http://ga.314300.cn/toupiao/user40.html'
     headers['Pragma'] = 'no-cache'
     # headers['Host'] = 'ga.314300.cn'
     # headers['x-forward-for'] = pure_ip_address
     headers['Cookie'] = 'ASPSESSIONIDSAACBBBS=HOPLOAJDCHIIHBFNLIODPLJL'
     # print(headers)
     headers[
         'User-Agent'] = 'Mozilla/5.0 (Linux; U; Android 2.3.6; zh-cn; GT-S5660 Build/GINGERBREAD) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MicroMessenger/5.3'
     html = requests.get(headers=headers,
                         url=targetUrl,
                         proxies={
                             "http": 'http://' + self.proxyip
                         },
                         verify=False,
                         timeout=12).content.decode()
     # 结束计时
     end = time.time()
     # 输出内容
     print(threading.current_thread().getName() + "使用代理IP, 耗时 " +
           str(end - start) + "毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" +
           html + "\n*************")
Esempio n. 26
0
 def run(self):
     # 开始计时
     pure_ip_address = self.proxyip.split(':')[0]
     # 验证IP归属
     if not getChinaIP(pure_ip_address):
         # pass
         raise ValueError('不是有效IP')
     #
     start = time.time()
     # 消除关闭证书验证的警告
     urllib3.disable_warnings()
     headers = Headers(headers=True).generate()
     headers[
         'Referer'] = 'http://bb.cf08tp.cn/Home/index.php?m=Index&a=index&id=2676'
     headers['Pragma'] = 'no-cache'
     headers['Host'] = 'bb.cf08tp.cn'
     headers['x-forward-for'] = pure_ip_address
     headers['Cookie'] = 'PHPSESSID={}'.format(''.join(
         str(uuid.uuid1()).split('-')))
     print(headers)
     html = requests.get(headers=headers,
                         url=targetUrl,
                         proxies={
                             "http": 'http://' + self.proxyip,
                             "https": 'https://' + self.proxyip
                         },
                         verify=False,
                         timeout=2).content.decode()
     # 结束计时
     end = time.time()
     # 输出内容
     print(threading.current_thread().getName() + "使用代理IP, 耗时 " +
           str(end - start) + "毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" +
           html + "\n*************")
Esempio n. 27
0
def gosreestr_parse_companies(fpath: str, struct=None):

    page_index = 23
    s = requests.Session()
    headers = Headers(headers=True)

    form_data = prepare_request_data(FORM_DATA, page_index)

    table_raw = s.post(LIST_URL, data=form_data).text
    mapping = {
        f.name: f.metadata['label_key']
        for f in attr.fields(GosreestrCompany)
    }

    timeout_error = False

    while not check_empty_table(table_raw):
        ids = parse_ids_from_table(table_raw)
        if not timeout_error:
            for _id in ids:
                url = DETAIL_URL.format(_id)
                try:
                    s.headers = headers.generate()
                    company_raw = s.get(url, timeout=10).text
                except (ReadTimeout, ConnectTimeout, ConnectionError,
                        ReadTimeoutError):
                    print('company request ban')
                    timeout_error = True
                    sleep(90)
                else:
                    timeout_error = False
                d = parse_company_info(company_raw, mapping)
                print(d)
                # sleep(15)
        page_index += 1
        form_data = prepare_request_data(FORM_DATA, page_index)
        sleep(300)
        try:
            s.headers = headers.generate()
            table_raw = s.post(LIST_URL, data=form_data, timeout=10).text
        except (ReadTimeout, ConnectTimeout, ConnectionError,
                ReadTimeoutError):
            print('table request ban')
            timeout_error = True
            sleep(300)
        else:
            timeout_error = False
Esempio n. 28
0
def startsHere():

    header = Headers()

    uheaders = header.generate()

    users = list(map(lambda x: x["_id"], db["users"].find({"viewed": False})))

    url = "https://www.reddit.com/user/"

    for usr in users:
        try:
            nurl = url + usr + "/.json?limit=1000"
            scrap(nurl, uheaders)
        except Exception as e:
            e = 0
        db["users"].update_one({"_id": usr}, {"$set": {"viewed": True}})
Esempio n. 29
0
 def get_header(self, browser=None, os=None, headers=None):
     if not browser:
         browser = "chrome"
     if not os:
         os = "win"
     if not headers:
         headers = False
     return Headers(browser=browser, os=os, headers=headers)
Esempio n. 30
0
def get_page(uri):
    """
    Reads a webpage given the URI.
    """

    # make request for uri
    HeadersGenerator = Headers(os='mac', headers=False)
    response = requests.get(uri, headers=HeadersGenerator.generate())

    # check status code
    status_code = response.status_code
    if status_code != 200:
        print(status_code)

    # get and return content as bytes
    content = response.content
    return content