Python create_scraper 예제들, cloudscraper.create_scraper Python 예제들

예제 #1

0

파일 보기

async def cb_sticker(event):
    "To get list of sticker packs with given name."
    split = event.pattern_match.group(1)
    if not split:
        return await edit_delete(event, "`Provide some name to search for pack.`", 5)
    catevent = await edit_or_reply(event, "`Searching sticker packs....`")
    scraper = cloudscraper.create_scraper()
    text = scraper.get(combot_stickers_url + split).text
    soup = bs(text, "lxml")
    results = soup.find_all("div", {"class": "sticker-pack__header"})
    if not results:
        return await edit_delete(catevent, "`No results found :(.`", 5)
    reply = f"**Sticker packs found for {split} are :**"
    for pack in results:
        if pack.button:
            packtitle = (pack.find("div", "sticker-pack__title")).get_text()
            packlink = (pack.a).get("href")
            packid = (pack.button).get("data-popup")
            reply += f"\n **• ID: **`{packid}`\n [{packtitle}]({packlink})"
    await catevent.edit(reply)

예제 #2

0

파일 보기

def create_scrapper_session(useCloudscraper=True,
                            retries=10,
                            backoff_factor=0.3,
                            status_forcelist=(500, 502, 504, 423)):
    session = None
    if useCloudscraper:
        session = cloudscraper.create_scraper()
    else:
        session = Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

예제 #3

0

파일 보기

    def test_user_agent(self, **kwargs):
        for browser in ['chrome', 'firefox']:
            scraper = cloudscraper.create_scraper(browser=browser)
            assert browser in scraper.headers['User-Agent'].lower()

        # Check it can't find browsers.json
        with pytest.raises(RuntimeError,
                           match=r".*?User-Agent was not found\."):
            scraper = cloudscraper.create_scraper(browser='bad_match')

        # Check mobile and desktop disabled
        with pytest.raises(
                RuntimeError,
                match=
                r"Sorry you can't have mobile and desktop disabled at the same time\."
        ):
            scraper = cloudscraper.create_scraper(browser={
                'browser': 'chrome',
                'desktop': False,
                'mobile': False
            })

        # check brotli
        scraper = cloudscraper.create_scraper(browser='chrome',
                                              allow_brotli=False)
        assert 'br' not in scraper.headers['Accept-Encoding']

        # test custom  User-Agent
        scraper = cloudscraper.create_scraper(browser={'custom': 'test'})
        assert scraper.headers['User-Agent'] == 'test'

        # check its matched chrome and loaded correct cipherSuite
        scraper = cloudscraper.create_scraper(browser={
            'custom': '50.0.9370.394',
            'tryMatchCustom': True
        })
        assert any('!' not in _ for _ in scraper.user_agent.cipherSuite)

        # check it didn't match anything and loaded custom cipherSuite
        scraper = cloudscraper.create_scraper(browser={
            'custom': 'aa50.0.9370.394',
            'tryMatchCustom': True
        })
        assert any('!' in _ for _ in scraper.user_agent.cipherSuite)

예제 #4

0

파일 보기

파일: bufferover.py 프로젝트: myhololens/goulang-python-web-tools-shmilylty-oneforall

 def query(self):
     """
     向接口查询子域并做子域匹配
     """
     time.sleep(self.delay)
     # 绕过cloudFlare验证
     scraper = cloudscraper.create_scraper()
     scraper.interpreter = 'js2py'
     scraper.proxies = self.get_proxy(self.source)
     url = self.addr + self.domain
     try:
         resp = scraper.get(url, timeout=self.timeout)
     except Exception as e:
         logger.log('ERROR', e.args)
         return
     if not resp:
         return
     subdomains = self.match(self.domain, str(resp.json()))
     # 合并搜索子域名搜索结果
     self.subdomains = self.subdomains.union(subdomains)

예제 #5

0

파일 보기

 def __init__(self, profileJson, _found):
     self.profielPropeties = profileJson
     self.found = _found
     if _found:
         self.cSession = cloudscraper.create_scraper()
         self.display_name = profileJson["displayName"]
         self.username = profileJson["username"]
         self.bio = profileJson["aboutMe"]
         self.avatar_url = profileJson["avatarFileName"]
         self.user_id = profileJson["id"]
         self.followersCount = profileJson["followerCount"]
         self.anonymousFollowerCount = profileJson["anonymousFollowerCount"]
         self.isFollowingcount = profileJson["followingCount"]
         self.active = profileJson["isActive"]
         self.tellsCount = profileJson["tellCount"]
         self.tells = []
         self.followers = []
         self.followings = []
     else:
         return

예제 #6

0

파일 보기

파일: stickers.py 프로젝트: uditvarshney100/LaylaRobot

def cb_sticker(update: Update, context: CallbackContext):
    msg = update.effective_message
    split = msg.text.split(" ", 1)
    if len(split) == 1:
        msg.reply_text("Provide some name to search for pack.")
        return

    scraper = cloudscraper.create_scraper()
    text = scraper.get(combot_stickers_url + split[1]).text
    soup = bs(text, "lxml")
    results = soup.find_all("a", {"class": "sticker-pack__btn"})
    titles = soup.find_all("div", "sticker-pack__title")
    if not results:
        msg.reply_text("No results found :(.")
        return
    reply = f"Stickers for *{split[1]}*:"
    for result, title in zip(results, titles):
        link = result["href"]
        reply += f"\n• [{title.get_text()}]({link})"
    msg.reply_text(reply, parse_mode=ParseMode.MARKDOWN)

예제 #7

0

파일 보기

def linkExtract():
    target = input(
        "[+] Target ( like https://google.com/ ) with http/https  \t")
    name = "links"
    scraper = cloudscraper.create_scraper()
    source = scraper.get(target).text
    soup = BeautifulSoup(source, 'html.parser')
    links = []
    for link in soup.find_all(attrs={'href': re.compile("http")}):
        links.append(link.get('href'))
    for link in soup.find_all(attrs={'href': re.compile("https")}):
        links.append(link.get('href'))
    print("\n")
    print(*set(links), sep="\n")
    with open(home + "/webgather/" + name+"-links.txt", 'w+') as f:
        for item in links:
            f.write("%s\n" % item)
    print(Fore.LIGHTMAGENTA_EX +
          "\n [!] Found {0} links (some were duplicate). results Also saved in {1}/webgather/{2}-links.txt \n" .format(len(links), home, name))
    input("")
    recreate()

예제 #8

0

파일 보기

def bgDownloadAssets():
    global config
    scraper = cloudscraper.create_scraper()
    apkPage = BeautifulSoup(scraper.get("https://apkpure.com/golf-blitz/com.noodlecake.ssg4/download").text, features="html.parser")
    apkLink = apkPage.find("a", id="download_link")['href']
    apkVersion = apkPage.find("span", attrs={"class": "file"}).text
    if not "apkVersion" in config:
        config["apkVersion"] = ""
    if config["apkVersion"] != apkVersion:
        print("downloading new apk for assets")
        apkPath = os.path.join(bot_globals.resources_path, "golfblitz.apk")
        with scraper.get(apkLink, stream=True) as dl: #requests.get(apkLink, stream=True)
            with open(apkPath, "wb") as f:
                for chunk in dl.iter_content(chunk_size=16384):
                    f.write(chunk)
        with zipfile.ZipFile(apkPath, 'r') as to_unzip:
            to_unzip.extractall(bot_globals.resources_path)
        print("apk has been downloaded and extracted")
        config["apkVersion"] = apkVersion
        json.dump(config, open(os.path.join(confPath, "main-configuration.json"), 'w'))
        bot_globals.update_hats_and_golfers()

예제 #9

0

파일 보기

def get_lab_urls(date):
    ''' returns all 4 lab urls from poelab.com 
        will return None for each if date on poelab doesnt match provided date (has not been updated yet)
        date is format: %Y-%m-%d'''
    labpages = []
    ret = []
    scraper = cloudscraper.create_scraper()
    with scraper.get('https://www.poelab.com/') as r:
        etree = lxmlhtml.fromstring(r.text)
        labpages = etree.xpath('//h2/a[@class="redLink"]/@href')
    for url in reversed(labpages[:4]):
        with scraper.get(url) as r:
            etree = lxmlhtml.fromstring(r.text)
            t = etree.xpath('//img[@id="notesImg"]/@src')
            if t:
                t = t[0]
            if date not in t:
                ret.append(None)
            else:
                ret.append(t)
    return ret

예제 #10

0

파일 보기

파일: client.py 프로젝트: azuline/pixiv-api

    def __init__(
        self,
        language="English",
        client_id="KzEZED7aC0vird8jWyHM38mXjNTY",
        client_secret="W9JZoJe00qPvJsiyCGT3CCtC6ZUtdpKpzMbNlUGP",
    ):
        self.language = language
        self.client_id = client_id
        self.client_secret = client_secret

        self.account = None
        self.access_token = None
        self.refresh_token = None

        # Using cloudscraper over a simple session allows us to
        # get around Cloudflare.
        self.session = cloudscraper.create_scraper()
        self.session.headers.update(HEADERS)

        if self.language:  # pragma: no cover
            self.session.headers.update({"Accept-Language": self.language})

예제 #11

0

파일 보기

def update_user_agents() -> None:
    if not HAS_CF:
        # The website with the UAs is behind Cloudflare's anti-bot page, we need cloudscraper
        return

    today = datetime.now()
    ua_path = get_homedir() / 'user_agents' / str(today.year) / f'{today.month:02}'
    safe_create_dir(ua_path)
    ua_file_name: Path = ua_path / f'{today.date().isoformat()}.json'
    if ua_file_name.exists():
        # Already have a UA for that day.
        return
    try:
        s = cloudscraper.create_scraper()
        r = s.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/')
    except Exception:
        traceback.print_exc()
        return
    to_store = ua_parser(r.text)
    with open(ua_file_name, 'w') as f:
        json.dump(to_store, f, indent=2)

예제 #12

0

파일 보기

파일: scrape_abstracts.py 프로젝트: NilsJPWerner/covid-19-gender-research

def scrape(research_network: str):
    ids = get_ids(research_network, 2272481)

    data = get_existing_data(research_network)
    completed_ids = set([d["id"] for d in data])
    scraper = cloudscraper.create_scraper()

    for id in tqdm(ids):
        if id in completed_ids:
            continue

        url = f"https://papers.ssrn.com/sol3/papers.cfm?abstract_id={id}"
        response = scraper.get(url)

        save_abstract_data(research_network, {
            "id": id,
            "url": url,
            "error": response.status_code != 200,
            "status_code": response.status_code,
            "html": remove_blank_space(response.text),
        })

예제 #13

0

파일 보기

파일: main.py 프로젝트: SatuSembilanDua/menime_api

def anifo(URL):
    URL = d_url(URL)
    #URL = 'https://www.oploverz.in/series/one-piece-sub-indo/'
    # scraper = cfscrape.create_scraper()
    scraper = cloudscraper.create_scraper()
    soup = BeautifulSoup(scraper.get(URL).content, 'html.parser')

    desc = soup.find('span', class_='desc')
    listinfo = soup.find('div', class_='listinfo')
    img_des = soup.find('div', class_='imgdesc')
    img = img_des.findChildren("img", recursive=False)

    a = desc.prettify(formatter="html5")
    b = listinfo.prettify(formatter="html5")
    c = a.replace('"', "\"")
    d = b.replace('"', "\"")
    e = html.escape(c)
    f = html.escape(d)
    ret = {'desc': e, 'info': f, 'img': img[0]["src"]}
    jsona = json.dumps(ret)
    return jsona

예제 #14

0

파일 보기

파일: main.py 프로젝트: SatuSembilanDua/menime_api

def get_eps_list(URL):
    URL = d_url(URL)
    scraper = cloudscraper.create_scraper()
    soup = BeautifulSoup(scraper.get(URL).content, 'html.parser')

    episodelist = soup.find(class_='episodelist')
    ret = []
    for li in episodelist.find_all("li"):
        eps = li.find(class_="leftoff")
        judul = li.find(class_="lefttitle")
        dt = li.find(class_="rightoff")
        alink = eps.find("a")
        con = {
            'link': alink.get('href'),
            'eps': eps.get_text().strip(),
            'judul': judul.get_text().strip(),
            'date': dt.get_text().strip()
        }
        ret.append(con)

    return json.dumps(ret)

예제 #15

0

파일 보기

파일: get_javlibrary.py 프로젝트: KushNee/javkit-golang

def get_jav_html(url_list: List[Union[int, str]]) -> str:
    """获取 javlibrary 网页内容；使用 cloudscraper 跳过 cloudflare 验证

    :param url_list:[0]-errorTimes,[1]-url,[2]-proxy
    :return: scraper.text
    """
    scraper = cloudscraper.create_scraper(browser="chrome")
    while url_list[0] != 6:
        try:
            rqs: Response = Response()
            if len(url_list) == 2:
                rqs = scraper.get(url_list[1])
            elif len(url_list) == 3:
                rqs = scraper.get(url_list[1], proxies=url_list[2])
            rqs.encoding = 'utf-8'
            return rqs.text
        except Exception as e:
            sleep(5)
            if url_list[0] == 5:
                raise e
            url_list[0] += 1

예제 #16

0

파일 보기

def pixabay_index(request):
    if not request.COOKIES.get('api'):
        return redirect('pixabay_ask_api')
    if request.method == 'GET':
        query = request.GET.get('q')
        if query:
            return render(
                request, 'stock_photos_explorer/pixabay-query.html', {
                    'title':
                    'Search Result for {} - Pixabay API Explorer'.format(
                        query).title(),
                    'theme':
                    'light'
                })
        image = bs(
            cloudscraper.create_scraper().get('https://pixabay.com').content,
            'lxml').find('picture').find('source')['srcset']
        return render(request, 'stock_photos_explorer/pixabay-index.html', {
            'title': 'Pixabay API Explorer',
            'image': image
        })

예제 #17

0

파일 보기

def scrape():
    BASE_URL = "https://www.booksandcoupons.com/"
    course_list = []
    course_dict = {}
    udemy_url = []
    browser = {}
    scraper = cloudscraper.create_scraper(browser)
    # Creating a cloudscraper object so that will bypass any cloudscraper protection
    try:
        r = scraper.get(BASE_URL)  ## base url
    except:
        raise ValueError("Fetching site error")

    try:
        soup = BS(r.text, "html.parser")
        # Use beutifulsoup to pasre the html data extract information data.
        header_courses = soup.find("h3", class_="post-title entry-title")
        for i, tag in enumerate(header_courses):
            course_list.insert(1, (tag.text, tag["href"]))
        courses = soup.find_all("h3", class_="post-title entry-title")
        for no, link in enumerate(courses):
            course_link = link.find("a")
            course_list.append((course_link.text, course_link["href"]))
    except error:
        print(error)

    for i, j in dict((course_list)).items():
        course_dict.setdefault(i, j)

    for name, urls in course_dict.items():
        r2 = scraper.get(urls)
        if r2.status_code == 200:
            soup2 = BS(r2.text, "html.parser")
            tags2 = soup2.find_all("a", string="ENROLL NOW")
            if len(tags2) == 1:
                udemy_url.append((current_date, name, tags2[0]["href"]))
            else:
                udemy_url.append((current_date, name, tags2[1]["href"]))

    return udemy_url

예제 #18

0

파일 보기

def main():
    addseptag()
    print('开始加载链接和代理...')
    addseptag()
    data, links, proxy = importLinkProxy()
    proxyList = []
    try:
        proxyList = setupProxyFromUser(proxy)
    except:
        proxyList = setupProxyFromIp(proxy)

    headers = {
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
    }

    with open('2captcha.txt') as f:
        apiKey = f.read()
        for i in range(len(links)):
            scraper = cloudscraper.create_scraper(interpreter='nodejs',
                                                  recaptcha={
                                                      'provider': '2captcha',
                                                      'api_key': apiKey
                                                  })
            try:
                r = scraper.get(links[i],
                                headers=headers,
                                proxies=proxyList[i])
                if r.status_code == 200 or r.status_code == 301:
                    print('激活成功')
                    status = '0'
                    outputToCsvStatus(i, status, data)
                else:
                    print('激活失败')
                    status = '-1'
                    outputToCsvStatus(i, status, data)
            except:
                print('激活失败')
                status = '-1'
                outputToCsvStatus(i, status, data)

예제 #19

0

파일 보기

    def __init__(self):
        self._destroyed = False
        self.executor = futures.ThreadPoolExecutor(max_workers=4)

        # Initialize cloudscrapper
        try:
            self.scraper = cloudscraper.create_scraper(
                browser={
                    'platform': 'linux',
                    'mobile': False
                }
            )
        except Exception as err:
            logger.exception('Failed to initialize cloudscraper')
            self.scraper = Session()
        # end try

        # Must resolve these fields inside `read_novel_info`
        self.novel_title = 'N/A'
        self.novel_author = 'N/A'
        self.novel_cover = None
        self.is_rtl = False

        # Each item must contain these keys:
        # `id` - 1 based index of the volume
        # `title` - the volume title (can be ignored)
        self.volumes = []

        # Each item must contain these keys:
        # `id` - 1 based index of the chapter
        # `title` - the title name
        # `volume` - the volume id of this chapter
        # `volume_title` - the volume title (can be ignored)
        # `url` - the link where to download the chapter
        self.chapters = []

        # Other stuffs - not necessary to resolve from crawler instance.
        self.home_url = ''
        self.novel_url = ''
        self.last_visited_url = None

예제 #20

0

파일 보기

파일: cloudflare.py 프로젝트: ReGius-igmt/CryptBot

def WaxWalletLogin(login, password, userToken2fa, captcha):
    scraper = cloudscraper.create_scraper()
    data = {
        "password": password,
        "username": login,
        "g-recaptcha-response": captcha,
        "redirectTo": ""
    }

    response = json.loads(
        scraper.post("https://all-access.wax.io/api/session", data).text)

    print(response)

    data = {"code": Get2FA(userToken2fa), "token2fa": response["token2fa"]}

    response = scraper.post("https://all-access.wax.io/api/session/2fa", data)
    print(response.text)
    response = scraper.get("https://all-access.wax.io/api/session")
    print(response.text)
    result = {"token": json.loads(response.text)["token"]}
    print(result)

예제 #21

0

파일 보기

def xur_weapon():
    scraper = cloudscraper.create_scraper(
        delay=5, recaptcha={'provider': 'return_response'})

    html = get_xur('https://whereisxur.com/')
    soup = BeautifulSoup(html, 'html.parser')
    xur_first_weapon = soup.find('div', class_="et_pb_blurb_0").find(
        'h4', class_='et_pb_module_header').find('span').string
    url_first_weapon = soup.find(
        'div', class_="et_pb_blurb_0").find('noscript').find("img")["src"]
    first_img = scraper.get(url_first_weapon, stream=True)
    with open('xur_img/img_first_weapon.png', 'wb') as f:
        f.write(first_img.content)

    xur_second_weapon = soup.find('div', class_="et_pb_blurb_1").find(
        'h4', class_='et_pb_module_header').find('span').string
    url_second_weapon = soup.find(
        'div', class_="et_pb_blurb_1").find('noscript').find("img")["src"]
    second_img = scraper.get(url_second_weapon, stream=True)
    with open('xur_img/img_second_weapon.png', 'wb') as f:
        f.write(second_img.content)

    xur_third_weapon = soup.find('div', class_="et_pb_blurb_2").find(
        'h4', class_='et_pb_module_header').find('span').string
    url_third_weapon = soup.find(
        'div', class_="et_pb_blurb_2").find('noscript').find("img")["src"]
    third_img = scraper.get(url_third_weapon, stream=True)
    with open('xur_img/img_third_weapon.png', 'wb') as f:
        f.write(third_img.content)

    xur_fourth_weapon = soup.find('div', class_="et_pb_blurb_3").find(
        'h4', class_='et_pb_module_header').find('span').string
    url_fourth_weapon = soup.find(
        'div', class_="et_pb_blurb_3").find('noscript').find("img")["src"]
    fourth_img = scraper.get(url_fourth_weapon, stream=True)
    with open('xur_img/img_fourth_weapon.png', 'wb') as f:
        f.write(fourth_img.content)
    save_xur_weapon(db, xur_first_weapon, xur_second_weapon, xur_third_weapon,
                    xur_fourth_weapon)

예제 #22

0

파일 보기

    def fetch_course_data_by_class_id(self, class_id):
        url = 'https://api.skillshare.com/classes/{}'.format(class_id)
        scraper = cloudscraper.create_scraper(browser={
            'custom':
            'Skillshare/4.1.1; Android 5.1.1',
        },
                                              delay=10)

        res = scraper.get(
            url,
            headers={
                'Accept': 'application/vnd.skillshare.class+json;,version=0.8',
                'User-Agent': 'Skillshare/5.3.0; Android 9.0.1',
                'Host': 'api.skillshare.com',
                'Referer': 'https://www.skillshare.com/',
                'cookie': self.cookie,
            })

        if not res.status_code == 200:
            raise Exception('Fetch error, code == {}'.format(res.status_code))

        return res.json()

예제 #23

0

파일 보기

파일: magnetdl.py 프로젝트: BrutuZ/Flexget

    def on_task_input(self, task, config):
        try:
            import cloudscraper
        except ImportError as e:
            logger.debug('Error importing cloudscraper: {}', e)
            raise plugin.DependencyError(
                issued_by='cfscraper',
                missing='cloudscraper',
                message='CLOudscraper module required. ImportError: %s' % e,
            )

        scraper = cloudscraper.create_scraper()
        category = config['category']
        persistence = SimplePersistence(plugin='magnetdl')
        last_magnet = persistence.get(category, None)
        logger.debug('last_magnet: {}', last_magnet)
        first_magnet = None
        stop = False

        for page in range(0, config['pages']):
            logger.verbose('Retrieving {} page {}', category, page + 1)
            url = self._url(category, page)
            logger.debug('Url: {}', url)
            try:
                for entry in self.parse_page(scraper, url):
                    if first_magnet is None:
                        first_magnet = entry['url']
                        logger.debug('Set first_magnet to {}', first_magnet)
                        persistence[category] = first_magnet
                    if last_magnet == entry['url']:
                        logger.debug('Found page where we have left, stopping')
                        stop = True
                    yield entry
            except Page404Error:
                logger.warning('Page {} returned 404, stopping', page)
                return
            if stop:
                return
            time.sleep(random.randint(1, 5))

예제 #24

0

파일 보기

파일: mangaChapters.py 프로젝트: vomov/comic-dl

    def json_download(self, chapter_id):
        headers = {
            'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
            'Accept-Encoding': 'gzip, deflate'
        }

        sess = requests.session()
        sess = cloudscraper.create_scraper(sess)

        search_url = "http://www.mangaeden.com/api/manga/{0}/".format(chapter_id)

        connection = sess.get(search_url, headers=headers)
        if connection.status_code != 200:
            print("Whoops! Seems like I can't connect to website.")
            print("It's showing : %s" % connection)
            print("Run this script with the --verbose argument and report the issue along with log file on Github.")
            sys.exit(1)
        else:
            json_data = connection.content

            return json_data

예제 #25

0

파일 보기

    def _get_category(query):

        url = "https://api.leboncoin.fr/api/parrot/v1/complete?q={query.replace(' ', '%20')}"

        anti_captcha = create_scraper(browser="chrome")

        res = anti_captcha.get(url).json()

        print(type(res))

        if res:

            assert isinstance(
                res, list), "Unexpected answer received from API: {res!r}"

            return str(res[0]["cat_id"])

        else:

            # No category returned

            return None

예제 #26

0

파일 보기

파일: helpers.py 프로젝트: Rafiot/lookyloo

def update_user_agents():
    if not HAS_CF:
        # The website with the UAs is behind Cloudflare's anti-bot page, we need cloudscraper
        return

    today = datetime.now()
    ua_path = get_homedir() / 'user_agents' / str(
        today.year) / f'{today.month:02}'
    safe_create_dir(ua_path)
    ua_file_name: Path = ua_path / f'{today.date().isoformat()}.json'
    if ua_file_name.exists():
        # Already have a UA for that day.
        return
    try:
        s = cloudscraper.create_scraper()
        r = s.get(
            'https://techblog.willshouse.com/2012/01/03/most-common-user-agents/'
        )
    except Exception:
        traceback.print_exc()
        return
    soup = BeautifulSoup(r.text, 'html.parser')
    uas = soup.find_all('textarea')[1].text
    to_store = {'by_frequency': []}
    for ua in json.loads(uas):
        os = ua['system'].split(' ')[-1]
        if os not in to_store:
            to_store[os] = {}
        browser = ' '.join(ua['system'].split(' ')[:-1])
        if browser not in to_store[os]:
            to_store[os][browser] = []
        to_store[os][browser].append(ua['useragent'])
        to_store['by_frequency'].append({
            'os': os,
            'browser': browser,
            'useragent': ua['useragent']
        })
    with open(ua_file_name, 'w') as f:
        json.dump(to_store, f, indent=2)

예제 #27

0

파일 보기

파일: wam.py 프로젝트: junqingchang/WAM

    def update_addon(self, addon_name):
        addon_link = self.config['addons'][addon_name]['link']
        addon_last_update = self.config['addons'][addon_name]['last_update']

        scraper = cloudscraper.create_scraper()
        r = scraper.get(addon_link)
        soup = BeautifulSoup(r.text, features='html.parser')
        addon_name = soup.find("meta", property="og:title")["content"]
        last_update = soup.find('abbr')['title']
        converted_time = self.convert_datetime(last_update.split()[:4])

        if converted_time > addon_last_update:
            self.remove_addon(addon_name)
            download_page = scraper.get(f'{addon_link}/download')
            download_soup = BeautifulSoup(download_page.text,
                                          features='html.parser')
            link = download_soup.find('p', {
                'class': 'text-sm'
            }).find('a')['href']
            download_link = f'http://www.curseforge.com{link}'

            files = scraper.get(download_link)
            existing_addons = os.listdir(self.addon_path)
            with open(os.path.join(self.addon_path, 'addon.zip'), 'wb') as f:
                f.write(files.content)

            with ZipFile(os.path.join(self.addon_path, 'addon.zip'),
                         'r') as zipobj:
                zipobj.extractall(self.addon_path)

            os.remove(os.path.join(self.addon_path, 'addon.zip'))

            all_addons = os.listdir(self.addon_path)
            new_files = [x for x in all_addons if x not in existing_addons]

            self.config['addons'][addon_name]['last_update'] = converted_time
            self.config['addons'][addon_name]['files'] = new_files
            self.save_config()

예제 #28

0

파일 보기

    def create_novel(self):
        try:
            print("Initializing...")
            scrapper = cloudscraper.create_scraper()
            page = scrapper.get(self.novel_link)
            soup = BeautifulSoup(page.text, 'html.parser')

            # Get Novel Name
            self.novel_name = soup.find(class_='title').get_text()

            # Get the html that stores links to each chapter
            chapters = soup.find_all(class_='rowChapter')

            # Get all the specified links from the html
            chapter_links = []
            for chapter in chapters:
                chapter_links.append(chapter.find('a').get('href'))
            chapter_links.reverse()  # Reverse the list so the first index will be the first chapter

            print("Starting...")

            book = EpubEngine(self.novel_name, self.storage_path)

            book.addCover(self.storage_path + "/cover.png")
            print("Added Cover")

            current_chapter = 1
            self.download_chapters(current_chapter, scrapper, chapter_links, book)
            
            book.createEpub()
            self.update_gui('END')

        except Exception as e:
            if 'Missing Node.js' in str(e):
                self.update_gui("NODEJS")
            else:
                print(e)
                self.update_gui('ERROR')

예제 #29

0

파일 보기

파일: PubMedSearchResults_Via_Entrez.py 프로젝트: JBB-eng/TestingAutoUpdate

def Scrap_For_TOC(url):
    """scrapes the html for the ToC information"""
    issue = ""
    publication_type = ""
    publication_info = []
    scraper = cloudscraper.create_scraper()
    web_text = scraper.get(url).text

    soup = BeautifulSoup(web_text, features="lxml")
    temp_soup = soup.get_text().replace('\n\n', '')
    temp_soup = ''.join(temp_soup)
    temp_soup = temp_soup.split('\n')
    parsed_soup = []
    start_stop_parsing = 0
    start_stop_pub_info = 0
    #x=0
    for line in temp_soup:  #soup.get_text():
        #print(line)
        if 'Facebook pageRSS FeedsMost recent' in line:
            start_stop_parsing = 1
            #print("start_stop = 1")
        elif 'ToolsSubmit an Article' in line:
            start_stop_parsing = 0
            #print("start_stop = 0")
        if start_stop_parsing == 1:
            parsed_soup.append(str(line))
            #print(line)
        if 'Select / Deselect allExport Citation(s)Export' in line:
            issue = line
        if 'Open Access' in line:
            start_stop_pub_info = 1
        elif 'Full text' in line:
            start_stop_pub_info = 0
        if start_stop_pub_info == 1 and start_stop_parsing == 1:
            publication_info.append(line)
            #x += 1
    print(issue)
    print(publication_info)

예제 #30

0

파일 보기

def gethtml(url, req='', headers='', interpreter='nodejs'):
    # session = requests.session()
    # session = cfscrape.create_scraper()
    session = cloudscraper.create_scraper(interpreter=interpreter)
    session.mount('file://', LocalFileAdapter())
    cookies_ = ConfigParser()
    cookies_.read('cookies')
    session.cookies['sess_id'] = cookies_.get('COOKIES', 'sess_id')
    session.cookies['session_id'] = cookies_.get('COOKIES', 'sess_id')
    #lang, lang2, forcesub, forceusa, localizecookies, quality, onlymainsub, connection_n_, proxy_ = config()
    config_ = config()
    if config_['forceusa']:
        session.cookies['sess_id'] = cookies_.get('COOKIES', 'sess_id_usa')
        session.cookies['session_id'] = cookies_.get('COOKIES', 'sess_id_usa')
    del session.cookies['c_visitor']
    if not config_['forceusa'] and config_['localizecookies']:
        session.cookies['c_locale'] = \
        {u'Español (Espana)': 'esES', u'Français (France)': 'frFR', u'Português (Brasil)': 'ptBR',
         u'English': 'enUS', u'Español': 'esLA', u'Türkçe': 'enUS', u'Italiano': 'itIT',
         u'العربية': 'arME', u'Deutsch': 'deDE', u'Русский': 'ruRU'}[config_['language']]

    if not urlparse(url).scheme and not urlparse(url).netloc:
        print('Apparently not a URL')
        sys.exit()
    if headers == '':
        headers = {
            'Referer':
            'http://crunchyroll.com/',
            'Host':
            'www.crunchyroll.com',
            'User-Agent':
            'Mozilla/5.0  Windows NT 6.1; rv:26.0 Gecko/20100101 Firefox/26.0'
        }
    res = session.get(url, params=req, headers=headers)
    res.encoding = 'UTF-8'
    #print(session.get(url, params=req, headers=headers).url)
    #open('page.html', 'a',encoding='UTF-8').write(res.text)
    return res.text