Esempio n. 1
0
def parse_pagination(html) -> Tuple[List, str]:
    """find all job links and next page link in pagination html"""
    print("Parse Pagination!")
    sel = Selector(text=html)
    jobs = sel.css('div.item h3 a::attr(href)').extract()
    next_page = sel.css('a[aria-label=Next]::attr(href)').extract_first()
    return jobs, next_page
Esempio n. 2
0
def parse_gallery(html):
    title = html.css("a::text").get()
    url = html.css("a::attr(href)").get()
    slides = []

    gallery_html = download(ROOT + url)
    gallery_page_document = Selector(text=gallery_html)
    for slide_element in gallery_page_document.css(".rsSlideContent"):
        slide = slide_element.css("img::attr(src)").get()
        slides = slides + [slide]

    next_gallery_page_url = gallery_page_document.css(
        ".gall_next_page > a::attr(href)"
    ).get()
    while next_gallery_page_url is not None:
        gallery_html = download(ROOT + url + next_gallery_page_url)
        gallery_page_document = Selector(text=gallery_html)
        for slide_element in gallery_page_document.css(".rsSlideContent"):
            slide = slide_element.css("img::attr(src)").get()
            slides = slides + [slide]
        next_gallery_page_url = gallery_page_document.css(
            ".gall_next_page > a::attr(href)"
        ).get()

    slides = [slide for slide in slides if slide is not None]

    return (title, url, GalleryContent(slides), None)
Esempio n. 3
0
def test_check_server():
    agent = make_connected_agent()
    agent.start(auto_register=False)
    port = random.randint(5000, 9999)
    agent.web.start(hostname="0.0.0.0", port=port)

    # wait for web server to be up
    counter = 0
    while counter < 4:
        if agent.web.server is not None:
            break
        counter += 1
        time.sleep(0.1)
    assert agent.web.server is not None

    response = requests.get(f"http://localhost:{port}/spade")

    sel = Selector(text=response.text)

    assert sel.css("title::text").get() == "fake agent"
    assert sel.css("img::attr(src)").get() == agent.avatar

    assert sel.css("ul.products-list > li").getall() == []

    agent.stop()
Esempio n. 4
0
def weibo_spider(url):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
    }
    response = requests.get(url=search_url, headers=headers)

    #构造css文本
    css_text = Selector(text=response.text)
    wrap_list = css_text.css('.card')
    #搜索结果的第一个
    weibo_detail_url = 'https:' + wrap_list[0].css(
        '.content .from a[target=_blank]::attr(href)').extract_first()
    print('本次搜索排第一的内容页面 --> %s ' % weibo_detail_url)
    #评论链接的id获取
    comment_id = css_text.css('.card-wrap')[0].css(
        '::attr(mid)').extract_first()
    print('该内容的comment_id --> %s ' % comment_id)
    #用户名
    user_name = wrap_list[0].css('.info div a[class=name]::attr(nick-name)'
                                 ).extract_first()  #a[class=name]
    print('发布该内容的用户 --> %s ' % user_name)
    #uid
    user_uid = wrap_list[0].css(
        '.info div a[class=s-btn-c]::attr(uid)').extract_first()
    print('该用户的UID --> %s ' % user_uid)
    #个人页面
    personal_page = wrap_list[0].css(
        '.info div a[class=name]::attr(href)').extract_first()
    print('该用户的个人页面 --> %s ' % personal_page)

    return comment_id
def crawl_detail_page(url, title, news_type):
    resp_text = requests.get(url).text
    sel = Selector(resp_text)
    author = sel.css('#articlewrap > div.article-info > div > a::text').extract_first() or \
             sel.css('#articlewrap > div.article-info > div::text').extract_first() or ''
    author = author.replace("\n", "").strip()
    print(f'保存数据  {news_type}   {title} {author} {url} 到 数据库')  # 用户自由发挥保存。
Esempio n. 6
0
def job():
    br = mechanize.Browser()
    br.set_handle_robots(False)
    br.addheaders = [
        ("user-agent",
         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
         ), ('authority', 'www.skybet.com'), ('method', 'GET'),
        ('path', '/football/specials/transfer-specials'), ('scheme', 'https'),
        ('accept',
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
         ), ('accept-encoding', 'gzip, deflate, br'),
        ('accept-language', 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4,zh-TW;q=0.2'),
        ('cache-control', 'max-age=0'),
        ('cookie',
         'sbgCFcustrandno01=82.2; sbgCAcid01=3EBFE797C814A6A4B6B182391BA5B43B; _ga=GA1.2.200552123.1496285371; betCEsessid01=3q9ad11ench0v48vqd6o2cvnf3; sbgCEsitevisitor=11; sbgCEsiteactiveBet=7b674d7ff7c810a5bad7b5f11999f7fd; sbgCAtrial01=betCOB,BetNGU,betTrial7,bingoappNativeNavBar,vegasappNativeNavBar; beta_site_stick=YmV0YQ==; sbgCEsiteactiveSSO=cc89f90083ecb66e2cfc2c50a43dc22d; config=%7B%22attemptedSSOLogin%22%3Atrue%7D; SSOSESSID=828d661bfc4a76371081e3fe508ef81f; __SBA_POLLTIME=1499223165446|loggedOut|true; SSO_SESS_ID=828d661bfc4a76371081e3fe508ef81f; betSsoAutoLoginAttempted=1499223668675; s_pers=%20s_vnum%3D1501814810661%2526vn%253D1%7C1501814810661%3B%20s_invisit%3Dtrue%7C1499225410331%3B%20s_getNewRepeat%3D1499223610336-Repeat%7C1501815610336%3B; s_sess=%20s_ctq%3D0%3B%20s_cc%3Dtrue%3B%20c%3DundefinedDirect%2520LoadDirect%2520Load%3B%20s.prop69%3Dvisitnumber_1%3B%20s_sq%3D%3B'
         ), ('upgrade-insecure-requests', '1')
    ]

    br.open(url)
    print br.response
    s = requests.session()

    text = s.get(url, headers=header).text
    print text
    selector = Selector(text=text)
    print len((selector.css('div.mktgrp > * >table')))
    print len((selector.css('div.mktgrp > * >h3')))
    player_odds_list = []
    club_odds = []
Esempio n. 7
0
 def get_authors_list(self, seek_type=1):
     """
     Get all authors from http://qsc.zww.cn/. The seek_type is always 1.
     """
     for i in range(1, 5):   # To-do: this is hardcode list page, currently, there are only 90 pages, but there should be another more intelligent way to get the total page Nos.
         req = self.poetry_request(1, i)
         content = req.content.decode('gb2312', 'ignore')    # Decode in Mandarin
         selector = Selector(text = content)
         content_in_script = selector.css('script').get()
         for line in content_in_script.splitlines():
             # Iterate each author list page
             if not line.startswith('parent.QTS.filllist'):
                 # This is html text line doesn't contain values we evaluate.
                 continue
             inner_selector = Selector(text = line)
             anchor_lines = inner_selector.css('a').getall()
             for anchor in anchor_lines:
                 # Interate each anchor tag
                 if re.match(r'.*doseek2\((10,.*,.*)\);.*', anchor): # doseek2(10, .*, .*) stands for a author possessing one/some poetry in this website.
                     author = re.search(r'(?<=\);">)[^\…]+', anchor).group(0)
                     seek_type, value, pageno = re.search(r'(?<=onclick="doseek2\()[^\)]+', anchor).group(0).split(',')
                     desc = self.get_author_info(seek_type, value, 1)   # Pageno should always be 1
                     with scopedsession() as session:
                         # Add every author to DB because adding a batch of keys may have some primary keys exist in the table, 
                         # and it will trigger rollback w/o adding the non-existing new keys.
                         session.add(
                             CiAuthor(
                                 name = author,
                                 desc = desc
                             )
                         )
Esempio n. 8
0
def get_winelab_price(url, result_list=None, need_print=False):
    try:
        cookies = dict(currentRegion='RU-PRI')
        site_page = requests.get(url, cookies=cookies, timeout=3)

        # write_to_file(site_page, '/root/pricepinger_v2/winelab.txt')

        sel = Selector(site_page.text)
    except Exception:
        if need_print:
            print("Ошибка получения по url " + url)
        return None

    # print(site_page.status_code)
    if site_page.status_code != requests.codes.ok:
        return None

    # title = sel.css('title::text').extract_first()
    # title = sel.css('div.description::text').extract_first()				#.capitalize()
    title = sel.css('h1::text').extract_first()  # .capitalize()

    price_purple = format_price(sel.css('span.purple::text').extract_first())
    price_yellow = format_price(sel.css('span.yellow::text').extract_first())
    price_green = format_price(sel.css('span.green::text').extract_first())

    price = max(price_purple, price_yellow, price_green)
    date_ts = datetime.datetime.now()
    result = [title, float(price), url, 'winelab', date_ts]

    if result_list is not None:
        result_list.append(result)
        if need_print is True:
            echo_list_to_sting(result)

    return result
def parse_job(html) -> dict:
    """find job details in job listing page"""
    sel = Selector(text=html)
    # setup some processing helpers
    join = lambda css, sep='': sep.join(sel.css(css).extract()).strip()
    first = lambda css: sel.css(css).extract_first(' ').strip()

    item = {}
    item['title'] = sel.css('h2.title::text').extract_first()
    item['location'] = join('.job-meta a::text', ', ')
    item['job_type'] = join('ul.list-unstyled a::text')
    item['posted_date'] = join(
        'div#affix-box p:contains("Posted:")::text').split(': ')[1]
    item['saved_times'] = join(
        'div#affix-box div:contains("Saved ")>strong::text')
    item['description'] = join('div.box-item-details p ::text')
    item['views'] = first(
        'div#affix-box li:contains("unique views")>strong::text')
    item['unique_views'] = first(
        'div#affix-box li:contains("views")>strong::text')

    bullets = lambda css: [
        ''.join(bullet.css('::text').extract()) for bullet in sel.css(css)
    ]
    h4_bullet = 'div.box-item-details h4:contains("{}")+ul>li'.format
    h3_bullet = 'div.box-item-details h3:contains("{}")+ul>li'.format
    item['about_you'] = bullets(h4_bullet('About You'))
    item['your_role'] = bullets(h4_bullet('Your role'))
    item['requirements'] = bullets(h4_bullet('Requirements'))
    item['nice_to_have'] = bullets(h4_bullet('Nice to have'))
    item['why_work_with_us'] = bullets(h4_bullet('Why work with us'))
    item['desired_skills'] = bullets(h3_bullet('Desired Skills'))
    item['contact'] = bullets(h3_bullet('Contact Info'))

    return item
Esempio n. 10
0
def scrape_noticia(html_content):
    selector = Selector(text=html_content)
    url = selector.css("head link[rel=canonical]::attr(href)").get()
    title = selector.css("h1#js-article-title::text").get()
    timestamp = selector.css("#js-article-date::attr(datetime)").get()
    writer = selector.css(".z--font-bold::text").get()
    shares_count = selector.css(".tec--toolbar__item::text").get()
    comments_count = selector.css(".tec--btn::attr(data-count)").get()
    summary = selector.css(
        ".tec--article__body > p:nth-child(1) ::text").getall()
    sources = selector.css(".z--mb-16 .tec--badge::text").getall()
    categories = selector.css("#js-categories .tec--badge::text").getall()

    if not writer:
        writer = selector.css(".z--font-bold a::text").get()

    if not shares_count:
        shares_count = 0
    else:
        shares_count = str(shares_count).split()[0]

    sources = [str(source).strip() for source in sources]
    categories = [str(category).strip() for category in categories]

    return {
        "url": url,
        "title": title,
        "timestamp": timestamp,
        "writer": str(writer).strip(),
        "shares_count": int(shares_count),
        "comments_count": int(comments_count),
        "summary": "".join(summary),
        "sources": sources,
        "categories": categories,
    }
Esempio n. 11
0
def get_dilan_price(url, result_list=None, need_print=False):
    try:
        site_page = requests.get(url, timeout=20)
        sel = Selector(site_page.text)
    except Exception:
        if need_print:
            print("Ошибка получения по url " + url)
        return None

    if site_page.status_code != requests.codes.ok:
        return None
    # write_to_file(site_page, '/root/pricepinger_v2/dilan.txt')

    # title = sel.css('title::text').extract_first().split(' -')[0]
    title = sel.css('h1::text').extract_first()  # .capitalize()
    # price = sel.css('div.__price::text').extract_first().split()[0]
    price = sel.css('div.price::text').extract_first().split()[0]

    date_ts = datetime.datetime.now()
    result = [title, float(price), url, 'Dilan', date_ts]

    if result_list is not None:
        result_list.append(result)
        if need_print is True:
            echo_list_to_sting(result)

    return result
Esempio n. 12
0
    def detailData(self, response, **kwargs):

        titleName = response.xpath('/html/body/div[2]/div[2]/div/p[2]/text()'
                                   ).extract_first()  # 获取标题名字
        select = Selector(response.text)
        publishTime = select.css('.p2 span::text').get()  # 获取时间
        excelData = select.css('.Section1').getall()  # 表格数据
        if excelData:  # 做判断,如果为表格,单独存入TXT  不是 正常存入数据
            print('我是表格')
            content = excelData[0]
            try:
                with open('./' + titleName + '.txt', 'a+',
                          encoding='utf-8') as f:
                    f.write(titleName)
                    f.write('\n')
                    f.write('\n')
                    f.write(publishTime)
                    f.write('\n')
                    f.write('\n')
                    f.write(content)
                    f.write('\n')
            except IOError as ex:
                print('写入目标文件错误,错误原因:'.ex)
        else:
            content = select.css('.p3 P span::text').getall()
            self.writeData(titleName=titleName,
                           publishTime=publishTime,
                           content=content)
Esempio n. 13
0
def parse_song(html: Selector):

    lyrics = ''.join(
        html.css(
            'body > div.container.main-page > div > div.col-xs-12.col-lg-8.text-center > div:nth-child(8)::text'
        ).getall()).strip()
    title = html.css(
        'body > div.container.main-page > div > div.col-xs-12.col-lg-8.text-center > b ::text'
    ).get()
    band = html.css(
        'body > div.container.main-page > div > div.col-xs-12.col-lg-8.text-center > div.lyricsh > h2 > b ::text'
    ).get()[:-len(' lyrics')]
    written_by = html.css(
        'body > div.container.main-page > div > div.col-xs-12.col-lg-8.text-center > div:nth-child(17) > small ::text'
    ).get()

    if written_by:
        written_by = [
            author.strip()
            for author in written_by[len('Writer(s): '):].split(',') if author
        ]

    return {
        'lyrics': lyrics,
        'title': title,
        'band': band,
        'written_by': written_by,
        'album': None
    }
Esempio n. 14
0
    def parse_detail(self, detail_page):
        """
		:param detail_page:
		:return:
		"""
        selector = Selector(detail_page)
        title = selector.css('h1 span ::text').get()
        sorce = selector.xpath('//strong/text()').get()
        for detail in selector.css('.indent #info'):
            author = detail.xpath(
                '//span[@class="pl"][text()=" 作者"]/following-sibling::a/text()'
            ).get()
            publish_house = detail.xpath(
                '//*[text()="出版社:"]/following-sibling::text()').get()
            Producer = detail.xpath(
                '//*[text()="出品方:"]/following-sibling::a/text()').get()
            Producer_year = detail.xpath(
                '//*[text()="出版年:"]/following-sibling::text()').get()
            page_num = detail.xpath(
                '//*[text()="页数:"]/following-sibling::text()').get()
            price = detail.xpath(
                '//*[text()="定价:"]/following-sibling::text()').get()
            Binding = detail.xpath(
                '//*[text()="装帧:"]/following-sibling::text()').get()
            ISBN = detail.xpath(
                '//*[text()="ISBN:"]/following-sibling::text()').get()
def song_info(html: Selector):

    author_css = 'body > div.container.main-page > div > div.col-xs-12.col-lg-8.text-center > div.lyricsh > h2 > b ::text'
    lyrics = 'body > div.container.main-page > div > div.col-xs-12.col-lg-8.text-center > div:nth-child(8) ::text'
    author = html.css(author_css).get()
    text = html.css(lyrics).getall()

    return {'author': author, 'lyrics': ''.join(text)}
Esempio n. 16
0
def extract_technique_synergy(sel: parsel.Selector):
    if sel.css('a::attr(title)').get() == 'Temtem Types':
        return None

    try:
        return sel.css('a::attr(title)').get().split()[0]
    except AttributeError:
        return None
Esempio n. 17
0
def song_info(html: Selector):

    author_css = 'body > div.container.main-page > div > div.col-xs-12.col-lg-8.text-center > div.lyricsh > h2 > b ::text'
    lyrics = 'body > div.container.main-page > div > div.col-xs-12.col-lg-8.text-center > div:nth-child(8) ::text'
    album_css = 'body > div.container.main-page > div > div.col-xs-12.col-lg-8.text-center > div.panel.songlist-panel.noprint > div.songinalbum_title > b ::text'
    author = html.css(author_css).get()
    text = html.css(lyrics).getall()
     album = html.css(album_css).get()
def crawl_detail_page(url, title, news_type):
    resp_text = requests.get(url).text
    sel = Selector(resp_text)
    author = sel.css('#articlewrap > div.article-info > div > a::text').extract_first() or \
             sel.css('#articlewrap > div.article-info > div::text').extract_first() or ''
    author = author.replace("\n", "").strip()
    print(f'使用print模拟保存到数据库  {news_type}   {title} {author} {url}'
          )  # ,实际为调用数据库插入函数,压根不需要return item出来在另外文件的地方进行保存。
Esempio n. 19
0
def tv(canal):
    response = get(f'https://meuguia.tv/programacao/canal/{canal}').text
    s = Selector(response)
    return {
        'nome': s.css('h2::text').get(),
        'inicio': s.css('div.time::text').get(),
        'tipo': s.css('h3::text').get(),
    }
Esempio n. 20
0
def requestAllNames(url):
    response = requests.get(defaultUrl + url)
    selector = Selector(text=response.text)
    titles = selector.css(".product_pod > h3 > a::text").getall()
    for title in titles:
        print(title)
    next = selector.css(".next > a::attr(href)").get()
    if (next):
        return requestAllNames(next)
Esempio n. 21
0
def get_html(title_url):
    result = requests.get(title_url)
    se = Selector(result.text)
    title = se.css('div.content-title p::text').get()
    content =se.css('div.content-title ul span::text').getall()
    view_num = se.css('#views::text').get()
    print(title)
    print(content)
    print(view_num)
Esempio n. 22
0
    def parse_trun(self, response):
        # 解析比赛轮次和球队
        logging.debug(response.url)
        sch_trun = response.css("table.linkblock a.OddsLink").extract()
        # schInfo["schedule_teams"] = schInfo["id"]
        if len(sch_trun) == 0:
            # 无轮次
            #
            scheduleInfo = copy.deepcopy(response.meta["scheduleInfoObj"])
            logging.debug(scheduleInfo)
            #
            schInfo = ScheduleInfo()
            schInfo["id"] = scheduleInfo["id"]
            schInfo["area"] = scheduleInfo["area"]
            schInfo["country"] = scheduleInfo["country"]
            schInfo["match_name"] = scheduleInfo["match_name"]
            schInfo["sch_idx"] = scheduleInfo["sch_idx"]
            schInfo["sch_name"] = scheduleInfo["sch_name"]
            schInfo["sch_type"] = scheduleInfo["sch_type"]
            schInfo["sch_group"] = scheduleInfo["sch_group"]
            schInfo["sch_trun"] = "无"
            schInfo["id"] = response.meta["scheduleInfoObj"]["id"] + "_0"
            schInfo["sch_url"] = response.url.replace(self.base_url, "")
            yield schInfo
        else:
            # 有轮次
            for t in sch_trun:

                odds_sel = Selector(text=t)
                trun_name = odds_sel.css("a::text").extract_first()
                if trun_name == None or trun_name.strip() == "":
                    trun_name = odds_sel.css("a b::text").extract_first()
                    if trun_name == None or trun_name.strip() == "":
                        continue
                trun_name = trun_name.strip()
                tmp_name = trun_name.encode("utf-8")
                if tmp_name == None or tmp_name == "" or tmp_name == "全部":
                    continue
                #
                scheduleInfo = copy.deepcopy(response.meta["scheduleInfoObj"])
                logging.debug(scheduleInfo)
                #
                schInfo = ScheduleInfo()
                schInfo["id"] = scheduleInfo["id"]
                schInfo["area"] = scheduleInfo["area"]
                schInfo["country"] = scheduleInfo["country"]
                schInfo["match_name"] = scheduleInfo["match_name"]
                schInfo["sch_idx"] = scheduleInfo["sch_idx"]
                schInfo["sch_name"] = scheduleInfo["sch_name"]
                schInfo["sch_type"] = scheduleInfo["sch_type"]
                schInfo["sch_group"] = scheduleInfo["sch_group"]
                schInfo["sch_trun"] = trun_name
                schInfo["id"] = response.meta["scheduleInfoObj"][
                    "id"] + "_" + trun_name
                schInfo["sch_url"] = odds_sel.xpath(
                    "//@href").extract_first().strip()
                yield schInfo
Esempio n. 23
0
 def parse_article(self, url, html) -> dict:
     """Parse html for data"""
     sel = Selector(text=html)
     data = {
         'url': url,
         'date': sel.css('time::attr(datetime)').extract_first(),
         'title': sel.css('h1 ::text').extract_first(),
     }
     return data
Esempio n. 24
0
 def _get_item(self, entry: Selector):
     video_id = entry.css('videoId::text').get()
     return PodcastItem(
         item_id=video_id,
         title=entry.css('title::text').get(),
         description=entry.css('description::text').get(),
         date=datetime.fromisoformat(entry.css('published::text').get()),
         image=entry.css('group > thumbnail::attr(url)').get(),
         content_type="video/mp4",
     )
Esempio n. 25
0
def scrape_noticia(html_content):
    selector = Selector(text=html_content)

    url = selector.css("meta[property='og:url']::attr(content)").get()
    title = selector.css("h1#js-article-title::text").get()
    timestamp = selector.css("#js-article-date::attr(datetime)").get()
    writer = selector.css("a.tec--author__info__link::text").get()
    shares_count = selector.css("div.tec--toolbar__item::text").re_first(
        r"\d+")
    comments_count = selector.css("#js-comments-btn::text").re_first(r"\d+")
    summary = "".join(
        selector.css(".tec--article__body > p:first-child *::text").getall())
    sources = selector.css("div.z--mb-16 .tec--badge::text").getall()
    categories = selector.css("#js-categories a.tec--badge::text").getall()

    return {
        "url":
        url,
        "title":
        title,
        "timestamp":
        timestamp,
        "writer":
        writer.strip() if writer else writer,
        "shares_count": (int(shares_count) if shares_count else 0),
        "comments_count": (int(comments_count) if comments_count else 0),
        "summary":
        summary,
        "sources":
        [source.strip() for source in sources] if sources else sources,
        "categories":
        [categorie.strip()
         for categorie in categories] if categories else categories,
    }
Esempio n. 26
0
def parse(html):
    document = Selector(text=html)
    memes = [
        catch_errors(parse_meme, element)
        for element in document.css("main .media-element")
    ]
    memes = [meme for meme in memes if meme is not None]
    title = document.css("title::text").get()
    next_page_url = "/kwejk/page/" + get_last_part_url(
        document.css(".btn-next::attr(href)").get())
    return Page(title, memes, next_page_url)
Esempio n. 27
0
def download_one(url):
    response = requests.get(url,timeout=40)
    response.raise_for_status()
    response.encoding="gbk"
    sel=Selector(response.text)
    title=sel.css('em::text').get()
    f=open(title+'.txt',mode='w',encoding='gbk')
    f.write(title)
    for list in sel.css('#content::text').getall():
        print(list.strip(),file=f)
    f.close()
Esempio n. 28
0
def parse(html):
    document = Selector(text=html)
    memes = [
        catch_errors(parse_meme, element) for element in document.css("article.story")
    ]
    memes = [meme for meme in memes if meme is not None]
    title = document.css("title::text").get()
    next_page_url = "/anonimowe/page/" + find_id_in_url(
        document.css("nav.pagination > div.next > a::attr(href)").get()
    )
    return Page(title, memes, next_page_url)
Esempio n. 29
0
 def parse(self, response):
     sel = Selector(text=response.body.decode('utf-8'))
     products = sel.css('.product__title::attr(href)').extract()
     yield \
         merge({'products': products}, \
         pick(['status', '_url'], \
         vars(response)))
     url = sel.css('.js-next::attr(href)').extract_first()
     if url:
         req = sub(r'\?.*', '', response.request.url) + url
         yield Request(req, callback=self.parse)
Esempio n. 30
0
def parse(html):
    document = Selector(text=html)
    memes = [
        catch_errors(parse_meme, element)
        for element in document.css(".demotivator")
    ]
    memes = [meme for meme in memes if meme is not None]

    title = document.css("title::text").get()
    next_page_url = "/demotywatory/page/" + get_last_part_url(
        document.css("a.next-page::attr(href)").get())
    return Page(title, memes, next_page_url)
Esempio n. 31
0
async def test_stop(test_client):
    agent = Agent("jid@server", "password")
    agent.web.setup_routes()
    client = await test_client(agent.web.app)

    response = await client.get("/spade/stop")
    response = await response.text()

    sel = Selector(text=response)
    assert sel.css("div.alert-warning > span::text").get().strip() == "Agent is stopping now."

    with LogCapture() as log:
        try:
            await client.get("/spade/stop/now/", timeout=0.0005)
        except requests.exceptions.ReadTimeout:
            pass

        log.check_present(('spade.Web', 'WARNING', "Stopping agent from web interface."))

    counter = 5
    while agent.is_alive() and counter > 0:
        counter -= 0.5
        time.sleep(0.5)

    assert not agent.is_alive()
Esempio n. 32
0
async def test_request_home(test_client):
    agent = make_connected_agent("jid@server", "password")
    future = agent.start(auto_register=False)
    future.result()
    agent.web.setup_routes()
    client = await test_client(agent.web.app)

    response = await client.get("/spade")
    response = await response.text()

    sel = Selector(text=response)

    assert sel.css("title::text").get() == "jid agent"
    assert sel.css("img::attr(src)").get() == agent.avatar

    assert sel.css("ul.products-list > li").getall() == []

    agent.stop()
Esempio n. 33
0
def getheaders():
    '从网页源代码内解析出 uuid与Xsrftoken'
    z1 = s.get('https://www.zhihu.com/')
    sel = Selector(z1.text)
    jsdata = sel.css('div#data::attr(data-state)').extract_first()
    xudid = json.loads(jsdata)['token']['xUDID']
    xsrf = json.loads(jsdata)['token']['xsrf']
    headers = headers_raw_to_dict(post_headers_raw)
    headers['X-UDID'] = xudid
    headers['X-Xsrftoken'] = xsrf
    return headers
Esempio n. 34
0
def load_chapters(url):
    """
    Loads all chapters from a manga comic and returns a list for dictionaries
    with related data.

    :return: chapter list in asc order
    """
    text = requests.get(url).text
    sel = Selector(text)
    hel_gen = sel.css(".chlist h3, .chlist h4")
    chapter_gen = map(hel_to_chapter, hel_gen)
    available_chapter_gen = filter(lambda v: v['title'], chapter_gen)
    return reversed(list(available_chapter_gen))
Esempio n. 35
0
async def test_add_get(test_client):
    agent = Agent("jid@server", "password")
    agent.web.add_get("/test", lambda request: {"number": 42}, "examples/hello.html")

    agent.web.setup_routes()
    client = await test_client(agent.web.app)

    response = await client.get("/test")
    response = await response.text()

    sel = Selector(text=response)
    assert sel.css("h1::text").get().strip() == "42"

    agent.stop()
Esempio n. 36
0
def main(argv=None, progname=None):
    parser = argparse.ArgumentParser(prog=progname, description=__doc__)
    parser.add_argument('expr', metavar='EXPRESSION',
                        help="A CSSexpression, or a XPath expression if --xpath is given.")
    parser.add_argument('file', metavar='FILE', nargs='?',
                        help="If missing, it reads the HTML content from the standard input.")
    parser.add_argument('--xpath', action='store_true',
                        help="Given expression is a XPath expression.")
    parser.add_argument('--re', metavar='PATTERN',
                        help="Apply given regular expression.")
    parser.add_argument('--encoding', metavar='ENCODING', default='utf-8',
                        help="Input encoding. Default: utf-8.")
    parser.add_argument('--repr', action='store_true',
                        help="Output result object representation instead of as text.")
    # TODO: Output this and parsel version.

    args = parser.parse_args(argv)

    if args.file:
        text = open(args.file).read()
    else:
        text = sys.stdin.read()

    if isinstance(text, six.binary_type):
        try:
            text = text.decode(args.encoding)
        except UnicodeDecodeError:
            parser.error("Failed to decode input using encoding: %s" % args.encoding)

    sel = Selector(text=text)

    if args.xpath:
        result = sel.xpath(args.expr)
    else:
        result = sel.css(args.expr)

    if args.re:
        regex = args.re.encode(args.encoding)
        regex = regex.decode('string_escape' if six.PY2 else 'unicode_escape')
        out = result.re(re.compile(regex, re.IGNORECASE | re.UNICODE))
    else:
        out = result.extract()

    if args.repr:
        pprint.pprint(out)
    else:
        print("\n".join(out))

    return 0
Esempio n. 37
0
def get_alexa_demographics(url, db_session=False):
    if db_session is not False:
        result = list(db_session.query(WebsitesCache).filter_by(link=url))
        if len(result) > 0 and result[0].male_ratio_alexa >= 0:
            return float(result[0].male_ratio_alexa), float(result[0].female_ratio_alexa)
        else:
            return 0.0, 0.0

    orig_url = url
    url = "http://www.alexa.com/siteinfo/" + url
    response = requests.get(url)

    # We need the decode part because Selector expects unicode.
    selector = Selector(response.content.decode('utf-8'))
    bars = selector.css("#demographics-content .demo-col1 .pybar-bg")
    values = []
    for bar in bars:
        value = bar.css("span::attr(style)").extract()[0]
        value = int(re.search(r'\d+', value).group())
        values.append(value)

    male_ratio = 0.0
    female_ratio = 0.0
    if sum(values) == 0:
        print "No alexa rating for " + url
    else:
        male_ratio = float(values[0] + values[1]) / sum(values)
        female_ratio = float(values[2] + values[3]) / sum(values)
        print url
        print values
        print male_ratio, female_ratio

    # Do we want to cache the result?
    if db_session is not False:
        try:
            db_session.query(WebsitesCache).filter(WebsitesCache.link==orig_url) \
                      .update({
                          'male_ratio_alexa': male_ratio,
                          'female_ratio_alexa': female_ratio
                       })
            db_session.commit()
        except:
            print "Could not update " + url

    return male_ratio, female_ratio
Esempio n. 38
0
async def test_get_messages(test_client):
    agent = Agent("jid@server", "password")
    agent.web.setup_routes()
    client = await test_client(agent.web.app)

    # add messages to trace
    for i in range(5):
        msg = Message(body=str(i), sender="{}@server".format(i), to="receiver@server")
        agent.traces.append(msg)

    response = await client.get("/spade/messages/")
    response = await response.text()

    sel = Selector(text=response)

    assert len(sel.css("ul.timeline > li").getall()) == 6  # num messages + end clock

    agent.stop()
Esempio n. 39
0
def download_chapter(chapter, folder_name):
    """
    Grabs all images from a chapter and writes them down to filesystem.

    """

    folder_name = werkzeug.utils.secure_filename(folder_name)

    # if the folder does not exist ...
    if not os.path.exists(folder_name):
        os.mkdir(folder_name)

    text = requests.get(chapter['href']).text
    sel = Selector(text)

    for value in sel.css("select[class='m'] > option::attr(value)").extract():
        value = int(value)
        url = re.sub(r'\d+\.html', '%d.html' % value, chapter['href'])
        download_page(url, folder_name)
Esempio n. 40
0
async def test_get_behaviour(test_client):
    class EmptyOneShotBehaviour(OneShotBehaviour):
        async def run(self):
            self.kill()

    agent = Agent("jid@server", "password")
    behaviour = EmptyOneShotBehaviour()
    agent.add_behaviour(behaviour)
    agent.web.setup_routes()

    client = await test_client(agent.web.app)

    response = await client.get("/spade/behaviour/OneShotBehaviour/EmptyOneShotBehaviour/")
    response = await response.text()

    sel = Selector(text=response)

    assert sel.css("section.content-header > h1::text").get().strip() == "OneShotBehaviour/EmptyOneShotBehaviour"
    agent.stop()
Esempio n. 41
0
async def test_add_post(test_client):
    agent = Agent("jid@server", "password")

    async def handle_post(request):
        form = await request.post()
        number = form["number"]
        return {"number": number}

    agent.web.add_post("/test", handle_post, "examples/hello.html")
    agent.web.setup_routes()
    client = await test_client(agent.web.app)

    response = await client.post("/test", data={"number": 1024})
    response = await response.text()

    sel = Selector(text=response)
    assert sel.css("h1::text").get() == "1024"

    agent.stop()
Esempio n. 42
0
def download_page(url, folder_name):
    text = requests.get(url).text
    sel = Selector(text)

    for src in sel.css("img[id='image']::attr(src)").extract():
        basename = os.path.basename(src)
        safe_basename = werkzeug.utils.secure_filename(basename)
        filename = os.path.join(folder_name, safe_basename)
        filename = os.path.abspath(filename)

        # file is not there or has a invalid size ...
        if not os.path.exists(filename) or os.path.getsize(filename) == 0:
            data = requests.get(src).content

            with open(filename, 'wb') as file:
                file.write(data)

            print('{0} written.'.format(filename))
        else:
            print('{0} exists. Skipping.'.format(filename))
Esempio n. 43
0
 def _find_match(self, sel: Selector) -> Match:
     xpath = lambda x: sel.xpath(x).extract_first(default='').strip()
     item = Match()
     item['url'] = urljoin(self.url_base, xpath(".//a/@href"))
     item['id'] = (re.findall('matches/(\d+)', item['url']) or [None])[0]
     item['game'] = next((g for g in self.games if g in item['url'].lower()))
     item['time'] = xpath("td[@class='status']/span/text()")
     item['time_secs'] = time_to_seconds(item['time'])
     item['timestamp'] = int((datetime.now() + timedelta(item['time_secs'])).timestamp())
     item['t1'] = xpath(".//span[contains(@class,'opp1')]/span/text()")
     item['t1_country'] = xpath(".//span[contains(@class,'opp1')]/span[contains(@class,'flag')]/@title")
     item['t1_country_short'] = xpath(".//span[contains(@class,'opp1')]"
                                      "/span[contains(@class,'flag')]/@class").split()[-1]
     item['t2'] = xpath(".//span[contains(@class,'opp2')]/span/text()")
     item['t2_country'] = xpath(".//span[contains(@class,'opp2')]/span[contains(@class,'flag')]/@title")
     item['t2_country_short'] = xpath(".//span[contains(@class,'opp2')]"
                                      "/span[contains(@class,'flag')]/@class").split()[-1]
     scores = sel.css('.score::text').extract()
     item['t1_score'] = scores[0] if scores else None
     item['t2_score'] = scores[1] if len(scores) > 1 else None
     return item
Esempio n. 44
0
async def test_get_agent(test_client):
    agent = make_presence_connected_agent("jid@server", "password")
    future = agent.start(auto_register=False)
    future.result()

    agent.web.setup_routes()
    client = await test_client(agent.web.app)

    jid = "friend@server"
    item = Item(jid=JID.fromstr(jid))

    agent.presence.roster._update_entry(item)

    response = await client.get(f"/spade/agent/{jid}/")
    response = await response.text()

    sel = Selector(text=response)

    assert sel.css("section.content-header > h1::text").get().strip() == jid

    agent.stop()
Esempio n. 45
0
def search_command(value):
    url = "{domain}/search.php?".format(domain=domain)
    quote = urllib.parse.quote(value)
    query = {
        'name': quote,
        'name_method': 'cw',
        'author': '',
        'author_method': 'cw',
        'artist': '',
        'artist_method': 'cw',
        'is_complete': '',
        'type': '',
        'advopts': '1',
        'rating': '',
        'rating_method': 'eq',
        'released': '',
        'released_method': 'eq',
        'genres[Sci-fi]': '0',
        'genres[Horror]': '0',
        'genres[Sports]': '0',
        'genres[Action]': '0',
        'genres[Shoujo Ai]#': '0',
        'genres[Drama]': '0',
        'genres[Fantasy]': '0',
        'genres[Mystery]': '0',
        'genres[Gender Bender]': '0',
        'genres[One Shot]': '0',
        'genres[Psychological]': '0',
        'genres[Tragedy]': '0',
        'genres[Historical]': '0',
        'genres[Mecha]': '0',
        'genres[Yuri]': '0',
        'genres[Seinen]': '0',
        'genres[Adult]': '0',
        'genres[Slice of Life]': '0',
        'genres[Doujinshi]': '0',
        'genres[Romance]': '0',
        'genres[School Life]': '0',
        'genres[Comedy]': '0',
        'genres[Shoujo]': '0',
        'genres[Ecchi]': '0',
        '#genres[Harem]': '0',
        'genres[Smut]': '0',
        'genres[Yaoi]': '0',
        'genres[Shounen Ai]': '0',
        'genres[Martial Arts]': '0',
        'genres[Josei]': '0',
        'genres[Shounen]': '0',
        'genres[Mature]': '0',
        'genres[Webtoons]': '0',
        'genres[Supernatural]': '0',
        'genres[Adventure]': '0',
    }
    url += urllib.parse.urlencode(query)

    try:
        data = requests.get(url).text
    except urllib.error.URLError:
        # mangafox requires a 5 seconds delay
        # between searches 
        import time
        time.sleep(5)
        data = requests.get(url).text

    sel = Selector(data)
    results = list()

    for link in sel.css('td:first-child > a:first-child'):
        manga_url = link.css('::attr(href)').extract_first()
        name = manga_url[7:].split('/')[2]
        results.append(odict([
            ('title', link.css('::text').extract_first()),
            ('name', "%s (use for download)" % name),
            ('url', manga_url),
        ]))

    if len(results):
        print("")
        for manga in results:
            for key, value in manga.items():
                print("%s: %s" % (key, value))
            print("")
    else:
        print('No results found')
Esempio n. 46
0

with open('lamberti.json') as jsonfile:
    jsondata = json.load(jsonfile)

with open('lamberti.geojson') as geojsonfile:
    geojsondata = json.load(geojsonfile)

for booth in jsondata:
    booth_no = int(booth['nr'][-2:])
    r = requests.get(booth['url'])
    if r.status_code == 200:
        text = r.text
        selector = Selector(text=text)

        booth_name = selector.css('.booth-title::text').get()
        booth_descr = selector.css('.booth-body > p::text').getall()

        if isinstance(booth_descr, list):
            booth_descr = " ".join(booth_descr)

        booth_owner_company = selector.css(
            '.contactParticle--company::text').get()
        booth_owner_name = selector.css(
            '.contactParticle--name\:firstname\,lastname::text').get()
        booth_owner_street = selector.css(
            '.contactParticle--street::text').get()
        booth_owner_city = selector.css(
            '.contactParticle--city\:postal_code\,locality::text').get()
        booth_owner_phone = selector.css('.contactParticle--phone::text').get()
        booth_owner_email = selector.css(