Beispiel #1
0
async def scrape_page(session, url):
    async with session.get(url) as resp:
        content = await resp.text()

    print('parsing url: {}'.format(url))
    doc = PyQuery(content)
    doc.make_links_absolute(base_url=url)

    table = doc(
        '#rz-main-container section:eq(1) .WriteSmallTableTop table:eq(1)')

    results = []

    for row in table.items('tr:gt(0)'):
        company_col = row('td').eq(0)
        phone_col = row('td').eq(1)
        website_col = row('td').eq(2)

        company = {
            'name': company_col.text(),
            'phone': phone_col.text(),
            'url': website_col('a').attr('href'),
            'details_url': company_col('a').attr('href'),
        }

        results.append(company)

    return results
Beispiel #2
0
async def scrape_page(session, url):
    async with session.get(url) as resp:
        content = await resp.text()

    print('parsing url: {}'.format(url))
    doc = PyQuery(content)
    doc.make_links_absolute(base_url=url)

    table = doc('#rz-main-container section:eq(1) .WriteSmallTableTop table:eq(1)')

    results = []

    for row in table.items('tr:gt(0)'):
        company_col = row('td').eq(0)
        phone_col = row('td').eq(1)
        website_col = row('td').eq(2)

        company = {
            'name': company_col.text(),
            'phone': phone_col.text(),
            'url': website_col('a').attr('href'),
            'details_url': company_col('a').attr('href'),
        }

        results.append(company)

    return results
def get_urls(base_url, exclude=set()):
    urls = []
    if base_url.endswith("/"):
        base_url = base_url[:-1]
    doc = PyQuery(base_url + "/plog/")
    doc.make_links_absolute(base_url=base_url)
    for a in doc("dd a"):
        href = a.attrib["href"]
        if href in exclude:
            continue
        urls.append(href)

    doc = PyQuery(base_url + "/")
    doc.make_links_absolute(base_url=base_url)
    for a in doc("a"):
        try:
            href = a.attrib["href"]
        except KeyError:
            pass
        if not href.startswith(base_url):
            continue
        if href.endswith(".html") or href.endswith(".png"):
            continue
        if href not in urls and href not in exclude:
            urls.append(href)
            urls.append(href)
            urls.append(href)

    url_start = base_url + "/p"
    for i in range(2, 10):
        url = url_start + str(i)
        if url in exclude:
            continue
        urls.append(url)
    return urls
Beispiel #4
0
class SegmentfaultTagSpider(object):
    def __init__(self, tag_name, page=1):
        self.url = 'http://segmentfault.com/t/%s?type=newest&page=%s' % (
            tag_name, page)
        self.tag_name = tag_name
        self.page = page
        self._dom = None

    @property
    def dom(self):
        if not self._dom:
            document = requests.get(self.url)
            document.encoding = 'utf-8'
            self._dom = Pq(document.text)
            self._dom.make_links_absolute(
                base_url="http://segmentfault.com/")  # 相对链接变成绝对链接 爽
        return self._dom

    @property
    def questions(self):
        return [
            question.attr('href')
            for question in self.dom('h2.title > a').items()
        ]

    @property
    def has_next_page(self):  # 看看还有没有下一页,这个有必要
        return bool(self.dom('ul.pagination > li.next'))  # 看看有木有下一页

    def next_page(
            self):  # 把这个蜘蛛杀了, 产生一个新的蜘蛛 抓取下一页。 由于这个本来就是个动词,所以就不加@property了
        if self.has_next_page:
            self.__init__(tag_name=self.tag_name, page=self.page + 1)
        else:
            return None
Beispiel #5
0
def tp_rest_detail_page_url(self, page_num_url, city_id, part):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    print "Now Proxy is " + PROXY
    headers = {
        'User-agent': GetUserAgent()
    }
    page = requests.get(page_num_url, proxies=proxies, headers=headers)
    page.encoding = 'utf8'
    if len(page.text) < 100:
        update_proxy('Platform', PROXY, x, '23')
        self.retry()
    doc = PyQuery(page.text)
    doc.make_links_absolute(page_num_url)

    data = []
    worker = u'daodao_poi_base_data'

    for item in doc('.property_title').items():
        href = item.attr.href
        if 'Restaurant_Review' in href:
            args = json.dumps(
                {u'target_url': unicode(href), u'city_id': unicode(city_id), u'type': u'rest'})
            task_id = get_task_id(worker, args=args)
            data.append((task_id, worker, args, unicode(part).replace(u'list', u'detail')))
    print insert_task(data=data)
Beispiel #6
0
 def list_page(self, response):
     result_content = {}
 
     content_iter = re.finditer(r"STK && STK.pageletM && STK.pageletM.view\((?P<content>\{.*?\})\)", response.content)
     for iter in content_iter:
         ok, content = safe_loads(iter.groupdict()['content'])
         if ok and "pl_weibo_direct" == content.get("pid"):
             result_content = content
             break
     else:
         return {}
     
     pyquery_doc = PyQuery(result_content["html"])
     pyquery_doc.make_links_absolute(response.url)
     
     items = []
     for item in pyquery_doc("DIV.feed_lists>DIV.WB_cardwrap>DIV").items():
         weibo_href = item("DIV.content>DIV.feed_from>A").attr.href
         if weibo_href:
             weibo_pics = []
             for pic in item("DIV.feed_content DIV.media_box IMG").items():
                 weibo_pics.append(pic.attr.src)
                 
             data = {
                 "content": item("DIV.feed_content P.comment_txt").text(),
                 "nickname": item("DIV.feed_content A.W_texta").attr.title,
                 "href": weibo_href,
                 "quote_nickname": item("DIV.feed_content DIV.comment DIV.comment_info A.W_texta").attr.title,
                 "quote_content": item("DIV.feed_content DIV.comment DIV.comment_info P.comment_txt").text(),
                 "pics": ''.join(weibo_pics)
             }
             self.crawl("data:,%s" % weibo_href, callback = self.detail_page, data_fetch_content=data)
Beispiel #7
0
class SegmentfaultTagSpider(object):
    def __init__(self, tag_name, page=1):
        self.url = 'https://segmentfault.com/t/%s?type=newest&page=%s'%(tag_name, page)
        self.tag_name = tag_name
        self.page = page
        self._dom = None

    @property
    def dom(self):
        if not self._dom:
            document = requests.get(self.url)
            document.encoding = 'utf-8'
            self._dom = Pq(document.text)
            self._dom.make_links_absolute(base_url='http://segmentfault.com/')
        return self._dom

    @property
    def questions(self):
        return [question.attr('href') for question in self.dom('h2.title > a').items()]

    @property
    def has_next_page(self):
        return bool(self.dom('ul.pagination > li.next'))

    def next_page(self):
        if self.has_next_page:
            print(self.page)
            self.__init__(tag_name=self.tag_name, page = self.page+1)
        else:
            return None
Beispiel #8
0
def get_urls():
    doc = PyQuery('https://www.peterbe.com/plog/')
    doc.make_links_absolute(base_url='https://www.peterbe.com')
    urls = []
    for a in doc('dd a'):
        urls.append(a.attrib['href'])
    return urls
Beispiel #9
0
def download(threadUrl):
    """
    """
    d = PyQuery(url=threadUrl, parser='soup')
    links = d('a[href^="job.php?action=download&aid="]')

    # 获取 verify 的值
    tmp = d('script:contains("var verifyhash =")').text()
    verify = re.search(r"var verifyhash = '(.*?)'", tmp).group(1)

    total = len(links)
    d.make_links_absolute()
    for i, e in enumerate(links.items(), start=1):
        filename = e.text()
        print('%s/%s %s' % (i, total, filename))

        if not os.path.exists(os.path.join(SAVE_PATH, filename)):
            params = urlencode(
                {'check': 1, 'verify': verify, 'nowtime': int(time.time() * 1000)})
            url = '%s?%s' % (e.attr['href'], params)

            print('  fetch: ' + url)
            downDoc = PyQuery(url, headers=headers)
            # 第0个是电信下载点,第1个是移动下载点
            downUrl = BASE_URL + downDoc('a[href^="remotedown.php"]').eq(1).attr('href')
            addToIDM(downUrl, SAVE_PATH, filename)
            time.sleep(1.5)

    wefiler_urls = checkWefiler(d)
    if wefiler_urls:
        print(wefiler_urls)
Beispiel #10
0
def get_urls(base_url, exclude=set()):
    urls = []
    if base_url.endswith("/"):
        base_url = base_url[:-1]
    doc = PyQuery(base_url + "/plog/")
    doc.make_links_absolute(base_url=base_url)
    for a in doc("dd a"):
        href = a.attrib["href"]
        if href in exclude:
            continue
        urls.append(href)

    doc = PyQuery(base_url + "/")
    doc.make_links_absolute(base_url=base_url)
    for a in doc("a"):
        try:
            href = a.attrib["href"]
        except KeyError:
            pass
        if not href.startswith(base_url):
            continue
        if href.endswith(".html") or href.endswith(".png"):
            continue
        if href not in urls and href not in exclude:
            urls.append(href)
            urls.append(href)
            urls.append(href)

    url_start = base_url + "/p"
    for i in range(2, 10):
        url = url_start + str(i)
        if url in exclude:
            continue
        urls.append(url)
    return urls
Beispiel #11
0
def run():
    """Search session cookies in pastebin.com"""
    parser = argparse.ArgumentParser()
    parser.add_argument("-w", "--word", help="palabra clave que desea buscar")
    parser.add_argument("-u", "--url", help="sitio en donde se desea buscar")
    args = parser.parse_args()
    if args.url:
        try:

            title = args.word + " " + args.url
            for page in range(10):
                params_1 = urlencode({'as_q': "%s" % title})
                params = params_1.replace("%22", '')
                jq = Pq(
                    url=
                    " https://www.google.com/search?%s&source=lnt&tbs=qdr:d&sa=X&ved=0ahUKEwi35d68i-XXAhUDuRQKHVVpB88QpwUIHQ&biw=1366&bih=647"
                    % params,
                    headers={
                        "user-agent":
                        "Mozilla/7.0 (Windows NT 6.1; rv:24.0) Gecko/20140129 Firefox/24.0"
                    })
                jq.make_links_absolute("http://www.google.com")
                for flix in jq("div.rc").children().items():
                    url = flix.find("a").attr("href")
                    if url == "http://www.google.com":
                        url = ""
                    print url
        except:
            print "error de red"
Beispiel #12
0
def tp_rest_list_page_num(self, index_url, city_id, part):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    print "Now Proxy is " + PROXY
    headers = {
        'User-agent': GetUserAgent()
    }
    page = requests.get(index_url, proxies=proxies, headers=headers)
    page.encoding = 'utf8'
    if len(page.text) < 100:
        update_proxy('Platform', PROXY, x, '23')
        self.retry()
    page.encoding = 'utf8'
    doc = PyQuery(page.text)
    doc.make_links_absolute(index_url)
    num_list = []
    for item in doc('.pageNumbers a').items():
        num = int(rest_oa_pattern.findall(item.attr.href)[0])
        num_list.append(num)

    tp_rest_detail_page_url.delay(index_url, city_id, part)
    try:
        for page_num in range(30, max(num_list) + 30, 30):
            g_num = rest_g_pattern.findall(index_url)[0]
            tp_rest_detail_page_url.delay(index_url.replace('-g' + g_num, '-g{0}-oa{1}'.format(g_num, page_num)),
                                          city_id, part)
    except:
        pass
Beispiel #13
0
class SegmentfaultTagSpider(object):

    def __init__(self, tag_name, page=1):
        self.url = 'http://segmentfault.com/t/%s?type=newest&page=%s' % (tag_name, page)
        self.tag_name = tag_name
        self.page = page
        self._dom = None

    @property
    def dom(self):
        if not self._dom:
            document = requests.get(self.url)
            document.encoding = 'utf-8'
            self._dom = PyQuery(document.text)
            self._dom.make_links_absolute(base_url="http://segmentfault.com/") # 相对链接变成绝对链接 爽
        return self._dom


    @property
    def questions(self):
        return [question.attr('href') for question in self.dom('h2.title > a').items()]

    @property
    def has_next_page(self): # 看看还有没有下一页,这个有必要
        return bool(self.dom('ul.pagination > li.next')) # 看看有木有下一页

    def next_page(self): # 把这个蜘蛛杀了, 产生一个新的蜘蛛 抓取下一页。 由于这个本来就是个动词,所以就不加@property了
        if self.has_next_page:
            self.__init__(tag_name=self.tag_name ,page=self.page+1)
        else:
            return None
Beispiel #14
0
class Getlist(object):
    #zurl查询网址ttp://www.0731gch.com/paixie/bianmi/index_26_
    #scatid保存到的栏目id
    #getpages要采集的页数
    #page分页码
    def __init__(self, zurl , scatid ,getpages , page=1):
        self.zurl=zurl
        self.url = zurl+"%d.html" % (page)
        self.catid = zurl.split('_')[1]
        self.page = page
        self._dom = None
        self.getpages=getpages
        self.scatid=scatid

    @property
    def dom(self):
        if not self._dom:
            document = requests.get(self.url)
            document.encoding = 'utf-8'
            self._dom = Pq(document.text)
            self._dom.make_links_absolute(base_url="http://www.0731gch.com/") # 相对链接变成绝对链接 爽
        return self._dom


    @property
    def urls(self):
        return [url.attr('href') for url in self.dom('.case_list dl dd h3 a').items()]

    @property
    def has_next_page(self): # 看看还有没有下一页,这个有必要
        return bool(self.dom('.fy ul .nextPage')) # 看看有木有下一页

    def next_page(self): # 把这个蜘蛛杀了, 产生一个新的蜘蛛 抓取下一页。 由于这个本来就是个动词,所以就不加@property了
        if self.has_next_page :
            self.__init__(zurl=self.zurl,scatid=self.scatid,getpages=self.getpages,page=self.page+1)
        else:
            return None

    def crawl(self): # 采集当前分页
        # sf_ids = [url for url in self.urls]
        con=len(self.urls)
        print('此页共要采集%s篇文章' %con)
        i=1
        for url in self.urls:
            print('此页第%d篇文章采集中' %i)
            Getshow(url).mysave(self.scatid)
            i+=1
            time.sleep(1)

    def crawl_all_pages(self):
        while True:
            print(u'正在抓取栏目页:%s%d.html, 分页:%d ,共需抓 %d 页' % (self.zurl,self.page, self.page, self.getpages))
            self.crawl()
            if int(self.page) >= int(self.getpages) or not self.has_next_page :
            # if not self.has_next_page :
                print('停止')
                break
            else:
                self.next_page()
Beispiel #15
0
 def take(self, *args, **kwargs):
     base_url = kwargs.pop('base_url', None) or self.base_url
     _doc = PyQuery(*args, **kwargs)
     if base_url:
         _doc.make_links_absolute(base_url)
     rv = {}
     self.node.do(None, rv=rv, value=_doc, last_value=_doc)
     return rv
Beispiel #16
0
def doc(html, url):
    """Returns a PyQuery object of a request's content"""
    parser = lxml.html.HTMLParser(encoding='utf-8')
    elements = lxml.html.fromstring(html, parser=parser)
    if isinstance(elements, lxml.etree._ElementTree):
        elements = elements.getroot()
    doc = PyQuery(elements)
    doc.make_links_absolute(url)
    return doc
def doc(rsp):
    """Returns a PyQuery object of a request's content"""
    parser = lxml.html.HTMLParser(encoding=encoding(rsp))
    elements = lxml.html.fromstring(rsp.content, parser=parser)
    if isinstance(elements, lxml.etree._ElementTree):
        elements = elements.getroot()
    doc =PyQuery(elements)
    doc.make_links_absolute(rsp.url)
    return doc
 def get_hosters_for_episode(url):
     dom = PyQuery(url=url)
     dom.make_links_absolute(base_url=SerienStream.base_url)
     hosters = dom('div.hosterSiteVideo > ul > li > div > a')
     hoster_list = list()
     for h in hosters:
         hoster = PyQuery(h)
         hoster_list.append((hoster.find('h4').text(), hoster.attr.href))
     return hoster_list
 def get_episodes(url):
     dom = PyQuery(url=url)
     dom.make_links_absolute(base_url=SerienStream.base_url)
     episodes = dom('table.seasonEpisodesList td.seasonEpisodeTitle > a')
     episode_list = list()
     for e in episodes:
         episode = PyQuery(e)
         episode_list.append((episode.text(), episode.attr.href))
     return episode_list
 def get_seasons(url):
     dom = PyQuery(url=url)
     dom.make_links_absolute(base_url=SerienStream.base_url)
     seasons = dom('div#stream ul:first-child a')
     season_list = list()
     for s in seasons:
         season = PyQuery(s)
         season_list.append((season.text(), season.attr.href))
     return season_list
Beispiel #21
0
 def dom(self):
     if not self._dom:
         d = requests.get(self.url)
         d.encoding = self.encoding
         __dom = Pq(d.text)
         if self.absolute_link:
             try:
                 __dom.make_links_absolute(base_url=self.base_url)
             except ValueError:
                 raise ValueError('When absolute_link is enabled, a base_url must be specified')
         self._dom = __dom
     return self._dom
Beispiel #22
0
    def process_spider_input(self, response, spider):
        """Returns a PyQuery object of the response's content"""

        if response.meta.has_key('_splash_processed'):
            splash_setting = response.meta['_splash_processed']
            endpoint = splash_setting['endpoint']
            if endpoint in ['render.json', 'execute']:
                splash_key_html = spider._splash_json_key_html
                body = response.body_as_unicode()
                splash_result = demjson.decode(body)
                if splash_result.has_key(splash_key_html):
                    body = splash_result[splash_key_html]
                    setattr(response, 'splash_result', splash_result)
                else:
                    setattr(response, 'pq', None)
                    return
            elif endpoint in ['render.png', 'render.jpeg', 'render.har']:
                # do nothing and pyquery is unavailable
                setattr(response, 'pq', None)
                return
            elif endpoint in ['render.html']:
                # do nothing, continue
                body = response.body
        else:
            body = response.body

        if not body:
            setattr(response, 'pq', None)
            return

        enc = self.encoding(body, response)

        try:
            parser = lxml.html.HTMLParser(encoding=enc)
            elements = lxml.html.fromstring(body, parser=parser)
        except (LookupError, ) as e:
            # lxml would raise LookupError when encoding not supported
            # try fromstring without encoding instead.
            # on windows, unicode is not availabe as encoding for lxml
            elements = lxml.html.fromstring(body)
        if isinstance(elements, lxml.etree._ElementTree):
            elements = elements.getroot()
        pq = PyQuery(elements)
        if response.meta.get('_splash_processed'):
            pq.make_links_absolute(
                response.meta["_splash_processed"]["args"]["url"])
        else:
            pq.make_links_absolute(response.url)

        setattr(response, 'pq', pq)
def collect_variable_listing_sources(data_source, output_dir, verbose):
    for letter in string.ascii_uppercase:
        i, url = 0, variable_listing_url(data_source, letter)
        while url:
            if verbose: print("\tFetching: %s" % url)
            src = requests.get(url).text
            save_source(src, output_dir, letter, i)
            doc = PyQuery(src, parser='html')
            doc.make_links_absolute("https://%s.ipums.org/" % data_source)
            next_page = doc('a.next_page')
            if next_page:
                url = next_page.attr['href']
                i += 1
            else:
                url = None
def get_urls(base_url, top_urls, exclude=set()):
    urls = []
    if base_url.endswith("/"):
        base_url = base_url[:-1]
    doc = PyQuery(base_url + "/plog/")
    doc.make_links_absolute(base_url=base_url)
    for a in doc("dd a"):
        href = a.attrib["href"]
        if href in exclude:
            continue
        urls.append(href)
        if len(urls) >= top_urls:
            break

    return urls
def get_urls(base_url, exclude=set()):
    urls = []
    if base_url.endswith("/"):
        base_url = base_url[:-1]
    doc = PyQuery(base_url + "/plog/")
    doc.make_links_absolute(base_url=base_url)
    for a in doc("dd a"):
        href = a.attrib["href"]
        if href in exclude:
            continue
        urls.append(href)
        if len(urls) > 200:
            break

    return urls
Beispiel #26
0
def get_all_links():

    try:
        return pickle.load(open('.links'))
    except IOError:
        URL_BASE = "http://www.casarosada.gob.ar/informacion/discursos?start={}"
        links = []
        for start in pages:
            url = URL_BASE.format(start)
            logging.info('Descargando links desde {}'.format(url))
            pq = PyQuery(url=url, headers=headers)
            pq.make_links_absolute()
            page_links = pq('div.category-item-title a')

            links.extend(list(reversed(page_links)))
        links = [pq(a).attr('href') for a in links]
        pickle.dump(links, open('.links', 'w'))
        return links
Beispiel #27
0
def get_all_links():

    try:
        return pickle.load(open('.links'))
    except IOError:
        URL_BASE = "http://www.casarosada.gob.ar/informacion/discursos?start={}"
        links = []
        for start in pages:
            url = URL_BASE.format(start)
            logging.info('Descargando links desde {}'.format(url))
            pq = PyQuery(url=url, headers=headers)
            pq.make_links_absolute()
            page_links = pq('div.category-item-title a')

            links.extend(list(reversed(page_links)))
        links = [pq(a).attr('href') for a in links]
        pickle.dump(links, open('.links', 'w'))
        return links
def get_doc_hyperlinking(doc: PyQuery,
                         base_url: str) -> List[HyperLinkingInPage]:
    """
    获取网页的超链接列表

    Parameters
    ----------
    doc : PyQuery
        整个文档的 pyquery 对象

    base_url : str
        网页的地址信息,用于将相对地址转换成绝对地址
    """
    rlt = []
    doc.make_links_absolute(base_url=base_url)
    all_href = doc("a")
    body_text = get_pq_object_inner_text(doc)
    ls_href_to_query = []
    for link in all_href:
        link_obj = PyQuery(link)
        url = str(link_obj.attr("href"))
        if not url.startswith("http"):
            continue
        ls_href_to_query.append(link_obj)
    ls_start_pos = batch_get_dom_node_start_pos(doc, ls_href_to_query)
    for ui_ele, start_pos in zip(ls_href_to_query, ls_start_pos):
        if start_pos < 0:
            logger.error(f"Can't find ui object '{ui_ele}'")
        text = get_pq_object_inner_text(ui_ele)
        if text != body_text[start_pos:start_pos + len(text)]:
            logger.error(
                f"inner text is not equal with doc body '{text}' ?= '{body_text[start_pos:start_pos+len(text)]}'"
            )
        url = str(ui_ele.attr("href"))
        hyperlinking_in_page = HyperLinkingInPage(start_pos=start_pos,
                                                  end_pos=start_pos +
                                                  len(text),
                                                  text=text,
                                                  url=url,
                                                  query_obj=ui_ele)
        rlt.append(hyperlinking_in_page)
    return rlt
Beispiel #29
0
def scrape_page(url):
    print('getting url: {}'.format(url))
    doc = PyQuery(url)
    doc.make_links_absolute()

    table = doc('#rz-main-container section:eq(1) .WriteSmallTableTop table:eq(1)')

    for row in table.items('tr:gt(0)'):
        company_col = row('td').eq(0)
        phone_col = row('td').eq(1)
        website_col = row('td').eq(2)

        company = {
            'name': company_col.text(),
            'phone': phone_col.text(),
            'url': website_col('a').attr('href'),
            'details_url': company_col('a').attr('href'),
        }

        yield company
def scrape_page(url):
    doc = PyQuery(url)
    doc.make_links_absolute()

    table = doc(
        '#rz-main-container section:eq(1) .WriteSmallTableTop table:eq(1)')

    for row in table.items('tr:gt(0)'):
        company_col = row('td').eq(0)
        phone_col = row('td').eq(1)
        website_col = row('td').eq(2)

        company = {
            'name': company_col.text(),
            'phone': phone_col.text(),
            'url': website_col('a').attr('href'),
            'details_url': company_col('a').attr('href'),
        }

        yield company
Beispiel #31
0
def tp_rest_city_page(self, city_url, city_id, part):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    print "Now Proxy is " + PROXY
    headers = {
        'User-agent': GetUserAgent()
    }
    page = requests.get(city_url, proxies=proxies, headers=headers)
    page.encoding = 'utf8'
    if len(page.text) < 100:
        update_proxy('Platform', PROXY, x, '23')
        self.retry()
    doc = PyQuery(page.text)
    doc.make_links_absolute(city_url)
    for item in doc('.restaurants.twoLines a').items():
        tp_rest_list_page_num.delay(item.attr.href, city_id, part)
Beispiel #32
0
def search_youtube_video(title, pages):
    print("Entramos en la busqueda")
    cont = 0
    lista_url = []
    lista_views = []
    for page in range(pages):
        params = urllib.parse.urlencode({
            'search_query':
            'intitle:"%s", video' % title,
            'page':
            page
        })
        jq = Pq(
            url="http://www.youtube.com/results?%s" % params,
            headers={
                "user-agent":
                "Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20140129 Firefox/24.0"
            })
        jq.make_links_absolute("http://www.youtube.com")
        for video in jq("ol.item-section").children().items():
            url = video.find("a.yt-uix-tile-link").attr("href")
            lista_url.append(url)
            views = video.find("ul.yt-lockup-meta-info li").eq(1).html()
            if views is not None:
                res = int(
                    views.split('visualizaciones')[0].strip().replace('.', ''))
            else:
                res = 0
            lista_views.append(res)

            cont = cont + 1
            if cont == 8:
                indice = lista_views.index(max(lista_views))
                print("views: {} ".format(max(lista_views)))
                print("indice: {}".format(indice))
                print("url: " + lista_url[indice])
                return lista_url[indice]

    indice = lista_views.index(max(lista_views))
    return lista_url[indice]
Beispiel #33
0
def get_urls_from_podcast(url, verbose=False):
    """given the url to a podcast, return the list of urls to each audiocut"""
    pq = PyQuery(url)
    pq.make_links_absolute()
    return [PyQuery(a).attr('href') for a in pq('.cut_brief h4 a')]
Beispiel #34
0
def get_urls_from_podcast(url, verbose=False):
    """given the url to a podcast, return the list of urls to each audiocut"""
    pq = PyQuery(url)
    pq.make_links_absolute()
    return [PyQuery(a).attr('href') for a in pq('.cut_brief h4 a')]
from pyquery import PyQuery


doc = PyQuery('https://www.rigzone.com/search/alpha/a/')
doc.make_links_absolute()

table = doc('#rz-main-container section:eq(1) .WriteSmallTableTop table:eq(1)')

for row in table.items('tr:gt(0)'):
    company_col = row('td').eq(0)
    phone_col = row('td').eq(1)
    website_col = row('td').eq(2)

    details_url = company_col('a').attr('href')
    company_name = company_col.text()
    company_phone = phone_col.text()
    company_url = website_col('a').attr('href')

    print(company_name, company_phone, company_url, details_url)
    break
Beispiel #36
0
class Getlist(object):
    #tocatid保存到的栏目id
    #getpages要采集的页数
    #page分页码
    def __init__(self, catid, tocatid, getpages, page=1):
        self.url = "http://www.vccoo.com/category/?id=%d&page=%d" % (catid,
                                                                     page)
        self.catid = catid
        self.getpages = getpages
        self.tocatid = tocatid
        self.page = page
        self._dom = None

    @property
    def dom(self):
        if not self._dom:
            document = requests.get(self.url)
            document.encoding = 'utf-8'
            self._dom = Pq(document.text)
            self._dom.make_links_absolute(
                base_url="http://www.vccoo.com/")  # 相对链接变成绝对链接 爽
        return self._dom

    @property
    def urls(self):
        return [
            url.attr('href') for url in self.dom('.list-con h3 > a').items()
        ]

    @property
    def has_next_page(self):  # 看看还有没有下一页,这个有必要
        return bool(self.dom('.pages ul li .next-page'))  # 看看有木有下一页

    def next_page(
            self):  # 把这个蜘蛛杀了, 产生一个新的蜘蛛 抓取下一页。 由于这个本来就是个动词,所以就不加@property了
        if self.has_next_page:
            self.__init__(catid=self.catid,
                          tocatid=self.tocatid,
                          getpages=self.getpages,
                          page=self.page + 1)
        else:
            return None

    def crawl(self):  # 采集当前分页
        sf_ids = [url.split('/')[-1] for url in self.urls]
        con = len(sf_ids)
        print('此页共要采集%s篇文章' % con)
        i = 1
        for sf_id in sf_ids:
            print('此页第%d篇文章采集中' % i)
            Getshow(sf_id).save(self.tocatid)
            i += 1
            print('休息3s采第%d篇' % i)
            time.sleep(3)

    def crawl_all_pages(self):
        while True:
            print(
                u'正在抓取栏目页:http://www.vccoo.com/category/?id=%d&page=%d , 分页:%d ,共需抓 %d 页'
                % (self.catid, self.page, self.page, self.getpages))
            self.crawl()
            if int(self.page) >= int(self.getpages) or not self.has_next_page:
                print('采集任务完成!!!')
                break
            else:
                self.next_page()