def _add_nested(self, k, el): """Parse nested element by its children.""" el = Pq(el) tagname = Pq(el)[0].tag if tagname in self.invalid_tags: return id = self._format_id(el.attr('id')) classes = self._format_classes(el.attr('class')) selector = self._format_selector(el, id, classes) children = Pq(el).children() if not self._is_root_body_node(el): return # Add for single nodes only if not children: self.selectors.add(selector) # Build nested css by traversing all child nodes and getting # their attributes. while children: for child in children: # 1. Add current self.selectors.add(selector) # 2. Add child child = Pq(child) selector += self._add_id_and_classes(child) self.selectors.add(selector) # # 3. Move to next children children = child.children()
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a, link'): e = PyQuery(element) href = e.attr('href') if href is None: continue print '// Drop queryString in included src' print 'from: ', href result = urlparse(href) if result.scheme == 'https': href = href elif result.scheme == '': href = result.path + (('#' + result.fragment) if result.fragment != '' else '') print 'to: ', href new_href = re.sub(r'(rss/index\.html)|(rss/?)$', 'rss/index.rss', href) if not abs_url_regex.search(href): new_href = re.sub(r'/index\.html$', '/', new_href) if href != new_href: e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def parseProductPage(product, need_img_urls=False): """进入商品详情页, 抓取四个新字段 delivery reviews star total_sales """ if product['product_url']: content = fetchContent(product['product_url'], False) doc=PyQuery(content) #product['delivery'] = doc("div.cost-entries-type > p > em.value").text() 运费JS动态 解决不了 product['reviews'] = doc('p.satisfaction-number > a > em.value').text() product['star'] = doc('p.star-level > i').attr("class") product['total_sales'] = doc('p.bargain-number > a > em.value').text() if need_img_urls: url_list = get_img_urls(content) product['img_urls'] = ', '.join(url_list) else: product['img_urls'] = '' product['color'], product['size'] = '', '' for index, td in enumerate(doc('div.obj-content > table > tbody > tr > td')): tdQ = PyQuery(td) if tdQ.attr('class') =='de-feature' and tdQ.text().strip() == u'颜色': product['color'] = PyQuery(doc('div.obj-content > table > tbody > tr > td')[index+1]).text() if tdQ.attr('class') =='de-feature' and tdQ.text().strip() == u'尺寸': product['size'] = PyQuery(doc('div.obj-content > table > tbody > tr > td')[index+1]).text() product['MOQ'] = extractNum(doc('tr.amount > td.ladder-1-1 > span.value').text().replace(u"≥", "")) if not product['MOQ'] or product['MOQ'] == 0: product['MOQ'] = extractNum(PyQuery(doc('tr.amount').remove('td.amount-title').children('td').eq(0))('span.value').text()) if product['MOQ'] == 1: #print product['product_url'] product['sku_size'] = PyQuery(doc('div.unit-detail-spec-operator').eq(0))('span.text').text() product['sku_color'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.name').text() product['sku_price'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.price').text() product['sku_amount'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.count > span > em.value').text() print product['sku_id'], '\t', product['sku_size'], "\t", product['sku_color'], "\t", product['sku_price'], "\t", product['sku_amount'] return product
def scrape(slug, url, name, title=None): f = urlopen(url) doc = f.read() doc, errs = tidy_document( doc, options={ "output-html": 1, #'indent':1, "clean": 1, "drop-font-tags": 1, }, ) if errs: # raise Exception, errs print errs doc = html5lib.parse(doc, treebuilder="lxml") # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html(doc) jQuery = PyQuery([doc]) td = jQuery("td#content") assert len(td) == 1 for img in td("img"): # print 'img:', PyQuery (img) img = PyQuery(img) src = img.attr("src") # alt = img.attr('alt') # if src.startswith ('/image'): rslt = getimage(src, slug.split("/")[0]) img.attr("src", rslt) if trace: print rslt # td = # no_fonts (td) # need to fix links here content = PyQuery(td[0]) # content = content.html() content = no_namespaces(content.html()) print slug, content[:60] # .html() # [:60] if dbteeth: # q, created = QuickPage.objects.get_or_create ( qp, created = create_or_update( QuickPage, keys=dict(slug=slug), fields=dict( name=name, title=title if title else name, content=content, # defaults = dict (sortorder = sortorder), ), )
def test_calendar_tag_rendering(self, timezone_mock): timezone_mock.now.return_value = tz_datetime(2015, 1, 10, 12) page_with_apphook = self.create_base_pages() other_config = EventsConfig.objects.create(namespace='other') self.create_event( title='ev1', start_date=tz_datetime(2015, 1, 13), publish_at=tz_datetime(2015, 1, 10) ) self.create_event( title='ev2', start_date=tz_datetime(2015, 1, 15), publish_at=tz_datetime(2015, 1, 10) ) self.create_event( de=dict( title='ev3', start_date=tz_datetime(2015, 1, 16), publish_at=tz_datetime(2015, 1, 10) ) ) self.create_event( title='ev4', start_date=tz_datetime(2015, 1, 18), publish_at=tz_datetime(2015, 1, 10), app_config=other_config ) self.create_event( title='ev5', start_date=tz_datetime(2015, 1, 22), end_date=tz_datetime(2015, 1, 27), publish_at=tz_datetime(2015, 1, 10) ) self.create_event( title='ev6', start_date=tz_datetime(2015, 1, 25), ) # make use of default tests self.app_config namespace, instead of # hard coding it template_str = """ {%% load aldryn_events %%} {%% calendar 2015 1 'en' '%s' %%} """ % self.app_config.namespace t = Template(template_str) with override('en'): html = t.render(SekizaiContext({})) table = PyQuery(html)('table.table-calendar') page_url_en = page_with_apphook.get_absolute_url() links = table.find('td.events, td.multiday-events').find('a') # test if tag rendered important elements self.assertEqual('1', table.attr('data-month-numeric'), ) self.assertEqual('2015', table.attr('data-year')) self.assertEqual('10', table.find('td.today').text()) self.assertEqual(8, links.length) # 13, 15, 22, 23, 24, 25, 26, 27 expected_days = (13, 15, 22, 23, 24, 25, 26, 27) for position, day in enumerate(expected_days): event_url = '{0}2015/1/{1}/'.format(page_url_en, day) rendered_url = links[position].attrib['href'] self.assertEqual(event_url, rendered_url)
def getTweets(tweetCriteria): refreshCursor = '' results = [] while True: json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor) refreshCursor = json['min_position'] try: tweets = PyQuery(json['items_html'])('div.js-stream-tweet') except Exception, e: print e # There was either an error in the request or nothing returned return results if len(tweets) == 0: break for tweetHTML in tweets: tweetPQ = PyQuery(tweetHTML) tweet = models.Tweet() # print tweetPQ("p.js-tweet-text").text() usernameTweet = tweetPQ("span.username.js-action-profile-name b").text() txt = re.sub(r"[^\x00-\x7F]", "", tweetPQ("p.js-tweet-text").text()) \ .replace('# ', '#') \ .replace('@ ', '@') \ .replace('www. ', 'www.') \ .replace('/ ', '/') retweets = int(tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")) favorites = int(tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")) dateSec = int(tweetPQ("small.time span.js-short-timestamp").attr("data-time")) id = tweetPQ.attr("data-tweet-id") permalink = tweetPQ.attr("data-permalink-path") geo = '' geoSpan = tweetPQ('span.Tweet-geo') if len(geoSpan) > 0: geo = geoSpan.attr('title') tweet.id = id tweet.permalink = 'https://twitter.com' + permalink tweet.username = usernameTweet tweet.text = txt tweet.date = datetime.datetime.fromtimestamp(dateSec) tweet.retweets = retweets tweet.favorites = favorites tweet.mentions = " ".join(re.compile('(@\\w*)').findall(tweet.text)) tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text)) tweet.geo = geo results.append(tweet) if tweetCriteria.maxTweets > 0 and len(results) >= tweetCriteria.maxTweets: return results
def _add(self, k, el): """Parse element, without considering children.""" el = Pq(el) id, classes = el.attr('id'), el.attr('class') if id is not None: self.selectors['ids'].add(id) if classes is not None: for _class in classes.split(' '): self.selectors['classes'].add(_class.strip())
def _absoluteurl(x): q = PyQuery(this) href = q.attr('href') if href and (href.startswith('#') or href.startswith('http') or href.startswith('ftp')): return if href: q.attr('href','/' + href)
def __processInstagramTag(self, i, e): obj = PyQuery(e) url = obj('a').attr('href') shortCode = re.match("http://.*/p/(.*)/", url).group(1) imageUrl = self.getInstagramImageUrl(shortCode) newObj = PyQuery("<img />") newObj.attr('src', imageUrl) obj.replaceWith(newObj)
def replace_img(index, node): node = PyQuery(node) if not node.attr('src'): return node try: node.attr('src', urljoin_rfc(base_url, node.attr('src'))) except: pass return node
def fixLinks(text): d = PyQuery(text, parser='html') for element in d('a'): e = PyQuery(element) href = e.attr('href') if not abs_url_regex.search(href): new_href = re.sub(r'/index\.html$', '/', href) e.attr('href', new_href) print "\t", href, "=>", new_href return d.__unicode__().encode('utf8')
def scrape_category (url, title): category_slug = slugify (title) try: f = urlopen (url) except ValueError: if trace: print 'Retrying:', url url = 'http://eracks.com' + url.replace (' ','%20') if trace: print 'As:', url f = urlopen (url) doc = html5lib.parse(f, treebuilder='lxml', namespaceHTMLElements=False) # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html (doc) jQuery = PyQuery([doc]) page_title = jQuery ('title').text() if page_title.startswith ("eRacks Open Source Systems: "): page_title = page_title.partition ("eRacks Open Source Systems: ") [-1] if page_title.startswith ("eRacks "): page_title = page_title.partition ("eRacks ") [-1] content = jQuery ('td#content') links = content ('a') images = content ('img') for link in links: a = PyQuery (link) href = a.attr('href') skus = find_sku.findall (href) if skus: sku = skus [0] #a.attr ('href', '/%s/%s/' % (category_slug, slugify (sku))) a.attr ('href', '/products/%s/%s/' % (category_slug, sku)) elif href.startswith ('/Legacy'): sku = slugify (href.split ('/') [-1]) #a.attr ('href', '/%s/%s/' % (category_slug, slugify (sku))) a.attr ('href', '/products/%s/%s/' % (category_slug, sku)) print 'link:', a.attr('href') for image in images: img = PyQuery (image) src = img.attr('src') newsrc = getimage (src, 'categories/' + category_slug) img.attr ('src', newsrc) print 'image:', newsrc description = content.html() if trace: print description if dbteeth: cat = Categories.objects.get (name=title) cat.comments = cat.comments + '\n\nScraped from Zope as of ' + str(datetime.date.today()) cat.description = description cat.title = page_title cat.save() print '..saved.'
def make_possible_feed(link_element): """ Visits each <link rel="alternate" href="http://..." /> element """ link = PyQuery(link_element) title = 'Unknown' if link.attr('title'): title = link.attr('title') if link.attr('href'): return {'feed_url': link.attr('href'), 'feed_title': title} else: log.info("Skipping malformed link element for feed, missing href") return False
def replace_link(index, node): node = PyQuery(node) if not node.attr('href'): return node link = node.attr('href').strip() if regex.match(link): try: node.attr('href', urljoin_rfc(base_url, link)) except: pass return node
def _append_contents(struct, par): tag = struct['tag'] _node = PyQuery('<%s />' % tag) if 'attributes' in struct: for key in struct['attributes'].keys(): _node.attr(key, struct['attributes'][key]) if 'text' in struct: _node.text(struct['text']) elif 'children' in struct: for (ugh, child) in struct['children'].iteritems(): _append_contents(child, _node) par.append(_node)
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a'): e = PyQuery(element) href = e.attr('href') if not abs_url_regex.search(href): new_href = re.sub(r'/index\.html$', '/', href) new_href = re.sub(r'index.html', '/', new_href) e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def parseProductDetails(self, product_page_content, product_info): doc = PyQuery(product_page_content) product_info['reviews'] = doc('a.Rating_div > b').text() # product_info['facebook_likes'] = doc('span#u_0_2 > span.pluginCountTextDisconnected').text().strip() # product_info['tweet_share'] = doc('div#c > a#count').text().strip() product_info['likes'] = self.crawler.fetchSocialLikes(product_info['product_url']) imgNodeList = doc('div.other_Imgs > a > div.otheImg_li > img') #div.otheImg_li下存在2个img(重复) urls = [] for imgNode in imgNodeList: imgNodeQ = PyQuery(imgNode) if imgNodeQ.attr('bigimg'): urls.append(imgNodeQ.attr('bigimg')) product_info['img_urls'] = ', '.join(urls)
def replace_image(self, target, image_name): elements = self.html_obj('*').filter('[dzid="' + target + '"]') location = self.location + urllib.quote_plus(image_name) for e in elements: pq = PyQuery(e) if pq.eq(0).is_('img'): pq.attr('src', location) else: pq.css('background-image', 'url("' + location + '");') return location return None
def process_place(link, marker): # print link response = urllib2.urlopen(link) page = PyQuery(response.read()) post_body = page('.post_body') marker['title'] = post_body('h1').text().encode('utf-8') marker['icon'] = post_body('img:first').attr('src') addresses = post_body('td:eq(1)').html().split('<br/>') marker['address'] = PyQuery(addresses[0]).text().encode('utf-8')[len(ADDRESS):] process_position(PyQuery(addresses[1]).text().encode('utf-8'), marker) marker['objects'][0]['phone'] = post_body('td:eq(2)').text().encode('utf-8')[len(PHONES):] marker['objects'][0]['time'] = post_body('td:eq(3)').text().encode('utf-8')[len(WORK_TIME):] try: site = post_body('a.inv').attr('href') if URL in site: site = site[len(URL):] marker['site'] = site except Exception as ex: marker['site'] = '' print 'Error on site getting: %s' % link coment_details = page('.coment_details') beers = [] for element in coment_details('.coment_content>div:eq(0)')('td:odd')('a'): beer = PyQuery(element) beer_name = beer.text().encode('utf-8') beer_link = beer.attr('href') beers.append({'name': beer_name, 'link': beer_link}) marker['objects'][0]['beers'] = beers beer_countries = [] for element in coment_details('.coment_content>div:eq(1)')('td:odd')('a'): beer_country = PyQuery(element) beer_country_name = beer_country.text().encode('utf-8') beer_country_link = beer_country.attr('href') beer_countries.append({'name': beer_country_name, 'link': beer_country_link}) marker['objects'][0]['beerCountries'] = beer_countries beer_sorts = [] for element in coment_details('.coment_content>p:eq(2)')('a'): beer_sort = PyQuery(element) beer_sort_name = beer_sort.text().encode('utf-8') beer_sort_link = beer_sort.attr('href') beer_sorts.append({'name': beer_sort_name, 'link': beer_sort_link}) marker['objects'][0]['beerSorts'] = beer_sorts return marker
def fix_share_links(text,parser): td_regex = re.compile(target_domain + '|' ) assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>" d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for share_class in ['.icon-twitter','.icon-facebook','.icon-google-plus']: for element in d(share_class): e = PyQuery(element) href = e.attr('href') new_href = re.sub(domain, target_domain, href) e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def get_old_fashion_comments(answer_url): aid = comment_list_id(answer_url) comment_box_link = 'http://www.zhihu.com/node/AnswerCommentBoxV2?params=%7B%22answer_id%22%3A%22{}%22%2C%22load_all%22%3Atrue%7D'.format(aid) # | log # log('comments: ' + comment_box_link) r = old_client._session.get(comment_box_link) # print(str(r.content)) doc = PyQuery(str(r.content, encoding='utf-8')) comments = [] for div in doc.find('div.zm-item-comment'): div = PyQuery(div) cid = div.attr('data-id') vote_count = int(div.find('span.like-num').find('em').text()) content = div.find('div.zm-comment-content').html() author_text = div.find('div.zm-comment-hd').text().replace('\n', ' ') if ' 回复 ' in author_text: author, reply_to = author_text.split(' 回复 ') else: author, reply_to = author_text, None comment = OldFashionComment(cid=cid, vote_count=vote_count, content=content, author=OldFashionAuthor(author), reply_to=OldFashionAuthor(reply_to) if reply_to else None) comments.append(comment) return comments
def parseProductsByCategory(self, category_page_content, category_info): self.num_idx = 0 if self.current_category == category_info: self.page_idx = self.page_idx + 1 else: self.current_category = category_info self.page_idx = 1 doc = PyQuery(category_page_content) productNodeList = doc('div#productsContent1_goods > div') productList = [] for productNode in productNodeList: productNodeQ = PyQuery(productNode) self.num_idx = self.num_idx + 1 productInfo = self.newProduct() productInfo['sku_id'] = productNodeQ.attr('alt1') productInfo['name'] = productNodeQ('div.goods_mz > a').text().strip() productInfo['product_url'] = productNodeQ('div.goods_mz > a').attr('href') productInfo['img_url'] = productNodeQ('div.goods_aImg > a > img').attr('src') productInfo['price'] = productNodeQ('div#cat-product-list_USD > span.special_price').attr('price') productInfo['original_price'] = productNodeQ('div#cat-product-list_USD > span.shop_price').attr('price') productInfo['page_idx'] = str(self.page_idx) productInfo['num_idx'] = str(self.num_idx) productInfo.set_categories(category_info) productList.append(productInfo) return productList
def next(self): if self.i == self.categories_iter.length: raise StopIteration link = self.categories_iter[self.i] py_link = PyQuery(link) href = py_link.attr('href') html_class = href.split('/')[-1:][0] title = py_link.text() thumbnail_url = self.crawler.baseurl + PyQuery(link).find('img').attr('src') url = self.crawler.category_url % href category = Category(title, url, html_class, thumbnail_url) shows = Shows(self.crawler, url) tmp = list() tmp.append(shows) if title == 'Nyheter': news_url = self.crawler.news_url % href news_shows = Shows(self.crawler, news_url) tmp.append(news_shows) category.shows = itertools.chain(*tmp) self.i += 1 return category
def get_subforums_infos(self, html): """ Get informations (description, number of topics and posts, ...) about the forums listed on a page """ document = PyQuery(html) idpattern = re.compile(r"/([fc]\d+)-.*") for element in document("a.forumlink"): e = PyQuery(element) match = idpattern.fullmatch(clean_url(e.attr("href"))) if not match: continue oldid = match.group(1) row = e.closest("tr") # Get forum status alt = row("td:nth-of-type(1) img").eq(0).attr("alt") self.forums[oldid].status = 1 if "verrouillé" in alt else 0 # Get subforum description self.forums[oldid].description = row("td:nth-of-type(2) span").eq(1).html() or "" # TODO : Get subforum icon # Get subforum numbers of topics and posts self.forums[oldid].num_topics = int(row("td").eq(2).text()) self.forums[oldid].num_posts = int(row("td").eq(3).text())
def find_external_links(url): '''Look for links to files in a web page and returns a set. ''' links = set() try: response = get(url) if response.status_code != 200: app.logger.warning('Error while getting proxy info for: %s' 'Errors details: %s', url, response.text) else: if response.content: p = PyQuery(response.content) for anchor in p("a"): panchor = PyQuery(anchor) href = panchor.attr("href") if url_is_egg_file(href): # href points to a filename href = get_absolute_url(href, url) links.add('<a href="%s">%s</a>' % (href, panchor.text())) except: # something happened when looking for external links: # timeout, HTML parser error, etc. # we must not fail and only log the error app.logger.exception('') return links
def _main(): # u'<title>':'<url>' sets mt_pages = {} wp_pages = {} # MT request = requests.get(MT_ARCHIVES_URL) document = PyQuery(request.content); archive_list = document('#pagebody .archive-list a') for archive in archive_list: archive = PyQuery(archive) mt_pages[archive.text()] = archive.attr('href') # WP fh = open(WP_ARCHIVES_FILE_PATH, 'r') document = PyQuery(fh.read(), parser='xml'); items = document('channel item') for item in items: item = PyQuery(item) wp_pages[item('title').text()] = item('link').text() # Create .htaccess fh = open(BASE_DIR + '/tmp/.htaccess', 'a') for title, href in mt_pages.items(): if title in wp_pages: fh.write('Redirect permanent %s %s\n' % ( re.sub(r'http://kjirou\.sakura\.ne\.jp', '', href), wp_pages[title], )) fh.write('Redirect permanent /mt/index.xml http://blog.kjirou.net/feed\n') fh.write('Redirect permanent /mt/atom.xml http://blog.kjirou.net/feed\n') fh.write('Redirect permanent /mt/archives.html http://blog.kjirou.net\n') fh.write('Redirect permanent /mt http://blog.kjirou.net\n')
def getPageLinkIfValid(element, currentPageNumber): pyElement = PyQuery(element) pageNumberText = pyElement.find('span').text() if pageNumberText.isdigit() and int(pageNumberText) > currentPageNumber: return 'https://www.youtube.com' + pyElement.attr('href') return None
def parse(html): '''return a list of dictionaries describing the stories on the front page''' elements = [] p = PyQuery(html) # 90s markup woohoo! anchors = p('.title:nth-child(3) a:nth-child(1)') for a in anchors: # have to re-wrap here, because PyQuery just exposes internal lxml objects upon getting iterated a = PyQuery(a) subtext = a.closest('tr').next().find('.subtext') if not subtext: # More link continue children = map(PyQuery, subtext.children()) try: span, submitted, comments = children[0], children[1], children[-1] except IndexError: # filter out ads continue comments = comments.text().rpartition(' ')[0] comments = int(comments) if comments else 0 url = a.attr('href') elements.append({ 'pos': len(elements) + 1, 'title': a.text(), 'url': url, 'domain': urlparse(url).netloc.rpartition('www.')[2], 'comments': comments, 'submitter': submitted.text(), 'points': int(span.text().split()[0]), 'id': int(span.attr('id').split('_', 1)[1]), 'ago': submitted[0].tail.split('ago')[0].strip(), }) logging.warning('parsed %s elements', len(elements)) return elements
def __extract(self, html): pq = PyQuery(html).find("main#main #mainArea table") selector_ = "thead tr:eq(0) th" date_order = [PyQuery(v).text().split('\n')[0] for v in PyQuery(pq).find(selector_)][3:] result = {d: {} for d in date_order} index = 0 total = len(PyQuery(pq).find("tbody tr")) while index < total: td = PyQuery(pq).find("tbody tr:eq(%d) td:eq(0)" % index) room_type = td.text().split()[0] rowspan = int(td.attr('rowspan')) for i in xrange(index, index + rowspan): row = PyQuery(pq).find("tbody tr:eq(%d)" % i) # smoking or not smoking = PyQuery(row).find("td.alC.alM > img").attr("alt") room = "%s (%s)" % (room_type, smoking) if row.hasClass('clubCardCell'): member_type = 'member' else: member_type = 'guest' for i, v in enumerate(self.__extract_price_remain(row)): if room not in result[date_order[i]]: result[date_order[i]][room] = {} result[date_order[i]][room][member_type] = v index += rowspan return result
def parsePage(content): doc = PyQuery(content) productNodeList = doc("ul#sm-offer-list > li") productList = [] for node in productNodeList: nodeQ = PyQuery(node) p = Product() p["product_name"] = nodeQ('a[offer-stat="title"]').text() url = nodeQ('a[offer-stat="title"]').attr("href") if url.find("http") == 0: p["product_url"] = url else: p["product_url"] = "http:" + url p["product_price"] = nodeQ("span.sm-offer-priceNum").text() p["img_url"] = nodeQ('a[offer-stat="pic"] > img').attr("src") p["sku_id"] = nodeQ.attr("t-offer-id") p["store_name"] = nodeQ("a.sm-offer-companyName").text() p["store_url"] = nodeQ("a.sm-offer-companyName").attr("href") print p["store_url"] p["tags"] = [] aList = nodeQ("div.sm-offer-subicon > a") for a in aList: s = PyQuery(a).attr("class") if s: p["tags"].append(s) p["tags"] = ", ".join(p["tags"]) # parseProductPage(p, True) # parseStorePage(p) # productList.append(p) # return productList #测试 return productList
def extract_links(page): d = PyQuery(page) links = d('.newstitle>a') entries = [] for link in links: link = PyQuery(link) # get title and link from html title = link.text().encode("utf-8") link = link.attr('href') entries.append((title, link)) return entries
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a, link'): e = PyQuery(element) href = e.attr('href') if href is None: continue new_href = re.sub(r'(rss/index\.html)|((?<!\.)rss/?)$', 'rss/index.rss', href) if not abs_url_regex.search(href): new_href = re.sub(r'/index\.html$', '/', new_href) if href != new_href: e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def run(self): # 获取商品详情url try: pq = helper.get(self.url, myHeaders=self.headers) for span in pq('span.product_title'): a = PyQuery(span).parents('a') self.q.put(a.attr('href')) except: helper.log('[ERROR] => ' + self.url, 'eastbay') self.error_page_url_queue.put({ 'url': self.url, 'gender': self.gender })
def fix_share_links(text, parser): filetext = text.decode('utf8') td_regex = re.compile(target_domain + '|') assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>" d = PyQuery(bytes(bytearray(filetext, encoding='utf-8')), parser=parser) for share_class in ['.share_links a']: print "share_class : ", share_class for element in d(share_class): e = PyQuery(element) print "element : ", e href = e.attr('href') print "href : ", href print "domain : ", domain print "target_domain : ", target_domain new_href = re.sub(domain, target_domain, href) e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def fix_meta_image_links(text, parser): filetext = text.decode('utf8') td_regex = re.compile(target_domain + '|') assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>" d = PyQuery(bytes(bytearray(filetext, encoding='utf-8')), parser=parser) for share_class in [ 'meta[property="og:image"]', 'meta[name="twitter:image"]' ]: print "share_class : ", share_class for element in d(share_class): e = PyQuery(element) href = e.attr('content') content_target_domain = target_domain.replace( "/static", "") new_href = re.sub(domain, content_target_domain, href) e.attr('content', new_href) print "\t fix image link", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def fixLinks(text, parser): #extremely lazy implementation - beware. text = text.replace('pngg', 'png') text = text.replace('pngng', 'png') text = text.replace('pngpng', 'png') text = text.replace('PNGG', 'PNG') text = text.replace('PNGNG', 'PNG') text = text.replace('PNGPNG', 'PNG') text = text.replace('jpgg', 'jpg') text = text.replace('jpgpg', 'jpg') text = text.replace('jpgjpg', 'jpg') text = text.replace('jpegg', 'jpeg') text = text.replace('jpegeg', 'jpeg') text = text.replace('jpegpeg', 'jpeg') text = text.replace('http://localhost:2368/', 'https://blog.lucaperic.com/') text = text.replace( 'https://feedly.com/i/subscription/feed/https://blog.lucaperic.com/rss/', 'https://feedly.com/i/subscription/feed/https://blog.lucaperic.com/rss/index.rss' ) text = text.replace('/author/luca/rss/', '/rss/index.rss') d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a'): e = PyQuery(element) href = e.attr('href') if not abs_url_regex.search(href): new_href = re.sub(r'rss/index\.html$', 'rss/index.rss', href) new_href = re.sub(r'/index\.html$', '/', new_href) e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return d.html(method='html').encode('utf8') return d.__unicode__().encode('utf8')
def _process(self, data, pid): if '/product/' not in data._resp.effective_url: return props = dict([ PQ(i).text().split(u':') for i in data('.detail-tab-pro-info li') ]) print props try: categories = [{ re.findall('\d+', PQ(i).attr('href'))[0]: PQ(i).text() } for i in data('.breadcrumbs a')[1:]] except: categories = [] ret = { 'title': data('#sec_productTitle').text(), 'category_id': data('#hid_categoryId').attr('value'), 'category_tree': categories, 'keywords': data('meta[name=Keywords]').attr('content').split(u','), 'property': props, 'image': 'http://www.carrefour.cn%s' % data('li.select img').attr('bimg') } # Brand if u'品牌' in props: brand = props[u'品牌'] left_columns = data('.middle-left01') target = None for column in left_columns: if u'相关品牌' == PQ(column)('p.left-title').text(): target = PQ(column) if target: foo = PQ(target('a')[0]) url = foo.attr('href') brand_id = re.findall('b=(\d+)', url)[0] brand_text = foo.text() if brand_text == brand: ret['brand'] = {'brand_name': brand, 'brand_id': brand_id} self.save(ret)
def fixLinks(text, parser): d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser) for element in d('a, link'): e = PyQuery(element) href = e.attr('href') if href is None: continue if (not abs_url_regex.search(href)) or ('/rss/' in href): new_href = re.sub(r'rss/$', 'feed.rss', href) new_href = re.sub(r'index\.html$', '', new_href) new_href = re.sub(r'index\.html\#$', '', new_href) e.attr('href', new_href) print "\t", href, "=>", new_href if parser == 'html': return "<!DOCTYPE html>\n<html>" + d.html( method='html').encode('utf8') + "</html>" elif parser == 'xml': return "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + d.__unicode__( ).encode('utf8') return "<!DOCTYPE html>\n<html>" + d.__unicode__().encode( 'utf8') + "</html>"
def test_css_classes(self): viewlet = self.get_viewlet(self.portal) registry = getUtility(IRegistry) proxy = registry.forInterface(IFooterSettings) proxy.columns_count = 2 doc = PyQuery(viewlet.render()) footer = doc('#ftw-footer') child = footer.children()[0] child = PyQuery(child) if IS_PLONE_5: self.assertEqual(child.attr('class'), 'col-lg-6') else: self.assertEqual(child.attr('class'), 'column cell position-0 width-8') child = footer.children()[1] child = PyQuery(child) self.assertEqual(child.attr('class'), 'column cell position-8 width-8')
def parseNextPageUrl(self, category_page_content): doc = PyQuery(category_page_content) nodeList = doc('p.listspan').children('span > a') #debug得到的结果 if not nodeList: nodeList = doc('p.listspan').children('a') #fw保存下来是这种格式 url = None for node in nodeList: nodeQ = PyQuery(node) if nodeQ.text().strip() == '>': url = nodeQ.attr('href') break if url: print self.merchant.filteruri(url) return self.merchant.filteruri(url)
def _lead_art_element(self): art_elements = self.element.find('layout').find('storytext').children() if not len(art_elements): return None el = PyQuery(art_elements[0]) if el[0].tag != 'image': return None image_id = el.attr('refId') return PyQuery(self.element.children('image[id="%s"]' % image_id))
def build_dict_from_sane_json(elem: PyQuery, already_wrapped=False) -> dict: # Find if has children elem = PyQuery(elem) children = list(elem.contents()) has_children = len(elem.children()) > 0 contents = [] if has_children: # Fix unwrapped children if not already_wrapped: children = fix_unwrapped_text(elem).contents() for child in children: child_dict = build_dict_from_sane_json(child, already_wrapped=True) if child_dict: contents.append(child_dict) else: contents = elem.html() extra = {} # Only tables need the HTML (to use later for extraction of relevant data) if elem.is_("table"): extra = {'original_html': str(elem)} if 'src' in elem[0].attrib: extra['src'] = elem.attr('src') if 'href' in elem[0].attrib: extra['href'] = elem.attr('href') return { 'type': list(elem)[0].tag, 'attrs': [], 'layout': {}, 'contents': contents, 'extra': extra }
def scrape_category(url, title): category_slug = slugify(title) try: f = urlopen(url) except ValueError: if trace: print 'Retrying:', url url = 'http://eracks.com' + url.replace(' ', '%20') if trace: print 'As:', url f = urlopen(url) doc = html5lib.parse( f, treebuilder='lxml' ) # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html(doc) jQuery = PyQuery([doc]) content = jQuery('td#content') #description = jQuery ('td#content').html() links = content('a') images = content('img') for link in links: a = PyQuery(link) href = a.attr('href') skus = find_sku.findall(href) if skus: sku = skus[0] a.attr('href', '/%s/%s/' % (category_slug, slugify(sku))) elif href.startswith('Legacy'): sku = slugify(href.split('/')[-1]) print 'link:', a.attr('href') for image in images: img = PyQuery(image) src = img.attr('src') newsrc = getimage(src, 'categories/' + category_slug) img.attr('src', newsrc) print 'image:', newsrc description = content.html() if trace: print description if dbteeth: cat = Categories.objects.get(name=title) cat.comments = cat.comments + '\n\nScraped from Zope as of ' + str( datetime.date.today()) cat.description = description cat.save() print '..saved.'
def get_one_page_album(account_id, page_count): # http://bcy.net/u/50220/post/cos?&p=1 album_pagination_url = "http://bcy.net/u/%s/post/cos" % account_id query_data = {"p": page_count} album_pagination_response = net.http_request(album_pagination_url, method="GET", fields=query_data) result = { "album_info_list": [], # 全部作品信息 "coser_id": None, # coser id "is_over": False, # 是不是最后一页作品 } if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException(crawler.request_failre(album_pagination_response.status)) if page_count == 1 and album_pagination_response.data.find("<h2>用户不存在</h2>") >= 0: raise crawler.CrawlerException("账号不存在") # 获取coser id coser_id_find = re.findall('<a href="/coser/detail/([\d]+)/\$\{post.rp_id\}', album_pagination_response.data) if len(coser_id_find) != 1: raise crawler.CrawlerException("页面截取coser id失败\n%s" % album_pagination_response.data) if not crawler.is_integer(coser_id_find[0]): raise crawler.CrawlerException("页面截取coser id类型不正确\n%s" % album_pagination_response.data) result["coser_id"] = coser_id_find[0] # 获取作品信息 album_list_selector = PQ(album_pagination_response.data.decode("UTF-8")).find("ul.l-grid__inner li.l-grid__item") for album_index in range(0, album_list_selector.size()): album_selector = album_list_selector.eq(album_index) result_album_info = { "album_id": None, # 作品id "album_title": None, # 作品标题 } # 获取作品id album_url = album_selector.find(".postWorkCard__img a.postWorkCard__link").attr("href") if not album_url: raise crawler.CrawlerException("作品信息截取作品地址失败\n%s" % album_selector.html().encode("UTF-8")) album_id = str(album_url).split("/")[-1] if not crawler.is_integer(album_id): raise crawler.CrawlerException("作品地址 %s 截取作品id失败\n%s" % (album_url, album_selector.html().encode("UTF-8"))) result_album_info['album_id'] = album_id # 获取作品标题 album_title = album_selector.find(".postWorkCard__img img").attr("alt") result_album_info["album_title"] = str(album_title.encode("UTF-8")) result["album_info_list"].append(result_album_info) # 判断是不是最后一页 last_pagination_selector = PQ(album_pagination_response.data).find("#js-showPagination ul.pager li:last a") if last_pagination_selector.size() == 1: max_page_count = int(last_pagination_selector.attr("href").strip().split("&p=")[-1]) result["is_over"] = page_count >= max_page_count else: result["is_over"] = True return result
def getHashtagsAndMentions(tweetPQ): """Given a PyQuery instance of a tweet (tweetPQ) getHashtagsAndMentions gets the hashtags and mentions from a tweet using the tweet's anchor tags rather than parsing a tweet's text for words begining with '#'s and '@'s. All hashtags are wrapped in anchor tags with an href attribute of the form '/hashtag/{hashtag name}?...' and all mentions are wrapped in anchor tags with an href attribute of the form '/{mentioned username}'. """ anchorTags = tweetPQ("p.js-tweet-text")("a") hashtags = [] mentions = [] isFirstHT = True firstHT = '' for tag in anchorTags: tagPQ = PyQuery(tag) url = tagPQ.attr("href") if url is None or len(url) == 0 or url[0] != "/": continue # Mention anchor tags have a data-mentioned-user-id # attribute. if not tagPQ.attr("data-mentioned-user-id") is None: mentions.append("@" + url[1:]) continue hashtagMatch = re.match('/hashtag/\w+', url) if hashtagMatch is None: continue hashtag = hashtagMatch.group().replace("/hashtag/", "#") if isFirstHT: firstHT = hashtag isFirstHT = False hashtags.append(hashtag) return (" ".join(hashtags), " ".join(mentions), firstHT)
def parseProductDetails(self, product_page_content, product_info): doc = PyQuery(product_page_content) product_info['price'] = re.sub( '\s', '', doc('span[class="product_price emphasis "]').text()) #从下面获取的是描述图的小图 #imgNodeList = doc('div[class="js-carousel-content car_content"] > div > img') imgNodeList = doc('div.js-slider-container > div > a > img') results = [] for node in imgNodeList: nodeQ = PyQuery(node) url = nodeQ.attr('src') if url: results.append(url) product_info['img_urls'] = ', '.join(results)
def tweetPaser(tweets_html): tweetslist = [] if tweets_html.strip() != '': scraped_tweets = PyQuery(tweets_html) scraped_tweets.remove('div.withheld-tweet') tweets = scraped_tweets('div.js-stream-tweet') if len(tweets) != 0: for tweet_html in tweets: t = {} tweetPQ = PyQuery(tweet_html) t['user'] = tweetPQ("span:first.username.u-dir b").text() txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text()) txt = txt.replace('# ', '#') txt = txt.replace('@ ', '@') t['tweet'] = txt t['id'] = tweetPQ.attr("data-tweet-id") t['retweets'] = int(tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")) t['favorites'] = int(tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")) t['link'] = 'https://twitter.com' + tweetPQ.attr("data-permalink-path") t['mentions'] = re.compile('(@\\w+)').findall(t['tweet']) t['hashtags'] = re.compile('(#\\w+)').findall(t['tweet']) t['timestamp'] = int(tweetPQ("small.time span.js-short-timestamp").attr("data-time")) tweetslist.append(t) return tweetslist
def test_calendar_tag_rendering_en_and_de(self, timezone_mock): timezone_mock.now.return_value = tz_datetime(2015, 1, 10, 12) page_with_apphook = self.create_base_pages(multilang=True) # make use of default tests self.app_config namespace, instead of # hard coding it t = self.get_template(self.app_config.namespace) context = self.get_context(page_with_apphook) with override('en'): html = t.render(SekizaiContext(context)) table = PyQuery(html)('table.js-calendar-table') page_with_apphook.get_absolute_url() links = table.find('td.events, td.multiday-events').find('a') # test if tag rendered important elements self.assertEqual('1', table.attr('data-month-numeric'), ) self.assertEqual('2015', table.attr('data-year')) self.assertEqual('10', table.find('td.today').text()) # should include DE only event as well expected_days = (13, 14, 15, 16, 17, 22, 23, 24, 25, 26, 27) for position, day in enumerate(expected_days): # page url may vary depending on fallback settings, check only # against the date. event_url = '/2015/1/{0}/'.format(day) rendered_url = links[position].attrib['href'] self.assertGreater(rendered_url.find(event_url), -1)
def parseProductsByCategory(self, category_page_content, category_info): doc = PyQuery(category_page_content) productNodeList = doc('ul.ws-product-list:first > li.hproduct') productList = [] for node in productNodeList: nodeQ = PyQuery(node) productInfo = self.newProduct() productInfo['name'] = nodeQ('h4.ws-product-title').text() productInfo['sku_id'] = nodeQ.attr('data-context-sku') productInfo['product_url'] = nodeQ('h4').parent('a').attr('href') productInfo['img_url'] = nodeQ('div.kor-product-photo > a > img').attr('src') productInfo['price'] = nodeQ('div.kor-product-sale-price > span.kor-product-sale-price-value').text() productInfo['likes'] = self.crawler.fetchSocialLikes(productInfo['product_url']) productInfo.set_categories(category_info) productList.append(productInfo) return productList
def __processImageTag(self, i, e): obj = PyQuery(e) style = obj.attr('style') if style != None and style.find('display: none') != -1: obj.remove() return newObj = PyQuery("<img />") newObj.attr('src', obj.attr('rel:bf_image_src')) newObj.attr('style', obj.attr('style')) newObj.width(obj.width()) newObj.height(obj.height()) obj.replaceWith(newObj)
def getSonglist(playlistId): f = opener.open(urllib.request.Request(url_base.format(playlistId))) html = f.read().decode('utf-8') doc = PyQuery(html) songs = doc('#song-list-pre-cache ul li a') song_arr = [] for song in songs: el = PyQuery(song) parser = urlparse(el.attr('href')) id = parse_qs(parser.query).get('id')[0] song_arr.append({ 'id': id, 'url': url_download.format(id), 'title': el.text() }) return song_arr
def get_tweet_ids(self, term, get_info): """ Given a search term or search phrase, find all the IDs of the result tweets. """ tweet_ids = [] full_tweets = [] refreshCursor = '' while True: response = self.getJsonReponse(term, refreshCursor) refreshCursor = response['min_position'] try: tweets = PyQuery(response['items_html'])('div.js-stream-tweet') except Exception: break # Exit when no more tweets loaded if len(tweets) == 0: break for tweetHTML in tweets: tweetPQ = PyQuery(tweetHTML) tweet_id = tweetPQ.attr("data-tweet-id") tweet_ids.append(tweet_id) if get_info: tweet_info = dict() tweet_info['id'] = tweet_id tweet_info['username'] = tweetPQ( "span.username.js-action-profile-name b").text() tweet_info['text'] = self.text_format( re.sub(r"[^\x00-\x7F]", "", tweetPQ("p.js-tweet-text").text()).replace( '# ', '#').replace('@ ', '@')) tweet_info['date'] = int( tweetPQ("small.time span.js-short-timestamp").attr( "data-time")) tweet_info['date'] = datetime.datetime.fromtimestamp( tweet_info['date']) full_tweets.append(tweet_info) if len(tweet_ids) > 700: break return tweet_ids, full_tweets
def scrape_top(fragment_str): global v1 v1.append(fragment_str) # parse the content of the '.athing' class # getting the fields id,title,uri,rank s = PyQuery(fragment_str) post_id = s.attr('id') title = s(".storylink").text()[:256] title = "None" if not title else title uri = sanitize_url(URL, s(".storylink").attr("href")) try: rank = int(s(".rank").text()[:-1]) except: rank = 0 global v v.append([post_id, title, uri, rank]) return post_id, title, uri, rank
def fetch_note_comments(self, url, dom, douban_id): comments = [] strip_username = lambda el: re.findall( r'^http(?:s?)://www\.douban\.com/people/(.+)/$', el.attr('href') ).pop(0) while True: comment_items = dom('#comments .comment-item') for comment_item in comment_items: item_div = PyQuery(comment_item) quote_user_link = item_div('.content>.reply-quote>.pubdate>a') if quote_user_link: quote_user_name = quote_user_link.text() quote_user_id = strip_username(quote_user_link) quote_text = item_div('.content>.reply-quote>.all').text() blockquote = '{0}({1}):{2}'.format(quote_user_name, quote_user_id, quote_text) else: blockquote = None comments.append({ 'douban_id': item_div.attr('data-cid'), 'content': item_div.outer_html(), 'target_type': 'note', 'target_douban_id': douban_id, 'user': self.fetch_user(strip_username(item_div('.pic>a'))), 'text': item_div('.content>p').text(), 'created': item_div('.content>.author>span').text(), 'quote': blockquote, }) next_page = dom('#comments>.paginator>.next>a') if next_page: url = next_page.attr('href') else: break response = self.fetch_url_content(url) dom = PyQuery(response.text) return comments
def parsecsdn(i): url = "https://blog.csdn.net/weiqifa0/article/list/" + str(i) result = requests.post(url) content = result.text # 把这个内容转换成PyQuery对象 datas = PyQuery(content) items = datas(".article-list a") for item in items: obj = {} # print(item) lineObj = PyQuery(item) # print(lineObj) title = lineObj.text() link = lineObj.attr("href") print(title, link)
def generate_sitemap(sitemap_file_name, html_text, print_html): pq = PyQuery(html_text) sitemap_text = '<?xml version="1.0" encoding="utf-8"?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n' for x in pq.find('a'): x = PyQuery(x) site_url = x.attr('href') sitemap_text += f'<url><loc>{site_url}</loc></url>\n' sitemap_text += '</urlset>' if print_html: print(f'--- {sitemap_file_name} ---') print(sitemap_text) file = open(sitemap_file_name, 'w', encoding='UTF-8') file.write(sitemap_text) file.close() return
def next(self): if self.i == self.categories_iter.length: raise StopIteration link = self.categories_iter[self.i] py_link = PyQuery(link) href = py_link.attr('href') html_class = href.split('/')[-1:][0] title = py_link.text() # thumbnail_url = self.crawler.baseurl + PyQuery(link).find('img').attr('src') url = href show = Show(title, url, html_class) show.clips = Episodes(self.crawler, url) self.i += 1 return show
def parseCategories(self, homepage_content): categoryList = [] doc = PyQuery(homepage_content) #只获取第一个一级品类的展开 Camp & Hike node = doc( 'div[class="mega-menu-container container js-mega-menus-target"] > div > div > section' ).eq(0) nodeList = PyQuery(node).find( "div.row > div.col-xs-2 > a") #a > h4是品类名 Accessories没有<a>忽略 for node in nodeList: nodeQ = PyQuery(node) categoryInfo = self.newCategory() categoryInfo.name = nodeQ.text() categoryInfo.url = nodeQ.attr('href') self.process_url(categoryInfo) if categoryInfo.name and categoryInfo.url: categoryInfo.parent_categories = ['Camp & Hike'] categoryList.append(categoryInfo.formalize()) return categoryList
def parseCategories(self, homepage_content): doc = PyQuery(homepage_content) nodeList = doc('ul#header-navigation-menu > li.menu-container') categoryList = [] #去除前面三个和后面两个乱七八糟的分类 validNodeList = nodeList[3:10] for node in validNodeList: nodeQ = PyQuery(node) level1Name = nodeQ.children('a').text() level2NodeList = nodeQ.children( 'div > ul:first > li.indent-child > span') for level2Node in level2NodeList: level2NodeQ = PyQuery(level2Node) categoryInfo = self.newCategory() categoryInfo.name = level2NodeQ.text() categoryInfo.url = level2NodeQ.attr('href') categoryInfo.parent_categories = [level1Name] categoryList.append(categoryInfo.formalize()) return categoryList
def get_doc_hyperlinking(doc: PyQuery, base_url: str) -> List[HyperLinkingInPage]: """ 获取网页的超链接列表 Parameters ---------- doc : PyQuery 整个文档的 pyquery 对象 base_url : str 网页的地址信息,用于将相对地址转换成绝对地址 """ rlt = [] doc.make_links_absolute(base_url=base_url) all_href = doc("a") body_text = get_pq_object_inner_text(doc) ls_href_to_query = [] for link in all_href: link_obj = PyQuery(link) url = str(link_obj.attr("href")) if not url.startswith("http"): continue ls_href_to_query.append(link_obj) ls_start_pos = batch_get_dom_node_start_pos(doc, ls_href_to_query) for ui_ele, start_pos in zip(ls_href_to_query, ls_start_pos): if start_pos < 0: logger.error(f"Can't find ui object '{ui_ele}'") text = get_pq_object_inner_text(ui_ele) if text != body_text[start_pos:start_pos + len(text)]: logger.error( f"inner text is not equal with doc body '{text}' ?= '{body_text[start_pos:start_pos+len(text)]}'" ) url = str(ui_ele.attr("href")) hyperlinking_in_page = HyperLinkingInPage(start_pos=start_pos, end_pos=start_pos + len(text), text=text, url=url, query_obj=ui_ele) rlt.append(hyperlinking_in_page) return rlt