def download(self, **kwargs): """ :param position - position of image or defaults to random :param category - archive category, see get_categories for the list :return: dict{'content': <image_content>, <some meta data>...} """ category = kwargs.get('category', None) position = kwargs.get('position', 0) rand = False if position == 0: rand = True if position > 1: position -= 1 # since 0 is reserved reduce position if not category: category = self.default_cat category = category.lower() url = self.url_tpl(category=category) response = requests.get(url) sel = Selector(text=response.text) # get position total_items = int(sel.xpath("//p[@class='count']").re('\d+')[0]) items = sel.xpath("//div[@id='search_results']//a[img]/@href").extract() items_per_page = len(items) # find the right image by position if rand: position = random.randrange(0, total_items) if position < items_per_page: image = items[position] else: page = int(math.ceil(position / items_per_page)) position -= items_per_page * (page - 1) url = "{}?page={}".format(url, page) response = requests.get(url) pos_sel = Selector(text=response.text) items = pos_sel.xpath("//div[@id='search_results']//a[img]/@href").extract() image = items[position] # retrieve image response = requests.get(urljoin(url, image)) sel = Selector(text=response.text) image_url = sel.xpath("//div[@class='primary_photo']/a/img/@src").extract_first() image_url = utils.fix_url_http(image_url) meta = { 'url': image_url, 'title': sel.xpath("//div[@class='primary_photo']/a/img/@alt").extract_first(), 'desc_title': sel.xpath("//div[@id='caption']/h2/text()").extract_first(), 'desc': sel.xpath("//div[@id='caption']/p[not(@class)]/text()").extract_first(), 'author': sel.xpath("//div[@id='caption']/p[@class='credit']/a/text()").extract_first(), 'publication_date': sel.xpath("//div[@id='caption']/p[@class='publication_time']" "/text()").extract_first(), } image = Image(image_url, meta) return self.process_url(image, kwargs)
def test_has_class_tab(self): body = u""" <p CLASS="foo\tbar">First</p> """ sel = Selector(text=body) self.assertEqual( [x.extract() for x in sel.xpath(u'//p[has-class("foo")]/text()')], [u'First'])
def getimgsrc(pin_id): url = 'http://huaban.com/pins/%s/' % pin_id z = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}) sel = Selector(text=z.text) jscode = sel.xpath("//script[contains(., 'app.page = app.page')]/text()").extract_first() parsed_js = js2xml.parse(jscode) for i in parsed_js.xpath('//property[@name="pins"]//property[@name="key"]/string/text()'): print 'http://img.hb.aicdn.com/' + i
def get_classes(html): doc = Selector(text=html) classes = set(doc.xpath('//*[@class]/@class').extract()) result = set() for cls in classes: for _cls in cls.split(): result.add(_cls) return result
def get_categories(self, response=None): if not response: response = requests.get(self.url) sel = Selector(text=response.text) categories = sel.xpath("//select[@id='search_category']" "/option/text()").extract() categories = [c.split(' by ')[0].replace(' & ', '-') for c in categories] return categories
def post(inputs): posted=[] failed=[] for week in inputs: try:data = urllib.request.urlopen(week).read() except urllib.error.URLError as e: failed.append(week) print(week) print(e.reason) if type(data) is bytes: data = data.decode("utf-8") hxs = Selector(text=data) posts = hxs.xpath('//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href').extract() posted.append(posts) else: hxs = Selector(text=data) posts = hxs.xpath('//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href').extract() posted.append(posts) return posted
def mon(inputs): week=[] errored_out=[] for month in inputs: try:data = urllib.request.urlopen(month).read() except urllib.error.URLError as e: print (month) errored_out.append(month) print(e.reason) if type(data) is bytes: data = data.decode("utf-8") hxs = Selector(text=data) weeks = hxs.xpath('//ul[@class="weeks"]/li/a').re('http://www.businessweek.com/archive/\\d+-\\d+/news/day\\d+\.html') week.append(weeks) else: hxs = Selector(text=data) weeks = hxs.xpath('//ul[@class="weeks"]/li/a').re('http://www.businessweek.com/archive/\\d+-\\d+/news/day\\d+\.html') week.append(weeks) return week
def test_has_class_simple(self): body = u""" <p class="foo bar-baz">First</p> <p class="foo">Second</p> <p class="bar">Third</p> <p>Fourth</p> """ sel = Selector(text=body) self.assertEqual( [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')], [u'First', u'Second']) self.assertEqual( [x.extract() for x in sel.xpath('//p[has-class("bar")]/text()')], [u'Third']) self.assertEqual( [x.extract() for x in sel.xpath('//p[has-class("foo","bar")]/text()')], []) self.assertEqual( [x.extract() for x in sel.xpath('//p[has-class("foo","bar-baz")]/text()')], [u'First'])
def update_match_streams(self, matches: List[Match]) -> List[Match]: """Populate Match objects with stream urls""" updated = [] for item in matches: # Populate stream data if match is live if not item['time_secs']: resp = self.session.get(item['url']) sel_detailed = Selector(text=resp.text) item['stream'] = sel_detailed.xpath("//div[@class='matches-streams']" "/span[.//a[re:test(text(),'english', 'i')]]" "//iframe/@src").extract_first() item['stream'] = clean_stream_url(item['stream']) updated.append(item) return updated
def test_set_xpathfunc(self): def myfunc(ctx): myfunc.call_count += 1 myfunc.call_count = 0 body = u""" <p CLASS="foo">First</p> """ sel = Selector(text=body) self.assertRaisesRegexp( ValueError, 'Unregistered function in myfunc', sel.xpath, 'myfunc()') set_xpathfunc('myfunc', myfunc) sel.xpath('myfunc()') self.assertEqual(myfunc.call_count, 1) set_xpathfunc('myfunc', None) self.assertRaisesRegexp( ValueError, 'Unregistered function in myfunc', sel.xpath, 'myfunc()')
def main(argv=None, progname=None): parser = argparse.ArgumentParser(prog=progname, description=__doc__) parser.add_argument('expr', metavar='EXPRESSION', help="A CSSexpression, or a XPath expression if --xpath is given.") parser.add_argument('file', metavar='FILE', nargs='?', help="If missing, it reads the HTML content from the standard input.") parser.add_argument('--xpath', action='store_true', help="Given expression is a XPath expression.") parser.add_argument('--re', metavar='PATTERN', help="Apply given regular expression.") parser.add_argument('--encoding', metavar='ENCODING', default='utf-8', help="Input encoding. Default: utf-8.") parser.add_argument('--repr', action='store_true', help="Output result object representation instead of as text.") # TODO: Output this and parsel version. args = parser.parse_args(argv) if args.file: text = open(args.file).read() else: text = sys.stdin.read() if isinstance(text, six.binary_type): try: text = text.decode(args.encoding) except UnicodeDecodeError: parser.error("Failed to decode input using encoding: %s" % args.encoding) sel = Selector(text=text) if args.xpath: result = sel.xpath(args.expr) else: result = sel.css(args.expr) if args.re: regex = args.re.encode(args.encoding) regex = regex.decode('string_escape' if six.PY2 else 'unicode_escape') out = result.re(re.compile(regex, re.IGNORECASE | re.UNICODE)) else: out = result.extract() if args.repr: pprint.pprint(out) else: print("\n".join(out)) return 0
def get_url(self,file_path): with open(file_path, encoding='utf8') as f: text = f.read() html = Selector(text, type='html') result = html.xpath("//div[@class='tocList__title clearIt']//a[@class='ref nowrap']/@href").getall() isbn = html.xpath("//span[@class='bookInfo__isbn__print']/text()").getall() eisbn = html.xpath("//span[@class='bookInfo__isbn__pdf']/text()").getall() pages = html.xpath("//div[@class='tocList__pages']/text()").getall() accept_date = html.xpath("//span[@class='conf-date']/text()").extract_first('') meeting_name = html.xpath("//h1[@class='bookInfo__title']/text()").extract_first('') meeting_place = html.xpath("//span[@class='conf-loc']//text()").extract_first('') if result is None: return else: fw = open(r'D:\code\proceedings\big_json\20190812_1.big_json', 'a', encoding='utf-8') for i, item in enumerate(result): lists = [] self.i +=1 url = 'https://ascelibrary.org' + item name = re.findall('10.1061/(.*)', item)[0] page = pages[i] if eisbn != []: if "ISBN (PDF)" in eisbn[0]: eisbns = eisbn[0].replace("ISBN (PDF):","").replace("-","").strip() else: eisbns = "" else: eisbns = "" if isbn != []: if "ISBN (print)" in isbn[0]: isbns = isbn[0].replace("ISBN (print):","").replace("-","").strip() else: isbns = "" else: isbns = "" if meeting_place == "": name_place = html.xpath("//div[@class='conf-date-loc']/text()").getall()[0] temp = name_place.split("|") meeting_place = temp[1] accept_date = temp[0] lists = [url,name,isbns,eisbns,page,accept_date,meeting_name,meeting_place] self.write_info(lists,fw)
def extract(self, html): """Extract data from lethain.com.""" selector = Selector(html) page_div = selector.xpath('//div[@class="page"]') text_div = selector.xpath('//div[@class="text"]') return { 'titles': [page_div.xpath('string(.//h2)').extract_first()], 'dates': [page_div.xpath('.//span[@class="date"]/text()').extract_first()], 'descriptions': [' '.join(text_div.xpath('string(.//p)').extract())], 'tags': page_div.xpath('.//span[@class="tag"]/a/text()').extract(), 'images': text_div.xpath('.//img/@src').extract(), }
def handler_detail_msm_sp(detail_urls_content, url): if '访问验证-安居客' not in detail_urls_content: # print(url) lat_lng = re.findall(r'lat: "(.*?)",.*?lng: "(.*?)"', detail_urls_content, re.S) real_lat_lng = lat_lng[0] xpath_css = Selector(text=detail_urls_content) house_facilities = xpath_css.xpath('//ul[@class="mod-peitao clearfix"]/li[not(contains(@class,"gray"))]') real_house_facilities = [] for rs in house_facilities: one = rs.xpath('./p/text()').extract_first() real_house_facilities.append(one) sp_item = {} sp_houses = xpath_css.xpath('//*[@id="fy_info"]/ul/li') for house_msg in sp_houses: key1 = str(house_msg.xpath('./span[1]/text()').extract_first()).replace(':','') key = sp_house_config.get(house_msg.xpath('./span[1]/text()').extract_first().replace(':','')) print(str(house_msg.xpath('./span[2]').extract_first()).replace('\n','').replace(' ','')) sp_item[key1] = remove_tags(str(house_msg.xpath('./span[2]').extract_first()).replace('\n','').replace(' ','')) house_resources_l = xpath_css.xpath('//div[@class="itemCon clearfix"]/ul[@class="litem"]/li') for house_resource in house_resources_l: key1 = house_resource.xpath('./span[1]/text()').extract_first() key = sp_house_config.get(house_resource.xpath('./span[1]/text()').extract_first().replace(':','')) sp_item[key1] = remove_tags(str(house_resource.xpath('./span[2]').extract_first()).replace('\n','').replace(' ','')) house_resources_r = xpath_css.xpath('//div[@class="itemCon clearfix"]/ul[@class="ritem"]/li') for house_resource in house_resources_r: key1 = house_resource.xpath('./span[1]/text()').extract_first() key = sp_house_config.get(house_resource.xpath('./span[1]/text()').extract_first().replace(':','')) sp_item[key1] = remove_tags(str(house_resource.xpath('./span[2]').extract_first())) describes = xpath_css.xpath('//*[@id="xzl_desc"]/div').extract_first() real_describe = remove_tags(str(describes)) shop_name = xpath_css.xpath('//div[@class="item-mod"]/h3/b/text()').extract_first().strip() print(shop_name) print(real_house_facilities) print(real_lat_lng) print(real_describe.strip()) public_time = xpath_css.xpath('//*[@id="xzl_desc"]/h3/div/text()')[1].root house_number = xpath_css.xpath('//*[@id="xzl_desc"]/h3/div/text()')[2].root print(public_time, house_number) print(sp_item) else: print('有验证码')
def getHTMLId(page): try: ''' page:网页页码 return:返回详情页id,一个列表 ''' #url参数 params = {'start': f'{25*page}'} r = requests.get(url, params=params, timeout=30) #发送请求 r.raise_for_status() #获取网页状态 #初始化生成一个XPath解析对象 selectors = Selector(r.text) #使用XPath选取指定内容,返回列表 detail_urls = selectors.xpath( '//div[@class="hd"]/a/@href').getall() #返回一个列表 return list(set(detail_urls)) #去重 except: return ""
def thsmn_test(self): # 同花顺模拟 thsmn_base_url = "http://t.10jqka.com.cn/trace/trade/getLastEnOrHold/?" search_people_url = "http://t.10jqka.com.cn/trace/?page={0}&order=weight&show=pic".format( 1) thsmn_text = self.resp_text(url=search_people_url, url_name="同花顺模拟页") thsmn_se = Selector(thsmn_text) people_num = thsmn_se.xpath( "//div[@id='sortshowtable']/ul/li/@data-zid").getall() if not people_num: return "同花顺模拟获取用户账号出问题" data = {'zidStr': ','.join([each_num for each_num in people_num])} thsmn_url = thsmn_base_url + urlencode(data) + '.html' response_text = self.resp_text(url=thsmn_url, url_name="同花顺模拟url") json_moni = json.loads(response_text).get("isT") if json_moni != True: return "同花顺模拟出问题"
def parse_content(node: Selector): s = '' for item in node.xpath('./node()'): if isinstance(item.root, str): s += item.root.strip() elif isinstance(item.root, lxml.html.HtmlElement): if item.root.tag == 'img': alt = item.root.attrib.get('alt', '') if bgm_face.match(alt): s += alt continue elif item.root.tag == 'div': s += unicodedata.normalize( 'NFKD', item.get().strip(), ).replace('<br>\r\n', '\n') else: s += item.get().strip() # continue # if item.root.tag == 'br': # s += '\n' # elif item.root.tag == 'img': # alt = item.root.attrib.get('alt', '') # if bgm_face.match(alt): # s += alt # continue # link = item.root.attrib.get('src') # s += f'[img]{link}[/img]' # elif item.root.tag == 'a': # href = (item.root.attrib['href']) # text = (item.xpath('./text()').extract_first()) # if href == text: # s += f'[url]{href}[/url]' # else: # s += f'[url={href}]{text}[/url]' # elif item.root.tag == 'span': # text = item.xpath('./text()').extract_first() # for key, value in SPAN_BACK_MAP.items(): # if key in item.root.attrib.get('style'): # s += f'[{value}]{text}[/{value}]' # break # else: # raise ValueError(item.root.tag + ' is not impl ed yet') return s
def download(self, **kwargs): """ Download and set image from wallhaven.cc :param position - position of image to choose from listed from 1 to 24, default is 0 = random. :param categories - categories to download from in 000 format, where every number represents binary for [general, anime, people] list. :param purity - purity of content in 000 format, where every number represents binary for [sfw, sketchy, _]. :param sorting - sorting type from available see WallhavenDownloader.sorting_types . """ # Make url from arguments order = 'desc' categories = kwargs.get('categories', '') purity = kwargs.get('purity', '') sorting = kwargs.get('sorting', '') page, position, rand = self._make_position(kwargs.get('position', 0)) url = self.base_url for arg in ['categories', 'purity', 'sorting', 'order', 'page']: value = locals()[arg] if value: url = add_or_replace_parameter(url, arg, locals()[arg]) # Download and parse items resp = requests.get(url) if resp.status_code != 200: self.logger.error('Failed to download image list {}'.format(resp.url)) return list_sel = Selector(text=resp.text) items = list_sel.xpath("//section[@class='thumb-listing-page']//figure/a/@href").extract() item = random.choice(items) if rand else items[position - 1] resp = requests.get(item) if resp.status_code != 200: self.logger.error('Failed to download image page {}'.format(resp.url)) return sel = Selector(text=resp.text) image_url = sel.xpath("//img[@id='wallpaper']/@src").extract_first() meta = { 'id': sel.xpath("//img[@id='wallpaper']/@data-wallpaper-id").extract_first(), 'tags': sel.xpath("//ul[@id='tags']//li/a/text()").extract(), 'views': sel.xpath("//dt[contains(text(),'Views')]/following-sibling::dd[1]/text()").extract_first(), 'favorites': sel.xpath("//dt[contains(text(),'Favorites')]" "/following-sibling::dd[1]//text()").extract_first(), 'res': sel.xpath("//h3/text()").extract_first(), } image = Image(image_url, meta) return self.process_url(image, kwargs)
def get_problem(self, remote_oj, remote_problem): url = 'http://acm.zucc.edu.cn/problem.php?id={}'.format(remote_problem) res = self.request.get(url=url) selector = Selector(res.text) title = selector.xpath( '/html/body/div[1]/div[2]/div[1]/center/h3/text()').get('').split( ':')[1].strip() data = { 'time_limit': float( selector.xpath( '/html/body/div[1]/div[2]/div[1]/center/span[2]/span/text()' ).get('').strip()), 'memory_limit': float( selector.xpath( '/html/body/div[1]/div[2]/div[1]/center/text()[2]').get( '').replace('MB', '').strip()), 'description': selector.xpath( '/html/body/div[1]/div[2]/div[2]/div[1]/div[2]/text()').get( '').strip(), 'input': selector.xpath( '/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/text()').get( '').strip(), 'output': selector.xpath( '/html/body/div[1]/div[2]/div[2]/div[3]/div[2]/text()').get( '').strip(), 'sample_input': selector.xpath('//*[@id="sampleinput"]/text()').get('').replace( '\r', '').strip(), 'sample_output': selector.xpath('//*[@id="sampleoutput"]/text()').get('').replace( '\r', '').strip() } return { 'title': title, 'description': self.problem_format.format(**data) }
def THS_DATA(self) -> Dict: # 日期设置期限 (同花顺) time_test = time.strftime("%Y-%m-%d", time.localtime()) # 名+净额字典 name_jinge_dict = {} ths_response_list = self.__r.spider_ths() for each_ths_response in ths_response_list: se = Selector(each_ths_response) for each_table in se.xpath("//div[@class='zdph']/table"): date = each_table.xpath("//td[1]/text()").get() stock_name = each_table.xpath("//td[2]/a/text()").get() jinge = each_table.xpath("//td[7]/text()").get() if date != time_test: pass else: name_jinge_dict[stock_name] = jinge return name_jinge_dict
def get_xinpan_detail(start_url_content): detail_urls_content = start_url_content if '访问验证-安居客' not in detail_urls_content: # lat_lng = re.findall(r'lat: "(.*?)",.*?lng: "(.*?)"', detail_urls_content, re.S) # real_lat_lng = lat_lng[0] xpath_css = Selector(text=detail_urls_content) item = {} # if 'zu' in url: house_msgs_l = xpath_css.xpath('//*[@id="container"]/div[1]/div[1]/div/div[2]/ul/li')[:-2] for house_msg in house_msgs_l: key1 = house_msg.xpath('./div[1]/text()').extract_first() if '楼盘特点' in key1: item[key1] = [ i for i in str(remove_tags(str(house_msg.xpath('./div[2]').extract_first()).replace('\n', ''))).strip().split(' ') if i] else: # key = house_config.get(house_msg.xpath('./span[1]/text()').extract_first()) item[key1] = remove_tags(str(house_msg.xpath('./div[2]').extract_first()).replace('\n', '').replace(' ', '')) print(item) else: print('有验证码')
async def get_urls_in_playlist(session, playlist_url=''): """get each url of videos in playlist""" try: payload = await get_post_args(session) payload['playlist'] = playlist_url except: raise ExtractException('failed: extract playlist') async with session.post( "http://www.downvids.net/videoflv.php", proxy='socks5://127.0.0.1:1080', data=payload, # todo: 有可能用data=json.dumps(payload) ) as res: text = await res.text() selector = Selector(text=text) video_urls = selector.xpath( "//span[@class='thumb vcard author']/a/@href").extract() for url in video_urls: yield url
def extract_character_names(sel: Selector) -> Iterable[AnimeCharacter]: """Extact the names of the anime characters.""" # pylint: disable=line-too-long maybe_name_anchors = \ sel.xpath("(//h2[contains(., 'Characters')]/following-sibling::div)[1]//a[not(./img)]") for maybe_name_anchor in maybe_name_anchors: href = maybe_name_anchor.attrib["href"] match = re.search(r"/character/\d+/(?P<name>[^/]+)$", href) if not match: continue name = re.sub(r"_+|\s+", " ", match.group("name")).strip() # pylint: disable=line-too-long role = get_all_text( maybe_name_anchor.xpath("./following-sibling::div/small")) if re.search(r"main", role, flags=re.IGNORECASE): role = "main" elif re.search(r"support(ing)?|secondary", role, flags=re.IGNORECASE): role = "secondary" yield AnimeCharacter(name=name, url=href, role=role)
def _get_top_100(self): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0' } url = "http://www.gutenberg.org/browse/scores/top" response = requests.get(url, headers=headers) se = Selector(response.text) books = [] for item in se.xpath( "//h2[@id='books-last1']/following-sibling::ol[1]/li"): href = item.xpath("./a/@href").extract_first() id = href.split("/")[-1] books.append({ "Title": item.xpath("./a/text()").extract_first(), "Url": "http://www.gutenberg.org" + href, "GutenbergId": id }) return books
def _parse_spellcheck_items(cls, response): selector = Selector(text=response.text) content_part = selector.xpath( "/html/body/div[2]/div/div/div[2]/div[3]/div[1]/div[1]") return { 'title': ''.join(content_part.css('h1 ::text').getall()), 'description': [ ''.join(p.css('::text').getall()) for p in content_part.css('* > p') ], 'recommend list': [{ 'word': ''.join(li.css('span ::text').getall()), 'link': None if response.ok else li.xpath('a/@href').get() } for li in content_part.css('* > ul > li')] }
async def detail(**kwargs): session = kwargs['session'] next_url = kwargs['next_url'] title = kwargs['title'] print(next_url) print(title) async with session.get( url=next_url, headers=HEADERS, proxy=PROXY_STR, ) as response: text = await response.text() resp = Selector(text=text) nodes = resp.xpath('//div[@class="kl2-1"]//img/@src').extract() nodes = list(set(nodes)) for img in nodes: # print(img) await download_img(session=session, url=img, title=title) print('next image')
def __init__(self, link:str): "ürün detayını trendyol'den dızlar." kaynak = "trendyol.com" if link.startswith('https://m.'): url = link.replace('https://m.', 'https://') elif link.startswith('https://ty.gl'): try: kisa_link_header = requests.get(link, headers=self.kimlik, allow_redirects=False).headers['location'] url = self.ayristir("adjust_redirect=", "&adjust_t=", unquote(kisa_link_header)) except KeyError: return None else: url = link try: istek = requests.get(url, headers=self.kimlik, allow_redirects=True) except requests.exceptions.ConnectionError: return None secici = Selector(istek.text) # affiliate = "https://tr.rdrtr.com/aff_c?offer_id=3107&aff_id=24172&url=" + quote(url) + "%26utm_source%3Daff_t%26utm_medium%3Dcps%26utm_campaign%3Dgelirortaklari%26utm_subaff%3D{aff_id}%26adjust_tracker%3D21ouxa_bfy1cc%26adjust_campaign%3Dperformics_tr%26adjust_adgroup%3D1%26adjust_label%3D{transaction_id}" try: trendyol_veri = { "link" : url.split('?')[0], "marka" : secici.xpath("//h1[@class='pr-new-br']/a/text()").get().strip() if secici.xpath("//h1[@class='pr-new-br']/a/text()").get() else secici.xpath("//h1[@class='pr-new-br']/text()").get().strip(), "baslik" : secici.xpath("//h1[@class='pr-new-br']/span/text()").get().strip(), "resim" : secici.xpath("//img[@class='ph-gl-img']/@src").get(), "gercek" : secici.xpath("//span[@class='prc-org']/text()").get(), "indirimli" : secici.xpath("//span[@class='prc-slg prc-slg-w-dsc']/text()").get() or secici.xpath("//span[@class='prc-slg']/text()").get(), "kampanya" : secici.xpath("//div[@class='pr-bx-pr-dsc']/text()").get(), "son_fiyat" : secici.xpath("//span[@class='prc-dsc']/text()").get(), "yorumlar" : self.trendyol_yorum(url), # "link" : self.link_kisalt.tinyurl.short(url.split('?')[0]) } except AttributeError: trendyol_veri = None kekik_json = {"kaynak": kaynak, 'veri' : trendyol_veri} self.kekik_json = kekik_json if kekik_json['veri'] != [] else None self.kaynak = kaynak
def parse_interval(self, response): interval_rides = "jQuery('#interval-rides').html(\"" lines = response.text.split('\n') for line in lines: if line.startswith(interval_rides): content = line[len(interval_rides):-3] content = content.replace("\\n", "") content = content.replace("\\'", "'") content = content.replace('\\"', '"') content = content.replace("\\\\"", "'") selector = Selector(text=content) activities = selector.xpath( '//div[@class="content react-feed-component"]//@data-react-props' ).extract() for activity in activities: # Read the JSON representation of the activity try: activity = activity.replace("\\", "") activity_json = json.loads(activity) if "activity" in activity_json: # Check if it's backcountry skiiing and located in New England. if activity_json["activity"][ "type"] == "BackcountrySki": activity_location = activity_json["activity"][ "timeAndLocation"]["location"] if any(state in activity_location for state in self.states): # Get the activity ID activity_id = activity_json["activity"][ "id"] request = scrapy.Request( url= f"https://www.strava.com/activities/{activity_id}", dont_filter=True, callback=self.parse_activity) yield request except: self.logger.error(activity)
def parse_html(self, message): utils.printf('%s:解析起始页开始...' % self.provider) conn = utils.init_db('mysql', 'aiaajournal', 2) result = [] stmt = 'insert ignore into journal(journal_name,url,eissn,cover_url,active) Values(%s,%s,%s,%s,%s)' active = 0 cnt = 0 for filename, fullname in utils.file_list(self.html_path): if filename == 'active.html': active = 1 else: active = 0 with open(fullname, encoding='utf8') as f: text = f.read() try: sel = Selector(text=text) for liTag in sel.xpath('//li[@class="search-item clearfix"]'): journal_name = liTag.xpath( './div/h4/a/text()').extract_first().strip() url = liTag.xpath( './div/h4/a/@href').extract_first().replace( 'journal', 'loi') eissn = liTag.xpath( './div/div/div/span[@class="meta__eissn"]/text()' ).extract_first().replace('eISSN: ', '').strip() cover_url = liTag.xpath( './div/a/img/@src').extract_first().strip() result.append( (journal_name, url, eissn, cover_url, active)) utils.printf(len(result)) except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg) utils.logerror(fullname) return utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析起始页完成...' % self.provider) self.senddistributefinish('startdown_index')
def spider_thsmn(self) -> Any: base_url = "http://t.10jqka.com.cn/trace/trade/getLastEnOrHold/?" time_test = time.strftime("%Y%m%d", time.localtime()) stock_name_list = [] # 装载股票名 stock_BS_list = [] # 装载BS stock_BS_times_list = [] # 装载BS次数 # http://t.10jqka.com.cn/trace/trade/getLastEnOrHold/?zidStr=60503016,44983608,36010761,56395121,58626061,40787461,47391107,62884256,25869277,37401557 # 获取账户名 for page in range(1, 4): search_people_url = "http://t.10jqka.com.cn/trace/?page={0}&order=weight&show=pic".format( page) text = self.resp_text(search_people_url) se = Selector(text) people_num = se.xpath( "//div[@id='sortshowtable']/ul/li/@data-zid").getall() data = {'zidStr': ','.join([each_num for each_num in people_num])} new_url = base_url + urlencode(data) + '.html' # 再次请求 response_text = self.resp_text(new_url) json_moni = json.loads(response_text) result_moni = json_moni.get("result") for each_count_num in people_num: each_stock_mesg = result_moni.get(each_count_num) if each_stock_mesg: # 上传时间 updata_time = each_stock_mesg['wtrq'] # 股票名 stock_name = each_stock_mesg['zqmc'] # 买卖 BS = each_stock_mesg['mmlb'] # 股数 stock_times = each_stock_mesg['wtsl'] if updata_time == time_test: stock_name_list.append(stock_name) stock_BS_list.append(BS) stock_BS_times_list.append(stock_times) else: pass return stock_name_list, stock_BS_list, stock_BS_times_list
def parse_html(self,message): utils.printf('%s:解析起始页开始...' % self.provider) conn = utils.init_db('mysql', 'pishuinfo', 4) result = [] stmt = 'insert ignore into video(video_id,stat) Values(%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.html_path): with open(fullname, encoding='utf8') as f: text = f.read() sel = Selector(text=text) for href in sel.xpath('//*[@id="TitleList"]/div/a/@href'): video_id = href.re('.*ID=(\d+)&isHost=.*')[0] result.append((video_id, 0)) utils.printf(len(result)) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析起始页完成...' % self.provider) self.senddistributefinish('startdown_index')
def lhc_wiki_events(): LHC_WIKI_EVENTS_URL = "https://lhc.net.br/wiki/Categoria:Eventos" response = requests.get(LHC_WIKI_EVENTS_URL) selector = Selector(text=response.text) raw_events = selector.xpath( "//script[contains(text(), 'window.eventCalendarData.push')]/text()" ).re_first(r"window.eventCalendarData.push\((.*)\)") events = json.loads(raw_events) lhc_events = [] for event_data in events: event = Event( name=event_data.get("title"), begin=event_data.get("start"), end=event_data.get("end"), url=urljoin("https://lhc.net.br", event_data.get("url", "")), location="Laboratório Hacker de Campinas", ) lhc_events.append(event) return lhc_events
def getimagelist_href(link: str): __image_list = [] try: print("Collecting images in the link: " + link) __response = requests.get(link, timeout=10) __selector = Selector(__response.text) if __response.status_code == 200: __image_list = __selector.xpath('//img/@src').getall() print("Images collected!") except Exception as exp: print("Error in the link") __new_list = [] for i in __image_list: if not (i[0:1] == "/"): __new_list.append(link + "/" + i) else: __new_list.append(link + i) print("Done!") print(__new_list) return __new_list
def get_global_data_BSV(): """ 拿到bsv价格以及 每T/1天的收益 """ ##TODO:需要判断是否为btc...其他的币 需要别的获取方法... logger.info("爬取bsv每T每天的收益") url = "https://explorer.viawallet.com/bsv" headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" } z = requests.get(url, headers=headers, timeout=60) sel = Selector(text=z.text) jscode = sel.xpath( '//script[contains(.,"coin_per_t_per_day")]/text()' ).extract_first() parse_js = js2xml.parse(jscode) mining_payoff_btc = float( parse_js.xpath('//*[@name="coin_per_t_per_day"]/string/text()')[0].strip() ) return mining_payoff_btc
async def parse_data(self, session, html): '''处理数据''' selector = Selector(html) result_list = selector.xpath('//a[@class="col-xs-6 col-sm-3"]') for result in result_list: img_url = result.xpath('./img/@data-original').extract_first() img_title = result.xpath('./img/@alt').extract_first() all_title = img_title + '.' + img_url.split('.')[-1] content = await self.fetch_img(session, img_url) try: with open(path + "\\" + all_title, mode='wb') as f: print("下载完成:", all_title) f.write(content) except Exception as e: print(e)
def generate_sections(data): sections = [] if len(data) > 0: sections.append(Section(index=0, title="Todos", subsections=[])) for index, element in enumerate(data): sel = Selector(text=element) title_text = sel.xpath("//h4/text()").get( ) # TODO: checar se esse xpath retorna o child e, caso não retornar, usar só "h4/text()" if title_text is not None and title_text != "": subsections = generate_subsections(element) section = Section( index=index + 1, title=title_text, subsections=subsections, source=data, ) sections.append(section) return sections
def parse_oddsList(self, response): # #team_fight_table tr[class!=LotteryListTitle play_list = response.css( "#team_fight_table tr:not(.LotteryListTitle)").extract() for p in play_list: play_sel = Selector(text=p) # 取matchid matchid = play_sel.xpath("//@matchid").extract_first() # 取odds_url odds_url = "/soccer/match/" + matchid + "/odds/" playInfo = copy.deepcopy(response.meta["playInfoObj"]) playInfo["id"] = matchid playInfo["play_urls"] = odds_url yield scrapy.Request(url=self.base_url + odds_url, headers=self.headers, meta={ 'cookiejar': response.meta['cookiejar'], "playInfoObj": playInfo }, callback=self.parse_playInfo)
def scrap_profiles(driver): """ select the profile from the webpage and scrap it. """ try: sel = Selector(text=driver.page_source) root = driver.find_element_by_class_name("pv-top-card") name = root.find_elements_by_xpath( "//section/div/div/div/*/li")[0].text.strip() job_title = sel.xpath('//h2/text()').getall()[1] ln_url = driver.current_url # upsert to Employee Model Employee.objects.get_or_create(name=name, designation=job_title.strip(), company='Mambu') time.sleep(5) except: print('failed to scrape profile') pass
def crawl_qoo10(keyword, num=10): url = 'https://www.qoo10.sg/s/' + keyword + '?keyword=' + keyword + '&keyword_auto_change=' response = requests.get(url, headers=headers) soup = BeautifulSoup(response.content, 'html.parser') parser = soup.find_all('div', attrs={"class": "bd_lst_item"}) itemlist = parser[1].select('tr') itemlist = itemlist[2:-1] count = 0 finalitemlist = [] for item in itemlist: selector = Selector(str(item)) href_links = selector.xpath('//a/@href').getall() for i in href_links: if str(i) != '#none' and str(i) != "#": link = str(i) break titles = str(item).split('title="')[1:] for j in titles: formatted = j.split('"')[0] if str(formatted) != 'Click to Play Video': title = str(formatted) break price = item.select('.prc') price = str(price).split('strong>')[1][:-2] price = price[2:] price = price.replace(',', '') finalitemlist.append((title, float(price), link)) count += 1 if count == num: break sorted_itemList = sorted(finalitemlist, key=lambda x: x[1]) return sorted_itemList
def parse_type(self, response): # 解析比赛类型 logging.debug(response.url) sch_type = response.css("div#m_id").extract() type_len = len(sch_type) if type_len == 0: # 没有类型 scheduleInfo = copy.deepcopy(response.meta["scheduleInfoObj"]) logging.debug(scheduleInfo) scheduleInfo["sch_type"] = "无" scheduleInfo["id"] = response.meta["scheduleInfoObj"]["id"] + "_0" yield scrapy.Request(url=buildRandomUrl(response.url), headers=self.headers, meta={ 'cookiejar': 1, "scheduleInfoObj": scheduleInfo }, callback=self.parse_group) else: for type in sch_type: # scheduleInfo = copy.deepcopy(response.meta["scheduleInfoObj"]) logging.debug(scheduleInfo) # logging.debug("sch_type html: " + type) sch_type_sel = Selector(text=type) name = sch_type_sel.css("a::text").extract_first() if name == None or name == "" or name == "null": continue url = sch_type_sel.xpath("//a/@href").extract_first() scheduleInfo["sch_type"] = name scheduleInfo[ "id"] = response.meta["scheduleInfoObj"]["id"] + "_" + name # yield scrapy.Request(url=self.base_url + url, headers=self.headers, meta={ 'cookiejar': 1, "scheduleInfoObj": scheduleInfo }, callback=self.parse_group)
def parse_html(self, message): utils.printf('%s:解析起始页开始...' % self.provider) conn = utils.init_db('mysql', 'ydylcnbook', 4) result = [] stmt = 'insert ignore into book(bookid,cover_url) Values(%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.html_path): with open(fullname, encoding='utf8') as f: text = f.read() sel = Selector(text=text) for aTag in sel.xpath('//ul[@class="list-book-1"]/li/a'): bookid = aTag.xpath('./@href').extract_first().split('=')[-1] cover_url = aTag.xpath('./div/div/img/@src').extract_first() result.append((bookid, cover_url)) utils.printf(len(result)) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析起始页完成...' % self.provider) self.senddistributefinish('startdown_list')
async def main(): async with aiohttp.ClientSession() as session: response = await session.get( 'https://kartochki-domana.com.ua/ru/product-category/podarochnie-nabori/' ) html = await response.text() sel = Selector(text=html) prod_urls = sel.xpath('//h3[@class="product_title"]/a/@href').getall() [(await queue.put(prod_url), added_prod_urls.add(prod_url)) for prod_url in prod_urls if prod_url not in added_prod_urls] print(queue.qsize()) # pprint(added_prod_urls) tasks = [] for _ in range(50): task = asyncio.Task(worker(session)) tasks.append(task) await asyncio.gather(*tasks)
def _find_match(self, sel: Selector) -> Match: xpath = lambda x: sel.xpath(x).extract_first(default='').strip() item = Match() item['url'] = urljoin(self.url_base, xpath(".//a/@href")) item['id'] = (re.findall('matches/(\d+)', item['url']) or [None])[0] item['game'] = next((g for g in self.games if g in item['url'].lower())) item['time'] = xpath("td[@class='status']/span/text()") item['time_secs'] = time_to_seconds(item['time']) item['timestamp'] = int((datetime.now() + timedelta(item['time_secs'])).timestamp()) item['t1'] = xpath(".//span[contains(@class,'opp1')]/span/text()") item['t1_country'] = xpath(".//span[contains(@class,'opp1')]/span[contains(@class,'flag')]/@title") item['t1_country_short'] = xpath(".//span[contains(@class,'opp1')]" "/span[contains(@class,'flag')]/@class").split()[-1] item['t2'] = xpath(".//span[contains(@class,'opp2')]/span/text()") item['t2_country'] = xpath(".//span[contains(@class,'opp2')]/span[contains(@class,'flag')]/@title") item['t2_country_short'] = xpath(".//span[contains(@class,'opp2')]" "/span[contains(@class,'flag')]/@class").split()[-1] scores = sel.css('.score::text').extract() item['t1_score'] = scores[0] if scores else None item['t2_score'] = scores[1] if len(scores) > 1 else None return item
posts = hxs.xpath('//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href').extract() posted.append(posts) else: hxs = Selector(text=data) posts = hxs.xpath('//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href').extract() posted.append(posts) return posted if __name__ == '__main__': print("in main") totalWeeks = [] totalPosts = [] url = 'http://www.businessweek.com/archive/news.html#r=404' data = urllib.request.urlopen(url).read() data = data.decode("utf-8") sel = Selector(text=data) months = sel.xpath('//ul/li/a').re('http://www.businessweek.com/archive/\\d+-\\d+/news.html') #admittMonths = 12*(2015-1991) + 8 m=[] for i in months: m.append([i]) totalWeeks = [] pool = Pool(8) totalWeeks= pool.map(mon,m) totalWeeks = [ent for sublist in totalWeeks for ent in sublist] print (len(totalWeeks)) #club = [ent for sublist in totalWeeks for ent in sublist] #print (len(club)) club = [ent for sublist in totalWeeks for ent in sublist] print (len(club)) d=[] for i in club:
pass except urllib.error.HTTPError: pass except timeout: pass else: fail.append(s[i]) print ("failed to retive info from ",s[i],i) flag = True if flag ==True: pass else: clap = response.read() clap = clap.decode("utf-8") h = Selector(text=clap) date = h.xpath('//meta[@content][@name="pub_date"]/@content').extract() if date: pass else: date = h.xpath('//meta[@content][@name="parsely-pub-date"]/@content').extract() key = h.xpath('//meta[@content][@name="keywords"]/@content').extract() info = h.xpath('//div[@id = "article_body"]/p//text()').extract() if not info: info = h.xpath('//div[@class = "article-body__content"]/p//text()').extract() if len(info)>1: info = ' '.join(str(r) for r in info) info = info.replace(u"\xa0", u" ") if "T" in date[0]: date,t = date[0].split('T') else: date = date[0]
def test_make_links_absolute(self): text = u'<a href="file.html">link to file</a>' sel = Selector(text=text, base_url='http://example.com') sel.root.make_links_absolute() self.assertEqual(u'http://example.com/file.html', sel.xpath('//a/@href').extract_first())
def find_history(self, sel: Selector) -> Generator[Match, None, None]: """ Generator to find recent matches in parsel.Selector object :returns: Generator for Match objects """ yield from self._find_matches(sel.xpath("//h2[contains(text(),'Recent')]/..//tr"))
def find_matches(self, sel: Selector) -> Generator[Match, None, None]: """ Generator to find live and upcoming matches in parsel.Selector object :returns: Generator for Match objects """ yield from self._find_matches(sel.xpath("//table[@id='gb-matches']//tr"))