def apply_selector_annotations(annotations, target_page): page = Selector(text=target_page) converted_annotations = [] annotations = _merge_annotations_by_selector(annotations) for annotation in annotations: if not annotation.get('selector'): accepted_elements = set( chain(*[[elem.root for elem in page.css(sel)] for sel in annotation.get('accept_selectors', []) if sel]) ) rejected_elements = set( chain(*[[elem.root for elem in page.css(sel)] for sel in annotation.get('reject_selectors', []) if sel]) ) elems = accepted_elements - rejected_elements else: elems = [elem.root for elem in page.css(annotation['selector'])] if elems: tagids = [int(e.attrib.get('data-tagid', 1e9)) for e in elems] tagid = min(tagids) if tagid is not None: annotation['tagid'] = tagid converted_annotations.append(annotation) return converted_annotations
def parse_item(self, response): items = [] sel = Selector(response) base_url = get_base_url(response) sites_even = sel.css('table.tablelist tr.even') for site in sites_even: item = TencentItem() item['name'] = site.css('.l.square a').xpath('text()').extract()[0] relative_url = site.css('.l.square a').xpath('@href').extract()[0] item['detailLink'] = urljoin_rfc(base_url, relative_url) item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()[0] item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract()[0] item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract()[0] item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()[0] items.append(item) # print repr(item).decode("unicode-escape") + '\n' sites_odd = sel.css('table.tablelist tr.odd') for site in sites_odd: item = TencentItem() item['name'] = site.css('.l.square a').xpath('text()').extract()[0] relative_url = site.css('.l.square a').xpath('@href').extract()[0] item['detailLink'] = urljoin_rfc(base_url, relative_url) item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()[0] item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract()[0] item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract()[0] item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()[0] items.append(item) # print repr(item).decode("unicode-escape") + '\n' info('parsed ' + str(response)) return items
def parse_podcastlist(self, response): """Extract podcast name and url from the list of podcasts""" sel = Selector(response) urls = sel.css("div#selectedcontent div ul li a::attr(href)").extract() names = sel.css("div#selectedcontent div ul li a::text").extract() for url, name in zip(urls, names): _id = get_id_from_url(url) item = ItunesItem(name=name, url=url, itunesId=_id) yield item
def apply_selector_annotations(annotations, target_page): page = Selector(text=target_page) converted_annotations = [] annotations = _merge_annotations_by_selector(annotations) for annotation in annotations: if not annotation.get('selector'): accepted_elements = set( chain(*[[elem._root for elem in page.css(sel)] for sel in annotation.get('accept_selectors', []) if sel]) ) rejected_elements = set( chain(*[[elem._root for elem in page.css(sel)] for sel in annotation.get('reject_selectors', []) if sel]) ) elems = accepted_elements - rejected_elements else: elems = [elem._root for elem in page.css(annotation['selector'])] if not elems: continue tagids = [int(e.attrib.get('data-tagid', 1e9)) for e in elems] tagid = min(tagids) if tagid is not None: annotation['tagid'] = tagid converted_annotations.append(annotation) # Create container for repeated field annotation if (annotation.get('repeated') and not annotation.get('item_container') and len(annotation.get('annotations')) == 1): parent = _get_parent(elems, page) field = annotation['annotations'].values()[0][0]['field'] container_id = '%s#parent' % annotation['id'] if len(parent): converted_annotations.append({ 'item_container': True, 'id': container_id, 'annotations': {'#portia-content': '#dummy'}, 'text-content': '#portia-content', 'container_id': annotation['container_id'], 'field': field, 'tagid': parent.attrib.get('data-tagid') }) annotation['item_container'] = True annotation['field'] = field annotation['container_id'] = container_id return converted_annotations
def parseUpdate(self, strProjectUrl): #更新頁面的檔案名稱格式為: projectID + "_blog_" + pageIndex + ".html" #其中pageIndex從0開始,至少會有1個,直接判斷檔案是否存在來判斷有多少個分頁 projectID = getFileNameInUrl(strProjectUrl) dicUpdateResult = {} i = 0 while True: blogPageFilePath = self.__LOCAL_PAGE_PATH + projectID + self.__LOCAL_PAGR_BLOG_SUFFIXES + "_" + str(i) + self.__LOCAL_PAGE_EXTENSION if os.path.isfile(blogPageFilePath) == True: with open(blogPageFilePath, "rb") as file: strBlogPageSource = file.read() root = Selector(text = strBlogPageSource) updateElements = root.css(".content > .well.simple") for updateElement in updateElements: dicUpdateResult["strUrl"] = strProjectUrl #更新資訊標題 strUpdateTitle = updateElement.css("h2 > a::text").extract_first() dicUpdateResult["strUpdateTitle"] = strUpdateTitle #更新資訊內容 strUpdateContent = "" for x in updateElement.css(".blogpost-content *::text").extract(): strUpdateContent = strUpdateContent + purifyString(x) strUpdateContent = purifyString(strUpdateContent) dicUpdateResult["strUpdateContent"] = strUpdateContent #更新資訊日期 strUpdateDate = updateElement.css("h2 > small > time::attr(datatime)").extract_first() dicUpdateResult["strUpdateDate"] = strUpdateDate i = i+1 else: break self.__lstUpdateResult.append(dicUpdateResult)
def parse_alpha(self, response): """ extract the alpha letters links""" sel = Selector(response) urls = sel.css("ul.alpha li a::attr(href)").extract() for url in urls: yield Request(url, callback=self.parse_page)
def parse_item(self, response): selector = Selector(response=response) selector.css('div#content div.article div.topic-content') item_loader = ItemLoader(item=HouseRentingDoubanItem(), selector=selector, response=response) item_loader.add_css(field_name='title', css='table.infobox *::text') item_loader.add_css(field_name='title', css='div#content > h1:first-child::text') item_loader.add_value(field_name='source', value=self.name) item_loader.add_css(field_name='author', css='h3 span.from a::text') item_loader.add_css(field_name='image_urls', css='div.topic-content div#link-report img::attr(src)') item_loader.add_css(field_name='author_link', css='h3 span.from a::attr(href)') item_loader.add_css(field_name='content', css='div.topic-content div#link-report *::text', re=r'\s*(.*)\s*') item_loader.add_value(field_name='source_url', value=response.url) item_loader.add_css(field_name='publish_time', css='h3 span:last-child::text', re=r'\s*(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s*') yield item_loader.load_item()
def parse(self, response): """ Extract the main genres""" sel = Selector(response) selector = "div#genre-nav div ul li a.top-level-genre::attr(href)" urls = sel.css(selector).extract() for url in urls: yield Request(url, callback=self.parse_alpha)
def parse(self, response): province_block_headers = response.css("#LiveAccordionWrapper1053 h3 a.LA-ui-accordion-header").extract() for province_block_header in province_block_headers: province_block_headers_selector = Selector(text=province_block_header) province_block_id = str.join('', province_block_headers_selector.css("::attr('href')").extract()) province_name = str.join('', province_block_headers_selector.css("::text").extract()) province_block = response.css(province_block_id) rows = province_block.css('table.tg tr') for row in rows: row_text = str.join('', row.css('::text').extract()) if 'provincia' in row_text.lower(): province_name = row_text.lower().replace('provincia :', '') continue if 'circ' in row_text.lower(): province_name = row_text.lower().replace('provincia :', '') continue if 'senador' in row_text.lower(): position = "senador" continue if 'diputado' in row_text.lower(): position = "diputado" continue if row.css('.tg-5mgg'): party_header = row.css('.tg-5mgg') party_header_colspan = str.join('', party_header.css("::attr('colspan')").extract()) print party_header_colspan if (party_header_colspan == '4'): party = str.join('', party_header.css('td.tg-5mgg::text').extract()) else: party = str.join('', party_header.css('::text').extract()) continue candidate_name = row_text print province_name print party print position print candidate_name yield self.create_item(province_name, party, position, candidate_name)
def parse_exercise(self, response): try: muscle_primary_selector = Selector( text=response.css('div.profile-info-value').extract()[4]) except Exception: muscle_primary_selector = Selector(text='') try: muscle_additional_selector = Selector( text=response.css('div.profile-info-value').extract()[5]) except Exception: muscle_additional_selector = Selector(text='') try: male_first_image = re.sub('\.\.\/', '', response.css('ul.ace-thumbnails img::attr(src)').extract()[0]) except Exception: male_first_image = '' try: male_second_image = re.sub('\.\.\/', '', response.css('ul.ace-thumbnails img::attr(src)').extract()[1]) except Exception: male_second_image = '' try: female_first_image = re.sub('\.\.\/', '', response.css('ul.ace-thumbnails img::attr(src)').extract()[2]) except Exception: female_first_image = '' try: female_second_image = re.sub('\.\.\/', '', response.css('ul.ace-thumbnails img::attr(src)').extract()[3]) except Exception: female_second_image = '' yield { 'name': response.css('div.page-header h1::text').extract()[0], 'rules': response.css('div ol li::text').extract(), 'm': 'Растяжка', 'mp': muscle_primary_selector.css('div::text').extract(), 'ma': muscle_additional_selector.css('div::text').extract(), 'url': response.url, 't': '', '1m': self.image_base_url + male_first_image, '2m': self.image_base_url + male_second_image, '1w': self.image_base_url + female_first_image, '2w': self.image_base_url + female_second_image, 'i': '', }
def parse_item(self, response): selector = Selector(response=response) selector.css('div.main-wrap') item_loader = ItemLoader(item=HouseRenting58Item(), selector=selector, response=response) item_loader.add_css(field_name='title', css='div.house-title > h1::text') item_loader.add_value(field_name='source', value=self.name) item_loader.add_css(field_name='author', css='div.house-basic-info div.house-agent-info p.agent-name > a::text') item_loader.add_css(field_name='image_urls', css='div.basic-pic-list > ul > li > img::attr(data-src)', re=r'(.*)\?.*') item_loader.add_css(field_name='author_link', css='div.house-basic-info div.house-agent-info p.agent-name > a::attr(href)') item_loader.add_css(field_name='content', css='ul.introduce-item *::text') item_loader.add_value(field_name='source_url', value=response.url) item_loader.add_css(field_name='publish_time', css='p.house-update-info::text') item_loader.add_css(field_name='price', css='div.house-pay-way *::text') item_loader.add_css(field_name='detail', css='div.house-desc-item > ul > li > span::text') yield item_loader.load_item()
def parse(self, response): sel = Selector(response) cities = sel.css("#report1 tr") for city in cities: item = CityItem() item["id"] = city.css(":nth-child(1)::text").extract() item["name"] = city.css(":nth-child(2)::text").extract() item["date"] = city.css(":nth-child(3)::text").extract() item["AQI"] = city.css(":nth-child(4)::text").extract() item["level"] = city.css(":nth-child(5)::text").extract() item["prime"] = city.css(":nth-child(6)::text").extract() yield item next_page = int(sel.css("#report1 tr:nth-last-child(2) input:first-child::attr(value)")[0].extract()) + 1 total_page = int(sel.css("#report1 tr:nth-last-child(2) td:first-child font::text")[1].extract()) if next_page <= total_page: yield scrapy.Request( url="http://datacenter.mep.gov.cn/report/air_daily/air_dairy.jsp?city=&startdate=" + self.month_before_yesterday + "&enddate=" + self.yesterday + "&page=" + str( next_page), callback=self.parse)
def parse_page(self, response): """ Extract the paginate numbers links """ sel = Selector(response) selector = ("ul.paginate li a:not(a.paginate-more)" ":not(a.paginate-previous)" "::attr(href)") urls = sel.css(selector).extract() self.parse_podcastlist(response) for url in urls: yield Request(url, callback=self.parse_podcastlist)
def parseStartupFollowersToJson(self, strUrl): lstStrFollowers = [] strStartupFollowersFilePath = spiderForAngellist.getFollowersLocalFilePath(strUrl, self.__strDate, self.__strCategory, self.__strSubCategory) if(os.path.isfile(strStartupFollowersFilePath)): with open(strStartupFollowersFilePath, "rb") as file: #讀取本地端文件檔案內容到字串 strPageSource = file.read() root = Selector(text=strPageSource) lstStrFollowers = root.css('div.text > div.name > a::text').extract() return lstStrFollowers
def port_sample(sample): """Convert slybot samples made before slybot 0.13 to new format.""" if not sample.get("annotated_body"): if not sample.get("plugins"): sample["plugins"] = {"annotations-plugin": {"extracts": []}} return sample # Handle empty body if not sample.get("plugins"): sample["plugins"] = load_annotations(sample.get("annotated_body", u"")) del sample["annotated_body"] # Group annotations by type annotations = sample["plugins"]["annotations-plugin"]["extracts"] try: sel = Selector(text=add_tagids(sample["original_body"])) except KeyError: annotated = sample["annotated_body"] sample["original_body"] = annotated sel = Selector(text=add_tagids(annotated)) annotations = port_standard(annotations, sel, sample) standard_annos, generated_annos, variant_annos = [], [], [] for a in annotations: if a.get("generated"): generated_annos.append(a) elif a.get("variants", 0) > 0: variant_annos.append(a) else: standard_annos.append(a) if not annotations: return sample new_annotations = [] a = find_element(annotations[0], sel) for b in annotations[1:]: b = find_element(b, sel) a = find_common_parent(a, b) container_id = gen_id() parent = a.getparent() new_annotations.append(_create_container(a if parent is None else parent, container_id, selector=sel)) for a in standard_annos: a.pop("variant", None) new_annotations.extend(standard_annos) new_annotations.extend(port_generated(generated_annos, sel)) new_annotations.extend(port_variants(variant_annos, sel)) for a in new_annotations: if not (a.get("item_container") and a.get("container_id")): a["container_id"] = container_id tagid = a.pop("tagid", None) or a.pop("data-tagid", None) elems = sel.css(a["selector"]) elem = elems[0].root # Update annotations sample["plugins"]["annotations-plugin"]["extracts"] = new_annotations sample["version"] = SLYBOT_VERSION return sample
async def get_film(self, url): source = await html_source(url) bt_url = re.findall(r'href="(attach-dialog-fid-.*\.htm)"', source) selector = Selector(text=source) film_name = selector.re(r'\[BT下载\].*B\b') film_name = film_name[0] if film_name else '' bt_name = selector.css('td:nth-child(1) > a::text').extract_first() if film_name and bt_name: try: bt_url = bt_url[0].replace('dialog', 'download') except IndexError as e: bt_url = '' return film_name, bt_name, self.domain + bt_url
def parse(self, response): if response.css("div.content-base>section>div").extract_first() is None: return div = re.sub(">\s*<","><",response.css('div.content-base>section>div').extract_first()) div = re.sub("[\s]{2,}", "", div) div = re.sub(">\s*/\s<*","><", div) div = re.sub(">\s*:\s<*", "><", div) rows = Selector(text=div).css('table>tbody>tr').extract() for row in rows: s = Selector(text=row) content = s.css('td::text').extract() if content[2] == "Computer": ua = s.css('td.useragent>a::text').extract_first() yield {'useragent':ua} page = Selector(text=div).css('#pagination>a').extract() url = Selector(text=page[-2]).css('::attr(href)').extract_first() if Selector(text=div).css('#pagination>span.current::text').extract_first() == '10': return yield scrapy.Request(url=response.urljoin(url), callback=self.parse)
async def get_page_item(self, page_num: int): """ :param page_num: get_page_url :return: """ item_url = self.base_url.format(genre=self.genre, page_num=page_num) content = await self.get_html_content(item_url) selector = Selector(text=content) urls = list(set(selector.css('#maincontent a::attr(href)').extract())) page_items = (url for url in urls if url.startswith('http://www.meizitu.com/a/')) for item in page_items: await self.get_item(item)
def parse_page(self, body): sel = Selector(text=body) # chuyển từ text sang Selector table = sel.css('.dxgvTable_SisTheme') data_rows = table.css('.dxgvDataRow_SisTheme') number_subjects = len(data_rows) with open('sis2.json', 'a') as f: for i in range(number_subjects): data = data_rows[i].css('.dxgv::text').getall() btn_collapse = self.driver.find_elements_by_class_name( 'dxGridView_gvDetailCollapsedButton_SisTheme') if i == 0 or i == 1: btn_collapse[0].click() else: btn_collapse[i - 1].click() sleep(0.5) detail = Selector(text=self.driver.page_source).css( '.dxgvDetailCell_SisTheme b::text').getall() if len(detail) == 3: condition_subject = None english_name = detail[0] short_name = detail[1] faculity = detail[2] else: condition_subject = detail[0] english_name = detail[1] short_name = detail[2] faculity = detail[3] json_row = { 'ma_hoc_phan': data[0], 'ten_hoc_phan': data[1], 'thoi_luong': data[2], # thoi luong 'so_tin_chi': data[3], # so tin chi 'tc_hoc_phi': data[4], # tin chi hoc phi 'trong_so': data[5], 'hoc_phan_dieu_kien': condition_subject, 'ten_tieng_anh': english_name, 'ten_viet-tat': short_name, 'vien_quan_ly': faculity } f.write(json.dumps(json_row, ensure_ascii=False)) f.write('\n') f.close()
def parse(self, response): """ `parse` should always `yield` Meeting items. Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ # Create even section groupings for each meeting, even though committees are in same el sections = [] for idx, section_group in enumerate(response.css(".list-items")[1:3]): if idx == 0: sections.append(section_group) else: section_split = section_group.extract().split("<hr>") for split in section_split: sections.append(Selector(text=split)) for section in sections: self._validate_location(section) title = self._parse_title(section) classification = self._parse_classification(title) year_match = re.search( r"\d{4}", " ".join(section.css("h2 *::text").extract())) if not year_match: continue year_str = year_match.group() for split_text in section.extract().split("<br>"): item = Selector(text=split_text) item_text = re.sub(r"\s+", " ", " ".join( item.css("*::text").extract())).strip() start = self._parse_start(item_text, year_str) if not start: continue meeting = Meeting( title=title, description="", classification=classification, start=start, end=None, all_day=False, time_notes= "Details may change, confirm with staff before attending", location=self.location, links=self._parse_links(item, response), source=response.url) meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) yield meeting
def get_category_links(self): """获取职业类别链接""" print('开始爬取拉勾网首页的职业类别链接') category_links = set() url = 'https://www.lagou.com/' headers = { 'Host': 'www.lagou.com', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'Referer': 'https://www.lagou.com/', 'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'), } response = requests.get(url, headers=headers) if response.status_code == 200: selector = Selector(text=response.text) categories = selector.css('.menu_box .menu_main .category-list') for category in categories: links = category.css('a::attr(href)') for link in links: category_link = link.extract() category_links.add(category_link) menu_subs = selector.css('.menu_sub.dn dd') for sub in menu_subs: links = sub.css('a::attr(href)') for link in links: category_link = link.extract() category_links.add(category_link) print(category_links) return category_links else: print('获取职业类别链接失败')
def parse_job(self, response): """Parse a joblink into a JobItem. """ s = Selector(response) item = JobItem() item["url"] = response.url item["site"] = "LandingJobs" item["title"] = s.css("h1::text").extract_first() item["text"] = s.xpath( '//section[@class="ld-job-details"]//text()').extract() item["text"].extend( s.xpath( '//section[@class="ld-job-offer-section"]//text()').extract()) yield item
def parse_category_page(self, response, category): video_list = response.css( "#contentHolder li .promoBlock .promoBlockWrap").getall() print(f"Found {len(video_list)} videos") for video in video_list: if isinstance(video, str): video = Selector(text=video) title = video.css(".text .textWrap h3 a::text").extract_first() title = title.strip().replace("\r", "").replace("\n", "") url = video.css( ".text .textWrap h3 a::attr('href')").extract_first() image = video.css(".image a img::attr('src')").extract_first() yield { "title": title, "author": "", "url": TheRsaSpider.base_url + url, "image": TheRsaSpider.base_url + image, "category": category, "source": TheRsaSpider.name }
def parse_stock_availability(self, body): sel = Selector(text=body) store_list = sel.css(".store-navigation .js-pickup-store-list") for store_data in store_list.css(".pickup-store-list-entry"): store_info = store_data.css(".js-select-store-label") store_name = store_info.css( ".pickup-store-info .pickup-store-list-entry-name").get() store_address = store_info.css( ".pickup-store-info .pickup-store-list-entry-address").get() store_city = store_info.css( ".pickup-store-info .pickup-store-list-entry-city").get() stock_info = store_info.css(".store-availability .available").get() result = [store_name, store_address, store_city, stock_info] self.stock_availability += result
def parse(self, response): """Get the pagination links and hand them off. """ s = Selector(response) pagination = s.css(".pagination") pagelinks = pagination.xpath( '//a[contains(@href, "&page=")]/@href').extract() for pagelink in pagelinks: request = Request( urljoin(self.root, pagelink), callback=self.parse_jobspage, dont_filter=True, ) yield request
def parse_1(self,reponse): selector = Selector(reponse) items = [] room = selector.css('#anchor-info')[0] item = DouyutestItem() # item = {} item['room_name'] = room.xpath('/div[2]/div[1]/h1/text()').extract_first() item['room_visitor'] = room.xpath('/div[2]/div[3]/ul/li[2]/div/div[2]/a/text()').extract_first() item['room_owner'] = room.xpath('/div[2]/div[3]/ul/li[1]/div/a/text()').extract_first() item['room_popularity'] = room.xpath('/div[2]/div[2]/dl/dd/a[2]/text()').extract_first() items.append(item) # yield item # print repr(item).decode("unicode-escape") + '\n' return items
def __parse_images(self, text): selector = Selector(text= text) for script in selector.css('script').getall(): if 'init' in script: break else: raise ValueError("Script not found") match = scriptparser.search(script) if not match: raise ValueError("Script not parsed") fargs = match.group(1) fargs = '[' + fargs.replace("'", '"').strip() + ']' imgs_splited = json.loads(fargs)[0] imgs = [ ''.join(i[:3]) for i in imgs_splited] return imgs
def parse2(self, response): jsonresponse = json.loads(response.body_as_unicode()) sel = Selector(text=jsonresponse['content'], type="html") links=sel.css('a.s-job-title::attr(href)').extract() for i in links: item = items.DmozItem() item["site"]="secretcv" item["firma"]=i.replace("-"," ").split("/")[3] a=i.replace("-"," ").replace("is ilanlari","").split("/")[4] a=" ".join(a.split()) item["ilanAd"]=re.sub("\d+", "", a) item["ilanID"]=i.split("/")[4].split("-")[-1].split("i")[-1] #return item yield Request(i, self.parse3, meta={"item":item})
def get_products(): # 提取商品的信息 html = browser.page_source # 获取页面的源代码 selector = Selector(text=html) # 使用Scrapy提供的Selector来解析 items = selector.css('li.gl-item') for item in items: product = { 'img_url': item.css('a[target="_blank"] img').extract_first(), 'title': item.css('div.p-name a::attr(title)').extract_first(), 'price': item.css('div div.p-price strong i::text').extract_first(), 'shop': item.css('div.p-shop span a::attr(title)').extract_first() } print(product) save_to_mongo(product) # 存入数据库
def replay(self,response): title1 = response.meta['title1'] s1 = Selector(response) topic_replay = s1.css( 'ul.topic-reply li.clearfix div.bg-img-green h4 a::text,ul.topic-reply li.clearfix div.bg-img-green h4::text,ul.topic-reply li.clearfix p::text').extract() for x in topic_replay: # 去掉在x左右的空白,\t,\n和\r字符. x1 = x.strip(' \t\n\r') if x1 != '': self.topic_replay_end.append(x1) replay2 = ''.join(s1.xpath('//*[@id="comments"]//text()').extract()) f = open("/Users/vivi/PycharmProjects/DoubanGroup/%s.txt" % title1, "a") print(replay2, file=f) f.close()
def parse_chouse_list(self, response): """提取成交房源链接""" sel = Selector(response) # 链家有时小区查询不到数据 total = sel.css('.resultDes .total span::text').extract_first() total = int(total) if total > 0: # 提取房源链接 links = sel.css( ".listContent li .info .title a::attr(href)").extract() for link in links: yield scrapy.Request(url=link, callback=self.parse_chouse_detail) # 链接分页 page_data = sel.css( ".house-lst-page-box::attr(page-data)").extract_first() page_data = json.loads(page_data) if page_data['curPage'] == 1 and page_data['totalPage'] > 1: price = response.url.replace(self.base_url + '/chengjiao/', '') for x in range(2, page_data['totalPage'] + 1, 1): url = self.base_url + '/chengjiao/' + 'pg' + str(x) + price yield scrapy.Request(url=url, callback=self.parse_chouse_list)
def parse_mypage(self, response): sel = Selector(response) item = CrawlSpiderItem() item['title'] = sel.xpath('//h1/text()').extract_first() item['price'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[1]/span[1]/text()').extract_first() item['area'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/p[1]/text()').re('\d+')[0] item['house_type'] = \ sel.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/p[2]/text()').extract_first().split(' ')[0] item['floor'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/p[3]/text()').extract_first() item['house_head'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/p[4]/text()').extract_first() item['metro'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/p[5]/text()').extract_first() item['community'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/p[6]/a[1]/text()').extract_first() item['position'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/p[7]/a[1]/text()').extract_first() item['real_position'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/p[7]/a[2]/text()').extract_first() item['community_introduce'] = sel.xpath( '//*[@id="introduction"]/div/div[2]/div[2]/div[3]/ul/li[1]/span[2]/text()').extract_first() item['transportation'] = sel.xpath( '//*[@id="introduction"]/div/div[2]/div[2]/div[3]/ul/li[2]/span[2]/text()').extract_first() item['surround_facility'] = sel.xpath( '//*[@id="introduction"]/div/div[2]/div[2]/div[3]/ul/li[3]/span[2]/text()').extract_first() item['public_time'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/p[8]/text()').extract_first() item['publisher_name'] = sel.xpath( '/html/body/div[4]/div[2]/div[2]/div[3]/div/div[1]/a[1]/text()').extract_first() item['publisher_img_url'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[3]/a/img/@src').extract_first() item['publisher_id'] = sel.xpath( '/html/body/div[4]/div[2]/div[2]/div[3]/div/div[1]/span/text()').extract_first() item['publisher_evaluate'] = '-'.join( sel.xpath('/html/body/div[4]/div[2]/div[2]/div[3]/div/div[2]/span[1]/text()').re(':(.*)/')) item['evaluate_num'] = '-'.join( sel.xpath('/html/body/div[4]/div[2]/div[2]/div[3]/div/div[2]/span[1]/a/text()').re(r'\d+')) item['publisher_with_checking'] = '-'.join( sel.xpath('/html/body/div[4]/div[2]/div[2]/div[3]/div/div[2]/span[2]/text()').re(r"\d+")) item['phone_number'] = '-'.join( sel.xpath('/html/body/div[4]/div[2]/div[2]/div[3]/div/div[3]/text()').re('\d+')) item['lease'] = sel.xpath( '//*[@id="introduction"]/div/div[2]/div[1]/div[2]/ul/li[1]/text()').extract_first() item['pay_way'] = '-'.join( sel.xpath('//*[@id="introduction"]/div/div[2]/div[1]/div[2]/ul/li[2]/text()').re(r'\w+')) item['house_state'] = sel.xpath( '//*[@id="introduction"]/div/div[2]/div[1]/div[2]/ul/li[3]/text()').extract_first() item['heating_method'] = sel.xpath( '//*[@id="introduction"]/div/div[2]/div[1]/div[2]/ul/li[4]/text()').extract_first() item['house_facility'] = [tem.strip() for tem in sel.css( '#introduction > div > div.introContent > div.feature > div.zf-tag > ul > li.tags::text').extract() if tem.strip()] # item['look_house_num'] = sel.xpath('//*[@id="record"]/div[2]/div[3]/span/text()').extract() item['pic_url'] = sel.xpath('//*[@id="topImg"]/div[2]/ul/li/img/@src').extract() yield item
def parse(self, response): t_selector = Selector(text=browser.page_source) all= t_selector.css("a::attr(href)").extract() all_urls = filter(lambda x: True if x.startswith("https") else False, all) for url in all_urls: match_obj = re.match("(.*www.lagou.com/(zhaopin|jobs)/).*", url) if match_obj: request_url = match_obj.group() yield scrapy.Request(request_url, headers=self.headers, callback=self.do_items) else: # 如果不是question页面则直接进一步跟踪 yield scrapy.Request(url, headers=self.headers, callback=self.parse)
def is_cached_response_fresh(self, response, request): if super().is_cached_response_fresh(response, request): try: body = gunzip(response.body) except OSError: body = response.body h = HtmlResponse(url=response.url, body=body) s = Selector(h) company_name = s.css("h2 > span:first-child::text").extract() return company_name and company_name[0].strip() else: return False
def parse(self, resp): sel = Selector(resp) for link in sel.css(".viewMore a"): txt = link.xpath("text()").extract_first().replace( "View all ", "").strip() href = link.xpath("@href").extract_first() req = Request( resp.urljoin(href), callback=self.parse_data_page ) req.meta["title"] = txt yield req
def parse(self, response): print('ttttttt', response.url) sel = Selector(response) num = sel.css( 'div#table-pagination ::attr(data-number)').extract_first() next_num = int(num) + 2 if int(num): next_url = re.sub('(\d+)\.', str(next_num) + ".", response.url) else: next_url = re.sub('.jhtml', '-2.jhtml', response.url) #next url yield Request(url=next_url, priority=1, callback=self.parse)
async def extract_user_pageing_api(self, ResText, webpage_url): try: selector = Selector(text=ResText) except TypeError: return None output = [] for article in selector.css("li[data-articleid]"): ele = dict() ele['vid'] = article.css('::attr(data-articleid)').extract_first() ele['webpage_url'] = f"https://www.xinpianchang.com/a{ele['vid']}?from=UserProfile" ele['cover'] = article.css( 'img[class*="lazy-img"]::attr(_src)').extract_first() ele['upload_ts'] = self.string2timestamp(string=article.css( '.video-hover-con p[class*="fs_12"]::text').extract_first(), format='%Y-%m-%d 发布') # ele['duration'] = self.format_duration(article.css('.duration::text').extract_first()) ele['duration'] = self.string2duration( string=article.css('.duration::text').extract_first(), format="%M' %S''") ele['description'] = self.format_desc( article.css('.desc::text').extract_first()) ele['playlist_url'] = webpage_url ele['title'] = self.format_desc( article.css('.video-con-top p::text').extract_first()) ele['category'] = self.format_category( article.css('.new-cate .c_b_9 ::text').extract()) ele['view_count'] = self.format_count( article.css('.icon-play-volume::text').extract_first()) ele['like_count'] = self.format_count( article.css('.icon-like::text').extract_first()) ele['role'] = article.css('.user-info .role::text').extract_first() ele['from'] = self.from_ output.append(ele) else: has_more = selector.css( "li[data-more]::attr(data-more)").extract_first() return output, has_more, {}
def crawl_ips(self): headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"} for i in range(1, 5): response = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers) selector = Selector(text=response.text) all_trs = selector.css("#ip_list tr") ip_list = [] for tr in all_trs[1:]: ip = tr.css("td::text")[0].extract() port = tr.css("td::text")[1].extract() # print(tr.css("td:nth-child(4) > a::text").extract()) if [] == tr.css("td:nth-child(4) > a::text").extract(): server_address = '' else: server_address = tr.css("td:nth-child(4) > a::text").extract()[0] anonymous = tr.css("td::text")[4].extract() ip_type = tr.css("td::text")[5].extract() speed = tr.css("td:nth-child(7) > div::attr(title)").extract()[0] speed = re.sub("[^0-9\.]", "", speed) # con_time = tr.css("td:nth-child(8) > div::attr(title)").extract()[0] con_time = re.sub("[^0-9\.]", "", con_time) # alive_time = tr.css("td::text")[10].extract() check_time = tr.css("td::text")[11].extract() status = 1 now_time = int(time.time()) # print(check_time) # print(ip, port, server_address, anonymous) ip_list.append((ip, port, server_address, anonymous, ip_type, speed, con_time, alive_time, check_time, status, now_time, now_time)) # insert into database for ip_info in ip_list: insert_sql = ''' insert into proxy_ip (ip, port, server_address, anonymous, type, speed, con_time, alive_time, check_time, status, created_at, updated_at) values('{0}', {1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', {9}, {10}, {11})'''.format( ip_info[0], ip_info[1], ip_info[2], ip_info[3], ip_info[4], ip_info[5], ip_info[6], ip_info[7], ip_info[8], ip_info[9], ip_info[10], ip_info[11]) # print(insert_sql) # return cursor.execute(insert_sql) db.commit() print("insert ip list over " + str(i) + " pages") print("insert ip list end @@@@")
def get_id(): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"} for i in range(1, 3): req = requests.get("https://www.xicidaili.com/nn/{0}".format(i), headers=headers) # print(req.text) sel = Selector(text=req.text) raw = (sel.css('#ip_list tr')) for ip_raw in raw[1:]: tdcss = ip_raw.css('td') ip = tdcss[1].css('::text').extract()[0] port = tdcss[2].css('::text').extract()[0] print("{0}:{1}".format(ip, port))
def parse(self, resp): hxs = Selector(resp) for row in hxs.css("div.detail_list > ul > li"): item = MangaChapterItem() cells = row.xpath("span") if not cells: continue try: item['name'], item['link'] = extract_link(cells[0].xpath("a")) item['date'] = self.parsedate( cells[-1].xpath('text()').extract()[0]) yield item except IndexError: pass
def clean_content(self, content): # <a class="headerlink" href="#check" title="Permalink to this headline">¶</a> headline 自带图形 content = content.replace(u'>\xb6<', u'><') # selenium LanguagePreference sel = Selector(text=content) # content = content.replace(sel.css('div#codeLanguagePreference').extract_first(), '') #可能是None for div in sel.css('div#codeLanguagePreference').extract(): content = content.replace(div, '') for lang in ['java', 'csharp', 'ruby', 'php', 'perl', 'javascript']: for div in sel.css('div.highlight-%s' % lang).extract(): # print len(content) content = content.replace(div, '') # liaoxuefeng comment content = content.replace('<h3>Comments</h3>', '') content = content.replace('<h3>Make a comment</h3>', '') # http://lxml.de/ for div in sel.css('div.sidemenu').extract(): content = content.replace(div, '') return content
def apply_selector_annotations(annotations, target_page): page = Selector(text=target_page) converted_annotations = [] annotations = _merge_annotations_by_selector(annotations) for annotation in annotations: if not annotation.get('selector'): accepted_elements = set( chain(*[[elem._root for elem in page.css(sel)] for sel in annotation.get('accept_selectors', []) if sel])) rejected_elements = set( chain(*[[elem._root for elem in page.css(sel)] for sel in annotation.get('reject_selectors', []) if sel])) elems = accepted_elements - rejected_elements else: elems = [elem._root for elem in page.css(annotation['selector'])] if elems: tagids = [int(e.attrib.get('data-tagid', 1e9)) for e in elems] tagid = min(tagids) if tagid is not None: annotation['tagid'] = tagid converted_annotations.append(annotation) return converted_annotations
def parse_start_url(self, response): print('parse_start_url --------> ' + response.url) self.page = 1 self.first_page_url = response.url self.play_num = [] self.play_num = get_album_simple(self.first_page_url) sel = Selector(response=response) ls = sel.css('.pagingBar_page::text').extract() if '下一页' in ls: self.pages = int(ls[-2]) r = requests.get(self.first_page_url + str(self.pages), headers=self.headers) if r.status_code == 200: sel = Selector(r) self.last_page_album_count = sel.css( '.discoverAlbum_item').extract().__len__() self.page_album_count = 12 else: self.pages = 1 r = requests.get(self.first_page_url, headers=self.headers) sel = Selector(r) self.page_album_count = sel.css( '.discoverAlbum_item').extract().__len__()
def parse_page(self, response): """ Parses one page of the forum """ for bp in response.css(".blockpost"): bp_selector = Selector(text=bp.extract()) message = "".join(bp_selector.xpath( "//div[@class='postmsg']/node()[not(local-name() = 'div' and @class='postsignature') and not(local-name() = 'p' and @class='postedit')]" ).extract()).strip() signature = bp_selector.css(".postsignature").xpath("p/node()").extract() modification = bp_selector.css(".postedit").extract() if len(modification) > 0: s = modification[0] modification = str(compute_date(s[s.find("(")+1:s.find(")")])) else: modification = False author_link_list = bp_selector.xpath("//strong/a/@href").extract() post = Post( author=bp_selector.xpath("//strong/a/text()").extract()[0] if len(author_link_list) > 0 else bp_selector.xpath("//strong/text()").extract()[0], author_id=extract_identifier(author_link_list[0]) if len(author_link_list) > 0 else None, number=int(bp_selector.xpath("//h2/span/span/text()").extract()[0][1:]), datetime=str(compute_date(bp_selector.xpath("//h2/span/a/text()").extract()[0])), content=message, signature="".join(signature).strip() if len(signature) > 0 else False, modification=modification, thread=extract_identifier(response.request.url) ) yield post
def get_mm_label(): r = requests.get(url, headers=headers) r.encoding = 'utf-8' # 解决乱码问题 index = 500 if r.status_code == 200: sel = Selector(r) label_lists = sel.css('.tag ul li a').extract() for x in label_lists: s = Selector(text=x) item = MMLabel() item["mml_cover"] = s.css('a img::attr(src)').extract()[0] item["mml_label"] = s.css('a::text').extract()[0] item["mml_order"] = index item.meta.id = item['mml_order'] print('get label ---> ' + item['mml_label']) try: rs = search.query("term", mml_order=item["mml_order"]).execute() if len(rs) <= 0: item.save() except Exception as e: print(e.__cause__) index -= 1
def get_page_for(number, cj): url = "http://www.haushalt.fm.nrw.de/grafik/ajax.php" suffix = "" if number: suffix = "?selection={}".format("+".join(number.split())) response = requests.get(url + suffix, cookies=cj) s = Selector(text=response.text) results = {} for tr in s.css("tr:not(:first-child)"): id = tr.css("::attr(id)").extract_first() name = tr.css(".col3 ::text").extract_first() results[id] = name if len(id) < len("n n n"): results = {**results, **get_page_for(id, cj)} return results
def parse_job(self, response): """Parse a joblink into a JobItem. """ s = Selector(response) item = JobItem() item["url"] = response.url item["site"] = "VirtualVocations" item["title"] = s.css("h1::text").extract_first() item["text"] = s.xpath('//div[@id="job_details"]//text()').extract() try: posted = s.xpath('//div[@class="col-sm-6"]/p/text()')[8].extract() item["date_posted"] = parse_date(posted).isoformat() except Exception as e: self.logger.error(e) yield item
def parse_item(self, response): selector = Selector(response=response) selector.css('div#content div.article div.topic-content') item_loader = ItemLoader(item=HouseRentingDoubanItem(), selector=selector, response=response) item_loader.add_css(field_name='title', css='table.infobox *::text') item_loader.add_css(field_name='title', css='div#content > h1:first-child::text') item_loader.add_value(field_name='source', value=self.name) item_loader.add_css(field_name='author', css='h3 span.from a::text') # item_loader.add_css(field_name='image_urls', css='div.topic-content div#link-report img::attr(src)') item_loader.add_css(field_name='author_link', css='h3 span.from a::attr(href)') item_loader.add_css(field_name='content', css='div.topic-content div#link-report *::text', re=r'\s*(.*)\s*') item_loader.add_value(field_name='source_url', value=response.url) item_loader.add_css(field_name='publish_time', css='h3 span:last-child::text', re=r'\s*(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s*') yield item_loader.load_item()
def _parse_docx(self, attachment): items = [] docx_bytes = BytesIO(attachment) docx_str = "" with ZipFile(docx_bytes) as zf: for zip_info in zf.infolist(): if zip_info.filename == "word/document.xml": with zf.open(zip_info) as docx_file: docx_str = StringIO(docx_file.read().decode()) if not docx_str: return # Remove MS Word namespaces on tags to use selectors sel = Selector(text=docx_str.getvalue()) sel.remove_namespaces() year_str = "".join([ p.strip() for p in sel.css("tbl > tr")[:1].css("tc:first-of-type") [:1].css("*::text").extract() if p.strip() ]) for table in sel.css("tbl"): month_str = "".join([ p.strip() for p in table.css("tr")[1:2].css("tc:first-of-type") [:1].css("*::text").extract() if p.strip() ]).title() for cell in table.css("tc > p"): cell_str = re.sub( r"((?<=[\-–]) | (?=[\-–])|@)", "", re.sub(r"\s+", " ", " ".join(cell.css("*::text").extract())).strip(), ).strip() if (len(cell_str) <= 2 or (len(cell_str) > 2 and cell_str.startswith("201")) or not cell_str[0].isdigit()): continue items.append(self._parse_item(cell_str, month_str, year_str)) return items
def get_selector(html, field): """ html : Selector or HtmlResponse execute the xpath config if existed in field """ try: selector = html if not ( isinstance(html, Selector) or isinstance(html, SelectorList)): selector = Selector(html) if contains(field, 'css'): selector = selector.css(field['css']) if contains(field, 'xpath'): #xpath can be a multiple selector = xpath(selector, field['xpath']) return selector except: return None
def parse(self, response): """ Parses the http://forum.ubuntu-fr.org page for forums """ for bt in response.css(".blocktable"): bt_selector = Selector(text=bt.extract()) category = bt_selector.xpath("//h2/span/text()").extract()[0] if category in excluded_categories: continue for tr in bt_selector.xpath("//tbody/tr"): tr_selector = Selector(text=tr.extract()) description = tr_selector.css(".forumdesc").xpath("text()").extract() link = tr_selector.xpath("//a/@href").extract()[0] identifier = extract_identifier(link) forum = Forum( identifier=identifier, name=tr_selector.xpath("//h3/a/text()").extract()[0], url=make_url(link), category=category, description=description[0] if len(description) > 0 else None, parent=None ) subforum_names = tr_selector.xpath("//div/a/text()").extract() + tr_selector.xpath("//div/a/strong/text()").extract() subforum_links = tr_selector.xpath("//div/a/@href").extract() subforums = [Forum( identifier=extract_identifier(link), name=name, url=make_url(link), category=category, description=None, parent=identifier ) for name, link in zip(subforum_names, subforum_links)] forums = [forum] + subforums for forum in forums: yield forum
def parseProjectListPage(self, strTypePageUrl): #從專案列表頁面截取專案是否募資成功以及摘要資訊 projectListFilePath = self.getProjectListFilePath(strTypePageUrl) strFileListPageSource = None with open(projectListFilePath, "rb") as file: strFileListPageSource = file.read() root = Selector(text = strFileListPageSource) lstProjectItem = root.css(".portfolio-item-wrapper") for projectItem in lstProjectItem: strUrl = projectItem.css(".portfolio-thumb > a[href*='project']::attr(href)").extract_first() strID = getFileNameInUrl(strUrl) strDescription = projectItem.css(".portfolio-thumb > a > .portfolio-zoom::text").extract_first() strDescription = purifyString(strDescription) intStatus = 0 successItem = projectItem.css(".ribbon-green.rgreen") failedItem = projectItem.css(".ribbon-green.rblue") if(len(successItem) > 0): intStatus = 1 elif(len(failedItem) > 0): intStatus = 2 self.__dicProjectInfo[strID] = {"strDescription":strDescription, "intStatus":intStatus}
def parseStartupActivityPressToJson(self, strUrl): lstActivityPress = [] strStartupActivityPressFilePath = spiderForAngellist.getActivityPressLocalFilePath(strUrl, self.__strDate, self.__strCategory, self.__strSubCategory) if(os.path.isfile(strStartupActivityPressFilePath)): with open(strStartupActivityPressFilePath, "rb") as file: #讀取本地端文件檔案內容到字串 strPageSource = file.read() root = Selector(text=strPageSource) lstDivActivityPress = root.css('div.startups-show-helpers.active') for divActivityPress in lstDivActivityPress: dicActivityPress = {} dicActivityPress['strUrl'] = strUrl dicActivityPress['strSourceUrl'] = divActivityPress.css('div.headline > a::attr(href)').extract_first() dicActivityPress['strSourceDomain'] = divActivityPress.css('div.type_and_actions > span.type::text').extract_first() dicActivityPress['strTitle'] = divActivityPress.css('div.headline > a::text').extract_first() dicActivityPress['strContent'] = divActivityPress.css('div.summary::text').extract_first() dicActivityPress['strDate'] = divActivityPress.css('div.timestamp > span::text').extract_first() lstActivityPress.append(dicActivityPress) self.__lstStartupActivityPressResult[strUrl] = lstActivityPress
def parseQnA(self, strProjectUrl): projectID = getFileNameInUrl(strProjectUrl) qaPageFilePath = self.__LOCAL_PAGE_PATH + projectID + self.__LOCAL_PAGR_QA_SUFFIXES + self.__LOCAL_PAGE_EXTENSION strQAPageSource = None if os.path.isfile(qaPageFilePath) == True: with open(qaPageFilePath, "rb") as file: #讀取本地端文件檔案內容到字串 strQAPageSource = file.read() root = Selector(text=strQAPageSource) lstQnaElement = root.css("#openQA + h2 + div .panel.panel-default") for qnaElement in lstQnaElement: dicQnaResult = {} dicQnaResult["strUrl"] = strProjectUrl #Q&A問題 strQnaQuestion = qnaElement.css(".panel-heading .panel-title > a::text").extract_first() dicQnaResult["strQnaQuestion"] = purifyString(strQnaQuestion) #Q&A回覆 strQnaAnswer = qnaElement.css(".panel-collapse > .panel-body::text").extract_first() dicQnaResult["strQnaAnswer"] = purifyString(strQnaAnswer) #Q&A回覆時間 strQnaDate = qnaElement.css(".panel-collapse > .panel-body > small::text").extract_first() strQnaDate = purifyString(strQnaDate) dicQnaResult["strQnaDate"] = strQnaDate[5:len(strQnaDate)] self.__lstQnaResult.append(dicQnaResult)
def getAllRazorLinks(self,store_id): store_razor_link = self.db.get_one_store(store_id)[-1] self.browser.get(store_razor_link) time.sleep(5) t_selector = Selector(text=self.browser.page_source) item5lines = t_selector.css('.J_TItems .item5line1') for item5line in item5lines: items = item5line.css('.item .detail') for item in items: razor_dict = {} razor_link = item.css('a ::attr(href)').extract_first('') razor_name = item.css('a ::text').extract_first() razor_id = 'null' mbj = re.match('.*?id=(\d+).*',razor_link) if mbj: razor_id = mbj.group(1) razor_dict['razor_id'] = razor_id razor_dict['name'] = razor_name razor_dict['link'] = razor_link razor_dict['store_id'] = store_id razor_dict['date'] = datetime.now().date() self.db.add_one_razor(razor_dict)
# Scrapy grab div with multiple classes? from scrapy import Selector sel = Selector(text='<div class="product product-small">I am a product!</div>') print sel.css('.product').extract()
def parseSyndicateToJson(self, strUrl, strSyndicateUrl): dicSyndicateResult = {}; strSyndicateFilePath = spiderForAngellist.getSyndicateLocalFilePath(strUrl, self.__strDate, self.__strCategory, self.__strSubCategory) if(os.path.isfile(strSyndicateFilePath)): print("[parserForAngellist] Parsing " + strSyndicateFilePath) with open(strSyndicateFilePath, "rb") as file: #讀取本地端文件檔案內容到字串 strPageSource = file.read() root = Selector(text=strPageSource) dicSyndicateResult['strUrl'] = strSyndicateUrl dicSyndicateResult['strCrawlTime'] = self.__strDate dicSyndicateResult['strName'] = root.css('div.gridspan.antialiased > h1::text').extract_first() dicSyndicateResult['strManager'] = root.css('div.managers > div.fixed_width.u-inlineBlock > div > a.u-uncoloredLink::text').extract() intTypicalInvestment = 0 fCarryPerDeal = 0.0 intBackerCount = 0 strTerms = root.css('ul.syndicate_terms > li::text').extract() for strTerm in strTerms: if "Total Carry Per Deal:" in strTerm: strCarryPerDeal = strTerm.replace('Total Carry Per Deal:', '').replace('%','').strip() fCarryPerDeal = float(strCarryPerDeal) dicSyndicateResult['fCarryPerDeal'] = fCarryPerDeal elif "Typical Investment:" in strTerm: # strTypicalInvestment = strTerm[strTerm.rfind('$')+1:].strip().replace(',', '') # intTypicalInvestment = int(strTypicalInvestment) # dicSyndicateResult['intTypicalInvestment'] = intTypicalInvestment # Use str instead strTypicalInvestment = strTerm[strTerm.rfind(':')+1:].strip() dicSyndicateResult['strTypicalInvestment'] = intTypicalInvestment elif "Backed by" in strTerm: strBackerCount = strTerm[strTerm.find('Backed by')+9:strTerm.find('Accredited Investor')].strip() intBackerCount = int(strBackerCount) dicSyndicateResult['intBackerCount'] = intBackerCount intBackedBy = 0 intDealsPerYear = 0 divSyndicateSummaryItems = root.css('ul.syndicate_summary > li') for divSyndicateSummaryItem in divSyndicateSummaryItems: strLabel = divSyndicateSummaryItem.css('div.syndicate_summary_label::text').extract_first().strip() if "Backed By" in strLabel: strBackedBy = divSyndicateSummaryItem.css('div.syndicate_summary_value::text').extract_first().strip() strCurrency = strBackedBy[:1] strBackedBy = strBackedBy[1:] if(strCurrency == u'$'): strCurrency = 'USD' elif (strCurrency == u'€'): strCurrency = 'EUR' intBase = 1 if(strBackedBy[-1:] == u'K'): intBase = 1000 strBackedBy = strBackedBy[:-1] elif(strBackedBy[-1:] == u'M'): intBase = 1000000 strBackedBy = strBackedBy[:-1] intBackedBy = int(locale.atof(strBackedBy.replace(",", "")) * intBase) dicSyndicateResult['strCurrency'] = strCurrency dicSyndicateResult['intBackedBy'] = intBackedBy elif "Expected Deals/Year" in strLabel: strDealsPerYear = divSyndicateSummaryItem.css('div.syndicate_summary_value::text').extract_first().strip() intDealsPerYear = int(strDealsPerYear) dicSyndicateResult['intDealsPerYear'] = intDealsPerYear lstStrBackers = root.css('div.gridspan > div.feature > figure > h3 > a.profile-link::text').extract() lstOverflowBackers = root.css('div.gridspan > ul.overflow > li > h4 > a.profile-link::text').extract() lstStrBackers.extend(lstOverflowBackers) dicSyndicateResult['lstStrBackers'] = lstStrBackers self.__lstSyndicateResult[strSyndicateUrl] = dicSyndicateResult
def parsePeopleToJson(self, strUrl): strObjectID = getFileNameInUrl(spiderForAngellist.getPureUrl(strUrl)) dicInvestorResult = {}; dicInvestorResult['strUrl'] = strUrl dicInvestorResult['strCrawlTime'] = self.__strDate strPeopleFilePath = spiderForAngellist.getPeopleLocalFilePath(strUrl, self.__strDate, self.__strCategory, self.__strSubCategory) print("[parserForAngellist] Parsing " + strPeopleFilePath) if(os.path.isfile(strPeopleFilePath)): with open(strPeopleFilePath, "rb") as file: #讀取本地端文件檔案內容到字串 strPageSource = file.read() root = Selector(text=strPageSource) strName = root.css("h1.js-name::text").extract_first().strip() dicInvestorResult['strName'] = strName lstDivInfo = root.css("div.tags > span.tag") strLocation = '' lstStrRole = [] for divInfo in lstDivInfo: if(divInfo.css('span.fontello-location.icon')): if(divInfo.css("::attr(title)") and (divInfo.css("::attr(title)").extract_first() != '')): strAllLocation = divInfo.css("::attr(title)").extract_first().strip() elif(divInfo.css("::attr(oldtitle)")): strAllLocation = divInfo.css("::attr(oldtitle)").extract_first().strip() else: strAllLocation = divInfo.css("::text").extract_first().strip() lstStrLocation = strAllLocation.split(',') lstStrLocation = map(unicode.strip, lstStrLocation) strLocation = lstStrLocation[0] elif(divInfo.css('span.fontello-tag-1.icon')): if(divInfo.css("::attr(title)") and (divInfo.css("::attr(title)").extract_first() != '')): strRole = divInfo.css("::attr(title)").extract_first().strip() elif(divInfo.css("::attr(oldtitle)")): strRole = divInfo.css("::attr(oldtitle)").extract_first().strip() else: strRole =divInfo.css("::text").extract_first().strip() lstStrRole = strRole.split(',') lstStrRole = map(unicode.strip, lstStrRole) dicInvestorResult['lstStrRole'] = lstStrRole dicInvestorResult['strLocation'] = strLocation dicLocation = self.parseLocation(strLocation) print("location parse complete") # strGeonameId = geonames.search(q=strLocation)[0]['geonameId'] # dicGeoname = geonames.get(strGeonameId) # bbox = dicGeoname['bbox'] # strCountry = dicGeoname['countryCode'] # strContinent = dicGeoname['continentCode'] # dicCity = geonames.findCity(north=bbox['north'], south=bbox['south'], east=bbox['east'], west=bbox['west'])[0] # strCity = dicCity['name'] dicInvestorResult['strCity'] = dicLocation['strCity'] dicInvestorResult['strCountry'] = dicLocation['strCountry'] dicInvestorResult['strContinent'] = dicLocation['strContinent'] intFollower = 0 if(root.css("a.followers_count.follow_link")): strFollower = root.css("a.followers_count.follow_link::text").extract_first().strip() strFollower = strFollower.split(' ')[0].replace(",", "") intFollower = int(strFollower) dicInvestorResult['intFollower'] = intFollower intFollowing = 0 if(root.css("a.following_count.follow_link")): strFollowing = root.css("a.following_count.follow_link::text").extract_first().strip() strFollowing = strFollowing.split(' ')[0].replace(",", "") intFollowing = int(strFollowing) dicInvestorResult['intFollowing'] = intFollowing lstStrMarket = [] lstStrMarketIndustry = [] lstAboutContent = root.css("div.s-grid0-colMd24.s-vgBottom2.field") for aboutContent in lstAboutContent: if(aboutContent.css("div.s-grid-colMd5 > div.u-uppercase::text").extract_first().strip() == 'Locations'): strLocation = aboutContent.css("div.s-grid-colMd5 > div.u-uppercase::text").extract_first().strip(); lstStrMarket = aboutContent.css('div.s-grid-colMd17 > div.item > div.module_taggings > div.content > div.value > span.tag > a::text').extract() elif(aboutContent.css("div.s-grid-colMd5 > div.u-uppercase::text").extract_first().strip() == 'Markets'): strMarket = aboutContent.css("div.s-grid-colMd5 > div.u-uppercase::text").extract_first().strip(); lstStrMarketIndustry = aboutContent.css('div.s-grid-colMd17 > div.item > div.module_taggings > div.content > div.value > span.tag > a::text').extract() dicInvestorResult['lstStrMarketIndustry'] = lstStrMarketIndustry dicInvestorResult['lstStrMarket'] = lstStrMarket lstExperience = [] lstDivExperience = root.css('div.feature.startup_roles.experience') for divExperience in lstDivExperience: dicExperienceResult = {} dicExperienceResult['strUrl'] = strUrl dicExperienceResult['strName'] = strName strCompany = divExperience.css('a.u-unstyledLink::text').extract_first().strip() dicExperienceResult['strCompany'] = strCompany strRole = divExperience.css('div.line > span.medium-font::text').extract_first().strip() dicExperienceResult['strRole'] = strRole lstExperience.append(dicExperienceResult) self.__lstExperinceResult[strUrl] = lstExperience #print("[parserForAngellist] lstExperience " + str(lstExperience)) lstReference = [] lstDivReference = root.css('div.profiles-show.review') for divReference in lstDivReference: dicReferenceResult = {} dicReferenceResult['strUrl'] = strUrl dicReferenceResult['strName'] = strName strContent = divReference.css('div.review-content::text').extract_first().strip() dicReferenceResult['strContent'] = strContent # strAuthor = divReference.css('div.annotation > div.profile-link::text').extract_first().strip() # dicReferenceResult['strAuthor'] = strAuthor lstStrAuthorContext = divReference.css('div.annotation').xpath('.//text()').extract() lstStrAuthorContext = map(unicode.strip, lstStrAuthorContext) lstStrAuthorContext = filter(lambda x: len(x) > 1, lstStrAuthorContext) strAuthor = lstStrAuthorContext[0] strAuthorContext = ','.join(lstStrAuthorContext) # strAuthorContext = divReference.css('div.annotation').extract_first().strip() dicReferenceResult['strAuthor'] = strAuthor dicReferenceResult['strAuthorContext'] = strAuthorContext lstReference.append(dicReferenceResult) self.__lstReferenceResult[strUrl] = lstReference #print("[parserForAngellist] lstReference " + str(lstReference)) self.__lstInverstorResult[strUrl] = dicInvestorResult # strInvestorJsonFilePath = parserForAngellist.getInvestorJsonFilePath(strUrl, self.__strDate, self.__strCategory, self.__strSubCategory) # saveObjToJson(dicInvestorResult, strInvestorJsonFilePath) # print("[parserForAngellist.] Result " + str(dicInvestorResult)) dicSyndicateResult = {}; divSyndicate = root.css("div.back_syndicate_button") if(divSyndicate): uSyndicateUrl = divSyndicate.css("a::attr(href)").extract_first().strip() strSyndicateUrl = parserForAngellist.PARSE_BASE_URL + str(uSyndicateUrl) self.parseSyndicateToJson(strUrl, strSyndicateUrl)
def parseStartupToJson(self, strUrl): strObjectID = getFileNameInUrl(strUrl) dicStartupResult = {}; dicStartupResult['strUrl'] = strUrl dicStartupResult['strCrawlTime'] = self.__strDate strStartupFilePath = spiderForAngellist.getOverviewLocalFilePath(strUrl, self.__strDate, self.__strCategory, self.__strSubCategory) print("[parserForAngellist] Parsing " + strStartupFilePath) if(os.path.isfile(strStartupFilePath)): with open(strStartupFilePath, "rb") as file: #讀取本地端文件檔案內容到字串 strPageSource = file.read() root = Selector(text=strPageSource) strCompany = root.css('div.text > div.name_holder > h1.name::text').extract_first().strip() dicStartupResult['strCompany'] = strCompany # Some company didn't have intros, and h2 has some parsing error strIntro = root.css('div.main.standard > div.text').css('p::text').extract_first() dicStartupResult['strIntro'] = strIntro lstStrProduct = root.css('div.product_desc > div.show.windows > div.content::text').extract() dicStartupResult['lstStrProduct'] = lstStrProduct lstStrFounders = [] lstStrFoundersDesc = [] divFoundersSection = root.css('div.founders.section > div.startups-show-sections > div.startup_roles') if(divFoundersSection): lstDivFounders = divFoundersSection.css('div.text') for divFounder in lstDivFounders: strFounderName = divFounder.css('div.name > a::text').extract_first() lstFounderDescs = divFounder.css('div.bio > p').css('::text').extract() # lstFounderDescs = map(unicode.strip, lstFounderDescs) lstFounderDescs = filter(lambda x: len(x) > 1, lstFounderDescs) strFounderDesc = ''.join(lstFounderDescs) lstStrFounders.append(strFounderName) lstStrFoundersDesc.append(strFounderDesc) # lstStrFoundersName = divFoundersSection.css('div.text > div.name > a::text').extract() # lstStrFoundersName = divFoundersSection.css('div.text > div.name > a::text').extract() dicStartupResult['lstStrFounders'] = lstStrFounders dicStartupResult['lstStrFoundersDesc'] = lstStrFoundersDesc lstStrTeam = [] lstStrTeamDesc = [] divTeamSection = root.css('div.team.section > div.startups-show-sections > div.group') if(divTeamSection): lstDivTeam = divTeamSection.css('div.text') for divTeam in lstDivTeam: strTeamName = divTeam.css('div.name > a::text').extract_first() lstTeamDescs = divTeam.css('div.bio > p').css('::text').extract() lstTeamDescs = filter(lambda x: len(x) > 1, lstTeamDescs) strTeamDesc = ''.join(lstTeamDescs) lstStrTeam.append(strTeamName) lstStrTeamDesc.append(strTeamDesc) dicStartupResult['lstStrTeam'] = lstStrTeam dicStartupResult['lstStrTeamDesc'] = lstStrTeamDesc lstLocationIndustry = root.css('div.main.standard > div.text > div.tags').css('a.tag::text').extract() strLocation = '' lstIndustry = [] if(len(lstLocationIndustry) > 0): strLocation = lstLocationIndustry[0] lstIndustry = lstLocationIndustry[1:] dicStartupResult['strLocation'] = strLocation dicStartupResult['lstIndustry'] = lstIndustry dicLocation = self.parseLocation(strLocation) dicStartupResult['strCity'] = dicLocation['strCity'] dicStartupResult['strCountry'] = dicLocation['strCountry'] dicStartupResult['strContinent'] = dicLocation['strContinent'] lstStrFollowers = self.parseStartupFollowersToJson(strUrl) dicStartupResult['lstStrFollowers'] = lstStrFollowers lstStrInvestor = [] divFundingSection = root.css('div.past_financing.section.startups-show-sections') if(divFundingSection): lstStrInvestor = divFundingSection.css('ul.roles > li.role').css('div.name > a::text').extract() dicStartupResult['lstStrInvestor'] = lstStrInvestor isFundraising = False divFundraisingHeader = root.css('div.fundraising.header') if(divFundraisingHeader): strFundraising = divFundraisingHeader.css('::text').extract_first() if "Fundraising" in strFundraising: isFundraising = True dicStartupResult['isFundraising'] = isFundraising lstStartupSeries = [] if(divFundingSection): lstDivStartupSeries = divFundingSection.css('div.startups-show-sections.startup_rounds > ul.startup_rounds.with_rounds > li.startup_round') for divStartupSeries in lstDivStartupSeries: dicStartupSeriesResult = {} dicStartupSeriesResult['strUrl'] = strUrl dicStartupSeriesResult['strCrawlTime'] = self.__strDate dicStartupSeriesResult['strCompany'] = strCompany strSeriesType = '' divStartupSeriesType = divStartupSeries.css('div.details.inner_section > div.header > div.type') if(divStartupSeriesType): strSeriesType = divStartupSeriesType.css('::text').extract_first().strip() dicStartupSeriesResult['strSeriesType'] = strSeriesType strSeriesMoney = u'Unknown' intSeriesMoney = 0 divStartupSeriesMoney = divStartupSeries.css('div.details.inner_section > div.raised') if(divStartupSeriesMoney): lstStrSeriesMoney = divStartupSeriesMoney.css('::text').extract() strSeriesMoney = "".join(lstStrSeriesMoney).strip() if(strSeriesMoney != u'Unknown'): strCurrency = strSeriesMoney[:1] strSeriesMoney = strSeriesMoney[1:] if(strCurrency == u'$'): strCurrency = 'USD' elif (strCurrency == u'€'): strCurrency = 'EUR' intBase = 1 if(strSeriesMoney[-1:] == u'K'): intBase = 1000 strSeriesMoney = strSeriesMoney[:-1] elif(strSeriesMoney[-1:] == u'M'): intBase = 1000000 strSeriesMoney = strSeriesMoney[:-1] intSeriesMoney = int(locale.atof(strSeriesMoney.replace(",", "")) * intBase) if(intSeriesMoney == 0): dicStartupSeriesResult['intSeriesMoney'] = strSeriesMoney else: dicStartupSeriesResult['intSeriesMoney'] = intSeriesMoney dicStartupSeriesResult['strCurrency'] = strCurrency strSeriesDate = '' divStartupSeriesDate = divStartupSeries.css('div.details.inner_section > div.header > div.date_display') if(divStartupSeriesDate): strSeriesDate = divStartupSeriesDate.css('::text').extract_first() dicStartupSeriesResult['strSeriesDate'] = strSeriesDate lstStrInvestor = divStartupSeries.css('div.participant > div.text > div.name > a::text').extract() lstStrInvestorUrl = divStartupSeries.css('div.participant > div.text > div.name > a::attr(href)').extract() dicStartupSeriesResult['lstStrInvestor'] = lstStrInvestor dicStartupSeriesResult['lstStrInvestorUrl'] = lstStrInvestorUrl lstStartupSeries.append(dicStartupSeriesResult) # print("[parserForAngellist] Startup Series" + str(dicStartupSeriesResult)) # import pdb; pdb.set_trace() self.__lstStartupSeriesResult[strUrl] = lstStartupSeries self.parseStartupActivityPressToJson(strUrl) self.__lstStartupResult[strUrl] = dicStartupResult