Python Selector.cssの例、scrapy.Selector.css Pythonの例

コード例 #1

0

ファイルを表示

ファイル: builder.py プロジェクト: CyberIntelMafia/portia

def apply_selector_annotations(annotations, target_page):
    page = Selector(text=target_page)
    converted_annotations = []
    annotations = _merge_annotations_by_selector(annotations)
    for annotation in annotations:
        if not annotation.get('selector'):
            accepted_elements = set(
                chain(*[[elem.root for elem in page.css(sel)]
                        for sel in annotation.get('accept_selectors', [])
                        if sel])
            )
            rejected_elements = set(
                chain(*[[elem.root for elem in page.css(sel)]
                        for sel in annotation.get('reject_selectors', [])
                        if sel])
            )
            elems = accepted_elements - rejected_elements
        else:
            elems = [elem.root for elem in page.css(annotation['selector'])]
        if elems:
            tagids = [int(e.attrib.get('data-tagid', 1e9)) for e in elems]
            tagid = min(tagids)
            if tagid is not None:
                annotation['tagid'] = tagid
                converted_annotations.append(annotation)
    return converted_annotations

コード例 #2

0

ファイルを表示

ファイル: tencent.py プロジェクト: MeditatorGuo/Spiders

    def parse_item(self, response):
        items = []
        sel = Selector(response)
        base_url = get_base_url(response)
        sites_even = sel.css('table.tablelist tr.even')
        for site in sites_even:
            item = TencentItem()
            item['name'] = site.css('.l.square a').xpath('text()').extract()[0]
            relative_url = site.css('.l.square a').xpath('@href').extract()[0]
            item['detailLink'] = urljoin_rfc(base_url, relative_url)
            item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()[0]
            item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract()[0]
            item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract()[0]
            item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()[0]
            items.append(item)
            # print repr(item).decode("unicode-escape") + '\n'

        sites_odd = sel.css('table.tablelist tr.odd')
        for site in sites_odd:
            item = TencentItem()
            item['name'] = site.css('.l.square a').xpath('text()').extract()[0]
            relative_url = site.css('.l.square a').xpath('@href').extract()[0]
            item['detailLink'] = urljoin_rfc(base_url, relative_url)
            item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()[0]
            item['workLocation'] = site.css('tr > td:nth-child(4)::text').extract()[0]
            item['recruitNumber'] = site.css('tr > td:nth-child(3)::text').extract()[0]
            item['publishTime'] = site.css('tr > td:nth-child(5)::text').extract()[0]
            items.append(item)
            # print repr(item).decode("unicode-escape") + '\n'

        info('parsed ' + str(response))
        return items

コード例 #3

0

ファイルを表示

ファイル: itunesSpider.py プロジェクト: coreyfloyd/itunes-podcast-crawler

    def parse_podcastlist(self, response):
        """Extract podcast name and url from the list of podcasts"""
        sel = Selector(response)
        urls = sel.css("div#selectedcontent div ul li a::attr(href)").extract()
        names = sel.css("div#selectedcontent div ul li a::text").extract()

        for url, name in zip(urls, names):
            _id = get_id_from_url(url)
            item = ItunesItem(name=name, url=url, itunesId=_id)
            yield item

コード例 #4

0

ファイルを表示

ファイル: builder.py プロジェクト: NamiStudio/portia

def apply_selector_annotations(annotations, target_page):
    page = Selector(text=target_page)
    converted_annotations = []
    annotations = _merge_annotations_by_selector(annotations)
    for annotation in annotations:
        if not annotation.get('selector'):
            accepted_elements = set(
                chain(*[[elem._root for elem in page.css(sel)]
                        for sel in annotation.get('accept_selectors', [])
                        if sel])
            )
            rejected_elements = set(
                chain(*[[elem._root for elem in page.css(sel)]
                        for sel in annotation.get('reject_selectors', [])
                        if sel])
            )
            elems = accepted_elements - rejected_elements
        else:
            elems = [elem._root for elem in page.css(annotation['selector'])]
        if not elems:
            continue

        tagids = [int(e.attrib.get('data-tagid', 1e9)) for e in elems]
        tagid = min(tagids)
        if tagid is not None:
            annotation['tagid'] = tagid
            converted_annotations.append(annotation)

        # Create container for repeated field annotation
        if (annotation.get('repeated') and
                not annotation.get('item_container') and
                len(annotation.get('annotations')) == 1):
            parent = _get_parent(elems, page)
            field = annotation['annotations'].values()[0][0]['field']
            container_id = '%s#parent' % annotation['id']
            if len(parent):
                converted_annotations.append({
                    'item_container': True,
                    'id': container_id,
                    'annotations': {'#portia-content': '#dummy'},
                    'text-content': '#portia-content',
                    'container_id': annotation['container_id'],
                    'field': field,
                    'tagid': parent.attrib.get('data-tagid')
                })
                annotation['item_container'] = True
                annotation['field'] = field
                annotation['container_id'] = container_id
    return converted_annotations

コード例 #5

0

ファイルを表示

ファイル: spider_flyingv.py プロジェクト: nathanfan46/crawlerforangellist

	def parseUpdate(self, strProjectUrl):
		#更新頁面的檔案名稱格式為： projectID + "_blog_" + pageIndex + ".html"
		#其中pageIndex從0開始，至少會有1個，直接判斷檔案是否存在來判斷有多少個分頁
		projectID = getFileNameInUrl(strProjectUrl)
		dicUpdateResult = {}
		i = 0
		while True:
			blogPageFilePath = self.__LOCAL_PAGE_PATH + projectID + self.__LOCAL_PAGR_BLOG_SUFFIXES + "_" + str(i) + self.__LOCAL_PAGE_EXTENSION
			if os.path.isfile(blogPageFilePath) == True:
				with open(blogPageFilePath, "rb") as file:
					strBlogPageSource = file.read()
					root = Selector(text = strBlogPageSource)
					updateElements = root.css(".content > .well.simple")
					for updateElement in updateElements:
						dicUpdateResult["strUrl"] = strProjectUrl
						#更新資訊標題
						strUpdateTitle = updateElement.css("h2 > a::text").extract_first()
						dicUpdateResult["strUpdateTitle"] = strUpdateTitle
						#更新資訊內容
						strUpdateContent = ""
						for x in updateElement.css(".blogpost-content *::text").extract():
							strUpdateContent = strUpdateContent + purifyString(x)
						strUpdateContent = purifyString(strUpdateContent)
						dicUpdateResult["strUpdateContent"] = strUpdateContent
						#更新資訊日期
						strUpdateDate = updateElement.css("h2 > small > time::attr(datatime)").extract_first()
						dicUpdateResult["strUpdateDate"] = strUpdateDate
				i = i+1
			else:
				break
		self.__lstUpdateResult.append(dicUpdateResult)

コード例 #6

0

ファイルを表示

ファイル: itunesSpider.py プロジェクト: coreyfloyd/itunes-podcast-crawler

    def parse_alpha(self, response):
        """ extract the alpha letters links"""
        sel = Selector(response)
        urls = sel.css("ul.alpha li a::attr(href)").extract()

        for url in urls:
            yield Request(url, callback=self.parse_page)

コード例 #7

0

ファイルを表示

ファイル: douban.py プロジェクト: Whdsk/house-renting

    def parse_item(self, response):
        selector = Selector(response=response)
        selector.css('div#content div.article div.topic-content')

        item_loader = ItemLoader(item=HouseRentingDoubanItem(), selector=selector, response=response)
        item_loader.add_css(field_name='title', css='table.infobox *::text')
        item_loader.add_css(field_name='title', css='div#content > h1:first-child::text')
        item_loader.add_value(field_name='source', value=self.name)
        item_loader.add_css(field_name='author', css='h3 span.from a::text')
        item_loader.add_css(field_name='image_urls', css='div.topic-content div#link-report img::attr(src)')
        item_loader.add_css(field_name='author_link', css='h3 span.from a::attr(href)')
        item_loader.add_css(field_name='content', css='div.topic-content div#link-report *::text', re=r'\s*(.*)\s*')
        item_loader.add_value(field_name='source_url', value=response.url)
        item_loader.add_css(field_name='publish_time', css='h3 span:last-child::text',
                            re=r'\s*(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s*')

        yield item_loader.load_item()

コード例 #8

0

ファイルを表示

ファイル: itunesSpider.py プロジェクト: coreyfloyd/itunes-podcast-crawler

    def parse(self, response):
        """ Extract the main genres"""
        sel = Selector(response)
        selector = "div#genre-nav div ul li a.top-level-genre::attr(href)"
        urls = sel.css(selector).extract()

        for url in urls:
            yield Request(url, callback=self.parse_alpha)

コード例 #9

0

ファイルを表示

ファイル: jce.py プロジェクト: eatskolnikov/DEVCA2016

    def parse(self, response):
        province_block_headers = response.css("#LiveAccordionWrapper1053 h3 a.LA-ui-accordion-header").extract()
        for province_block_header in province_block_headers:
            province_block_headers_selector = Selector(text=province_block_header)
            province_block_id = str.join('', province_block_headers_selector.css("::attr('href')").extract())
            province_name = str.join('', province_block_headers_selector.css("::text").extract())
            province_block = response.css(province_block_id)
            rows = province_block.css('table.tg tr')
            for row in rows:
                row_text = str.join('', row.css('::text').extract())

                if 'provincia' in row_text.lower():
                    province_name = row_text.lower().replace('provincia :', '')
                    continue

                if 'circ' in row_text.lower():
                    province_name = row_text.lower().replace('provincia :', '')
                    continue

                if 'senador' in row_text.lower():
                    position = "senador"
                    continue

                if 'diputado' in row_text.lower():
                    position = "diputado"
                    continue

                if row.css('.tg-5mgg'):
                    party_header = row.css('.tg-5mgg')
                    party_header_colspan = str.join('', party_header.css("::attr('colspan')").extract())
                    print party_header_colspan
                    if (party_header_colspan == '4'):
                        party = str.join('', party_header.css('td.tg-5mgg::text').extract())
                    else:
                        party = str.join('', party_header.css('::text').extract())
                    continue



                candidate_name = row_text
                print province_name
                print party
                print position
                print candidate_name
                yield self.create_item(province_name, party, position, candidate_name)

コード例 #10

0

ファイルを表示

ファイル: atletiq.py プロジェクト: brnv/playground

    def parse_exercise(self, response):
        try:
            muscle_primary_selector = Selector(
                text=response.css('div.profile-info-value').extract()[4])
        except Exception:
            muscle_primary_selector =  Selector(text='')
        try:
            muscle_additional_selector = Selector(
                text=response.css('div.profile-info-value').extract()[5])
        except Exception:
            muscle_additional_selector = Selector(text='')

        try:
            male_first_image = re.sub('\.\.\/', '', response.css('ul.ace-thumbnails img::attr(src)').extract()[0])
        except Exception:
            male_first_image = ''
        try:
            male_second_image = re.sub('\.\.\/', '', response.css('ul.ace-thumbnails img::attr(src)').extract()[1])
        except Exception:
            male_second_image = ''

        try:
            female_first_image = re.sub('\.\.\/', '', response.css('ul.ace-thumbnails img::attr(src)').extract()[2])
        except Exception:
            female_first_image = ''
        try:
            female_second_image = re.sub('\.\.\/', '', response.css('ul.ace-thumbnails img::attr(src)').extract()[3])
        except Exception:
            female_second_image = ''

        yield {
            'name': response.css('div.page-header h1::text').extract()[0],
            'rules': response.css('div ol li::text').extract(),
            'm': 'Растяжка',
            'mp': muscle_primary_selector.css('div::text').extract(),
            'ma': muscle_additional_selector.css('div::text').extract(),
            'url': response.url,
            't': '',
            '1m': self.image_base_url + male_first_image,
            '2m': self.image_base_url + male_second_image,
            '1w': self.image_base_url + female_first_image,
            '2w': self.image_base_url + female_second_image,
            'i': '',
        }

コード例 #11

0

ファイルを表示

ファイル: a58.py プロジェクト: Whdsk/house-renting

    def parse_item(self, response):
        selector = Selector(response=response)
        selector.css('div.main-wrap')

        item_loader = ItemLoader(item=HouseRenting58Item(), selector=selector, response=response)
        item_loader.add_css(field_name='title', css='div.house-title > h1::text')
        item_loader.add_value(field_name='source', value=self.name)
        item_loader.add_css(field_name='author', css='div.house-basic-info div.house-agent-info p.agent-name > a::text')
        item_loader.add_css(field_name='image_urls', css='div.basic-pic-list > ul > li > img::attr(data-src)',
                            re=r'(.*)\?.*')
        item_loader.add_css(field_name='author_link',
                            css='div.house-basic-info div.house-agent-info p.agent-name > a::attr(href)')
        item_loader.add_css(field_name='content', css='ul.introduce-item *::text')
        item_loader.add_value(field_name='source_url', value=response.url)
        item_loader.add_css(field_name='publish_time', css='p.house-update-info::text')
        item_loader.add_css(field_name='price', css='div.house-pay-way *::text')
        item_loader.add_css(field_name='detail', css='div.house-desc-item > ul > li > span::text')

        yield item_loader.load_item()

コード例 #12

0

ファイルを表示

ファイル: dmoz_spider.py プロジェクト: roubao1994/tutorial

    def parse(self, response):
        sel = Selector(response)
        cities = sel.css("#report1 tr")

        for city in cities:
            item = CityItem()
            item["id"] = city.css(":nth-child(1)::text").extract()
            item["name"] = city.css(":nth-child(2)::text").extract()
            item["date"] = city.css(":nth-child(3)::text").extract()
            item["AQI"] = city.css(":nth-child(4)::text").extract()
            item["level"] = city.css(":nth-child(5)::text").extract()
            item["prime"] = city.css(":nth-child(6)::text").extract()
            yield item

        next_page = int(sel.css("#report1 tr:nth-last-child(2) input:first-child::attr(value)")[0].extract()) + 1
        total_page = int(sel.css("#report1 tr:nth-last-child(2) td:first-child font::text")[1].extract())
        if next_page <= total_page:
            yield scrapy.Request(
                url="http://datacenter.mep.gov.cn/report/air_daily/air_dairy.jsp?city=&startdate=" + self.month_before_yesterday + "&enddate=" + self.yesterday + "&page=" + str(
                    next_page), callback=self.parse)

コード例 #13

0

ファイルを表示

ファイル: itunesSpider.py プロジェクト: coreyfloyd/itunes-podcast-crawler

    def parse_page(self, response):
        """ Extract the paginate numbers links """
        sel = Selector(response)
        selector = ("ul.paginate li a:not(a.paginate-more)"
                    ":not(a.paginate-previous)"
                    "::attr(href)")
        urls = sel.css(selector).extract()
        self.parse_podcastlist(response)

        for url in urls:
            yield Request(url, callback=self.parse_podcastlist)

コード例 #14

0

ファイルを表示

ファイル: parserForAngellist.py プロジェクト: nathanfan46/crawlerforangellist

	def parseStartupFollowersToJson(self, strUrl):
		lstStrFollowers = []
		strStartupFollowersFilePath = spiderForAngellist.getFollowersLocalFilePath(strUrl, self.__strDate, self.__strCategory, self.__strSubCategory)
		if(os.path.isfile(strStartupFollowersFilePath)):
			with open(strStartupFollowersFilePath, "rb") as file: #讀取本地端文件檔案內容到字串
				strPageSource = file.read()

			root = Selector(text=strPageSource)

			lstStrFollowers = root.css('div.text > div.name > a::text').extract()

		return lstStrFollowers

コード例 #15

0

ファイルを表示

ファイル: migration.py プロジェクト: zhangwei5095/portia

def port_sample(sample):
    """Convert slybot samples made before slybot 0.13 to new format."""
    if not sample.get("annotated_body"):
        if not sample.get("plugins"):
            sample["plugins"] = {"annotations-plugin": {"extracts": []}}
        return sample  # Handle empty body
    if not sample.get("plugins"):
        sample["plugins"] = load_annotations(sample.get("annotated_body", u""))
    del sample["annotated_body"]

    # Group annotations by type
    annotations = sample["plugins"]["annotations-plugin"]["extracts"]
    try:
        sel = Selector(text=add_tagids(sample["original_body"]))
    except KeyError:
        annotated = sample["annotated_body"]
        sample["original_body"] = annotated
        sel = Selector(text=add_tagids(annotated))
    annotations = port_standard(annotations, sel, sample)
    standard_annos, generated_annos, variant_annos = [], [], []
    for a in annotations:
        if a.get("generated"):
            generated_annos.append(a)
        elif a.get("variants", 0) > 0:
            variant_annos.append(a)
        else:
            standard_annos.append(a)
    if not annotations:
        return sample
    new_annotations = []
    a = find_element(annotations[0], sel)
    for b in annotations[1:]:
        b = find_element(b, sel)
        a = find_common_parent(a, b)
    container_id = gen_id()
    parent = a.getparent()
    new_annotations.append(_create_container(a if parent is None else parent, container_id, selector=sel))
    for a in standard_annos:
        a.pop("variant", None)
    new_annotations.extend(standard_annos)
    new_annotations.extend(port_generated(generated_annos, sel))
    new_annotations.extend(port_variants(variant_annos, sel))
    for a in new_annotations:
        if not (a.get("item_container") and a.get("container_id")):
            a["container_id"] = container_id
        tagid = a.pop("tagid", None) or a.pop("data-tagid", None)
        elems = sel.css(a["selector"])
        elem = elems[0].root
    # Update annotations
    sample["plugins"]["annotations-plugin"]["extracts"] = new_annotations
    sample["version"] = SLYBOT_VERSION
    return sample

コード例 #16

0

ファイルを表示

ファイル: bt_spider.py プロジェクト: Demon89/film_sipder

 async def get_film(self, url):
     source = await html_source(url)
     bt_url = re.findall(r'href="(attach-dialog-fid-.*\.htm)"', source)
     selector = Selector(text=source)
     film_name = selector.re(r'\[BT下载\].*B\b')
     film_name = film_name[0] if film_name else ''
     bt_name = selector.css('td:nth-child(1) > a::text').extract_first()
     if film_name and bt_name:
         try:
             bt_url = bt_url[0].replace('dialog', 'download')
         except IndexError as e:
             bt_url = ''
         return film_name, bt_name, self.domain + bt_url

コード例 #17

0

ファイルを表示

    def parse(self, response):
        if response.css("div.content-base>section>div").extract_first() is None:
            return
        div = re.sub(">\s*<","><",response.css('div.content-base>section>div').extract_first())
        div = re.sub("[\s]{2,}", "", div)
        div = re.sub(">\s*/\s<*","><", div)
        div = re.sub(">\s*:\s<*", "><", div)


        rows = Selector(text=div).css('table>tbody>tr').extract()

        for row in rows:
            s = Selector(text=row)
            content = s.css('td::text').extract()
            if content[2] == "Computer":
                ua = s.css('td.useragent>a::text').extract_first()
                yield {'useragent':ua}
        page = Selector(text=div).css('#pagination>a').extract()
        url = Selector(text=page[-2]).css('::attr(href)').extract_first()
        if Selector(text=div).css('#pagination>span.current::text').extract_first() == '10':
            return
        yield scrapy.Request(url=response.urljoin(url), callback=self.parse)

コード例 #18

0

ファイルを表示

ファイル: get_download_url.py プロジェクト: zenwuyuan/mytest

 async def get_page_item(self, page_num: int):
     """
     :param page_num: get_page_url
     :return:
     """
     item_url = self.base_url.format(genre=self.genre, page_num=page_num)
     content = await self.get_html_content(item_url)
     selector = Selector(text=content)
     urls = list(set(selector.css('#maincontent a::attr(href)').extract()))
     page_items = (url for url in urls
                   if url.startswith('http://www.meizitu.com/a/'))
     for item in page_items:
         await self.get_item(item)

コード例 #19

0

ファイルを表示

ファイル: sis_spider.py プロジェクト: trinhvanson1997/Crawler

    def parse_page(self, body):
        sel = Selector(text=body)  # chuyển từ text sang Selector
        table = sel.css('.dxgvTable_SisTheme')

        data_rows = table.css('.dxgvDataRow_SisTheme')

        number_subjects = len(data_rows)

        with open('sis2.json', 'a') as f:
            for i in range(number_subjects):
                data = data_rows[i].css('.dxgv::text').getall()
                btn_collapse = self.driver.find_elements_by_class_name(
                    'dxGridView_gvDetailCollapsedButton_SisTheme')
                if i == 0 or i == 1:
                    btn_collapse[0].click()
                else:
                    btn_collapse[i - 1].click()

                sleep(0.5)

                detail = Selector(text=self.driver.page_source).css(
                    '.dxgvDetailCell_SisTheme b::text').getall()

                if len(detail) == 3:
                    condition_subject = None
                    english_name = detail[0]
                    short_name = detail[1]
                    faculity = detail[2]
                else:
                    condition_subject = detail[0]
                    english_name = detail[1]
                    short_name = detail[2]
                    faculity = detail[3]

                json_row = {
                    'ma_hoc_phan': data[0],
                    'ten_hoc_phan': data[1],
                    'thoi_luong': data[2],  # thoi luong
                    'so_tin_chi': data[3],  # so tin chi
                    'tc_hoc_phi': data[4],  # tin chi hoc phi
                    'trong_so': data[5],
                    'hoc_phan_dieu_kien': condition_subject,
                    'ten_tieng_anh': english_name,
                    'ten_viet-tat': short_name,
                    'vien_quan_ly': faculity
                }

                f.write(json.dumps(json_row, ensure_ascii=False))
                f.write('\n')

        f.close()

コード例 #20

0

ファイルを表示

    def parse(self, response):
        """
        `parse` should always `yield` Meeting items.

        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
        needs.
        """
        # Create even section groupings for each meeting, even though committees are in same el
        sections = []
        for idx, section_group in enumerate(response.css(".list-items")[1:3]):
            if idx == 0:
                sections.append(section_group)
            else:
                section_split = section_group.extract().split("<hr>")
                for split in section_split:
                    sections.append(Selector(text=split))

        for section in sections:
            self._validate_location(section)
            title = self._parse_title(section)
            classification = self._parse_classification(title)
            year_match = re.search(
                r"\d{4}", " ".join(section.css("h2 *::text").extract()))
            if not year_match:
                continue
            year_str = year_match.group()
            for split_text in section.extract().split("<br>"):
                item = Selector(text=split_text)
                item_text = re.sub(r"\s+", " ", " ".join(
                    item.css("*::text").extract())).strip()
                start = self._parse_start(item_text, year_str)
                if not start:
                    continue

                meeting = Meeting(
                    title=title,
                    description="",
                    classification=classification,
                    start=start,
                    end=None,
                    all_day=False,
                    time_notes=
                    "Details may change, confirm with staff before attending",
                    location=self.location,
                    links=self._parse_links(item, response),
                    source=response.url)

                meeting["status"] = self._get_status(meeting)
                meeting["id"] = self._get_id(meeting)

                yield meeting

コード例 #21

0

ファイルを表示

ファイル: lagou.py プロジェクト: amberwest/lagou

    def get_category_links(self):
        """获取职业类别链接"""
        print('开始爬取拉勾网首页的职业类别链接')
        category_links = set()
        url = 'https://www.lagou.com/'
        headers = {
            'Host':
            'www.lagou.com',
            'Pragma':
            'no-cache',
            'Upgrade-Insecure-Requests':
            '1',
            'Referer':
            'https://www.lagou.com/',
            'User-Agent':
            ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 '
             '(KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'),
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            selector = Selector(text=response.text)
            categories = selector.css('.menu_box .menu_main .category-list')
            for category in categories:
                links = category.css('a::attr(href)')
                for link in links:
                    category_link = link.extract()
                    category_links.add(category_link)

            menu_subs = selector.css('.menu_sub.dn dd')
            for sub in menu_subs:
                links = sub.css('a::attr(href)')
                for link in links:
                    category_link = link.extract()
                    category_links.add(category_link)
            print(category_links)
            return category_links
        else:
            print('获取职业类别链接失败')

コード例 #22

0

ファイルを表示

ファイル: landingjobs.py プロジェクト: zero-code/remotor

 def parse_job(self, response):
     """Parse a joblink into a JobItem.
     """
     s = Selector(response)
     item = JobItem()
     item["url"] = response.url
     item["site"] = "LandingJobs"
     item["title"] = s.css("h1::text").extract_first()
     item["text"] = s.xpath(
         '//section[@class="ld-job-details"]//text()').extract()
     item["text"].extend(
         s.xpath(
             '//section[@class="ld-job-offer-section"]//text()').extract())
     yield item

コード例 #23

0

ファイルを表示

    def parse_category_page(self, response, category):
        video_list = response.css(
            "#contentHolder li .promoBlock .promoBlockWrap").getall()
        print(f"Found {len(video_list)} videos")

        for video in video_list:
            if isinstance(video, str):
                video = Selector(text=video)

            title = video.css(".text .textWrap h3 a::text").extract_first()
            title = title.strip().replace("\r", "").replace("\n", "")
            url = video.css(
                ".text .textWrap h3 a::attr('href')").extract_first()
            image = video.css(".image a img::attr('src')").extract_first()

            yield {
                "title": title,
                "author": "",
                "url": TheRsaSpider.base_url + url,
                "image": TheRsaSpider.base_url + image,
                "category": category,
                "source": TheRsaSpider.name
            }

コード例 #24

0

ファイルを表示

 def parse_stock_availability(self, body):
     sel = Selector(text=body)
     store_list = sel.css(".store-navigation .js-pickup-store-list")
     for store_data in store_list.css(".pickup-store-list-entry"):
         store_info = store_data.css(".js-select-store-label")
         store_name = store_info.css(
             ".pickup-store-info .pickup-store-list-entry-name").get()
         store_address = store_info.css(
             ".pickup-store-info .pickup-store-list-entry-address").get()
         store_city = store_info.css(
             ".pickup-store-info .pickup-store-list-entry-city").get()
         stock_info = store_info.css(".store-availability .available").get()
         result = [store_name, store_address, store_city, stock_info]
         self.stock_availability += result

コード例 #25

0

ファイルを表示

 def parse(self, response):
     """Get the pagination links and hand them off.
     """
     s = Selector(response)
     pagination = s.css(".pagination")
     pagelinks = pagination.xpath(
         '//a[contains(@href, "&page=")]/@href').extract()
     for pagelink in pagelinks:
         request = Request(
             urljoin(self.root, pagelink),
             callback=self.parse_jobspage,
             dont_filter=True,
         )
         yield request

コード例 #26

0

ファイルを表示

ファイル: douyuSpider.py プロジェクト: zhugeburu/scrapy-examples

 def parse_1(self,reponse):
     selector = Selector(reponse)
     items = []
     room = selector.css('#anchor-info')[0]
     item = DouyutestItem()
     # item = {}
     item['room_name'] = room.xpath('/div[2]/div[1]/h1/text()').extract_first()
     item['room_visitor'] = room.xpath('/div[2]/div[3]/ul/li[2]/div/div[2]/a/text()').extract_first()
     item['room_owner'] = room.xpath('/div[2]/div[3]/ul/li[1]/div/a/text()').extract_first()
     item['room_popularity'] = room.xpath('/div[2]/div[2]/dl/dd/a[2]/text()').extract_first()
     items.append(item)
         # yield  item
         # print repr(item).decode("unicode-escape") + '\n'
     return items

コード例 #27

0

ファイルを表示

ファイル: readmanga.py プロジェクト: Tynukua/getManga

 def __parse_images(self, text):
     selector = Selector(text= text)
     for script in selector.css('script').getall():
         if 'init' in script: break
     else:
         raise ValueError("Script not found")
     match = scriptparser.search(script)
     if not match:
         raise ValueError("Script not parsed")
     fargs = match.group(1)
     fargs = '[' + fargs.replace("'", '"').strip() + ']'
     imgs_splited = json.loads(fargs)[0]
     imgs = [ ''.join(i[:3]) for i in imgs_splited]
     return imgs

コード例 #28

0

ファイルを表示

ファイル: secretcv.py プロジェクト: tansuaksan/ScrapingThingsFromWeb

 def parse2(self, response):
     jsonresponse = json.loads(response.body_as_unicode())
     sel = Selector(text=jsonresponse['content'], type="html")
     links=sel.css('a.s-job-title::attr(href)').extract()
     for i in links:
         item = items.DmozItem()
         item["site"]="secretcv"
         item["firma"]=i.replace("-"," ").split("/")[3]
         a=i.replace("-"," ").replace("is ilanlari","").split("/")[4]
         a=" ".join(a.split())
         item["ilanAd"]=re.sub("\d+", "", a)
         item["ilanID"]=i.split("/")[4].split("-")[-1].split("i")[-1]
         #return item
         yield Request(i, self.parse3, meta={"item":item})

コード例 #29

0

ファイルを表示

ファイル: 利用Selenium爬取京东商品.py プロジェクト: YilK/Web-Crawler

def get_products():  # 提取商品的信息
    html = browser.page_source  # 获取页面的源代码
    selector = Selector(text=html)  # 使用Scrapy提供的Selector来解析
    items = selector.css('li.gl-item')
    for item in items:
        product = {
            'img_url': item.css('a[target="_blank"] img').extract_first(),
            'title': item.css('div.p-name a::attr(title)').extract_first(),
            'price': item.css('div div.p-price strong i::text').extract_first(),
            'shop': item.css('div.p-shop span a::attr(title)').extract_first()

        }
        print(product)
        save_to_mongo(product)  # 存入数据库

コード例 #30

0

ファイルを表示

 def replay(self,response):
     title1 = response.meta['title1']
     s1 = Selector(response)
     topic_replay = s1.css(
         'ul.topic-reply li.clearfix div.bg-img-green h4 a::text,ul.topic-reply li.clearfix div.bg-img-green h4::text,ul.topic-reply li.clearfix p::text').extract()
     for x in topic_replay:
         # 去掉在x左右的空白,\t,\n和\r字符.
         x1 = x.strip(' \t\n\r')
         if x1 != '':
             self.topic_replay_end.append(x1)
     replay2 = ''.join(s1.xpath('//*[@id="comments"]//text()').extract())
     f = open("/Users/vivi/PycharmProjects/DoubanGroup/%s.txt" % title1, "a")
     print(replay2, file=f)
     f.close()

コード例 #31

0

ファイルを表示

ファイル: lianjia.py プロジェクト: willible/house_spider

 def parse_chouse_list(self, response):
     """提取成交房源链接"""
     sel = Selector(response)
     # 链家有时小区查询不到数据
     total = sel.css('.resultDes .total span::text').extract_first()
     total = int(total)
     if total > 0:
         # 提取房源链接
         links = sel.css(
             ".listContent li .info .title a::attr(href)").extract()
         for link in links:
             yield scrapy.Request(url=link,
                                  callback=self.parse_chouse_detail)
         # 链接分页
         page_data = sel.css(
             ".house-lst-page-box::attr(page-data)").extract_first()
         page_data = json.loads(page_data)
         if page_data['curPage'] == 1 and page_data['totalPage'] > 1:
             price = response.url.replace(self.base_url + '/chengjiao/', '')
             for x in range(2, page_data['totalPage'] + 1, 1):
                 url = self.base_url + '/chengjiao/' + 'pg' + str(x) + price
                 yield scrapy.Request(url=url,
                                      callback=self.parse_chouse_list)

コード例 #32

0

ファイルを表示

    def parse_mypage(self, response):

        sel = Selector(response)
        item = CrawlSpiderItem()
        item['title'] = sel.xpath('//h1/text()').extract_first()
        item['price'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[1]/span[1]/text()').extract_first()
        item['area'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/p[1]/text()').re('\d+')[0]
        item['house_type'] = \
        sel.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/p[2]/text()').extract_first().split(' ')[0]
        item['floor'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/p[3]/text()').extract_first()
        item['house_head'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/p[4]/text()').extract_first()
        item['metro'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/p[5]/text()').extract_first()
        item['community'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/p[6]/a[1]/text()').extract_first()
        item['position'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/p[7]/a[1]/text()').extract_first()
        item['real_position'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/p[7]/a[2]/text()').extract_first()
        item['community_introduce'] = sel.xpath(
            '//*[@id="introduction"]/div/div[2]/div[2]/div[3]/ul/li[1]/span[2]/text()').extract_first()
        item['transportation'] = sel.xpath(
            '//*[@id="introduction"]/div/div[2]/div[2]/div[3]/ul/li[2]/span[2]/text()').extract_first()
        item['surround_facility'] = sel.xpath(
            '//*[@id="introduction"]/div/div[2]/div[2]/div[3]/ul/li[3]/span[2]/text()').extract_first()

        item['public_time'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/p[8]/text()').extract_first()
        item['publisher_name'] = sel.xpath(
            '/html/body/div[4]/div[2]/div[2]/div[3]/div/div[1]/a[1]/text()').extract_first()
        item['publisher_img_url'] = sel.xpath('/html/body/div[4]/div[2]/div[2]/div[3]/a/img/@src').extract_first()
        item['publisher_id'] = sel.xpath(
            '/html/body/div[4]/div[2]/div[2]/div[3]/div/div[1]/span/text()').extract_first()
        item['publisher_evaluate'] = '-'.join(
            sel.xpath('/html/body/div[4]/div[2]/div[2]/div[3]/div/div[2]/span[1]/text()').re(':(.*)/'))
        item['evaluate_num'] = '-'.join(
            sel.xpath('/html/body/div[4]/div[2]/div[2]/div[3]/div/div[2]/span[1]/a/text()').re(r'\d+'))
        item['publisher_with_checking'] = '-'.join(
            sel.xpath('/html/body/div[4]/div[2]/div[2]/div[3]/div/div[2]/span[2]/text()').re(r"\d+"))
        item['phone_number'] = '-'.join(
            sel.xpath('/html/body/div[4]/div[2]/div[2]/div[3]/div/div[3]/text()').re('\d+'))
        item['lease'] = sel.xpath(
            '//*[@id="introduction"]/div/div[2]/div[1]/div[2]/ul/li[1]/text()').extract_first()
        item['pay_way'] = '-'.join(
            sel.xpath('//*[@id="introduction"]/div/div[2]/div[1]/div[2]/ul/li[2]/text()').re(r'\w+'))
        item['house_state'] = sel.xpath(
            '//*[@id="introduction"]/div/div[2]/div[1]/div[2]/ul/li[3]/text()').extract_first()
        item['heating_method'] = sel.xpath(
            '//*[@id="introduction"]/div/div[2]/div[1]/div[2]/ul/li[4]/text()').extract_first()
        item['house_facility'] = [tem.strip() for tem in sel.css(
            '#introduction > div > div.introContent > div.feature > div.zf-tag > ul > li.tags::text').extract() if
                                  tem.strip()]
        # item['look_house_num'] = sel.xpath('//*[@id="record"]/div[2]/div[3]/span/text()').extract()
        item['pic_url'] = sel.xpath('//*[@id="topImg"]/div[2]/ul/li/img/@src').extract()
        yield item

コード例 #33

0

ファイルを表示

    def parse(self, response):

        t_selector = Selector(text=browser.page_source)
        all= t_selector.css("a::attr(href)").extract()
        all_urls = filter(lambda x: True if x.startswith("https") else False, all)
        for url in all_urls:
            match_obj = re.match("(.*www.lagou.com/(zhaopin|jobs)/).*", url)
            if match_obj:
                request_url = match_obj.group()
                yield scrapy.Request(request_url, headers=self.headers, callback=self.do_items)

            else:
                # 如果不是question页面则直接进一步跟踪
                yield scrapy.Request(url, headers=self.headers, callback=self.parse)

コード例 #34

0

ファイルを表示

ファイル: cache.py プロジェクト: koliambus/mbu-ukraine

    def is_cached_response_fresh(self, response, request):
        if super().is_cached_response_fresh(response, request):

            try:
                body = gunzip(response.body)
            except OSError:
                body = response.body

            h = HtmlResponse(url=response.url, body=body)
            s = Selector(h)
            company_name = s.css("h2 > span:first-child::text").extract()
            return company_name and company_name[0].strip()
        else:
            return False

コード例 #35

0

ファイルを表示

ファイル: item_data.py プロジェクト: iynaix/pathofexile

    def parse(self, resp):
        sel = Selector(resp)

        for link in sel.css(".viewMore a"):
            txt = link.xpath("text()").extract_first().replace(
                                                    "View all ", "").strip()
            href = link.xpath("@href").extract_first()

            req = Request(
                resp.urljoin(href),
                callback=self.parse_data_page
            )
            req.meta["title"] = txt
            yield req

コード例 #36

0

ファイルを表示

    def parse(self, response):
        print('ttttttt', response.url)
        sel = Selector(response)
        num = sel.css(
            'div#table-pagination ::attr(data-number)').extract_first()
        next_num = int(num) + 2

        if int(num):
            next_url = re.sub('(\d+)\.', str(next_num) + ".", response.url)
        else:
            next_url = re.sub('.jhtml', '-2.jhtml', response.url)

        #next url
        yield Request(url=next_url, priority=1, callback=self.parse)

コード例 #37

0

ファイルを表示

ファイル: xinpianchang.py プロジェクト: jerryliu306/aioVextractor

 async def extract_user_pageing_api(self, ResText, webpage_url):
     try:
         selector = Selector(text=ResText)
     except TypeError:
         return None
     output = []
     for article in selector.css("li[data-articleid]"):
         ele = dict()
         ele['vid'] = article.css('::attr(data-articleid)').extract_first()
         ele['webpage_url'] = f"https://www.xinpianchang.com/a{ele['vid']}?from=UserProfile"
         ele['cover'] = article.css(
             'img[class*="lazy-img"]::attr(_src)').extract_first()
         ele['upload_ts'] = self.string2timestamp(string=article.css(
             '.video-hover-con p[class*="fs_12"]::text').extract_first(),
                                                  format='%Y-%m-%d 发布')
         # ele['duration'] = self.format_duration(article.css('.duration::text').extract_first())
         ele['duration'] = self.string2duration(
             string=article.css('.duration::text').extract_first(),
             format="%M' %S''")
         ele['description'] = self.format_desc(
             article.css('.desc::text').extract_first())
         ele['playlist_url'] = webpage_url
         ele['title'] = self.format_desc(
             article.css('.video-con-top p::text').extract_first())
         ele['category'] = self.format_category(
             article.css('.new-cate .c_b_9 ::text').extract())
         ele['view_count'] = self.format_count(
             article.css('.icon-play-volume::text').extract_first())
         ele['like_count'] = self.format_count(
             article.css('.icon-like::text').extract_first())
         ele['role'] = article.css('.user-info .role::text').extract_first()
         ele['from'] = self.from_
         output.append(ele)
     else:
         has_more = selector.css(
             "li[data-more]::attr(data-more)").extract_first()
         return output, has_more, {}

コード例 #38

0

ファイルを表示

ファイル: spider_proxy_ip.py プロジェクト: yataOrg/create_db

    def crawl_ips(self):
        headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"}

        for i in range(1, 5):
            response = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers)
            selector = Selector(text=response.text)
            all_trs = selector.css("#ip_list tr")
            ip_list = []
            for tr in all_trs[1:]:

                ip = tr.css("td::text")[0].extract()
                port = tr.css("td::text")[1].extract()
                # print(tr.css("td:nth-child(4) > a::text").extract())
                if [] == tr.css("td:nth-child(4) > a::text").extract():
                    server_address = ''
                else:
                    server_address = tr.css("td:nth-child(4) > a::text").extract()[0]
                anonymous = tr.css("td::text")[4].extract()
                ip_type = tr.css("td::text")[5].extract()
                speed = tr.css("td:nth-child(7) > div::attr(title)").extract()[0]
                speed = re.sub("[^0-9\.]", "", speed)  #
                con_time = tr.css("td:nth-child(8) > div::attr(title)").extract()[0]
                con_time = re.sub("[^0-9\.]", "", con_time)  #

                alive_time = tr.css("td::text")[10].extract()
                check_time = tr.css("td::text")[11].extract()
                status = 1
                now_time = int(time.time())
                # print(check_time)
                # print(ip, port, server_address, anonymous)
                ip_list.append((ip, port, server_address, anonymous, ip_type, speed, con_time, alive_time, check_time,
                                status, now_time, now_time))

                # insert into database

            for ip_info in ip_list:
                insert_sql = '''
                    insert into proxy_ip (ip, port, server_address, anonymous, type, speed, con_time, alive_time, check_time, status, created_at, updated_at)  
                    values('{0}', {1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', {9}, {10}, {11})'''.format(
                    ip_info[0], ip_info[1], ip_info[2], ip_info[3],
                    ip_info[4], ip_info[5], ip_info[6], ip_info[7], ip_info[8], ip_info[9], ip_info[10], ip_info[11])
                # print(insert_sql)
                # return
                cursor.execute(insert_sql)
                db.commit()

            print("insert ip list over " + str(i) + " pages")
        print("insert ip list end @@@@")

コード例 #39

0

ファイルを表示

def get_id():
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"}

    for i in range(1, 3):
        req = requests.get("https://www.xicidaili.com/nn/{0}".format(i), headers=headers)

        # print(req.text)
        sel = Selector(text=req.text)

        raw = (sel.css('#ip_list tr'))
        for ip_raw in raw[1:]:
            tdcss = ip_raw.css('td')
            ip = tdcss[1].css('::text').extract()[0]
            port = tdcss[2].css('::text').extract()[0]
            print("{0}:{1}".format(ip, port))

コード例 #40

0

ファイルを表示

ファイル: mangahere.py プロジェクト: abusalman/manga-downloader-flask

    def parse(self, resp):
        hxs = Selector(resp)
        for row in hxs.css("div.detail_list > ul > li"):
            item = MangaChapterItem()
            cells = row.xpath("span")
            if not cells:
                continue

            try:
                item['name'], item['link'] = extract_link(cells[0].xpath("a"))
                item['date'] = self.parsedate(
                                        cells[-1].xpath('text()').extract()[0])
                yield item
            except IndexError:
                pass

コード例 #41

0

ファイルを表示

ファイル: htmltopdf.py プロジェクト: jingxinmingzhi/jingxinmingzhi

    def clean_content(self, content):
        # <a class="headerlink" href="#check" title="Permalink to this headline">¶</a>  headline 自带图形
        content = content.replace(u'>\xb6<', u'><')

        # selenium LanguagePreference
        sel = Selector(text=content)
        # content = content.replace(sel.css('div#codeLanguagePreference').extract_first(), '') #可能是None
        for div in sel.css('div#codeLanguagePreference').extract():
            content = content.replace(div, '')

        for lang in ['java', 'csharp', 'ruby', 'php', 'perl', 'javascript']:
            for div in sel.css('div.highlight-%s' % lang).extract():
                # print len(content)
                content = content.replace(div, '')

        # liaoxuefeng comment
        content = content.replace('<h3>Comments</h3>', '')
        content = content.replace('<h3>Make a comment</h3>', '')

        # http://lxml.de/
        for div in sel.css('div.sidemenu').extract():
            content = content.replace(div, '')

        return content

コード例 #42

0

ファイルを表示

ファイル: builder.py プロジェクト: theaverageguy/portia

def apply_selector_annotations(annotations, target_page):
    page = Selector(text=target_page)
    converted_annotations = []
    annotations = _merge_annotations_by_selector(annotations)
    for annotation in annotations:
        if not annotation.get('selector'):
            accepted_elements = set(
                chain(*[[elem._root for elem in page.css(sel)]
                        for sel in annotation.get('accept_selectors', [])
                        if sel]))
            rejected_elements = set(
                chain(*[[elem._root for elem in page.css(sel)]
                        for sel in annotation.get('reject_selectors', [])
                        if sel]))
            elems = accepted_elements - rejected_elements
        else:
            elems = [elem._root for elem in page.css(annotation['selector'])]
        if elems:
            tagids = [int(e.attrib.get('data-tagid', 1e9)) for e in elems]
            tagid = min(tagids)
            if tagid is not None:
                annotation['tagid'] = tagid
                converted_annotations.append(annotation)
    return converted_annotations

コード例 #43

0

ファイルを表示

    def parse_start_url(self, response):
        print('parse_start_url --------> ' + response.url)
        self.page = 1
        self.first_page_url = response.url
        self.play_num = []
        self.play_num = get_album_simple(self.first_page_url)

        sel = Selector(response=response)
        ls = sel.css('.pagingBar_page::text').extract()
        if '下一页' in ls:
            self.pages = int(ls[-2])
            r = requests.get(self.first_page_url + str(self.pages),
                             headers=self.headers)
            if r.status_code == 200:
                sel = Selector(r)
                self.last_page_album_count = sel.css(
                    '.discoverAlbum_item').extract().__len__()
            self.page_album_count = 12
        else:
            self.pages = 1
            r = requests.get(self.first_page_url, headers=self.headers)
            sel = Selector(r)
            self.page_album_count = sel.css(
                '.discoverAlbum_item').extract().__len__()

コード例 #44

0

ファイルを表示

ファイル: post_spider.py プロジェクト: bolaft/ubuntu-forum-scraper

	def parse_page(self, response):

		"""
		Parses one page of the forum
		"""
		for bp in response.css(".blockpost"):
			bp_selector = Selector(text=bp.extract())

			message = "".join(bp_selector.xpath(
				"//div[@class='postmsg']/node()[not(local-name() = 'div' and @class='postsignature') and not(local-name() = 'p' and @class='postedit')]"
			).extract()).strip()

			signature = bp_selector.css(".postsignature").xpath("p/node()").extract()

			modification = bp_selector.css(".postedit").extract()

			if len(modification) > 0:
				s = modification[0]
				modification = str(compute_date(s[s.find("(")+1:s.find(")")]))
			else:
				modification = False

			author_link_list = bp_selector.xpath("//strong/a/@href").extract()

			post = Post(
				author=bp_selector.xpath("//strong/a/text()").extract()[0] if len(author_link_list) > 0 else bp_selector.xpath("//strong/text()").extract()[0],
				author_id=extract_identifier(author_link_list[0]) if len(author_link_list) > 0 else None,
				number=int(bp_selector.xpath("//h2/span/span/text()").extract()[0][1:]),
				datetime=str(compute_date(bp_selector.xpath("//h2/span/a/text()").extract()[0])),
				content=message,
				signature="".join(signature).strip() if len(signature) > 0 else False,
				modification=modification,
				thread=extract_identifier(response.request.url)
			)

			yield post

コード例 #45

0

ファイルを表示

ファイル: mm_label.py プロジェクト: yelelen/sfish-py

def get_mm_label():
    r = requests.get(url, headers=headers)
    r.encoding = 'utf-8'  # 解决乱码问题
    index = 500
    if r.status_code == 200:
        sel = Selector(r)
        label_lists = sel.css('.tag ul li a').extract()
        for x in label_lists:
            s = Selector(text=x)
            item = MMLabel()
            item["mml_cover"] = s.css('a img::attr(src)').extract()[0]
            item["mml_label"] = s.css('a::text').extract()[0]
            item["mml_order"] = index
            item.meta.id = item['mml_order']
            print('get label ---> ' + item['mml_label'])

            try:
                rs = search.query("term",
                                  mml_order=item["mml_order"]).execute()
                if len(rs) <= 0:
                    item.save()
            except Exception as e:
                print(e.__cause__)
            index -= 1

コード例 #46

0

ファイルを表示

def get_page_for(number, cj):
    url = "http://www.haushalt.fm.nrw.de/grafik/ajax.php"
    suffix = ""
    if number:
        suffix = "?selection={}".format("+".join(number.split()))
    response = requests.get(url + suffix, cookies=cj)
    s = Selector(text=response.text)
    results = {}
    for tr in s.css("tr:not(:first-child)"):
        id = tr.css("::attr(id)").extract_first()
        name = tr.css(".col3 ::text").extract_first()
        results[id] = name
        if len(id) < len("n n n"):
            results = {**results, **get_page_for(id, cj)}
    return results

コード例 #47

0

ファイルを表示

 def parse_job(self, response):
     """Parse a joblink into a JobItem.
     """
     s = Selector(response)
     item = JobItem()
     item["url"] = response.url
     item["site"] = "VirtualVocations"
     item["title"] = s.css("h1::text").extract_first()
     item["text"] = s.xpath('//div[@id="job_details"]//text()').extract()
     try:
         posted = s.xpath('//div[@class="col-sm-6"]/p/text()')[8].extract()
         item["date_posted"] = parse_date(posted).isoformat()
     except Exception as e:
         self.logger.error(e)
     yield item

コード例 #48

0

ファイルを表示

ファイル: douban.py プロジェクト: yutong01/crawlers

    def parse_item(self, response):
        selector = Selector(response=response)
        selector.css('div#content div.article div.topic-content')

        item_loader = ItemLoader(item=HouseRentingDoubanItem(),
                                 selector=selector,
                                 response=response)
        item_loader.add_css(field_name='title', css='table.infobox *::text')
        item_loader.add_css(field_name='title',
                            css='div#content > h1:first-child::text')
        item_loader.add_value(field_name='source', value=self.name)
        item_loader.add_css(field_name='author', css='h3 span.from a::text')
        # item_loader.add_css(field_name='image_urls', css='div.topic-content div#link-report img::attr(src)')
        item_loader.add_css(field_name='author_link',
                            css='h3 span.from a::attr(href)')
        item_loader.add_css(field_name='content',
                            css='div.topic-content div#link-report *::text',
                            re=r'\s*(.*)\s*')
        item_loader.add_value(field_name='source_url', value=response.url)
        item_loader.add_css(field_name='publish_time',
                            css='h3 span:last-child::text',
                            re=r'\s*(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s*')

        yield item_loader.load_item()

コード例 #49

0

ファイルを表示

ファイル: summ_social_services_advisory.py プロジェクト: City-Bureau/city-scrapers-akr

 def _parse_docx(self, attachment):
     items = []
     docx_bytes = BytesIO(attachment)
     docx_str = ""
     with ZipFile(docx_bytes) as zf:
         for zip_info in zf.infolist():
             if zip_info.filename == "word/document.xml":
                 with zf.open(zip_info) as docx_file:
                     docx_str = StringIO(docx_file.read().decode())
     if not docx_str:
         return
     # Remove MS Word namespaces on tags to use selectors
     sel = Selector(text=docx_str.getvalue())
     sel.remove_namespaces()
     year_str = "".join([
         p.strip() for p in sel.css("tbl > tr")[:1].css("tc:first-of-type")
         [:1].css("*::text").extract() if p.strip()
     ])
     for table in sel.css("tbl"):
         month_str = "".join([
             p.strip() for p in table.css("tr")[1:2].css("tc:first-of-type")
             [:1].css("*::text").extract() if p.strip()
         ]).title()
         for cell in table.css("tc > p"):
             cell_str = re.sub(
                 r"((?<=[\-–]) | (?=[\-–])|@)",
                 "",
                 re.sub(r"\s+", " ",
                        " ".join(cell.css("*::text").extract())).strip(),
             ).strip()
             if (len(cell_str) <= 2
                     or (len(cell_str) > 2 and cell_str.startswith("201"))
                     or not cell_str[0].isdigit()):
                 continue
             items.append(self._parse_item(cell_str, month_str, year_str))
     return items

コード例 #50

0

ファイルを表示

ファイル: allfunctions.py プロジェクト: bangnguyen/my-scrapy

def get_selector(html, field):
    """
    html : Selector or HtmlResponse
    execute the xpath config if existed in field
    """
    try:
        selector = html
        if not ( isinstance(html, Selector) or isinstance(html, SelectorList)):
            selector = Selector(html)
        if contains(field, 'css'):
            selector = selector.css(field['css'])
        if contains(field, 'xpath'):
            #xpath can be a multiple
            selector = xpath(selector, field['xpath'])
        return selector
    except:
        return None

コード例 #51

0

ファイルを表示

ファイル: forum_spider.py プロジェクト: bolaft/ubuntu-forum-scraper

	def parse(self, response):
		"""
		Parses the http://forum.ubuntu-fr.org page for forums
		"""
		for bt in response.css(".blocktable"):
			bt_selector = Selector(text=bt.extract())

			category = bt_selector.xpath("//h2/span/text()").extract()[0]

			if category in excluded_categories:
				continue

			for tr in bt_selector.xpath("//tbody/tr"):
				tr_selector = Selector(text=tr.extract())

				description = tr_selector.css(".forumdesc").xpath("text()").extract()
				link = tr_selector.xpath("//a/@href").extract()[0]
				identifier = extract_identifier(link)

				forum = Forum(
					identifier=identifier,
					name=tr_selector.xpath("//h3/a/text()").extract()[0],
					url=make_url(link),
					category=category,
					description=description[0] if len(description) > 0 else None,
					parent=None
				)

				subforum_names = tr_selector.xpath("//div/a/text()").extract() + tr_selector.xpath("//div/a/strong/text()").extract()
				subforum_links = tr_selector.xpath("//div/a/@href").extract()

				subforums = [Forum(
					identifier=extract_identifier(link),
					name=name,
					url=make_url(link),
					category=category,
					description=None,
					parent=identifier
				) for name, link in zip(subforum_names, subforum_links)]

				forums = [forum] + subforums

				for forum in forums:
					yield forum

コード例 #52

0

ファイルを表示

ファイル: spider_flyingv.py プロジェクト: nathanfan46/crawlerforangellist

	def parseProjectListPage(self, strTypePageUrl): #從專案列表頁面截取專案是否募資成功以及摘要資訊
		projectListFilePath = self.getProjectListFilePath(strTypePageUrl)
		strFileListPageSource = None
		with open(projectListFilePath, "rb") as file:
			strFileListPageSource = file.read()
			root = Selector(text = strFileListPageSource)
			lstProjectItem = root.css(".portfolio-item-wrapper")
			for projectItem in lstProjectItem:
				strUrl = projectItem.css(".portfolio-thumb > a[href*='project']::attr(href)").extract_first()
				strID = getFileNameInUrl(strUrl)
				strDescription = projectItem.css(".portfolio-thumb > a > .portfolio-zoom::text").extract_first()
				strDescription = purifyString(strDescription)
				intStatus = 0
				successItem = projectItem.css(".ribbon-green.rgreen")				
				failedItem = projectItem.css(".ribbon-green.rblue")
				if(len(successItem) > 0):
					intStatus = 1
				elif(len(failedItem) > 0):
					intStatus = 2
				self.__dicProjectInfo[strID] = {"strDescription":strDescription, "intStatus":intStatus}

コード例 #53

0

ファイルを表示

ファイル: parserForAngellist.py プロジェクト: nathanfan46/crawlerforangellist

	def parseStartupActivityPressToJson(self, strUrl):
		lstActivityPress = []
		strStartupActivityPressFilePath = spiderForAngellist.getActivityPressLocalFilePath(strUrl, self.__strDate, self.__strCategory, self.__strSubCategory)
		if(os.path.isfile(strStartupActivityPressFilePath)):
			with open(strStartupActivityPressFilePath, "rb") as file: #讀取本地端文件檔案內容到字串
				strPageSource = file.read()

			root = Selector(text=strPageSource)

			lstDivActivityPress = root.css('div.startups-show-helpers.active')
			for divActivityPress in lstDivActivityPress:
				dicActivityPress = {}
				dicActivityPress['strUrl'] = strUrl
				dicActivityPress['strSourceUrl'] = divActivityPress.css('div.headline > a::attr(href)').extract_first()
				dicActivityPress['strSourceDomain'] = divActivityPress.css('div.type_and_actions > span.type::text').extract_first()
				dicActivityPress['strTitle'] = divActivityPress.css('div.headline > a::text').extract_first()
				dicActivityPress['strContent'] = divActivityPress.css('div.summary::text').extract_first()
				dicActivityPress['strDate'] = divActivityPress.css('div.timestamp > span::text').extract_first()
				lstActivityPress.append(dicActivityPress)

		self.__lstStartupActivityPressResult[strUrl] = lstActivityPress

コード例 #54

0

ファイルを表示

ファイル: spider_flyingv.py プロジェクト: nathanfan46/crawlerforangellist

	def parseQnA(self, strProjectUrl):
		projectID = getFileNameInUrl(strProjectUrl)
		qaPageFilePath = self.__LOCAL_PAGE_PATH + projectID + self.__LOCAL_PAGR_QA_SUFFIXES + self.__LOCAL_PAGE_EXTENSION
		strQAPageSource = None
		if os.path.isfile(qaPageFilePath) == True:
			with open(qaPageFilePath, "rb") as file: #讀取本地端文件檔案內容到字串
				strQAPageSource = file.read()
			root = Selector(text=strQAPageSource)
			lstQnaElement = root.css("#openQA + h2 + div .panel.panel-default")
			for qnaElement in lstQnaElement:
				dicQnaResult = {}
				dicQnaResult["strUrl"] = strProjectUrl
				#Q&A問題
				strQnaQuestion = qnaElement.css(".panel-heading .panel-title > a::text").extract_first()
				dicQnaResult["strQnaQuestion"] = purifyString(strQnaQuestion)
				#Q&A回覆
				strQnaAnswer = qnaElement.css(".panel-collapse > .panel-body::text").extract_first()
				dicQnaResult["strQnaAnswer"] = purifyString(strQnaAnswer)
				#Q&A回覆時間
				strQnaDate = qnaElement.css(".panel-collapse > .panel-body > small::text").extract_first()
				strQnaDate = purifyString(strQnaDate)
				dicQnaResult["strQnaDate"] = strQnaDate[5:len(strQnaDate)]
				self.__lstQnaResult.append(dicQnaResult)

コード例 #55

0

ファイルを表示

ファイル: razorScrapy.py プロジェクト: aoleiReiz/python-TAP

    def getAllRazorLinks(self,store_id):
        store_razor_link = self.db.get_one_store(store_id)[-1]
        self.browser.get(store_razor_link)
        time.sleep(5)
        t_selector = Selector(text=self.browser.page_source)
        item5lines = t_selector.css('.J_TItems .item5line1')

        for item5line in item5lines:
            items = item5line.css('.item .detail')
            for item in items:
                razor_dict = {}
                razor_link =  item.css('a ::attr(href)').extract_first('')
                razor_name = item.css('a ::text').extract_first()
                razor_id = 'null'
                mbj = re.match('.*?id=(\d+).*',razor_link)
                if mbj:
                    razor_id = mbj.group(1)
                razor_dict['razor_id'] = razor_id
                razor_dict['name'] = razor_name
                razor_dict['link'] = razor_link
                razor_dict['store_id'] = store_id
                razor_dict['date'] = datetime.now().date()

                self.db.add_one_razor(razor_dict)

コード例 #56

0

ファイルを表示

ファイル: python_25718.py プロジェクト: fiolbs/code_extraction

# Scrapy grab div with multiple classes?
from scrapy import Selector
sel = Selector(text='&lt;div class="product product-small"&gt;I am a product!&lt;/div&gt;')
print sel.css('.product').extract()

コード例 #57

0

ファイルを表示

ファイル: parserForAngellist.py プロジェクト: nathanfan46/crawlerforangellist

	def parseSyndicateToJson(self, strUrl, strSyndicateUrl):
		dicSyndicateResult = {};
		strSyndicateFilePath = spiderForAngellist.getSyndicateLocalFilePath(strUrl, self.__strDate, self.__strCategory, self.__strSubCategory)
		if(os.path.isfile(strSyndicateFilePath)):
			print("[parserForAngellist] Parsing " + strSyndicateFilePath)

			with open(strSyndicateFilePath, "rb") as file: #讀取本地端文件檔案內容到字串
				strPageSource = file.read()

			root = Selector(text=strPageSource)
			dicSyndicateResult['strUrl'] = strSyndicateUrl
			dicSyndicateResult['strCrawlTime'] = self.__strDate
			dicSyndicateResult['strName'] = root.css('div.gridspan.antialiased > h1::text').extract_first()
			dicSyndicateResult['strManager'] = root.css('div.managers > div.fixed_width.u-inlineBlock > div > a.u-uncoloredLink::text').extract()

			intTypicalInvestment = 0
			fCarryPerDeal = 0.0
			intBackerCount = 0

			strTerms = root.css('ul.syndicate_terms > li::text').extract()
			for strTerm in strTerms:
				if "Total Carry Per Deal:" in strTerm: 
					strCarryPerDeal = strTerm.replace('Total Carry Per Deal:', '').replace('%','').strip()
					fCarryPerDeal = float(strCarryPerDeal)
					dicSyndicateResult['fCarryPerDeal'] = fCarryPerDeal
				elif "Typical Investment:" in strTerm:
					# strTypicalInvestment = strTerm[strTerm.rfind('$')+1:].strip().replace(',', '')
					# intTypicalInvestment = int(strTypicalInvestment)
					# dicSyndicateResult['intTypicalInvestment'] = intTypicalInvestment
					# Use str instead
					strTypicalInvestment = strTerm[strTerm.rfind(':')+1:].strip()
					dicSyndicateResult['strTypicalInvestment'] = intTypicalInvestment
				elif "Backed by" in strTerm:
					strBackerCount = strTerm[strTerm.find('Backed by')+9:strTerm.find('Accredited Investor')].strip()
					intBackerCount = int(strBackerCount)
					dicSyndicateResult['intBackerCount'] = intBackerCount


			intBackedBy = 0
			intDealsPerYear = 0

			divSyndicateSummaryItems = root.css('ul.syndicate_summary > li')
			for divSyndicateSummaryItem in divSyndicateSummaryItems:
				strLabel = divSyndicateSummaryItem.css('div.syndicate_summary_label::text').extract_first().strip()
				if "Backed By" in strLabel:
					strBackedBy = divSyndicateSummaryItem.css('div.syndicate_summary_value::text').extract_first().strip()
					strCurrency = strBackedBy[:1]
					strBackedBy = strBackedBy[1:]

					if(strCurrency == u'$'):
						strCurrency = 'USD'
					elif (strCurrency == u'€'):
						strCurrency = 'EUR'

					intBase = 1
					if(strBackedBy[-1:] == u'K'):
						intBase = 1000
						strBackedBy = strBackedBy[:-1]
					elif(strBackedBy[-1:] == u'M'):
						intBase = 1000000
						strBackedBy = strBackedBy[:-1]

					intBackedBy = int(locale.atof(strBackedBy.replace(",", "")) * intBase)

					dicSyndicateResult['strCurrency'] = strCurrency
					dicSyndicateResult['intBackedBy'] = intBackedBy

				elif "Expected Deals/Year" in strLabel:
					strDealsPerYear = divSyndicateSummaryItem.css('div.syndicate_summary_value::text').extract_first().strip()
					intDealsPerYear = int(strDealsPerYear)
					dicSyndicateResult['intDealsPerYear'] = intDealsPerYear

			lstStrBackers = root.css('div.gridspan > div.feature > figure > h3 > a.profile-link::text').extract()
			lstOverflowBackers = root.css('div.gridspan > ul.overflow > li > h4 > a.profile-link::text').extract()
			lstStrBackers.extend(lstOverflowBackers)
			dicSyndicateResult['lstStrBackers'] = lstStrBackers

			self.__lstSyndicateResult[strSyndicateUrl] = dicSyndicateResult

コード例 #58

0

ファイルを表示

ファイル: parserForAngellist.py プロジェクト: nathanfan46/crawlerforangellist

	def parsePeopleToJson(self, strUrl):
		strObjectID = getFileNameInUrl(spiderForAngellist.getPureUrl(strUrl))
		dicInvestorResult = {};
		dicInvestorResult['strUrl'] = strUrl
		dicInvestorResult['strCrawlTime'] = self.__strDate

		strPeopleFilePath = spiderForAngellist.getPeopleLocalFilePath(strUrl, self.__strDate, self.__strCategory, self.__strSubCategory)
		print("[parserForAngellist] Parsing " + strPeopleFilePath)

		if(os.path.isfile(strPeopleFilePath)):
			with open(strPeopleFilePath, "rb") as file: #讀取本地端文件檔案內容到字串
				strPageSource = file.read()

			root = Selector(text=strPageSource)
			strName = root.css("h1.js-name::text").extract_first().strip()
			dicInvestorResult['strName'] = strName
			lstDivInfo = root.css("div.tags > span.tag")

			strLocation = ''
			lstStrRole = []
			for divInfo in lstDivInfo:
				if(divInfo.css('span.fontello-location.icon')):
					if(divInfo.css("::attr(title)") and (divInfo.css("::attr(title)").extract_first() != '')):	
						strAllLocation = divInfo.css("::attr(title)").extract_first().strip()
					elif(divInfo.css("::attr(oldtitle)")):
						strAllLocation = divInfo.css("::attr(oldtitle)").extract_first().strip()
					else:
						strAllLocation = divInfo.css("::text").extract_first().strip()

					lstStrLocation = strAllLocation.split(',')
					lstStrLocation = map(unicode.strip, lstStrLocation)
					strLocation = lstStrLocation[0]
				elif(divInfo.css('span.fontello-tag-1.icon')):
					if(divInfo.css("::attr(title)") and (divInfo.css("::attr(title)").extract_first() != '')):	
						strRole = divInfo.css("::attr(title)").extract_first().strip()
					elif(divInfo.css("::attr(oldtitle)")):
						strRole = divInfo.css("::attr(oldtitle)").extract_first().strip()
					else:
						strRole =divInfo.css("::text").extract_first().strip()
					lstStrRole = strRole.split(',')
					lstStrRole = map(unicode.strip, lstStrRole)

			dicInvestorResult['lstStrRole'] = lstStrRole

			dicInvestorResult['strLocation'] = strLocation
			dicLocation = self.parseLocation(strLocation)
			print("location parse complete")
			# strGeonameId = geonames.search(q=strLocation)[0]['geonameId']
			# dicGeoname = geonames.get(strGeonameId)
			# bbox = dicGeoname['bbox']
			# strCountry = dicGeoname['countryCode']
			# strContinent = dicGeoname['continentCode']
			# dicCity = geonames.findCity(north=bbox['north'], south=bbox['south'], east=bbox['east'], west=bbox['west'])[0]
			# strCity = dicCity['name']

			dicInvestorResult['strCity'] = dicLocation['strCity']
			dicInvestorResult['strCountry'] = dicLocation['strCountry']
			dicInvestorResult['strContinent'] = dicLocation['strContinent']

			intFollower = 0
			if(root.css("a.followers_count.follow_link")):
				strFollower = root.css("a.followers_count.follow_link::text").extract_first().strip()
				strFollower = strFollower.split(' ')[0].replace(",", "")
				intFollower = int(strFollower)
			dicInvestorResult['intFollower'] = intFollower

			intFollowing = 0
			if(root.css("a.following_count.follow_link")):
				strFollowing = root.css("a.following_count.follow_link::text").extract_first().strip()
				strFollowing = strFollowing.split(' ')[0].replace(",", "")
				intFollowing = int(strFollowing)
			dicInvestorResult['intFollowing'] = intFollowing

			lstStrMarket = []
			lstStrMarketIndustry = []

			lstAboutContent = root.css("div.s-grid0-colMd24.s-vgBottom2.field")
			for aboutContent in lstAboutContent:

				if(aboutContent.css("div.s-grid-colMd5 > div.u-uppercase::text").extract_first().strip() == 'Locations'):
					strLocation = aboutContent.css("div.s-grid-colMd5 > div.u-uppercase::text").extract_first().strip();
					lstStrMarket = aboutContent.css('div.s-grid-colMd17 > div.item > div.module_taggings > div.content > div.value > span.tag > a::text').extract()
				elif(aboutContent.css("div.s-grid-colMd5 > div.u-uppercase::text").extract_first().strip() == 'Markets'):
					strMarket = aboutContent.css("div.s-grid-colMd5 > div.u-uppercase::text").extract_first().strip();
					lstStrMarketIndustry = aboutContent.css('div.s-grid-colMd17 > div.item > div.module_taggings > div.content > div.value > span.tag > a::text').extract()
			
			dicInvestorResult['lstStrMarketIndustry'] = lstStrMarketIndustry
			dicInvestorResult['lstStrMarket'] = lstStrMarket

			lstExperience = []
			lstDivExperience = root.css('div.feature.startup_roles.experience')
			for divExperience in lstDivExperience:
				dicExperienceResult = {}
				dicExperienceResult['strUrl'] = strUrl
				dicExperienceResult['strName'] = strName
				strCompany = divExperience.css('a.u-unstyledLink::text').extract_first().strip()
				dicExperienceResult['strCompany'] = strCompany
				strRole = divExperience.css('div.line > span.medium-font::text').extract_first().strip()
				dicExperienceResult['strRole'] = strRole
				lstExperience.append(dicExperienceResult)

			self.__lstExperinceResult[strUrl] = lstExperience
			
			#print("[parserForAngellist] lstExperience " + str(lstExperience))

			lstReference = []
			lstDivReference = root.css('div.profiles-show.review')
			for divReference in lstDivReference:
				dicReferenceResult = {}
				dicReferenceResult['strUrl'] = strUrl
				dicReferenceResult['strName'] = strName
				strContent = divReference.css('div.review-content::text').extract_first().strip()
				dicReferenceResult['strContent'] = strContent			
				# strAuthor = divReference.css('div.annotation > div.profile-link::text').extract_first().strip() 
				# dicReferenceResult['strAuthor'] = strAuthor
				lstStrAuthorContext = divReference.css('div.annotation').xpath('.//text()').extract() 
				lstStrAuthorContext = map(unicode.strip, lstStrAuthorContext)
				lstStrAuthorContext = filter(lambda x: len(x) > 1, lstStrAuthorContext)
				strAuthor = lstStrAuthorContext[0]
				strAuthorContext = ','.join(lstStrAuthorContext)
				# strAuthorContext = divReference.css('div.annotation').extract_first().strip() 
				dicReferenceResult['strAuthor'] = strAuthor
				dicReferenceResult['strAuthorContext'] = strAuthorContext
				lstReference.append(dicReferenceResult)

			self.__lstReferenceResult[strUrl] = lstReference

			#print("[parserForAngellist] lstReference " + str(lstReference))

			
			self.__lstInverstorResult[strUrl] = dicInvestorResult
			# strInvestorJsonFilePath = parserForAngellist.getInvestorJsonFilePath(strUrl, self.__strDate, self.__strCategory, self.__strSubCategory)
			# saveObjToJson(dicInvestorResult, strInvestorJsonFilePath)
			# print("[parserForAngellist.] Result " + str(dicInvestorResult))

			dicSyndicateResult = {};
			divSyndicate = root.css("div.back_syndicate_button")
			if(divSyndicate):
				uSyndicateUrl = divSyndicate.css("a::attr(href)").extract_first().strip()
				strSyndicateUrl = parserForAngellist.PARSE_BASE_URL + str(uSyndicateUrl)
				self.parseSyndicateToJson(strUrl, strSyndicateUrl)

コード例 #59

0

ファイルを表示

ファイル: parserForAngellist.py プロジェクト: nathanfan46/crawlerforangellist

	def parseStartupToJson(self, strUrl):
		strObjectID = getFileNameInUrl(strUrl)
		dicStartupResult = {};
		dicStartupResult['strUrl'] = strUrl
		dicStartupResult['strCrawlTime'] = self.__strDate

		strStartupFilePath = spiderForAngellist.getOverviewLocalFilePath(strUrl, self.__strDate, self.__strCategory, self.__strSubCategory)
		print("[parserForAngellist] Parsing " + strStartupFilePath)

		if(os.path.isfile(strStartupFilePath)):
			with open(strStartupFilePath, "rb") as file: #讀取本地端文件檔案內容到字串
				strPageSource = file.read()

			root = Selector(text=strPageSource)
			strCompany = root.css('div.text > div.name_holder > h1.name::text').extract_first().strip()
			dicStartupResult['strCompany'] = strCompany

			# Some company didn't have intros, and h2 has some parsing error
			strIntro = root.css('div.main.standard > div.text').css('p::text').extract_first()
			dicStartupResult['strIntro'] = strIntro

			lstStrProduct = root.css('div.product_desc > div.show.windows > div.content::text').extract()
			dicStartupResult['lstStrProduct'] = lstStrProduct

			lstStrFounders = []
			lstStrFoundersDesc = []
			divFoundersSection = root.css('div.founders.section > div.startups-show-sections >  div.startup_roles')
			if(divFoundersSection):
				lstDivFounders = divFoundersSection.css('div.text')
				for divFounder in lstDivFounders:
					strFounderName = divFounder.css('div.name > a::text').extract_first()
					lstFounderDescs = divFounder.css('div.bio > p').css('::text').extract()
					# lstFounderDescs = map(unicode.strip, lstFounderDescs)
					lstFounderDescs = filter(lambda x: len(x) > 1, lstFounderDescs)
					strFounderDesc = ''.join(lstFounderDescs)
					lstStrFounders.append(strFounderName)
					lstStrFoundersDesc.append(strFounderDesc)

				# lstStrFoundersName = divFoundersSection.css('div.text > div.name > a::text').extract()
				# lstStrFoundersName = divFoundersSection.css('div.text > div.name > a::text').extract()
				dicStartupResult['lstStrFounders'] = lstStrFounders
				dicStartupResult['lstStrFoundersDesc'] = lstStrFoundersDesc

			lstStrTeam = []
			lstStrTeamDesc = []
			divTeamSection = root.css('div.team.section > div.startups-show-sections > div.group')
			if(divTeamSection):
				lstDivTeam = divTeamSection.css('div.text')
				for divTeam in lstDivTeam:
					strTeamName = divTeam.css('div.name > a::text').extract_first()
					lstTeamDescs = divTeam.css('div.bio > p').css('::text').extract()
					lstTeamDescs = filter(lambda x: len(x) > 1, lstTeamDescs)
					strTeamDesc = ''.join(lstTeamDescs)
					lstStrTeam.append(strTeamName)
					lstStrTeamDesc.append(strTeamDesc)

				dicStartupResult['lstStrTeam'] = lstStrTeam
				dicStartupResult['lstStrTeamDesc'] = lstStrTeamDesc

			lstLocationIndustry = root.css('div.main.standard > div.text > div.tags').css('a.tag::text').extract()

			strLocation = ''
			lstIndustry = []
			if(len(lstLocationIndustry) > 0):
				strLocation = lstLocationIndustry[0]
				lstIndustry = lstLocationIndustry[1:]
			dicStartupResult['strLocation'] = strLocation
			dicStartupResult['lstIndustry'] = lstIndustry

			dicLocation = self.parseLocation(strLocation)

			dicStartupResult['strCity'] = dicLocation['strCity']
			dicStartupResult['strCountry'] = dicLocation['strCountry']
			dicStartupResult['strContinent'] = dicLocation['strContinent']

			lstStrFollowers = self.parseStartupFollowersToJson(strUrl)
			dicStartupResult['lstStrFollowers'] = lstStrFollowers

			lstStrInvestor = []
			divFundingSection = root.css('div.past_financing.section.startups-show-sections')
			if(divFundingSection):
				lstStrInvestor = divFundingSection.css('ul.roles > li.role').css('div.name > a::text').extract()

			dicStartupResult['lstStrInvestor'] = lstStrInvestor

			isFundraising = False
			divFundraisingHeader = root.css('div.fundraising.header')
			if(divFundraisingHeader):
				strFundraising = divFundraisingHeader.css('::text').extract_first()
				if "Fundraising" in strFundraising:
					isFundraising = True

			dicStartupResult['isFundraising'] = isFundraising

			lstStartupSeries = []
			if(divFundingSection):
				lstDivStartupSeries = divFundingSection.css('div.startups-show-sections.startup_rounds > ul.startup_rounds.with_rounds > li.startup_round')
				for divStartupSeries in lstDivStartupSeries:
					dicStartupSeriesResult = {}
					dicStartupSeriesResult['strUrl'] = strUrl
					dicStartupSeriesResult['strCrawlTime'] = self.__strDate
					dicStartupSeriesResult['strCompany'] = strCompany

					strSeriesType = ''
					divStartupSeriesType = divStartupSeries.css('div.details.inner_section > div.header > div.type')
					if(divStartupSeriesType):
						strSeriesType = divStartupSeriesType.css('::text').extract_first().strip()
					dicStartupSeriesResult['strSeriesType'] = strSeriesType

					strSeriesMoney = u'Unknown'
					intSeriesMoney = 0
					divStartupSeriesMoney = divStartupSeries.css('div.details.inner_section > div.raised')
					if(divStartupSeriesMoney):
						lstStrSeriesMoney = divStartupSeriesMoney.css('::text').extract()
						strSeriesMoney = "".join(lstStrSeriesMoney).strip()

					if(strSeriesMoney != u'Unknown'):
						strCurrency = strSeriesMoney[:1]
						strSeriesMoney = strSeriesMoney[1:]

						if(strCurrency == u'$'):
							strCurrency = 'USD'
						elif (strCurrency == u'€'):
							strCurrency = 'EUR'

						intBase = 1
						if(strSeriesMoney[-1:] == u'K'):
							intBase = 1000
							strSeriesMoney = strSeriesMoney[:-1]
						elif(strSeriesMoney[-1:] == u'M'):
							intBase = 1000000
							strSeriesMoney = strSeriesMoney[:-1]

						intSeriesMoney = int(locale.atof(strSeriesMoney.replace(",", "")) * intBase)

					if(intSeriesMoney == 0):
						dicStartupSeriesResult['intSeriesMoney'] = strSeriesMoney
					else:
						dicStartupSeriesResult['intSeriesMoney'] = intSeriesMoney
						dicStartupSeriesResult['strCurrency'] = strCurrency

					strSeriesDate = ''
					divStartupSeriesDate = divStartupSeries.css('div.details.inner_section > div.header > div.date_display')
					if(divStartupSeriesDate):
						strSeriesDate = divStartupSeriesDate.css('::text').extract_first()
					dicStartupSeriesResult['strSeriesDate'] = strSeriesDate

					lstStrInvestor = divStartupSeries.css('div.participant > div.text > div.name > a::text').extract()
					lstStrInvestorUrl = divStartupSeries.css('div.participant > div.text > div.name > a::attr(href)').extract()
					dicStartupSeriesResult['lstStrInvestor'] = lstStrInvestor
					dicStartupSeriesResult['lstStrInvestorUrl'] = lstStrInvestorUrl

					lstStartupSeries.append(dicStartupSeriesResult)
					# print("[parserForAngellist] Startup Series" + str(dicStartupSeriesResult))
					# import pdb; pdb.set_trace()

				self.__lstStartupSeriesResult[strUrl] = lstStartupSeries
				

			self.parseStartupActivityPressToJson(strUrl)

			self.__lstStartupResult[strUrl] = dicStartupResult