Esempio n. 1
0
    def parseAlbumPage(self, response):

        self._logger.debug("Parsing Album Page at url %s ", response.url)
        artist_name = response.meta['artist_name']
        album_name = response.meta['album_name']
        self._logger.debug("Parsing Album %s for Artist %s", album_name, artist_name)
        hxs = Selector(response)
        album = AlbumScraperItem()
        album['item_type'] = 'Album'
        album['album_name'] = album_name
        album['artist_name'] = artist_name

        album_label = hxs.xpath('/html/body/table[2]/tr/td[2]/table/tr[6]/td[2]/text()').extract()
        if (len(album_label) > 0):
            album['album_label'] = album_label[0]
        album_title = hxs.xpath('/html/body/table[2]/tr/td[2]/table/tr[2]/td[2]/text()').extract()
        if (len(album_title) > 0):
            print album_title[0]
        released_year = hxs.xpath('/html/body/table[2]/tr/td[2]/table/tr[3]/td[2]/text()').extract()
        if (len(released_year) > 0):
            album['album_year'] = released_year[0]
        album_styles = hxs.xpath('/html/body/table[2]/tr/td[2]/table/tr[5]/td[2]/ul/li')
        style_list = []
        for album_style in album_styles:
            style = album_style.xpath('./text()').extract()
            if (len(style) > 0):
                style_list.append(style[0])

        album['album_styles'] = ','.join(style_list)

        yield album
Esempio n. 2
0
 def parse_review(self, response):
     hxs = Selector(response)
     asin = response.meta['asin']
     title = FmtSQLCharater(first_item(hxs.xpath('//title/text()').extract()))
     title = title.replace(u'Amazon.com: Customer Reviews: ', '')
     rlist = hxs.xpath("//div[@id='cm_cr-review_list']/div[@class='a-section review']")
     for div in rlist:
         r = Review()
         r['product_id'] = asin
         r['product_name'] = title
         r['review_id'] = first_item(div.xpath('@id').extract())
         votes = FmtSQLCharater(first_item(div.xpath('div[1]/span/text()').extract()))
         match = re.search(u'(.+) people found this helpful', votes, re.I)
         if match:
             r['total_feedback_num'] = match.group(1)
             r['total_helpful_num'] = match.group(2)
         #
         r['full_star'] = FmtSQLCharater(first_item(div.xpath("div[2]/a[1]/i/span/text()").extract()))
         r['title'] = FmtSQLCharater(first_item(div.xpath("div[2]/a[2]/text()").extract()))
         r['cust_name'] = FmtSQLCharater(first_item(div.xpath("div[3]/span[1]/a/text()").extract()))
         r['creation_date'] = FmtSQLCharater(first_item(div.xpath("div[3]/span[4]/text()").extract()))
         #r['creation_date'] = r['creation_date'].replace(u'于 ', '').replace('年', '/').replace(u'月', '/').replace(u'日', '/')
         r['body'] = first_item(div.xpath("div[5]/span").extract())
         yield r
     #下一页
     if len(rlist) == 10:
         page = response.meta['page'] + 1
         log.msg('Request Product[%s]-[%d] page review ...' % (asin, page))
         yield Request(
             url=self.review_url.replace('<?asin?>', asin).replace('<?page?>', str(page)),
             callback=self.parse_review,
             headers=self.headers,
             meta={'page': page, 'asin': asin}
         )
Esempio n. 3
0
    def parse_item(self, response):
        #self.log('AshfordSpider#parse_item...')
        self._logger.info('AshfordSpider#parse_item...')
        item = AshfordItem()
        sel = Selector(response)
        self._enrich_base_data(item, response, is_update=False)
        self._enrich_same_part(item, response)
        item['prodName'] = ''.join(sel.xpath(' //*[@id="prodName"]/a/text()').extract()).strip()
        item['prod_desc'] = (''.join(sel.xpath('//*[@id="fstCont"]/h3/text()').extract()).strip())
        item['detail'] = format_html_string(''.join(sel.xpath('//div[@id="tab1_info"]').extract()).strip())
        item['Brand'] = ''.join(sel.xpath('//h1[@id="prodName"]/a[@id="sameBrandProduct"]/text()[1]').extract()).strip()
        item['product_images'] = list(set(sel.xpath('//a[contains(@href,"/images/catalog/") and contains(@href,"XA.jpg")]/@href').extract()))
        item['image_urls'] = [urljoin(response.url, i) for i in item['product_images']]
        chinese_url = response.url.replace('www.', 'zh.')

        response.meta['item_half'] = item
        self.crawler.stats.inc_crawled_pages(
            crawlid=response.meta['crawlid'],
            spiderid=response.meta['spiderid'],
            appid=response.meta['appid']
        )

        yield Request(
            url=chinese_url,
            meta=response.meta,
            callback=self.parse_chinese_detail,
            dont_filter=True
            )
Esempio n. 4
0
def load_annotations(body):
    """Create slybot annotations from annotated html."""
    if not body:
        return {"annotations-plugin": {"extracts": []}}
    sel = Selector(text=add_tagids(body))
    existing_ids = set()
    annotations = []
    for elem in sel.xpath("//*[@data-scrapy-annotate]"):
        attributes = elem.root.attrib
        annotation = json.loads(unquote(attributes["data-scrapy-annotate"]))
        if isinstance(elem.root, _Element) and elem.root.tag.lower() == "ins":
            annotation.update(find_generated_annotation(elem))
        else:
            annotation["tagid"] = attributes.get("data-tagid")
        if "id" not in annotation:
            annotation["id"] = gen_id(disallow=existing_ids)
        existing_ids.add(annotation["id"])
        annotations.append(annotation)
    for elem in sel.xpath("//*[@%s]" % "|@".join(IGNORE_ATTRIBUTES)):
        attributes = elem.root.attrib
        for attribute in IGNORE_ATTRIBUTES:
            if attribute in attributes:
                break
        ignore = {attribute[len("data-scrapy-") :]: True}
        if "id" not in ignore:
            ignore["id"] = gen_id(disallow=existing_ids)
        existing_ids.add(ignore["id"])
        annotations.append(ignore)
    return {"annotations-plugin": {"extracts": annotations}}
Esempio n. 5
0
    def stockholder(self,response):
        hxs = Selector(response)

        data = response.meta['data']
        data['type'] = 'StockHolder'
        data['reference'] = response.url

        date_str = ''.join(hxs.xpath('//tr[position() =1]/td[1]/text()').extract()).replace(u'(Ultimo Período Informado)','').strip()
        #06 / 2015
        try:
            date_struct = time.strptime(date_str,'%m / %Y')
            data['stock_date'] = time.strftime('%Y-%m-%dT%H:%M:%SZ',date_struct)
        except:
            self.log('No Information: %s' % response.url)

        stock_list = hxs.xpath('//tr[position() >1]')
        for item in stock_list:
            data['type'] = 'StockHolder'
            data['stock_name'] = ''.join(item.xpath('td[1]/text()').extract()).strip()
            data['stock_percentage'] = ''.join(item.xpath('td[4]/text()').extract()).strip()
            if data['stock_name'] == '':
                data['type'] = 'Company'
            m = hashlib.md5()
            to_hash = data['type'] + data['company_ICN'] + data['company_name'] + \
                      data['stock_name'] + data['stock_percentage']
            m.update(to_hash.encode('utf-8'))
            data['id'] = m.hexdigest()
            yield data
Esempio n. 6
0
        def xt(cls, response):
            XPATH_BI_creator = cls.XPATH.format("Erstunterzeichner")
            XPATH_PET_creator = cls.XPATH.format("eine Petition")

            creators = []

            raw_creators_list = response.xpath(XPATH_PET_creator).extract()
            if len(raw_creators_list) > 0:
                # PET started by members of parliament
                for raw_creator in raw_creators_list:
                    creator_sel = Selector(text=raw_creator)
                    raw_parl_id_url = creator_sel.xpath("//a/@href").extract()
                    name = u''
                    parl_id = u''
                    if len(raw_parl_id_url) > 0:
                        raw_parl_id = raw_parl_id_url[0].split("/")
                        if len(raw_parl_id) > 1:
                            parl_id = raw_parl_id[2]
                    raw_name = creator_sel.xpath("//a/text()").extract()
                    if len(raw_name) > 0:
                        name = raw_name[0]
                    if parl_id != u'' and name != u'':
                        creators.append((parl_id, name))
            else:
                raw_creators_list = response.xpath(XPATH_BI_creator).extract()
                if len(raw_creators_list) > 0:
                    # BI first signed by a person
                    name = _clean(raw_creators_list[0].split("\t")[1])
                    creators.append(("", name))
                # VBG seem to have no visible "starter"

            return creators
    def parse_business_page(self, response):
        #
        # Set up xpaths for populating item entries
        #
        hxs               = Selector(response)
        contact           = hxs.xpath('//*[@id="main-content"]/div[1]/div[1]/div/section[2]/div[1]')
        bNameXPath_list   = hxs.xpath('//*[@id="main-content"]/div[1]/div[1]/h1/text()').extract()        
        bStreetXPath_list = contact.xpath('./p[@class="street-address"]/text()').extract()
        bCityState_list   = contact.xpath('./p[@class="city-state"]/text()').extract()
        bPhone_list       = contact.xpath('./p[@class="phone"]/text()').extract()

        #
        # Grab specific business fields
        #
        businessItem           = YellowpagesItem()
        businessItem['Name']   = bNameXPath_list[0] if bNameXPath_list else ''
        businessItem['Street'] = bStreetXPath_list[0] if bStreetXPath_list else ''
        businessItem['Phone']  = bPhone_list[0] if bPhone_list else ''
        if bCityState_list:
            city_state_string      = bCityState_list[0]
            businessItem['City'], businessItem['State'], businessItem['Postal'] = city_state_string.split()
            businessItem['City'] = businessItem['City'].strip(',')
            businessItem['Postal'] = int(businessItem['Postal'])
            businessItem['Street'] = businessItem['Street'].strip(',')
        else:
            city_state_string = ''

        return businessItem
Esempio n. 8
0
def extract_combine_JCXX(response):
    """
    提取监管信息(如http://00225516.11315.com/)
    """
    sel = Selector(text=response.body)
    xpath_result = [response.url[7:15]] 
    #存xpath提取出来的信息,[response.url[7:15]]为公司的id
    keywords1 = ['企业法人营业执照',
    '组织机构代码','税务登记证','银行开户许可证',
    '第三方征信认证']
    xpath_syn1 = [u"//a[text()='%s']/@href"%i for i in keywords1]
    for i in xpath_syn1:
    #提取企业法人营业执照,组织机构
        tmp = sel.xpath(i).extract()
        if len(tmp) == 0:
            xpath_result.append("")
        elif len(tmp) == 1 &('java' not in tmp[0]):   #只一元素且不是"javascript:void(0);"
            xpath_result.append("http://00225516.11315.com"+tmp[0].strip())
        else:
            log.msg("error for_JCXX xpath_syn1 xpath_result=%s\
                "%"\001".join(xpath_result), level=log.ERROR)

    keywords = ['1-2.质量检查信息','1-3.行政许可资质',\
    '1-4.行政监管信息','1-5.商标/专利/著作权信息',\
    '1-6.人民法院的判决信息','1-7.人民法院判定的被执行人信息',\
    '1-8.人民法院核定的失信被执行人信息']
    # xpath_syn = [u"//a[text()='%s']/ancestor::div[@class=\
    # 'f-cb bdsolid']/div"%i for i in keywords] #xpath语句
    xpath_syn = [u"//a[text()='%s']/../../div"%i for i in keywords] #xpath语句
    for i in xpath_syn:
        raw_re = sel.xpath(i)
        check_total = raw_re.xpath("./a[1]/text()").extract() 
        #查看是否有信息,如是check_total为空,说明没有信息
        check_a = raw_re.xpath("./a")
        #查看总信息是否有分类 ,如果a标签的数量大于1则说明有分类

        if len(check_total) == 0:
            #如果总信息都为0,则直接返回0|0.......
            xpath_result.append("|".join(["0" for i in xrange(0,len(check_a))])+"|") 
            #最后一个"|"是用来隔开url
        elif len(check_a) == 1:
            #如果只有总信息,则直接将信息个数和url拼接
            all_JCXX = check_total[0].strip()
            all_JCXX_url = check_a.xpath("./@href").extract()[0]
            all_JCXX_url = "http://00225516.11315.com" + all_JCXX_url
            xpath_result.append(str(all_JCXX) + "|" + all_JCXX_url)
        else:
            #如果有总信息,且下面有子信息,则迭代将子信息提取出来
            info_me = []
            all_JCXX_url = check_a.xpath("./@href").extract()[0]
            all_JCXX_url = "http://00225516.11315.com" + all_JCXX_url
            for i in xrange(1, len(check_a)+1):
                s = "./a[%s]/text()" %str(i)
                ex_out = raw_re.xpath(s).extract()
                if len(ex_out) == 0:
                    info_me.append("0") #如是a节点没有元素,则加入""
                else:
                    info_me.append(ex_out[0].strip())
            xpath_result.append("|".join(info_me) + "|" + all_JCXX_url)
    return "\001".join(xpath_result) #将个元素用"\001"结合
Esempio n. 9
0
    def parse_user_page(self, response):
        html = response.body

        # f=open('E:/PyCharm/CatPackages/resources/doc/user.html','w')
        # f.write(response.body)
        # f.close()

        html = html.decode('utf-8')

        sel = Selector(text=html)
        uid = sel.xpath('//script').re("\$CONFIG\['oid'\]='(\d+?)';")[0]
        pid = sel.xpath('//script').re("\$CONFIG\['page_id'\]='(\d+?)';")[0]
        name = sel.xpath('//script').re("\$CONFIG\['onick'\]='(.*?)';")[0]
        print(uid)
        print(pid)
        print(name)
        # import pdb
        # pdb.set_trace()

        # (follow_num, fan_num, post_num) = sel.xpath('//script').re("<strong.*?>(\d+)<.*?/strong>")
        # follow_num = int(follow_num)
        # fan_num = int(fan_num)
        # post_num = int(post_num)

        # verify = sel.xpath('//script').re("W_icon icon_verify_(v|club)") is not None

        yield UserItem(uid=uid, pid=pid, name=name)
Esempio n. 10
0
    def parse_detail(self,response):
        item = CrawldetailsItem()
        sel = Selector(response)

        try:
            item["kd"] = response.meta['kd']
            item["title"] = self.get_text(sel,'//*[@id="job_detail"]/dt/h1/@title')
            item["company"] = sel.xpath('//*[@id="container"]/div[2]/dl/dt/a/div/h2/text()').extract()[0].strip()
            item["city"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[2]/text()').extract()[0]
            item["address"] = sel.xpath('//*[@id="container"]/div[2]/dl/dd/div[1]/text()').extract()[0]
            industry = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[1]/li[1]').extract()[0]
            item["industry"] = BeautifulSoup(industry).get_text().encode("utf-8").split(' ')[1].strip()
            scale = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[1]/li[2]').extract()[0]
            item["scale"] = BeautifulSoup(scale).get_text().encode("utf-8").split(' ')[1].strip()
            phase = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[2]/li').extract()[0]
            item["phase"] = BeautifulSoup(phase).get_text().encode("utf-8").split(' ')[1].strip()
            item["salary"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[1]/text()').extract()[0]
            item["experience"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[3]/text()').extract()[0]
            item["education"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[4]/text()').extract()[0]
            item["description"] = self.get_text(sel,'//*[@id="job_detail"]/dd[2]')
            item["url"] = response.url
            item["published"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[3]/text()').extract()[0][:-8]
            item["tag"] = self.get_text(sel, '//*[@id="job_detail"]/dd[1]/p[2]/text()')


        except Exception, e:
            print e
Esempio n. 11
0
    def parse(self, response):
        hxs = Selector(response)
        item = MartinScrapperItem()

        try:
            titles = hxs.xpath(
                '//div[@class="wpb_wrapper"]//div[@class="wpb_wrapper"]/text()'
            ).extract()
            print(titles)
            clean_titles = [
                ct for ct in [
                    re.sub(r'\s+',' ',title).strip() for title in titles
                ] if ct
            ]
            info_list = hxs.xpath(
                    '//div[@class="wpb_wrapper"]//div[@class="wpb_wrapper"]/p/text()'
            ).extract()
            clean_info_list = [
                ct for ct in [
                    re.sub(r'\s+',' ',info).strip() for info in info_list
                ] if ct
            ]
            print(clean_info_list)
            #print(info)
        except KeyError:
            self.log('Unable to find title', level=log.WARNING)
        return item
Esempio n. 12
0
    def parse_homepage(self, response):
        sel = Selector(response)

        def func(node, hot):
            country_url = node.xpath('./@href').extract()[0].strip()
            country_name = node.xpath('./text()').extract()[0].strip()
            ret = node.xpath('./span[@class="en"]/text()').extract()
            country_engname = ret[0].lower().strip() if ret else None

            if 'country' in self.param and country_engname.lower() not in self.param['country']:
                return None

            sights_url = urlparse.urljoin(country_url, './sight')
            m = {"country_name": country_name, "country_url": country_url, "country_popular": hot,
                 "country_engname": country_engname, "sights_url": sights_url}
            return Request(url=sights_url, callback=self.parse_countrysights, meta={"country": m})

        for req in map(lambda node: func(node, False),
                       sel.xpath('//div[@id="allcitylist"]/div[contains(@class,"line")]/ul/li/a[@href]')):
            yield req

        for req in map(lambda node: func(node, True),
                       sel.xpath(
                               '//div[@id="allcitylist"]/div[contains(@class,"line")]/ul/li/p[@class="hot"]/a[@href]')):
            yield req
Esempio n. 13
0
    def pages(self, response):
        """
        提取各种案件的页数,并发起请求
        """
        sel = Selector(text=response.body)
        # self.file_haveRequested.write(response.url+"\n")
        self.cases(response)   #提取首页的内容
        iscontinue = len(sel.xpath("//div[@id='bottom_right_con_five_xsaj']//ul"))
        if iscontinue:  #如果当前页不为空
            try:
                pages = sel.xpath("//div[@id='bottom_right_con_five_xsaj']//script").re("createPageHTML\(([\d]*?),")[0]
                baseurl = response.url
                for i in range(1, int(pages)+1): #fort test
                    fp = self.url_fingerprint(baseurl+"index_"+str(i)+".htm")
                    if fp not in self.url_have_seen:
                        self.url_have_seen.add(fp)
                        yield Request(baseurl+"index_"+str(i)+".htm", 
                            callback = self.cases, dont_filter=False)
                    else:
                        pass
                    # yield Request(baseurl+"index_"+str(i)+".htm", 
                    #         callback = self.cases, dont_filter=False)                    


            except Exception, e:
                log.msg("only_one url==%s== error=%s" %(response.url,\
                    e), level=log.ERROR)
Esempio n. 14
0
    def parse_location(self, response):
        sel = Selector(response)
        print(" **************** LOCATION LIST *************")
        print(response.url)
        print(" **************** LOCATION LIST *************")

        location = sel.xpath("//ul[@class='geoList']")
        for loc in location:
            state_link = loc.xpath("li/a/@href").extract()
            print(" **************** Attraction List starts *************")

            for link in state_link:
                url_link = response.urljoin(link)
                print(url_link)
                # "https://www.tripadvisor.com/Attractions-g34345-Activities-Key_West_Florida_Keys_Florida.html"
                yield scrapy.Request(url_link, callback=self.parse_attraction)
            print(" **************** Attraction List  ends *************")



            # yield scrapy.Request(url_link,callback=self.parse_test)

        locations = sel.xpath("//a[@class='guiArw sprite-pageNext  pid0']/@href").extract()
        print(" **************** LOCATION LIST  PAGINATION  starts *************")
        print(locations)
        print(" **************** LOCATION Link *************")

        for location in locations:
            if location:
                location_link = response.urljoin(location)
                print(location_link)
                yield scrapy.Request(location_link, callback=self.parse_location)
        print(" **************** LOCATION Link *************")

        print(" **************** LOCATION LIST  PAGINATION  ends *************")
    def stepOne(self, response):
        
        hxs = Selector(response)
        # untranslatedRows = hxs.xpath('//table[@id="translations"]/tr[@class="preview untranslated priority-normal no-warnings"]/td[@class="original"]')
        untranslatedRows = hxs.xpath('//table[@id="translations"]/tr[ contains(@class, "untranslated") ]/td[@class="original"]')

        for rows in untranslatedRows:

            aux = WordpressTranslationHackItem()
            aux['originalString'] = ''

            for r in rows.xpath('./child::node()').extract():        
                aux['originalString'] = aux['originalString'] + r.strip() + ' '        
            
            self.untranslated.append( aux )
                
            # print ( self.untranslated[-1] )
            # print ( '------------------' )
            # pdb.set_trace()

        paginaSiguiente = []
        paginaSiguiente = hxs.xpath('//div[@class="paging"]/a[@class="next"]/@href')        

        try:            
            fullUrl_toNextPage = response.urljoin( paginaSiguiente[0].extract() )
            return fullUrl_toNextPage
        except Exception:
            return None
Esempio n. 16
0
def load_annotations(body):
    """Create slybot annotations from annotated html."""
    if not body:
        return {'annotations-plugin': {'extracts': []}}
    sel = Selector(text=add_tagids(body))
    existing_ids = set()
    annotations = []
    for elem in sel.xpath('//*[@data-scrapy-annotate]'):
        attributes = elem._root.attrib
        annotation = json.loads(unquote(attributes['data-scrapy-annotate']))
        if (isinstance(elem._root, _Element) and
                elem._root.tag.lower() == 'ins'):
            annotation.update(find_generated_annotation(elem))
        else:
            annotation['tagid'] = attributes.get('data-tagid')
        if 'id' not in annotation:
            annotation['id'] = gen_id(disallow=existing_ids)
        existing_ids.add(annotation['id'])
        annotations.append(annotation)
    for elem in sel.xpath('//*[@%s]' % '|@'.join(IGNORE_ATTRIBUTES)):
        attributes = elem._root.attrib
        for attribute in IGNORE_ATTRIBUTES:
            if attribute in attributes:
                break
        ignore = {attribute[len('data-scrapy-'):]: True}
        if 'id' not in ignore:
            ignore['id'] = gen_id(disallow=existing_ids)
        existing_ids.add(ignore['id'])
        annotations.append(ignore)
    return {'annotations-plugin': {'extracts': annotations}}
Esempio n. 17
0
 def detail(self, response):
     """
     extract detail info
     """
     sel = Selector(text=response.body)
     
     condition = sel.xpath(self.xpathSen["brand"]).extract()
     if len(condition) != 0:
         xpath_keys = ["type_auto","brand","level","BSX",
              "CSJG","ZWGS","PL","RLXS","QDFS"]
         xpath_conf = ["DDTC","DDTJZY","ESP","GPS","DSXH",
              "DCLD","DGLFXP"]
         keys_info = []
         for xpath_str in xpath_keys:
             tmp = sel.xpath(self.xpathSen[xpath_str]).extract()
             try:
                 keys_info.append(tmp[0])
             except Exception, e:
                 keys_info.append("")
                 log.msg("error info=%s keys_info=%s" %(e, "\001".join(keys_info)), level=log.ERROR)
         
         conf_info = []
         for xpath_s in xpath_conf:
             tmp = sel.xpath(self.xpathSen[xpath_s]).extract()
             try:
                 conf_info.append(tmp[0])
             except Exception, e:
                 conf_info.append("-")
                 log.msg("error info=%s conf_info=%s"%(e, \
                     "\001".join(conf_info)), level=log.ERROR)
 def parse(self, response):
     zip_file = open('CANADA_ZIPCODES.txt', 'r+')
     zip_list = filter(None, zip_file.read().split('\n'))
     for zip_item in zip_list:
         print "*** zip_item"
         print zip_item
         geo_url = 'https://maps.google.com/?q=%s canada'%(zip_item)
         try:
             map_url_content = requests.get(geo_url).content
         except:
             sleep(15)
             map_url_content = requests.get(geo_url).content
         sleep(3)
         sell = Selector(text=map_url_content)
         map_error_1 = sell.xpath(
             '//div[@class="sp-error-msg"]|//div[@class="noprint res"]/div//div[contains(@id,"marker_B")]')
         latlong = ' '.join(sell.xpath('//script').extract()) if not map_error_1 else ''
         lat_lng = re.findall(r'",\[(-?\d+\.?\d*),(-?\d+\.?\d*)\]\]', latlong, re.I)
         venue_latitude, venue_longitude = lat_lng[0] if lat_lng else ('', '')
         print venue_latitude, venue_longitude
         if not venue_latitude or not venue_longitude:
             with open('missing_lat_lng.txt', 'a+') as d:
                 print "*** DROPPED ZIP - %s"%(zip_item)
                 d.write(zip_item+'\n')
             print "NO LATITUDE OR LONGITUDE"
         else:
             fetch_url = 'http://api.invisalign.com/svc/rd?pc=%s&cl=CA&lat=%s&lng=%s&it=us'%(zip_item, venue_latitude, venue_longitude)
             meta_data = {'venue_latitude': venue_latitude,
                          'venue_longitude': venue_longitude,
                          'zip_code': zip_item}
             yield Request(url = fetch_url, dont_filter=True, callback=self.parse_result, meta=meta_data)
Esempio n. 19
0
    def parse_channel(self, response):
        hxs = Selector(response)
        item = response.meta['record']
        item['video_url'] = hxs.xpath("body//div[@id='divVideoHolder']/@videosrc").extract()[0]
        item["title"] = hxs.xpath("body//div[@id='divTitrGrid']/text()").extract()[0]

        return item
Esempio n. 20
0
 def parse_item(self, response):
     items = []
     sel = Selector(response)
     print("test1")
     products = sel.xpath('//*[@id="coreProductInfos"]/div[2]')
    # breadcrumbs = sel.xpath('//div[@id ="contentWrapper"]')\
     table = sel.xpath('//tr[contains(td, "techDataCol")]')
     category = sel.xpath('//*[@id="contentWrapper"]/div[1]/span[2]/a/span/text()').extract()
     print(category)
     for product in products:
         if 'Geheugen' in category:
             item = Memory()
             print (table.xpath('//td/text()').extract())
             item['Category'] = category
             item['Name'] = product.xpath('//td[contains(td[1], "Modelnaam")]/td[2]/table/tbody/tr/td/text()').extract()
             item['Brand'] = product.xpath('//*[@id="details"]/div[4]/div/table/tbody/tr[2]/td[2]/table/tbody/tr/td/text()').extract()
             item['Quantity'] = product.xpath('//tr[contains(td[1], "Aantal")]/td[2]/text()').extract()
             item['Size'] = product.xpath('//tr[contains(td[1], "Modulegrootte")]/td[2]/text()').extract()
             item['PriceGB'] = product.xpath('//tr[contains(td[1], "Prijs per GB")]/td[2]/text()').extract()
             item['Type'] = product.xpath('//tr[contains(td[1], "Geheugentype")]/td[2]/text()').extract()
             item['Specification'] = product.xpath('//tr[contains(td[1], "Geheugen Specificatie")]/td[2]/text()').extract()
             item['LowVoltage'] = product.xpath('//tr[contains(td[1], "Low Voltage DDR")]/td[2]/text()').extract()
             item['Voltage'] = product.xpath('//tr[contains(td[1], "Spanning")]/td[2]/text()'). extract()
             item['Warranty'] = product.xpath('//tr[contains(td[1], "Fabrieksgarantie")]/td[2]/text()').extract()
             item['Ean'] = product.xpath('//tr[contains(td[1], "EAN")]/td[2]/text()').extract()
             item['Sku'] = product.xpath('//tr[contains(td[1], "SKU")]/td[2]/text()').extract()
             print("Geheugen!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
             items.append(item)
         return items
Esempio n. 21
0
    def parse_state_url(self, response):  # draw the state
        sel = Selector(response)
        tempcountryname = sel.xpath(
            '//div[@id="MediaWeatherRegion"]/div[@class="hd"]/div[@class="yom-bread"]/text()').extract()
        match = re.search(r'[\w\s]+$', tempcountryname[0])
        if match:
            countryname = match.group().strip()
        else:
            self.log('没有国家名', log.WARNING)
            return

        data_1 = response.meta['data']

        for node in sel.xpath('//div[@id="page1"]/ul/li/a'):
            state_name = node.xpath('./span/text()').extract()[0].strip()
            state_href = node.xpath('./@href').extract()[0]

            yield Request(url='https://weather.yahoo.com' + state_href, callback=self.parse_city,
                          meta={'data': {'data_1': data_1, 'countryname': countryname, 'state': state_name}})

            country_code = data_1['countrycode']

            # Get states and provinces
            item = YahooCityItem()
            item['country'] = {'countrycode': country_code, 'countryname': countryname}
            item['state'] = state_name
            item['level'] = 1
            item['abroad'] = data_1['abroad']
            yield item
Esempio n. 22
0
	def parse_job_details(self, response):
		hxs = Selector(response)
		item = BrightermondaySampleItem()
		item['link'] = response.url
		item['title'] = hxs.xpath('//h2/text()').extract()[0]
		item['desc'] = hxs.xpath('//article[@class="resultDetail"]/p/text()').extract()[0]
		return item
Esempio n. 23
0
        def xt(cls, response):
            persons = []
            raw_persons = response.xpath(cls.XPATH).extract()
            for raw_person in raw_persons:
                person = Selector(text=raw_person)
                if person.xpath('//th'):
                    continue
                source_link = person.xpath(
                    '//td//a/@href').extract()[0]
                reversed_name = _clean(
                    Selector(
                        text=remove_tags(raw_person, 'img')
                    ).xpath('//td//a/text()').extract()[0])

                (pres_start_date, pres_end_date) = cls.xt_pres_date(
                    raw_person)

                mandate = {
                    'title': u'RechnungshofpräsidentIn',
                    'short': u'RH-PräsidentIn',
                    'start_date': pres_start_date,
                    'end_date': pres_end_date
                }
                persons.append({
                    'source_link': source_link,
                    'reversed_name': reversed_name,
                    'mandate': mandate,
                })

            return persons
Esempio n. 24
0
    def parse(self, response):
        sel = Selector(response)
        data = sel.xpath("//table[@class='infobox']")
        title = sel.xpath("//header[@id='WikiaPageHeader']//h1/text()").extract()[0].strip()
        print "===================NAME======================"
        title = title.replace(" (episode)", "")
        print title

        season_episode_str = data.xpath("normalize-space(tr[2]/td[1]/text())").extract()[0]
        season_id = season_episode_str.split("Season ", 1)[1].rpartition(",")[0]
        episode_id = season_episode_str.split("episode ", 1)[1]
        # this title_card is too small.
        title_card = data.xpath("tr[2]/td/div/div/a/@href").extract()[0]
        # this doesn't work for some things. Scraping for this has moved to ep_detail_2
        # production_code = data.xpath("normalize-space(tr[3]/td/text())").extract()[0]
        e, e_created = Episode.objects.get_or_create(title=title)
        e.season_id = season_id
        e.episode_id = episode_id
        e.link = response.request.url
        e.save()

        # Note for characters. Towards the end, there is /a[1].The [1] is there because I only want the first link.
        # Sometimes something like Hunson Abadeer (name not revealed until "Return to the Nightosphere") will appear.
        # Both Hunson Abadeer and Return ... will be a tags, but Return is obviously not a character.
        characters = sel.xpath("//div[@id='mw-content-text']/*[self::h3 or self::h2][span[@id='Major_characters' or @id='Minor_characters']]/following-sibling::*[1]/li/a[1]/text() | "
                               "//div[@id='mw-content-text']/*[self::h3 or self::h2][span[@id='Major_characters' or @id='Minor_characters']]/following-sibling::*[1]/li/ul/li/a[1]/text()").extract()
        for char in characters:
            c, c_created = Character.objects.get_or_create(name=char)
            e.characters.add(c)
        print title_card
        print characters
    def stepTwo(self, response):
        hxs = Selector(response)
        translatedRows = hxs.xpath('//table[@id="translations"]/tr[ contains(@class, "status-current") ]/td[@class="original"]')
       
        # print ( len(untranslatedRows) )
        # pdb.set_trace()

        for rows in translatedRows:

            aux = ""

            for r in rows.xpath('./child::node()').extract():        
                aux = aux + r.strip() + ' '        
            
            i = self.compareStrings(aux) 
            
            if i is not None:
                #scrapy item
                # traductionItem = W
                # traductionItem['originalString'] = aux
                self.untranslated[i]['translatedString'] = rows.xpath('./..//td[@class="translation foreign-text"]/text()').extract()[0].strip()
                
        paginaSiguiente = []
        paginaSiguiente = hxs.xpath('//div[@class="paging"]/a[@class="next"]/@href')

        try:            
            fullUrl_toNextPage = response.urljoin( paginaSiguiente[0].extract() )
            return fullUrl_toNextPage
        except Exception:
            return None
Esempio n. 26
0
 def parse(self, response):
     selector = Selector(response)
     posts = selector.xpath('//div[@class="articleh"]') + selector.xpath('//div[@class="articleh odd"]')
     for index, post in enumerate(posts):
         item = GubaPostItem()
         item['stock_id'] = re.search('\d+', response.url).group(0)
         item['read_count'] = int(post.xpath('span[@class="l1"]/text()').extract()[0])
         item['comment_count'] = int(post.xpath('span[@class="l2"]/text()').extract()[0])
         item['username'] = post.xpath('span[@class="l4"]/text()').extract()
         item['updated_time'] = post.xpath('span[@class="l5"]/text()').extract()[0]
         link = post.xpath('span[@class="l3"]/a/@href').extract()
         print item['updated_time']
         if link:
             if link[0].startswith('/'):
                 link = "http://guba.eastmoney.com/" + link[0][1:]
             else:
                 link = "http://guba.eastmoney.com/" + link[0]
             item['url'] = link
             yield Request(url=link, meta={'item': item, 'PhantomJS': True}, callback=self.parse_post)
     for pagenum in xrange(2, 5):
         url = response.url.split('_')
         if len(url) == 1:
             nextpage = url[0][:-5] + '_' + str(pagenum) + '.html'
         elif len(url) == 2:
             nextpage = url[0] + '_' + str(pagenum) + '.html'
         else:
             break
         yield Request(url=nextpage, callback=self.parse)
Esempio n. 27
0
    def parse_item(self, response):

        selector = Selector(response)
        companyInfo = selector.xpath('//td[@class="cont_company"]//td[@class="td_r"]/text()')
        jobInfo = selector.xpath('//*[@id="DataList1"]//table/tr')
        contactInfo = selector.xpath('//td[@class="cont_contact"]')
        contact_text = contactInfo.xpath('text()').extract()[0] + ' ' + contactInfo.xpath('text()').extract()[1] + ' ' + contactInfo.xpath('text()').extract()[2]

        #print self.mailre.findall(contact_text)
        #print self.phonePartern.match(contactInfo.xpath('text()').extract()[0])
        #print self.emainPartern(contactInfo.xpath('text()').extract()[1])
        #print (contactInfo.xpath('text()').extract()[2]).replace(' ','')

        for each in jobInfo:
            item = TsrcwItem()
            print each.extract()
            jobList = []
            try:
                for i in each.xpath('td[@class="td-grey"]/text()'):
                    if not (i.extract()).strip() == "":
                        jobList.append((i.extract()).strip())
                item['email'] = self.mailre.findall(contact_text)[0]
                item['companyName'] = (companyInfo.extract()[0]).strip()
                item['industryName'] = (companyInfo.extract()[1]).strip()
                item['companyNature'] = (companyInfo.extract()[2]).strip()
                item['jobName'] = (each.xpath('td[@class="td-grey"]/a/text()').extract()[0]).strip()
                item['jobDetail'] = self.baseUrl+(each.xpath('td[@class="td-grey"]/a/@href').extract()[0]).strip()
                item['jobRegion'] = jobList[0]
                item['requiredDegree'] = jobList[1]
                item['salary'] = jobList[2]
                item['endDate'] = jobList[3]
                yield item
            except Exception,e:
                continue
    def parse_item_page(self, response):
        sel = Selector(response)
        item = response.meta['item']

        review_urls = sel.xpath('//a[@class="a-link-emphasis a-text-bold"]/@href').extract()
        self.logger.debug('if this product has review: ' + str(len(review_urls)))
        if review_urls:
            this_review_url = review_urls[0]
            ix = 0

            while True:
                yield Request(url=this_review_url, meta={'item': item}, callback=self.parse_review_content_page)
                response = requests.get(this_review_url)
                this_review_url_sel = Selector(text=response.text)
                next_review_urls = this_review_url_sel.xpath('//li[@class="a-last"]/a/@href').extract()
                self.logger.debug('next_review_urls :'.join(next_review_urls))
                if next_review_urls:
                    this_review_url = next_review_urls[0]
                    this_review_url = 'http://www.amazon.com' + this_review_url
                    self.logger.debug(this_review_url)
                    ix += 1
                    self.logger.debug(ix)
                else:
                    break
        else:
            yield Request(url=response.url, meta={'item': item}, callback=self.return_invalid_review)
Esempio n. 29
0
    def parse_item(self, response):
        video = ItemLoader(item=YoutubeVideoItem(), response=response)

        info = response.xpath(
            '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[5]/div[2]/ytd-video-primary-info-renderer/div/div/div[3]/ytd-sentiment-bar-renderer/paper-tooltip/div/text()'
        ).get()
        comments = response.xpath('//*[@id="body"]').getall()

        if info is not None:
            info = info.split("/")

        video.add_value('url', '')
        video.add_value(
            'date',
            response.xpath(
                '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[5]/div[2]/ytd-video-primary-info-renderer/div/div/div[1]/div[2]/yt-formatted-string/text()'
            ).get())
        video.add_value(
            'title',
            response.xpath(
                '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[5]/div[2]/ytd-video-primary-info-renderer/div/h1/yt-formatted-string/text()'
            ).get())
        video.add_value(
            'views',
            response.xpath(
                '/html/body/ytd-app/div/ytd-page-manager/ytd-watch-flexy/div[4]/div[1]/div/div[5]/div[2]/ytd-video-primary-info-renderer/div/div/div[1]/div[1]/yt-view-count-renderer/span[1]/text()'
            ).get())
        video.add_value(
            'category',
            response.xpath(
                '//*[@class="content content-line-height-override style-scope ytd-metadata-row-renderer"]/a/text()'
            ).get())

        if info is None:
            info = []
            for i in range(0, 2):
                info.append("0")

        video.add_value('likes', info[0].strip())
        video.add_value('dislikes', info[1].strip())

        for comment in comments:
            s = Selector(text=comment)
            item = ItemLoader(item=YoutubeCommentItem(), response=response)
            item.add_value('id', s.xpath('//*[@id="author-text"]/@href').get())
            item.add_value(
                'date',
                s.xpath(
                    '//*[@class="published-time-text above-comment style-scope ytd-comment-renderer"]/a/text()'
                ).get())
            item.add_value(
                'name',
                self.clean(
                    s.xpath('//*[@id="author-text"]/span/text()').get()))
            item.add_value(
                'picture',
                s.xpath(
                    '//div[@id="author-thumbnail"]/a/yt-img-shadow/img/@src').
                get())
            item.add_value('content',
                           s.xpath('//*[@id="content-text"]/text()').get())
            item.add_value(
                'likes',
                self.clean(
                    s.xpath('//*[@id="vote-count-middle"]/text()').get()))

            video.add_value('comments', item.load_item())

        yield video.load_item()
Esempio n. 30
0
# encoding: utf-8
from scrapy import Selector

__author__ = 'mtianyan'
__date__ = '2018/1/25 0025 21:26'
import requests

response = requests.get(
    "https://www.aqistudy.cn/historydata/daydata.php?city=%E6%9C%AC%E6%BA%AA&month=201502"
)
sel = Selector(response)
list = sel.xpath("//tr").extract()[1:]
pass
Esempio n. 31
0
    def parse_coin_detail_info(self, response):
        selector = Selector(response)
        coin = Coin()

        # current price 当前价格数据
        coin_price = selector.xpath('//div[@class="coinprice"]').extract()
        current_price = re.findall(r'<div class="coinprice">(.*?)<span',
                                   coin_price[0], re.S)
        if len(current_price) is not 0:
            coin['price'] = current_price[0]
            coin['time'] = datetime.utcnow().replace(tzinfo=utc)
            print(coin['price'], ' ', coin['time'])

        # lowest price and highest price最高和最低价格数据
        low_height = selector.xpath('//div[@class="lowHeight"]').extract()
        prices = re.findall(
            r'<div class="lowHeight">.*?<span class="value">(.*?)</span></div>.*?<div>.*?<span class="value">(.*?)</span></div>',
            low_height[0], re.S)
        if len(prices) is not 0:
            coin['highest_price'] = prices[0][0]
            coin['lowest_price'] = prices[0][1]
            print(coin['highest_price'], ' ', coin['lowest_price'])

        # description币的描述数据
        desc = selector.xpath('//div[@class="des"]/a').extract()
        description = re.findall(r'<a href="(.*?)" target="_blank">', desc[0],
                                 re.S)
        if len(description) is not 0:
            desc_url = base_url + description[0]
            print(desc_url)
            response = requests.get(desc_url)
            desc_selector = Selector(response)
            desc_content = desc_selector.xpath(
                '//div[@class="boxContain"]/div/p').extract()
            coin['description'] = self.tool.replace(''.join(
                i.strip() for i in desc_content))
            print(coin['description'])

        # market市场相关信息
        market = selector.xpath(
            '//div[@id="baseInfo"]/div[@class="firstPart"]/div/div[@class="value"]'
        ).extract()
        values = []
        for value in market:
            market_value = re.findall(r'<div class="value">(.*?)<', value,
                                      re.S)
            values.append(market_value[0])

        if len(values) is not 0:
            coin['market_capitalization'] = values[0]  # 流通市值
            coin['market_count'] = values[1]  # 流通量
            coin['publish_count'] = values[2]  # 发行量
            coin['tx_count'] = values[3]  # 交易额
            print(coin['market_capitalization'], ' ', coin['market_count'],
                  ' ', coin['publish_count'], ' ', coin['tx_count'])

        # base info列表基本信息数据
        items = selector.xpath(
            '//div[@id="baseInfo"]/div[@class="secondPark"]/ul/li').extract()
        for item in items:
            base_info = re.findall(
                r'<li>.*?<span class="tit">(.*?)</span>.*?<span class="value">(.*?)</span>.*?</li>',
                item, re.S)
            if len(base_info) is not 0:
                if base_info[0][0] == '英文名:':
                    coin['english_name'] = self.tool.replace(
                        base_info[0][1]).strip()
                    print(coin['english_name'])
                elif base_info[0][0] == '中文名:':
                    coin['chinese_name'] = self.tool.replace(
                        base_info[0][1]).strip()
                    print(coin['chinese_name'])
                elif base_info[0][0] == '上架交易所:':
                    coin['exchanger_count'] = self.tool.replace(
                        base_info[0][1]).strip()
                    print(coin['exchanger_count'])
                elif base_info[0][0] == '发行时间:':
                    coin['publish_time'] = self.tool.replace(
                        base_info[0][1]).strip()
                    print(coin['publish_time'])
                elif base_info[0][0] == '白皮书:':
                    coin['white_paper'] = self.tool.replace(
                        base_info[0][1]).strip()
                    print(coin['white_paper'])
                elif base_info[0][0] == '网站:':
                    websites = re.findall(
                        r'<a href="(.*?)" rel="nofollow" target="_blank">',
                        base_info[0][1], re.S)
                    if len(websites) is not 0:
                        office_websites = []
                        for website in websites:
                            office_websites.append(
                                self.tool.replace(website).strip())
                        coin['website'] = office_websites
                        print(coin['website'])
                elif base_info[0][0] == '区块站:':
                    explorers = []
                    block_explorers = re.findall(
                        r'<a href="(.*?)" rel="nofollow" target="_blank">',
                        base_info[0][1], re.S)
                    if block_explorers is not []:
                        for block_explorer in block_explorers:
                            explorers.append(
                                self.tool.replace(block_explorer).strip())
                        coin['block_explorer'] = explorers
                        print(coin['block_explorer'])
                elif base_info[0][0] == '是否代币:':
                    coin['is_token'] = self.tool.replace(
                        base_info[0][1]).strip()
                    print(coin['is_token'])
                elif base_info[0][0] == '众筹价格:':
                    ico_price = re.findall(r'<a href="#ico">(.*?)</a>',
                                           base_info[0][1], re.S)
                    coin['ico_price'] = self.tool.replace(ico_price[0]).strip()
                    print(coin['ico_price'])

        yield coin
def download():
    try:
        cur.execute(
            "select class_id, doc_website from jdk_class where class_id <= 4240"
        )
        lists = cur.fetchall()
        for every in lists:
            print every[0]
            while 1 == 1:
                try:
                    sel = Selector(requests.get(every[1], timeout=10))
                except Exception, e:
                    print 'timeout'
                    continue
                break

            block_list = sel.xpath('//div[@class="details"]/ul/li/ul')
            for block in block_list:
                details = block.xpath('li/ul')
                for each in details:
                    full_declaration = each.xpath('li/pre').extract()[0]
                    method_name = each.xpath('li/h4/text()').extract()[0]
                    # print method_name
                    cur.execute(
                        "select method_id from jdk_method where full_declaration = '"
                        + full_declaration + "' and name = '" + method_name +
                        "' and class_id = " + str(every[0]))
                    method_id = cur.fetchall()[0][0]
                    # print "method_id: " + str(method_id)
                    if each.xpath('li/dl/dt/span[@class="throwsLabel"]'):
                        if each.xpath(
                                'li/dl/dt/span[@class="throwsLabel"]/text()'
                        ).extract()[0] == "Throws:":
                            exception_count = 0
                            exceptions = []
                            following_tags_dd = each.xpath(
                                'li/dl/dt/span[@class="throwsLabel"]/parent::*/following-sibling::dd'
                            )
                            if each.xpath(
                                    'li/dl/dt/span[@class="throwsLabel"]/parent::*/following-sibling::dt'
                            ):
                                following_tags_dt = each.xpath(
                                    'li/dl/dt/span[@class="throwsLabel"]/parent::*/following-sibling::dt'
                                )
                                next_dt = following_tags_dt[0]
                                preceding_tags_dd = next_dt.xpath(
                                    'preceding-sibling::dd')
                                set_following_tags_dd = set(
                                    list(following_tags_dd.extract()))
                                set_preceding_tags_dd = set(
                                    list(preceding_tags_dd.extract()))

                                exceptions = list(set_following_tags_dd
                                                  & set_preceding_tags_dd)
                                exception_count = len(exceptions)
                            else:
                                exception_count = len(following_tags_dd)
                                exceptions = following_tags_dd.extract()

                            print exception_count
                            print exceptions

                            for ex in exceptions:
                                temp_str = ex[:ex.find("</code>")]
                                temp_str = temp_str.replace(
                                    "<dd>",
                                    "").replace("<code>",
                                                "").replace("</a>", "")
                                exception_class = temp_str[temp_str.find(">") +
                                                           1:]
                                print exception_class
                                description = ex[ex.find("</code>") +
                                                 9:].replace(
                                                     "</dd>", "").replace(
                                                         "\n", "").replace(
                                                             "  ", "").strip()
                                if description == "dd>":
                                    description = ''
                                print description
                                cur.execute(
                                    "insert into jdk_exception(name, class_id, method_id, description) values(%s, %s, %s, %s)",
                                    (exception_class, every[0], method_id,
                                     description))
                                conn.commit()

    except Exception, e:
        print Exception, ":", e
Esempio n. 33
0
def parse(html, source_url=u''):
    response = Selector(text=html)
    # 处理内容区
    content_html = response.xpath(u'//div[@class="lph-article-comView"]')
    if not content_html:
        return

    # 去除内部不需要的标签
    content_items = content_html.xpath(u'*[not(name(.)="script") '
                                       u' and not(name(.)="style")'
                                       u' and not(name(.)="a")'
                                       u' and not(name(.)="iframe")]|text()')
    if not content_items:
        return

    # 处理时间
    post_date = response.xpath(u'//td[@class="time"]/text()').extract_first(
        u'').strip()

    # 处理作者
    post_user = response.xpath(u'//a[@rel="nofollow"]/text()').extract_first(
        u'')

    # 处理来源
    src_ref = u'雷锋网'

    # 组装新的内容标签
    content_html = u"""<div class="lphArticle-detail">
                            <div class="lph-article-comView">
                             %s
                           </div>
                      </div>
                   """ % (u''.join(content_items.extract()), )

    content_html = content_html.replace(u'https://static.leiphone.com/uploads/new/category/pic/201801/5a5dd347356f7'
                                        u'.jpg?imageMogr2/thumbnail/!740x140r/gravity/Center/crop/740x140/quality/90'
                                        u'', u'')\
        .replace(u'雷锋网原创文章,未经授权禁止转载。详情见。', '')\
        .replace(u'雷锋网原创文章,未经授权禁止转载。详情见', '')\
        .replace(u'<a href="http://dwz.cn/4ErMxZ" rel="nofollow" target="_blank">转载须知</a>。', u'') \
        .replace(u'转载须知。', u'') \
        .replace(u'转载须知', u'') \
        .replace(u'雷锋网版权文章,未经授权禁止转载。详情见。', u'')\
        .replace(u'雷锋网版权文章,未经授权禁止转载。详情见', u'')

    # 去除不要的标签内容
    clear_paths_in = []
    style_in_list = []
    style_need_replace = []

    title = response.xpath(
        u'//meta[@property="og:title"]/@content | //title/text()'
    ).extract_first(u'')

    content_item = {
        u'title': title,
        u'content_html': content_html,
        u'post_date': post_date,
        u'style_in_list': style_in_list,
        u'style_need_replace': style_need_replace,
        u'clear_paths_in': clear_paths_in
    }

    return content_item
Esempio n. 34
0
 def parse(self, response):
     root = Selector(response)
     item = HorseRacingItem()
     for each in root.xpath('//select[@id="raceDateSelect"]'):
         item['date_pages'] = each.xpath('.//option/@value').extract()
         return item
Esempio n. 35
0
    def parse_house_info(self, resp):
        """
        解析二手房信息
        :return:
        """
        item = dict()
        response = Selector(resp)
        generalXpath = "//span[text()='{}']/../text()"
        # 链家编号
        item['houseCode'] = response.xpath(
            "//div[@class='houseRecord']/span[2]/text()").extract_first(
                "").strip()
        # 小区名
        item['houseName'] = response.xpath(
            "//div[@class='communityName']/a[1]/text()").extract_first(
                "").strip()
        # 朝向
        item['houseDirection'] = response.xpath(
            generalXpath.format("房屋朝向")).extract_first("").strip()
        # 户型
        item['houseType'] = response.xpath(
            generalXpath.format("房屋户型")).extract_first("").strip()
        # 电梯
        item['houseElevator'] = response.xpath(
            generalXpath.format("配备电梯")).extract_first("").strip()
        # 区域
        item['houseAddress'] = response.xpath(
            "//div[@class='areaName']/a/text()").extract_first("").strip()
        item['houseDistrict'] = response.xpath(
            "//div[@class='areaName']/span[@class='info']/a[2]/text()"
        ).extract_first("").strip()
        item['houseRegion'] = response.xpath(
            "//div[@class='areaName']/span[@class='info']/a[1]/text()"
        ).extract_first("").strip()
        # 楼层
        item['houseFloor'] = response.xpath(
            generalXpath.format("所在楼层")).extract_first("").strip()
        # 建筑面积
        item['houseSize'] = response.xpath(
            generalXpath.format("建筑面积")).extract_first("").strip()
        # 装修情况
        item['houseStatus'] = response.xpath(
            generalXpath.format("装修情况")).extract_first("").strip()
        # 每平米价格
        item['houseUnitPrice'] = response.xpath(
            "//span[@class='unitPriceValue']/text()").extract_first(
                "").strip()
        # 总价
        item['houseAllPrice'] = response.xpath(
            "//div[@class='price ']/span[@class='total']/text()"
        ).extract_first("").strip()
        # 建设时间
        item['houseYear'] = response.xpath(
            "//div[@class='area']/div[@class='subInfo']/text()").re_first(
                r"(\d+)")

        # 原文链接
        item['url'] = resp.url

        # 经纬度
        postions = self.pattern_position.search(resp.text)
        # 获取坐标
        item['Longitude'] = postions.group(1)
        item['Latitude'] = postions.group(2)
        self.db.update_set('houseCode', item)
        self.lianjia_spider_log.info(f'parse item success:{resp.url}')
Esempio n. 36
0
          </tr>
        </table>
    </div>

</body>
</html>
'''
#xpath定位一个tap并不是唯一的,可以用很多xpath定位一个tap

#获取一个html的xpath
sel = Selector(text=html)

#xpath是让整个tap都是可配置的
#因为tap可能会随时变,这样通过改变量,方便管理
age_name_xpath = "//div[1]/div/p[1]/text()"
age_name_tap = sel.xpath(age_name_xpath).extract()
#加个if是防止报错
if age_name_tap:
    name = age_name_xpath[0]

#使用class时,需要把全部的class都包含了才行
teacher_tap = sel.xpath(
    "//div[@class='teacher_info info']/p/text()").extract()[0]

#可以使用contains方法
teacher_tap = sel.xpath(
    "//div[contains(@class,'teacher_info')]/p/text()").extract()[0]

#找到class元素,用@class
teacher_class = sel.xpath(
    "//div[contains(@class,'teacher_info')]/@class").extract()[0]
Esempio n. 37
0
    def parse(self, response):
        sites = json.loads(response.text)

        spider_name = response.meta['spider_name']

        #网页html
        data = sites["items_html"]
        min_position = sites["min_position"]

        #第一条twitter
        position = ''

        if 'max_position' in sites:
            position = sites["max_position"]
        else:
            position = min_position.split('-')[2]


        if data == "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n":
            print ("抓取完成!!!,更新种子")
            self.db.updateSeedTag(spider_name)
            self.db.updateSeedCountLocation(spider_name, position)
        else:
            #是否还有下一页
            #has_more_items = sites["has_more_items"]

            item = SpiderTwitterItem()

            # 获得贴文作者
            twitter_author = re.compile('data-name="(.+)" data-user-id=').findall(data)[0]

            selector_app = Selector(text=data)
            twitter_group = selector_app.xpath("//li[@class='js-stream-item stream-item stream-item\n']").extract()
            twitter_group_count = len(twitter_group)

            next_page_id = ""

            for twitter_personal in twitter_group:
                selector_content = Selector(text=twitter_personal)
                twitter_id = selector_content.xpath("//li[@class='js-stream-item stream-item stream-item\n']/@data-item-id").extract()

                if len(twitter_id) > 0:
                    next_page_id = twitter_id[0]

                    if self.db.getTwitterById(next_page_id):

                        # 判断是否是爬取到之前记录位置
                        if self.db.isSeedLocation(spider_name, next_page_id):

                            print ("%s最新推文抓取完毕"%spider_name)
                            self.db.updateSeedCountLocation(spider_name, position)
                            return

                        print ("%s已存在,进行去重过滤"%next_page_id)
                        continue
                    else:
                        item['twitter_id'] = twitter_id

                else:
                    item['twitter_id'] = ''

                twitter_content_whole = ""
                twitter_content_list = selector_content.xpath("//div[@class='js-tweet-text-container']").extract()

                for twitter_content in twitter_content_list:
                    selector_content_text = Selector(text=twitter_content)
                    twitter_content_text = selector_content_text.xpath("//text()").extract()
                    twitter_content_text_num = len(twitter_content_text)
                    for i in range(twitter_content_text_num):
                        if twitter_content_text[i] != "  " and twitter_content_text[i] != "\n  ":
                            twitter_content_add = twitter_content_text[i].replace("\n","")
                            twitter_content_whole += twitter_content_add

                twitter_content_whole_trun = twitter_content_whole.replace('"','\\"')
                twitter_href = selector_content.xpath("//small[@class='time']/a/@href").extract()
                twitter_time = selector_content.xpath("//small[@class='time']/a/@title").extract()
                twitter_num = selector_content.xpath("//span[@class='ProfileTweet-actionCountForAria']/text()").extract()
               
                if len(twitter_num) > 0:
                    twitter_reply = twitter_num[0]
                    twitter_trunsmit = twitter_num[1]
                    twitter_zan = twitter_num[2]
                else:
                    twitter_reply = ''
                    twitter_trunsmit = ''
                    twitter_zan = ''

                twitter_img = selector_content.xpath("//div[@class='AdaptiveMedia-photoContainer js-adaptive-photo ']/@data-image-url").extract()
                print ("目标:%s" % twitter_id[0])
                print ("内容:%s" % twitter_content_whole_trun)
                if len(twitter_author) > 0:
                    author = twitter_author
                    item['twitter_author'] = author
                else:
                    item['twitter_author'] = ''
                if len(twitter_id) > 0:
                    tw_id = twitter_id[0]
                    item['twitter_id'] = tw_id
                else:
                    item['twitter_id'] = ''
                if twitter_content_whole:
                    content = twitter_content_whole_trun
                    item['twitter_content'] = content
                else:
                    item['twitter_content'] = ''
                if len(twitter_href) > 0:
                    href = "https://twitter.com%s"%twitter_href[0]
                    item['twitter_href'] = href
                else:
                    item['twitter_href'] = ''
                if len(twitter_time) > 0:
                    time = twitter_time[0]
                    item['twitter_time'] = time
                else:
                    item['twitter_time'] = ''
                if len(twitter_num) > 0:
                    reply = twitter_reply
                    item['twitter_reply'] = reply
                else:
                    item['twitter_reply'] = ''
                if len(twitter_num) > 0:
                    trunsmit = twitter_trunsmit
                    item['twitter_trunsmit'] = trunsmit
                else:
                    item['twitter_trunsmit'] = ''
                if len(twitter_num) > 0:
                    zan = twitter_zan
                    item['twitter_zan'] = zan
                else:
                    item['twitter_zan'] = ''
                if len(twitter_img) == 1:
                    img = twitter_img[0]
                    item['twitter_img'] = img
                elif len(twitter_img) > 1:
                    img_list = []
                    for img in twitter_img:
                        img_list.append(img)
                    item['twitter_img'] = img_list
                else:
                    item['twitter_img'] = ''
                yield item

            print ("下一页等待中...")

            #has_more_items 为true 代表还有下一页
            yield Request(url=self.next_page_url.format(spider_name,self.now_time, next_page_id, position), callback=self.parse,headers={'Referer': "https://twitter.com/"}, meta={'spider_name': spider_name})
Esempio n. 38
0
    def parse(self, response):
        driver = response.meta['driver']
        driver.maximize_window()
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[0])
        for _, value in self.df.iterrows():
            cntr = 199
            while True:
                location = value['Location']
                category = value['Category']
                subCat = value['Subcategory']
                url = f"{value['URL']}{cntr}"
                driver.get(url)
                cntr += 1
                WebDriverWait(driver, 15).until(
                    EC.visibility_of_element_located((
                        By.XPATH,
                        "(//div[@class='js-project-group'])[2]//h3/parent::a[@class='soft-black mb3']"
                    )))

                html = driver.page_source
                respObj = Selector(text=html)

                count = respObj.xpath(
                    "normalize-space(//b[contains(@class, 'count')]/text())"
                ).get()
                pCount = int("".join(re.findall(r'\d+', count)))

                driver.switch_to.window(driver.window_handles[1])
                items = respObj.xpath(
                    "(//div[@class='js-project-group'])[2]//h3/parent::a[@class='soft-black mb3']"
                )
                for item in items:
                    title = item.xpath("normalize-space(.//h3/text())").get()
                    if title not in self.li:
                        self.li.append(title)
                        url = item.xpath(".//@href").get()
                        driver.get(url)
                        time.sleep(1)
                        WebDriverWait(driver, 15).until(
                            EC.visibility_of_element_located((
                                By.XPATH,
                                "//a[@data-modal-title='About the creator']")))
                        html1 = driver.page_source
                        respObj1 = Selector(text=html1)
                        title = respObj1.xpath(
                            "normalize-space(//h2/span/a/text())").get()
                        creator = respObj1.xpath(
                            "normalize-space(//a[@data-modal-title='About the creator']/text())"
                        ).get()
                        backers = respObj1.xpath(
                            "normalize-space(//b[contains(text(), 'backers')]/text())"
                        ).get()
                        money = respObj1.xpath(
                            "normalize-space(//span[@class='money']/text())"
                        ).get()
                        driver.find_element_by_xpath(
                            "//a[@data-modal-title='About the creator']"
                        ).click()
                        time.sleep(2)
                        html2 = driver.page_source
                        respObj2 = Selector(text=html2)
                        yield {
                            'Title':
                            title,
                            'Creator':
                            creator,
                            'Backers':
                            backers.replace(" backers", ""),
                            'Money':
                            money,
                            'Website':
                            respObj2.xpath(
                                "//h4[contains(text(), 'Websites')]/following-sibling::ul/li/a/@href"
                            ).getall(),
                            'Location':
                            location,
                            'Category':
                            category,
                            'Sub Category':
                            subCat
                        }
                    else:
                        pass
                driver.switch_to.window(driver.window_handles[0])
                a = pCount // 12
                if pCount % 12 != 0:
                    a += 1
                else:
                    a += 0
                if cntr > 200:
                    break
Esempio n. 39
0
 def parse(self, response):
     user_item = UserItem()
     user_item['crawl_time'] = int(time.time())
     selector = Selector(response)
     user_item['_id'] = re.findall('(\d+)/info', response.url)[0]
     user_info_text = ";".join(
         selector.xpath('body/div[@class="c"]//text()').extract())
     nick_name = re.findall('昵称;?:?(.*?);', user_info_text)
     gender = re.findall('性别;?:?(.*?);', user_info_text)
     place = re.findall('地区;?:?(.*?);', user_info_text)
     brief_introduction = re.findall('简介;?:?(.*?);', user_info_text)
     birthday = re.findall('生日;?:?(.*?);', user_info_text)
     sex_orientation = re.findall('性取向;?:?(.*?);', user_info_text)
     sentiment = re.findall('感情状况;?:?(.*?);', user_info_text)
     vip_level = re.findall('会员等级;?:?(.*?);', user_info_text)
     authentication = re.findall('认证;?:?(.*?);', user_info_text)
     labels = re.findall('标签;?:?(.*?)更多>>', user_info_text)
     if nick_name and nick_name[0]:
         user_item["nick_name"] = nick_name[0].replace(u"\xa0", "")
     if gender and gender[0]:
         user_item["gender"] = gender[0].replace(u"\xa0", "")
     if place and place[0]:
         place = place[0].replace(u"\xa0", "").split(" ")
         user_item["province"] = place[0]
         if len(place) > 1:
             user_item["city"] = place[1]
     if brief_introduction and brief_introduction[0]:
         user_item["brief_introduction"] = brief_introduction[0].replace(
             u"\xa0", "")
     if birthday and birthday[0]:
         user_item['birthday'] = birthday[0]
     if sex_orientation and sex_orientation[0]:
         if sex_orientation[0].replace(u"\xa0", "") == gender[0]:
             user_item["sex_orientation"] = "同性恋"
         else:
             user_item["sex_orientation"] = "异性恋"
     if sentiment and sentiment[0]:
         user_item["sentiment"] = sentiment[0].replace(u"\xa0", "")
     if vip_level and vip_level[0]:
         user_item["vip_level"] = vip_level[0].replace(u"\xa0", "")
     if authentication and authentication[0]:
         user_item["authentication"] = authentication[0].replace(
             u"\xa0", "")
     if labels and labels[0]:
         user_item["labels"] = labels[0].replace(u"\xa0",
                                                 ",").replace(';',
                                                              '').strip(',')
     education_info = selector.xpath('//div[contains(text(),"学习经历")]/following-sibling::div[1]'). \
         xpath('string(.)').extract()
     if education_info:
         user_item['education'] = education_info[0].replace(u"\xa0", "")
     work_info = selector.xpath('//div[contains(text(),"工作经历")]/following-sibling::div[1]'). \
         xpath('string(.)').extract()
     if work_info:
         user_item['work'] = work_info[0].replace(u"\xa0", "")
     request_meta = response.meta
     request_meta['item'] = user_item
     yield Request(self.base_url + '/u/{}'.format(user_item['_id']),
                   callback=self.parse_further_information,
                   meta=request_meta,
                   dont_filter=True,
                   priority=1)
Esempio n. 40
0
class JCpenneySpider(BaseCheckoutSpider):
    name = 'jcpenney_checkout_products'
    allowed_domains = ['jcpenney.com'
                       ]  # do not remove comment - used in find_spiders()

    SHOPPING_CART_URL = 'http://www.jcpenney.com/jsp/cart/viewShoppingBag.jsp'
    CHECKOUT_PAGE_URL = "https://www.jcpenney.com/dotcom/" \
                        "jsp/checkout/secure/checkout.jsp"

    def start_requests(self):
        yield scrapy.Request('http://www.jcpenney.com/')

    def _get_colors_names(self):
        swatches = self._find_by_xpath(
            '//ul[@class="small_swatches"]'
            '/li[not(@class="sku_not_available_select")]'
            '//a[not(span[@class="no_color"]) and '
            'not(span[@class="color_illegal"])]/img')
        return [x.get_attribute("name") for x in swatches]

    def select_size(self, element=None):
        default_attr_xpath = '*//div[@id="skuOptions_size"]//' \
                             'li[@class="sku_select"]'
        avail_attr_xpath = '*//*[@id="skuOptions_size"]//' \
                           'li[not(@class="sku_not_available" or @class="sku_illegal")]/a'
        self.select_attribute(default_attr_xpath, avail_attr_xpath, element)

    def select_color(self, element=None, color=None):
        default_attr_xpath = '*//li[@class="swatch_selected"]'
        avail_attr_xpath = ('*//*[@class="small_swatches"]'
                            '//a[not(span[@class="no_color"]) and '
                            'not(span[@class="color_illegal"])]')

        if color and color in self.available_colors:
            default_attr_xpath = '*//*[@class="small_swatches"]//a' \
                                 '[img[@name="%s"]]' % color

        self.select_attribute(default_attr_xpath, avail_attr_xpath, element)
        self._find_by_xpath('//h1')[0].click()
        time.sleep(1)

    def click_condition(self, default_xpath, all_xpaths):
        return self._find_by_xpath(default_xpath) or self._find_by_xpath(
            all_xpaths)

    def select_attribute(self, default_attr_xpath, avail_attr_xpath, element):
        max_retries = 20
        retries = 0
        if self.click_condition(default_attr_xpath, avail_attr_xpath):
            self._click_attribute(default_attr_xpath, avail_attr_xpath,
                                  element)
            while self.driver.find_elements(
                    By.ID, 'page_loader') and retries < max_retries:
                time.sleep(1)
                retries += 1
            print(inspect.currentframe().f_back.f_code.co_name)

    def select_width(self, element=None):
        default_attr_xpath = '*//div[@id="skuOptions_width"]//' \
                             'li[@class="sku_select"]'
        avail_attr_xpath = '*//*[@id="skuOptions_width"]//' \
                           'li[not(@class="sku_not_available" or @class="sku_illegal")]/a'

        self.select_attribute(default_attr_xpath, avail_attr_xpath, element)

    def select_waist(self, element=None):
        default_attr_xpath = (
            '*//*[@id="skuOptions_waist"]//li[@class="sku_select"]')
        avail_attr_xpath = ('*//*[@id="skuOptions_waist"]//'
                            'li[not(@class="sku_not_available" '
                            'or @class="sku_illegal")]')

        self.select_attribute(default_attr_xpath, avail_attr_xpath, element)

    def select_inseam(self, element=None):
        default_attr_xpath = (
            '*//*[@id="skuOptions_inseam"]//li[@class="sku_select"]')
        avail_attr_xpath = ('*//*[@id="skuOptions_inseam"]//'
                            'li[not(@class="sku_not_available" '
                            'or @class="sku_illegal")]')

        self.select_attribute(default_attr_xpath, avail_attr_xpath, element)

    def select_neck(self, element=None):
        default_attr_xpath = (
            '*//*[@id="skuOptions_neck size"]//li[@class="sku_select"]')

        avail_attr_xpath = ('*//*[@id="skuOptions_neck size"]//'
                            'li[not(@class="sku_not_available" '
                            'or @class="sku_illegal")]')

        self.select_attribute(default_attr_xpath, avail_attr_xpath, element)

    def select_sleeve(self, element=None):
        default_attr_xpath = (
            '*//*[@id="skuOptions_sleeve"]//li[@class="sku_select"]')

        avail_attr_xpath = ('*//*[@id="skuOptions_sleeve"]//'
                            'li[not(@class="sku_not_available" '
                            'or @class="sku_illegal")]')

        self.select_attribute(default_attr_xpath, avail_attr_xpath, element)

    def _parse_attributes(self, product, color, quantity):
        time.sleep(10)
        self.select_color(product, color)
        self.select_size(product)
        self.select_width(product)
        self.select_waist(product)
        self.select_inseam(product)
        self.select_neck(product)
        self.select_sleeve(product)
        self._set_quantity(product, quantity)

    def _get_products(self):
        return self._find_by_xpath(
            '//*[@id="regularPP"]|//*[contains(@class,"product_row")]')

    def _add_to_cart(self):
        addtobagbopus = self._find_by_xpath('//*[@id="addtobagbopus"]')
        addtobag = self._find_by_xpath('//*[@id="addtobag"]')

        if addtobagbopus:
            self._click_on_element_with_id('addtobagbopus')
        elif addtobag:
            self._click_on_element_with_id('addtobag')
        time.sleep(5)

    def _do_others_actions(self):
        skip_this_offer = self._find_by_xpath(
            '//a[contains(@href,"javascript:skipThisOffer")]')
        if skip_this_offer:
            skip_this_offer[0].click()
            time.sleep(4)

    def _set_quantity(self, product, quantity):
        quantity_option = Select(
            self.driver.find_element_by_xpath('*//*[@name="prod_quantity"]'))
        try:
            quantity_option.select_by_value(str(quantity))
            quantity_selected = quantity_option.first_selected_option.text
            if quantity_selected != str(quantity):
                time.sleep(4)
            self.log('Quantity "{}" selected'.format(quantity))
        except:
            pass

    def _get_product_list_cart(self):
        time.sleep(1)
        self.page_source = self.driver.page_source
        self.page_selector = Selector(text=self.page_source)
        try:
            item_info = re.findall('var jcpORDERJSONjcp = (\{.+?\});',
                                   self.page_source, re.MULTILINE)[0]
            self.item_info = json.loads(item_info)
            return self.item_info
        except IndexError:
            return None

    def _get_products_in_cart(self, product_list):
        return product_list.get('purchasedItems')

    def _get_subtotal(self):
        return self.item_info.get('merchantTotalWithSavings')

    def _get_total(self):
        return self.item_info.get('orderTotal')

    def _get_item_name(self, item):
        return item.get('displayName')

    def _get_item_id(self, item):
        return item.get('itemNumber')[2:]

    def _get_item_price(self, item):
        return str(item.get('lineTotalPrice'))

    def _get_item_price_on_page(self, item):
        price_on_page_from_json = float(item.get('lineUnitPrice'))
        price_on_page_from_html = self.page_selector.xpath(
            '//span[contains(@data-anid, "product_CurrentSellingPrice")]/text()'
        ).re(FLOATING_POINT_RGEX)
        price_on_page_from_html = float(is_empty(price_on_page_from_html, 0))
        return price_on_page_from_json if price_on_page_from_json >= 0 else price_on_page_from_html

    def _get_item_color(self, item):
        selector = scrapy.Selector(text=self.page_source)
        color_new = is_empty(
            selector.xpath(
                '//span[@class="size" and '
                'contains(text(),"color:")]/text()').re('color\:\n(.+)'))
        color_old = is_empty(
            selector.xpath(
                '//span[@class="size" and contains(text(),"color:")]'
                '/strong/text()').extract())
        return color_new or color_old

    def _get_item_quantity(self, item):
        return item.get('quantity')

    def _enter_promo_code(self, promo_code):
        self.log('Enter promo code: {}'.format(promo_code))
        promo_field = self._find_by_xpath('//*[@id="cr-code"]')[0]
        promo_field.send_keys(promo_code)
        time.sleep(2)
        promo_field.send_keys(Keys.ENTER)
        time.sleep(5)
        self.driver.refresh()
        time.sleep(5)
        self.item_info = self._get_product_list_cart()

    def _remove_promo_code(self):
        self.log('Remove promo code')
        try:
            remove_field = self._find_by_xpath(
                '//a[@title="remove" and @class="cr-remove"]')[0]
            remove_field.click()
            time.sleep(10)
        except IndexError:
            self.log('Invalid promo code')

    def _get_promo_total(self):
        return self._get_total()

    def _get_promo_subtotal(self):
        return str(self._get_subtotal())

    def _parse_no_longer_available(self):
        return bool(self._find_by_xpath('//*[@class="error_holder"]'))
Esempio n. 41
0
    def fetch_userdata(self, url):
        user = YelpUser()
        response = requests.get(url)
        page = Selector(response)
        user.yelp_id = url[url.rfind('=') + 1:]
        user.name = page.xpath(
            '//div[@class="user-profile_info arrange_unit"]/h1/text()'
        ).extract_first()
        user.location = page.xpath(
            '//div[@class="user-profile_info arrange_unit"]/h3/text()'
        ).extract_first()
        user.tagline = page.xpath(
            '//p[@class="user-tagline"]/text()').extract_first()
        user.friends_count = page.xpath(
            '//li[@class="friend-count"]/strong/text()').extract_first()
        user.reviews_count = page.xpath(
            '//li[@class="review-count"]/strong/text()').extract_first()
        user.photos_count = page.xpath(
            '//li[@class="photo-count"]/strong/text()').extract_first()
        user.image_url = page.xpath(
            '//div[@class="user-profile_avatar"]//img/@src').extract_first()

        if (MUST_DOWNLOAD_USER_IMAGE):
            if (os.path.exists(BASE_DIR + '/UserImages') == False):
                os.mkdir(BASE_DIR + '/UserImages')
            with open(BASE_DIR + 'UserImages/' + user.yelp_id + '.jpg',
                      'wb') as f:
                f.write(requests.get(user.image_url))
            user.image_path = BASE_DIR + 'UserImages/' + user.yelp_id + '.jpg'

        sidebar = page.xpath('//div[@class="user-details-overview_sidebar"]')
        extra_data = {}
        for ysection in sidebar.xpath('.//div[@class="ysection"]'):
            key = ysection.xpath('.//h4/text()').extract_first()
            if (key == 'Rating Distribution'):
                starts_distribution = ysection.xpath(
                    './/td[@class="histogram_count"]/text()').extract()
                extra_data[key] = dict()
                extra_data[key]['5 stars'] = starts_distribution[0]
                extra_data[key]['4 stars'] = starts_distribution[1]
                extra_data[key]['3 stars'] = starts_distribution[2]
                extra_data[key]['2 stars'] = starts_distribution[3]
                extra_data[key]['1 stars'] = starts_distribution[4]
            elif (key == 'Review Votes' or key == 'Stats'):
                items = ysection.xpath('.//ul/li')
                items_title = ysection.xpath(
                    './/ul/li/text()[not(normalize-space(.)="")]').extract()
                for item in items_title:
                    item = item.strip()
                extra_data[key] = dict()
                for title, item in dict(zip(items_title, items)).items():
                    extra_data[key][title.strip()] = item.xpath(
                        './/strong/text()').extract_first()
            elif (key.find('Compliments') != -1):
                items = ysection.xpath('.//li')
                extra_data['Compliments'] = dict()
                for item in items:
                    compliment = item.xpath('.//span/@class').extract_first()
                    extra_data['Compliments'][
                        self.compliments[compliment]] = item.xpath(
                            './/small/text()').extract_first()
        user.meta = json.dumps(extra_data)
        return user
Esempio n. 42
0
<NOSCRIPT><em>Địa chỉ email này được bảo vệ bởi JavaScript.<BR>Bạn cần kích hoạt Javascript để có thể xem.</em></NOSCRIPT>

                            </div>

"""
sel = Selector(text=t)

# address_str = u'Địa chỉ'
# print address_str

# # address_str = u'dcdc'

# print sel.xpath('//*[@class="left-detail"]/div[contains(., \''+ address_str +'\')]/div[2]//text()').extract()

# //*[@id="product-detail"]/div[8]/table/tbody/tr/td[1]/div/div[2]/div[2]/div[2]
# //*[@id="product-detail"]/div[8]/table/tbody/tr/td[1]/div/div[2]

email_data = sel.xpath('//*[@class="right"]/script//text()').extract_first()

from HTMLParser import HTMLParser

h = HTMLParser()

import re

email_extract = re.search(r"mailto\:(.*)'", email_data)
if email_extract.group(1):
    email = email_extract.group(1)
    email = h.unescape(email)

print email
Esempio n. 43
0
    def parse(self, response):
        page = Selector(response)
        review_boxes = page.xpath(
            '//ul[@class="ylist ylist-bordered reviews"]/li')
        del review_boxes[0]
        for review_box in review_boxes:
            rv = Review()
            rv.business_id = self.biz_id
            rv.user_id = review_box.xpath(
                './/li[@class="user-name"]/a/@href').extract_first()
            if rv.user_id != None:
                user_url = rv.user_id
                rv.user_id = rv.user_id[rv.user_id.rfind("=") + 1:]
                if (self.session.query(YelpUser).filter(
                        YelpUser.yelp_id == rv.user_id).count() == 0):
                    user = self.fetch_userdata('https://www.yelp.com' +
                                               user_url)
                    self.session.add(user)

            else:
                user = YelpUser()
                user.yelp_id = None
                user.name = "Qype User"
                user.location = review_box.xpath(
                    './/li[@class="user-location responsive-hidden-small"]/b/text()'
                ).extract_first().strip()
                user.photos_count = review_box.xpath(
                    './/li[@class="photo-count responsive-small-display-inline-block"]/b/text()'
                ).extract_first()
                user.friends_count = review_box.xpath(
                    './/li[@class="friend-count responsive-small-display-inline-block"]/b/text()'
                ).extract_first()
                user.reviews_count = review_box.xpath(
                    './/li[@class="review-count responsive-small-display-inline-block"]/b/text()'
                ).extract_first()
                user.meta = None
                self.session.add(user)

            rv.text = review_box.xpath(
                './/div[@class="review-content"]/p/text()').extract_first()
            rv.rating = review_box.xpath(
                './/div[@class="review-content"]/div[@class="biz-rating biz-rating-large clearfix"]/div/div/@title'
            ).extract_first()
            rv.rating = rv.rating[0:rv.rating.find(" ")]
            rv.date = review_box.xpath(
                './/div[@class="review-content"]/span[@class="rating-qualifier"]/text()'
            ).extract_first()
            self.session.add(rv)

        if (self.session.query(CrawlData).filter(
                CrawlData.url == response.url).count() != 0):
            crawl_data = CrawlData()
            crawl_data.body = response.body
            crawl_data.requestHeader = str(response.request.headers)
            crawl_data.url = response.url
            self.session.add(crawl_data)

        self.session.commit()
        next_page = page.xpath('//link[@rel="next"]/@href').extract_first()
        if (next_page != None):
            yield response.follow(next_page, self.parse)
Esempio n. 44
0
time.sleep(2)

try:
  cookiesBtnElem = driver.find_element_by_xpath("//button[text()='Accetta']")
  driver.execute_script("arguments[0].click()", cookiesBtnElem)
  time.sleep(1)
except:
  pass

while True:
  pageCntr += 1
  html = driver.page_source
  respObj = Selector(text=html)

  #if pageCntr > 27:
  cards = respObj.xpath("//div[@data-list-type='Catalog']/div[@id]")
  for card in cards:
    urlList.append(card.xpath(".//a[contains(@id, 'app_lnk')]/@href").get())

  nextPageType1 = respObj.xpath(f"//a[@data-page and text()='{pageCntr}']")
  nextPageType2 = respObj.xpath(f"//span[contains(@class, 'pagination') and text()='{pageCntr}']")
  
  if nextPageType1:
    nextBtnElem = driver.find_element_by_xpath(f"//a[@data-page and text()='{pageCntr}']")
    driver.execute_script("arguments[0].click()", nextBtnElem)
    time.sleep(2)
    print(f"\n\n PAGE-{pageCntr}")
  elif nextPageType2:
    nextBtnElem = driver.find_element_by_xpath(f"//span[contains(@class, 'pagination') and text()='{pageCntr}']")
    driver.execute_script("arguments[0].click()", nextBtnElem)
    time.sleep(2)
Esempio n. 45
0
 def _all_principal_td(self):
     page_selector = Selector(text=self._content)
     return page_selector.xpath(
         '//td[starts-with(@headers, "LINK BREAK_COUNTRY_NAME")]')
# 可利用Beautiful Soup、pyquery及正则表达式来提取网页数据
# Scrapy提供了自己的数据提取方法:Selector(选择器).基于lxml构建,支持XPath选择器、CSS选择器就正则,解析速度和准确度非常高
# 1.直接使用:独立模块,可直接利用Selector类构建一个选择器对象,调用相关方法如xpath、css来提取数据
# 针对一段HTML,用如下方式狗结案Selector对象提取数据:
from scrapy import Selector

body = '<html><head><title>Hello World</title></head><body></body></html>'
selector = Selector(text=body)
title = selector.xpath('//title/text()').extract_first()    # 查找title中的文本,XPath选择器最后加text方法发可实现文本提取
print(title)
# 没有在Scrapy框架中运行,把Scrapy中的Selector单独拿出来使用,构建时传入text参数,生成了Selector选择器对象,像Scrapy中的解析
# 方式一样,调用xpath、css方法来提取。

# 2.Scrapy shell:Selector主要与Scrapy结合使用,Scrapy的回调函数中response直接调用xpath或者css方法提取数据,
# 所以借助Scrapy shell模拟Scrapy请求过程,理解相关提取方法
# 用官方文档样例页面:http://doc.scrapy.org/en/latest/_static/selectors-sample1.html
# 开启Srapy shell,命令行输入:
scrapy shell http://doc.scrapy.org/en/latest/_static/selectors-sample1.html
# 进入到Scrapy shell模式。过程是,Scrapy发起一次请求,请求的URL是命令行下输入的URL,把可操作的变量request、response传递给我
# 可在命令行模式下输入命令调用对象的一些操作方法,回车后实时显示结果。
# 演示实例都将页面的源码作为分析目标,源码:
<html>
<head>
<base href='http://example.com/' />
<title>Example website</title>
</head>
<body>
<div id = 'images'>
<a href='imgae1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
<a href='imgae1.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
<a href='imgae1.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
Esempio n. 47
0
 def parse(self, response):
     """
     解析
     """
     sel = Selector(text=response.body)
     print len(sel.xpath(u"//b[text()='单位名称']"))!= 0, "parse 条件"
     log.msg("parse 条件=%s"%str(len(sel.xpath(u"//b[text()='单位名称']")) != 0), level=log.INFO)
     if (len(sel.xpath(u"//b[text()='单位名称']")) != 0): #判别是否为要输入验证码
         pass
     else:
         log.msg("code=%s,  %s"%(str(response.status),response.body), level=log.INFO)
         raise UnknownResponseError
     #========================================================
     """
     第一部分:企业信用档案
     """
     item = DetailInformation()
     item['basic_info'] = fundation_info_extract(response)
     #========================================================
     #========================================================
     """
     第一部分 政府监管信息
     """
     item['regulatory_info'] = extract_combine_JCXX(response)
     #========================================================
     #========================================================
     """
     第三部分 行业评价信息
     """
     keywords_list = ['2-1.体系/产品/行业认证信息',
         '2-2.行业协会(社会组织)评价信息',\
         '2-3.水电气通讯等公共事业单位评价']
     item['envaluated_info'] = block_info_extract(response,\
         keywords_list)
     #========================================================
     """
     第四部分 媒体评价信息
     """
     keywords_list = ['3-1.媒体评价信息']
     item['media_env'] = block_info_extract(response, keywords_list)
     #========================================================
     """
     第五部分 金融信贷信息
     """
     #url = 'http://www.11315.com/\
     #getTradeLendingCount?companyId=%s'%response.url[7:15]
     #header = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36",
     #  'Referer':response.url}
     #req = urllib2.Request(url=url, headers=header)
     #xtml = urllib2.urlopen(req)
     #Nums = xtml.read()
     #print Nums, "this is Nums"
     #Nums = eval(Nums).split(",")
     #print Nums, "this is anothor Nums"
     #total = str(sum([int(i) for i in Nums]))
     #Nums.insert(0, total)  #在头部插入
     #if total == '0':
     #    t_url = ""
     #else:
     #    t_url = sel.xpath(u"//script").re(ur"html\(\'<a href=\"([\w\W]*?)\"")[0]
     #Nums.append(t_url)
     #Nums_re = "|".join(Nums)
     keywords_list = ['4-2.民间借贷评价信息']
     item["credit_fin"] = block_info_extract(response, keywords_list)
     #=======================================================
     """
     第六部分 企业运营信息
     """
     #keywords_list = ['5-3.水电煤气电话费信息',
     #'5-4.纳税信息']                          #要么运行js,要么模拟请求,破网站,就两行数据至于吗
     #item['operation_info'] = block_info_extract(response, keywords_list)
     #========================================================
     """
     第七部分 市场反馈信息
     """
     keywords_list = ['6-1.消费者评价信息',
     '6-2.企业之间履约评价','6-3.员工评价信息',
     '6-4.其他']
     item['feedback_info'] = block_info_extract(response, keywords_list)
     #========================================================
     return item
Esempio n. 48
0
def main():
    adsl = ADSL()
    result = []
    df_input = pd.read_excel('sku.xlsx')
    sku_list = df_input['sku'].values
    start = 0
    length = len(sku_list)

    while 1:

        if start == length:
            break
        print('正在爬取第{}条'.format(start + 1))
        sku = sku_list[start]
        options = webdriver.ChromeOptions()
        options.add_argument(
            '--user-agent=Mozilla/5.0 (Windows NT 999999.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
        )

        options.add_argument('--headless')
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--disable-gpu')

        driver = webdriver.Chrome(executable_path=r'chromedriver.exe',
                                  chrome_options=options)
        wait = WebDriverWait(driver, TIMEOUT)  # 等待加载最长时间

        url = 'https://item.jd.com/{}.html'.format(sku)
        try:
            driver.get(url)
        except Exception as e:
            print(e)
            start += 1
            continue

        try:
            wait.until(
                EC.presence_of_element_located(
                    (By.XPATH, '//a[@id="InitCartUrl"]')))
        except:
            print('访问超时,重试')
            start += 1
            continue

        text = driver.page_source
        resp = Selector(text=text)
        title = resp.xpath('//div[@class="sku-name"]/text()').extract()
        if len(title) > 1:
            title = title[1].strip()
        else:
            title = title[0].strip()
        price = resp.xpath(
            '//span[@class="p-price"]/span[2]/text()').extract_first()
        comment = resp.xpath(
            '//div[@id="comment-count"]/a/text()').extract_first()

        try:
            activity_type = resp.xpath(
                '//div[@class="activity-type"]/strong/text()').extract_first()
        except:
            activity_type = None

        area = resp.xpath(
            '//div[@class="ui-area-text"]/text()').extract_first()
        store = resp.xpath(
            '//div[@id="store-prompt"]/strong/text()').extract_first()
        d = {}

        d['title'] = title
        d['price'] = price
        d['comment'] = comment
        d['activity_type'] = activity_type
        d['area'] = area
        d['store'] = store
        d['sku'] = str(sku)
        d['url'] = url

        result.append(d)
        time.sleep(2 * random.randint(2, 6))
        driver.close()
        start += 1

        adsl.reconnect()

        df = pd.DataFrame(result)
        df.to_csv(output_filename, encoding='gbk', mode='a', header=False)

    print('爬取结束,共爬取了{}条'.format(length))
Esempio n. 49
0
# -*- coding: utf-8 -*-
from scrapy import Selector
import requests
response = requests.get("https://www.baidu.com").text
select = Selector(text=response)
title = select.xpath("//title/text()").extract_first()
print(title)
Esempio n. 50
0
    def parse(self, response):
        driver = response.meta['driver']
        for _, value in self.df.iterrows():
            driver.get(value['url'])
            time.sleep(2)

            html = driver.page_source
            resp_obj = Selector(text=html)

            check1 = resp_obj.xpath("//div[@data-type='items']")
            check2 = resp_obj.xpath(
                "//span[text()='Shop by Category' or text()='Shop by category']/parent::span/parent::button/following-sibling::div/div/ul/li"
            )
            check3 = resp_obj.xpath(
                "//h2[text()='Shop by category']/parent::div/parent::div/following-sibling::div//div[@class='TempoCategoryTile-tile valign-top']"
            )
            if check1:
                cntr = 1
                while True:
                    html = driver.page_source
                    resp_obj = Selector(text=html)
                    listings = resp_obj.xpath("//div[@data-type='items']")
                    for prods in listings:
                        product_url = f'''https://www.walmart.com{prods.xpath(".//div[@class='search-result-product-title gridview']/a/@href").get()}'''
                        product_name = prods.xpath(
                            "normalize-space(.//div[@class='search-result-product-title gridview']/a/span/text())"
                        ).get()
                        price = prods.xpath(
                            "normalize-space(.//span[@class='price-main-block']/span/span/text())"
                        ).get()
                        if not product_name:
                            product_url = f'''https://www.walmart.com{prods.xpath(".//span[text()='Product Title']/parent::div/a/@href").get()}'''
                            product_name = prods.xpath(
                                "normalize-space(.//span[text()='Product Title']/parent::div/a/span/text())"
                            ).get()
                        if not price:
                            price = f'''{prods.xpath("normalize-space(.//span[@class='price price-main'][1]/span/text())").get()} - {prods.xpath("normalize-space(.//span[@class='price price-main'][2]/span/text())").get()}'''
                        yield {
                            'product_url': product_url,
                            'product_name': product_name,
                            'product_price': price,
                            'lvl1_cat': value['lvl1_cat'],
                            'lvl2_cat': value['lvl2_cat'],
                            'lvl3_cat': value['lvl3_cat'],
                            'lvl4_cat': None
                        }

                    next_page = resp_obj.xpath(
                        "//span[text()='Next Page']/parent::button")
                    cntr += 1
                    if next_page:
                        next_page = resp_obj.xpath(
                            f"//ul[@class='paginator-list']/li/a[text()='{cntr}']/@href"
                        ).get()
                        driver.get(f"https://www.walmart.com{next_page}")
                        time.sleep(2)
                    else:
                        break

            elif check2:
                driver.execute_script("window.open('');")
                driver.switch_to.window(driver.window_handles[1])
                for listings in check2:
                    lvl4_cat = listings.xpath(".//a/span/text()").get()
                    url = listings.xpath(".//a/@href").get()
                    driver.get(f"https://www.walmart.com{url}")
                    cntr = 1
                    while True:
                        html = driver.page_source
                        resp_obj = Selector(text=html)
                        listings = resp_obj.xpath("//div[@data-type='items']")
                        for prods in listings:
                            product_url = f'''https://www.walmart.com{prods.xpath(".//div[@class='search-result-product-title gridview']/a/@href").get()}'''
                            product_name = prods.xpath(
                                "normalize-space(.//div[@class='search-result-product-title gridview']/a/span/text())"
                            ).get()
                            price = prods.xpath(
                                "normalize-space(.//span[@class='price-main-block']/span/span/text())"
                            ).get()
                            if not product_name:
                                product_url = f'''https://www.walmart.com{prods.xpath(".//span[text()='Product Title']/parent::div/a/@href").get()}'''
                                product_name = prods.xpath(
                                    "normalize-space(.//span[text()='Product Title']/parent::div/a/span/text())"
                                ).get()
                            if not price:
                                price = f'''{prods.xpath("normalize-space(.//span[@class='price price-main'][1]/span/text())").get()} - {prods.xpath("normalize-space(.//span[@class='price price-main'][2]/span/text())").get()}'''
                            yield {
                                'product_url': product_url,
                                'product_name': product_name,
                                'product_price': price,
                                'lvl1_cat': value['lvl1_cat'],
                                'lvl2_cat': value['lvl2_cat'],
                                'lvl3_cat': value['lvl3_cat'],
                                'lvl4_cat': lvl4_cat
                            }

                        next_page = resp_obj.xpath(
                            "//span[text()='Next Page']/parent::button")
                        cntr += 1
                        if next_page:
                            next_page = resp_obj.xpath(
                                f"//ul[@class='paginator-list']/li/a[text()='{cntr}']/@href"
                            ).get()
                            driver.get(f"https://www.walmart.com{next_page}")
                            time.sleep(2)
                        else:
                            break
                driver.close()
                driver.switch_to.window(driver.window_handles[0])

            elif check3:
                driver.execute_script("window.open('');")
                driver.switch_to.window(driver.window_handles[1])
                for listings in check3:
                    lvl4_cat = listings.xpath(".//span/text()").get()
                    url = listings.xpath(".//following-sibling::a/@href").get()
                    driver.get(f"https://www.walmart.com{url}")
                    cntr = 1
                    while True:
                        html = driver.page_source
                        resp_obj = Selector(text=html)
                        listings = resp_obj.xpath("//div[@data-type='items']")
                        for prods in listings:
                            product_url = f'''https://www.walmart.com{prods.xpath(".//div[@class='search-result-product-title gridview']/a/@href").get()}'''
                            product_name = prods.xpath(
                                "normalize-space(.//div[@class='search-result-product-title gridview']/a/span/text())"
                            ).get()
                            price = prods.xpath(
                                "normalize-space(.//span[@class='price-main-block']/span/span/text())"
                            ).get()
                            if not product_name:
                                product_url = f'''https://www.walmart.com{prods.xpath(".//span[text()='Product Title']/parent::div/a/@href").get()}'''
                                product_name = prods.xpath(
                                    "normalize-space(.//span[text()='Product Title']/parent::div/a/span/text())"
                                ).get()
                            if not price:
                                price = f'''{prods.xpath("normalize-space(.//span[@class='price price-main'][1]/span/text())").get()} - {prods.xpath("normalize-space(.//span[@class='price price-main'][2]/span/text())").get()}'''
                            yield {
                                'product_url': product_url,
                                'product_name': product_name,
                                'product_price': price,
                                'lvl1_cat': value['lvl1_cat'],
                                'lvl2_cat': value['lvl2_cat'],
                                'lvl3_cat': value['lvl3_cat'],
                                'lvl4_cat': lvl4_cat
                            }

                        next_page = resp_obj.xpath(
                            "//span[text()='Next Page']/parent::button")
                        cntr += 1
                        if next_page:
                            next_page = resp_obj.xpath(
                                f"//ul[@class='paginator-list']/li/a[text()='{cntr}']/@href"
                            ).get()
                            driver.get(f"https://www.walmart.com{next_page}")
                            time.sleep(2)
                        else:
                            break
                driver.close()
                driver.switch_to.window(driver.window_handles[0])

            else:
                pass
Esempio n. 51
0
def amica(report_label, product, model):
    from globals import file_path

    if product[7].startswith('http'):
        page_address = product[7]
        driver.get(product[7])
        html = requests.get(product[7]).content
        sel = Selector(text=html)
    else:
        search = product[1][product[1].lower().find('amica') + len('amica') +
                            1:]
        amica_link = f'https://www.amica.pl/szukaj/{search}'
        driver.get(amica_link)
        html = requests.get(amica_link).content
        sel = Selector(text=html)

        # Znajdź model na stronie Amica
        try:
            for i in range(len(sel.xpath('//div[@class="container"]'))):
                if driver.find_element_by_xpath(
                        f'//h3[@class="prodSymbol"][{i + 1}]').text == model:
                    page_address = driver.find_element_by_xpath(
                        f'//h3[@class="prodSymbol"][{i + 1}]/a').get_attribute(
                            'href')
                    break

        except NoSuchElementException:
            report_label[
                'text'] += f"Nie znaleziono {model} na stronie Amica. Pomijam go."
            return -1

        driver.find_element_by_css_selector(
            '#produkty > div.moreProducts > div > div > div > div > div > div > div.image > a'
        ).click()
    sleep(1)
    driver.find_element_by_css_selector(
        '#menu01 > div > div.product-view__media > img').click()

    first = driver.find_element_by_css_selector(
        '#prod_app > div.medialightbox__overlay > div > div.cool-lightbox__inner > div.cool-lightbox__wrapper > '
        'div > div > img').get_attribute('src')

    # Zapisywanie i obrabianie zdjęc do miniaturek
    i = 0
    while i < 15:
        if i == 0:
            res = requests.get(first)
        else:
            desc_img = driver.find_element_by_css_selector(
                '#prod_app > div.medialightbox__overlay > div > div.cool-lightbox__inner > div.cool-lightbox__wrapper '
                '> div > div > img').get_attribute('src')
            if desc_img == first:
                break
            res = requests.get(desc_img)
        with open(f'{file_path}/{model}/obrazki_produktu/{i}.jpg',
                  'wb') as file_format:
            file_format.write(res.content)
        try:
            driver.find_element_by_xpath(
                '//*[@id="prod_app"]/div[4]/div/div[2]/div[2]/button[2]/div'
            ).click()
        except ElementNotInteractableException:
            pass

        sleep(1)
        i = i + 1

    for y in range(i):
        im = Image.open(f'{file_path}/{model}/obrazki_produktu/{y}.jpg')
        file_format = im.format
        width, height = im.size
        if width > height:
            ratio = width / 600
        else:
            ratio = height / 600
        new_width = round(width / ratio)
        new_height = round(height / ratio)
        im = im.resize((new_width, new_height))
        if file_format == 'PNG':
            im.save(f'{file_path}/{model}/obrazki_produktu/{y}.jpg', 'PNG')
        elif file_format == 'JPEG':
            im.save(f'{file_path}/{model}/obrazki_produktu/{y}.jpg', 'JPEG')
        else:
            print(f"Nie umiem zrobić zdjęcia nr {y} :'( (typ {file_format})")
    driver.find_element_by_tag_name('body').send_keys(Keys.ESCAPE)

    html = requests.get(page_address).content
    sel = Selector(text=html)

    raw = sel.xpath('/html/body/div[1]/script[4]/text()').extract()

    for i in range(len(raw)):
        raw[i] = raw[i].replace('\n', '')
        raw[i] = raw[i].replace('\t', '')
        raw[i] = raw[i].replace('\xa0', '')
        raw[i] = raw[i].replace('\r', '')
        raw[i] = raw[i].replace('  ', '')

    t = raw[0]
    t = t[t.find('"descTitle":'):]
    t = t[:t.find('}]}')]
    desc = []
    imgs = []
    while t.find('"descTitle":') != -1:
        t = t[t.find('"descTitle":') + 13:]
        desc.append(t[:t.find('"')])
        t = t[t.find('"descIconUrl":') + 15:]
        imgs.append(t[:t.find('"')])
        t = t[t.find('"descText":') + 12:]
        desc.append(t[:t.find('"')])

    for i in range(len(imgs)):
        imgs[i] = imgs[i].replace('\\', '')

    # pobieranie zdjęć z opisu na dysk lokalny
    for i, img in enumerate(imgs):
        res = requests.get(img)
        with open(f'{file_path}/{model}/obrazki_opisu/{i}.jpg',
                  'wb') as file_format:
            file_format.write(res.content)

    for i in range(len(desc)):
        desc[i] = desc[i].replace('\\u0105', 'ą')
        desc[i] = desc[i].replace('\\u0119', 'ę')
        desc[i] = desc[i].replace('\\u0107', 'ć')
        desc[i] = desc[i].replace('\\u0144', 'ń')
        desc[i] = desc[i].replace('\\u015b', 'ś')
        desc[i] = desc[i].replace('\\u015a', 'Ś')
        desc[i] = desc[i].replace('\\u00f3', 'ó')
        desc[i] = desc[i].replace('\\u0141', 'Ł')
        desc[i] = desc[i].replace('\\u0142', 'ł')
        desc[i] = desc[i].replace('\\u017a', 'ź')
        desc[i] = desc[i].replace('\\u017b', 'Ż')
        desc[i] = desc[i].replace('\\u017c', 'ż')
        desc[i] = desc[i].replace('\\u017', 'Ź')
        desc[i] = desc[i].replace('\\u00ae', '®')
        desc[i] = desc[i].replace('\\u00b0', '°')
        desc[i] = desc[i].replace('\u00b0', '°')
        desc[i] = desc[i].replace('\u2070', '°')
        desc[i] = desc[i].replace('\\u2070', '°')
        desc[i] = desc[i].replace('\\u2013', '-')
        desc[i] = desc[i].replace('\u2013', '-')
        desc[i] = desc[i].replace('\\u2026', '...')
        desc[i] = desc[i].replace('\u2026', '...')
        desc[i] = desc[i].replace('\\n', '')
        desc[i] = desc[i].replace('\\/', '/')

    j = 0
    fin = ['<div class="product-description-section">']
    for i in range(0, len(desc), 6):
        fin.append('<div class="three-col-equaly">')
        try:
            fin.append(
                f'<div><img src="https://matrixmedia.pl/media/wysiwyg/Amica/'
                f'{model}/{j}.jpg"/><br/><h2 class="important-header">{desc[i]}</h2>'
            )
            fin.append(f'<p style="font-size: large;">{desc[i + 1]}</p></div>')
            fin.append(
                f'<div><img src="https://matrixmedia.pl/media/wysiwyg/Amica/'
                f'{model}/{j + 1}.jpg"/><br/><h2 class="important-header"> {desc[i + 2]}</h2>'
            )
            fin.append(f'<p style="font-size: large;">{desc[i + 3]}</p></div>')
            fin.append(
                f'<div><img src="https://matrixmedia.pl/media/wysiwyg/Amica/'
                f'{model}/{j + 2}.jpg"/><br/><h2 class="important-header"> {desc[i + 4]}</h2>'
            )
            fin.append(f'<p style="font-size: large;">{desc[i + 5]}</p></div>')
        except IndexError:
            pass
        finally:
            fin.append('</div>')
        j = j + 3
    fin.append('</div>')

    reg = ''.join(fin)
    reg = reg.replace(
        '*Zdjęcie ma charakter poglądowy i może nie przedstawiać dokładnego modelu produktu.',
        '')
    print("------------ OPIS GRAFICZNY ------------")
    print(reg + '\n\n')
    """ OPIS TECHNICZNY """
    html = requests.get(page_address).content
    sel = Selector(text=html)

    tech_raw = sel.xpath('/html/body/div[1]/script[4]/text()').extract()
    tech_raw2 = tech_raw[0]
    tech_d = tech_raw2[tech_raw2.find('"attrGroupData"'):tech_raw2.
                       find('"docFilesDataList"')]

    tech_desc_1 = []
    while tech_d.find('"attrName":') != -1:
        tech_d = tech_d[tech_d.find('"attrName":') + 12:]
        tech_desc_1.append(tech_d[:tech_d.find('"')])
        tech_d = tech_d[tech_d.find('"attrValue":') + 13:]
        tech_desc_1.append(tech_d[:tech_d.find('"')])

    tech_d2 = tech_d[tech_d.find(tech_desc_1[-1]):]

    tech_desc_2 = []
    while tech_d2.find('"attrValue":') != -1:
        tech_d2 = tech_d2[tech_d2.find('"attrValue":') + 13:]
        tech_desc_2.append(tech_d2[:tech_d2.find('"')])

    tech_desc = [
        '<table id="plan_b" class="data-table"><tbody><tr class="specs_category"><td '
        'colspan="2">Specyfikacja</td></tr>'
    ]
    for i in range(0, len(tech_desc_1), 2):
        tech_desc.append(f'<tr><td class="c_left">{tech_desc_1[i]}</td>')
        tech_desc.append(f'<td class="c_left">{tech_desc_1[i + 1]}</td></tr>')

    for i in range(len(tech_desc_2)):
        if i == 0:
            tech_desc.append(f'<tr><td class="c_left">Funkcje</td>')
            tech_desc.append(f'<td class="c_left">{tech_desc_2[i]}</td></tr>')
        else:
            tech_desc.append(f'<tr><td class="c_left"></td>')
            tech_desc.append(f'<td class="c_left">{tech_desc_2[i]}</td></tr>')
    tech_desc.append('</tbody></table>')

    for i in range(len(tech_desc)):
        tech_desc[i] = tech_desc[i].replace('\\u0105', 'ą')
        tech_desc[i] = tech_desc[i].replace('\\u0119', 'ę')
        tech_desc[i] = tech_desc[i].replace('\\u0107', 'ć')
        tech_desc[i] = tech_desc[i].replace('\\u0144', 'ń')
        tech_desc[i] = tech_desc[i].replace('\\u015b', 'ś')
        tech_desc[i] = tech_desc[i].replace('\\u015a', 'Ś')
        tech_desc[i] = tech_desc[i].replace('\\u00f3', 'ó')
        tech_desc[i] = tech_desc[i].replace('\\u0141', 'Ł')
        tech_desc[i] = tech_desc[i].replace('\\u0142', 'ł')
        tech_desc[i] = tech_desc[i].replace('\\u017a', 'ź')
        tech_desc[i] = tech_desc[i].replace('\\u017b', 'Ż')
        tech_desc[i] = tech_desc[i].replace('\\u017c', 'ż')
        tech_desc[i] = tech_desc[i].replace('\\u017', 'Ź')
        tech_desc[i] = tech_desc[i].replace('\\u00ae', '®')
        tech_desc[i] = tech_desc[i].replace('\\u00b0', '°')
        tech_desc[i] = tech_desc[i].replace('\u00b0', '°')
        tech_desc[i] = tech_desc[i].replace('\u2070', '°')
        tech_desc[i] = tech_desc[i].replace('\\u2070', '°')
        tech_desc[i] = tech_desc[i].replace('\\u2013', '-')
        tech_desc[i] = tech_desc[i].replace('\u2013', '-')
        tech_desc[i] = tech_desc[i].replace('\\u2026', '...')
        tech_desc[i] = tech_desc[i].replace('\u2026', '...')
        tech_desc[i] = tech_desc[i].replace('\\n', '')
        tech_desc[i] = tech_desc[i].replace('\\/', '/')
        tech_desc[i] = tech_desc[i].replace(':', '')

    tech = ''.join(tech_desc)
    print('------------ OPIS TECHNICZNY ------------')
    print(tech + '\n\n')
    """ OPIS KRÓTKI """
    for i in range(len(tech_desc_1)):
        tech_desc_1[i] = tech_desc_1[i].replace('\\u0105', 'ą')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u0119', 'ę')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u0107', 'ć')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u0144', 'ń')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u015b', 'ś')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u015a', 'Ś')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u00f3', 'ó')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u0141', 'Ł')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u0142', 'ł')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u017a', 'ź')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u017b', 'Ż')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u017c', 'ż')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u017', 'Ź')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u00ae', '®')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u00b0', '°')
        tech_desc_1[i] = tech_desc_1[i].replace('\u00b0', '°')
        tech_desc_1[i] = tech_desc_1[i].replace('\u2070', '°')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u2070', '°')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u2013', '-')
        tech_desc_1[i] = tech_desc_1[i].replace('\u2013', '-')
        tech_desc_1[i] = tech_desc_1[i].replace('\\u2026', '...')
        tech_desc_1[i] = tech_desc_1[i].replace('\u2026', '...')
        tech_desc_1[i] = tech_desc_1[i].replace('\\n', '')
        tech_desc_1[i] = tech_desc_1[i].replace('\\/', '/')
        tech_desc_1[i] = tech_desc_1[i].replace(':', '')

    if len(tech_desc_1) < 12:
        n = len(tech_desc_1)
    else:
        n = 12

    short = ['<ul>']
    for i in range(0, n, 2):
        short.append(f'<li>{tech_desc_1[i]}: {tech_desc_1[i + 1]}</li>')
    short.append('</ul>')

    short = '\n'.join(short)
    print('------------ OPIS KRÓTKI ------------')
    print(short + '\n\n')

    return [reg, short, tech]
Esempio n. 52
0

def preview_result(Xpath, inputtext):
    sel = Selector(text=inputtext)
    result = sel.xpath(Xpath).extract()
    n = len(result)
    for idx, element in enumerate(result[:min(4,n)], start=1):
        print(f"Element {idx}: {element}", end=sp)


sp = '\n\n'
url = 'https://www.cdc.gov/nchs/tutorials/NHANES/index_continuous.htm'
# res = requests.get(url)
# html = res.text 
html = requests.get(url).content



xpath = '//p'
xpath2 = '//*'
sel = Selector(text=html)
sll = sel.xpath('//p')[2].extract()  # extract the 3rd element (here paragrph) of the selectorList
sll_ = sel.xpath('//p')  # without extract(), the selectorList give a 36 line preview of the paragraph
slla = sel.xpath('//p').extract()
sllf = sel.xpath('//p').extract_first()



# print(sll, slla, sllf, sep=sp)

print(number_of_element(xpath, html), number_of_element(xpath2, html),preview_result(xpath, html), sep=sp)
Esempio n. 53
0
    def parse_detail(self, response):
        try:
            # 数据获取不全
            data = Selector(text=response.body.decode('gbk'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            # 共有字段
            fileTitle = data.xpath(
                '//td[@class="fh tac bw fwb f18-0 pl2 b0"]/text()'
            ).extract_first()
            # 正文标题
            textTitle = data.xpath(
                '//td[@class="fh vat bw f8-0 b1"]/table[1]//tr[1]/td[@align="center"]/text()'
            ).extract_first()
            supllyType = response.meta.get('supllyType').strip()
            administration = response.meta.get('administration').strip()
            supplyNoticeTitle = response.meta.get('supplyNoticeTitle').strip()
            publishTime = response.meta.get('publishTime').strip()
            projectName = ''
            parcelNumber = ''
            parcelLocation = ''
            landPurpose = ''
            landArea = ''
            transferTimeLimit = ''
            transferPrice = ''
            landPurposeDetail = ''
            transferUnit = ''
            remark = ''
            publicityPeriod = ''
            contactUnit = ''
            unitAddr = ''
            postalCode = ''
            contactTel = ''
            contacter = ''
            email = ''
            lanServiceCondition = ''

            # 公告类型
            # noticeType =
            # 公示期
            publicityPeriod = reFunction(u'公示期:([\s\S]*)三、',
                                         reFunction('四、[\s\S]*',
                                                    items)).strip()
            # 联系单位
            contactUnit = reFunction(u'联系单位:([\s\S]*)单位地址',
                                     reFunction('四、[\s\S]*', items)).strip()
            # 单位地址
            unitAddr = reFunction(u'单位地址:([\s\S]*)邮政编码',
                                  reFunction('四、[\s\S]*', items)).strip()
            # 邮政编码
            postalCode = reFunction(u'邮政编码:([\s\S]*)联系电话',
                                    reFunction('四、[\s\S]*', items)).strip()
            # 联系电话
            contactTel = reFunction(u'联系电话:([\s\S]*)联 系 人',
                                    reFunction('四、[\s\S]*', items)).strip()
            # 联系人
            contacter = reFunction(u'联 系 人:([\s\S]*)电子邮件',
                                   reFunction('四、[\s\S]*', items)).strip()
            # 电子邮件
            email = reFunction(u'电子邮件:([\w\.\@]*)(?:[\S]*)',
                               reFunction('四、[\s\S]*', items)).strip()
            if '宗地编号' in items:
                for item in [
                        '宗地编号' + _ for _ in re.findall('([\s\S]*)二、', items)
                    [0].split('宗地编号')[1:]
                ]:
                    # 宗地编号
                    parcelNumber = reFunction('宗地编号:(?:\s*)([\s\S]*)地块位置',
                                              item).strip()
                    # 地块位置	parcelArea
                    parcelLocation = reFunction('地块位置:(?:\s*)([\s\S]*)土地用途:',
                                                item).strip()
                    # 土地用途
                    landPurpose = reFunction('土地用途:(?:\s*)([\s\S]*)土地面积\(公顷\)',
                                             item).strip()
                    # 土地面积(公顷)
                    landArea = reFunction(
                        '土地面积\(公顷\):(?:\s*)([\w}/\.{]*)(?:\s*)', item).strip()
                    # 项目名称
                    projectName = reFunction('项目名称:(?:\s*)([\s\S]*)土地用途明细',
                                             item).strip()
                    # 出让年限
                    transferTimeLimit = reFunction(
                        '出让年限:(?:\s*)([\s\S]*)成交价\(万元\)', item).strip()
                    # 成交价(万元)
                    transferPrice = reFunction(
                        '成交价\(万元\):(?:\s*)([\s\S]*)土地用途明细', item).strip()
                    # 土地用途明细(用途名称、面积)
                    landPurposeDetail = reFunction(
                        '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位',
                        item).strip() if reFunction(
                            '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位',
                            item).strip() else reFunction(
                                '(?:\s*)([\d\.]*)(?:[\s]*)受让单位', item).strip()
                    # 受让单位
                    transferUnit = reFunction('受让单位:(?:\s*)([\w}/{]*)(?:\s*)',
                                              item).strip()
                    # 土地使用条件
                    lanServiceCondition = reFunction(
                        '土地使用条件:(?:\s*)([\s\S]*)备注', item).strip()
                    # 备注
                    # remark = reFunction(u'备注:(?:\s*)([\w}/,、\u4e00-\uffe5()《》:\-\.<≤。{\u3002\uff1f\uff01\uff0c\u3001\uff1b\uff1a\u201c\u201d\u2018\u2019\uff08\uff09\u300a\u300b\u3008\u3009\u3010\u3011\u300e\u300f\u300c\u300d\ufe43\ufe44\u3014\u3015\u2026\u2014\uff5e\ufe4f\uffe5]*)(?:\s*)', item).strip()
                    remark = reFunction(u'备注:(?:\s*)([\s\S]*)(?:\s*)[二、]?',
                                        item).strip()
                    # 爬取时间
                    crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S",
                                                 time.localtime())
                    # 爬取地址url
                    url = response.url
                    # 唯一标识
                    md5Mark = encrypt_md5(parcelNumber + publishTime +
                                          parcelLocation + url)

                    # 存储数据
                    csvFile = [
                        administration, supplyNoticeTitle, publishTime,
                        fileTitle, textTitle, projectName, parcelNumber,
                        parcelLocation, landPurpose, landArea,
                        transferTimeLimit, transferPrice, landPurposeDetail,
                        transferUnit, remark, publicityPeriod, contactUnit,
                        unitAddr, postalCode, contactTel, contacter, email,
                        lanServiceCondition, crawlingTime, url, md5Mark
                    ]
                    self.fileDetail.write(','.join([
                        _.replace(',', ' ').replace('\n', '').replace(
                            '\r', '') if _ else _ for _ in csvFile
                    ]))
                    self.fileDetail.write('\n')
                    yield
                    #TODO
            elif '地块编号' in items:
                for item in [
                        '地块编号' + _ for _ in re.findall('([\s\S]*)二、', items)
                    [0].split('地块编号')[1:]
                ]:
                    # 地块编号
                    parcelNumber = reFunction('地块编号:(?:\s*)([\s\S]*)地块位置',
                                              item).strip()
                    # 地块位置	parcelArea
                    parcelLocation = reFunction('地块位置:(?:\s*)([\s\S]*)土地用途:',
                                                item).strip()
                    # 土地用途
                    landPurpose = reFunction('土地用途:(?:\s*)([\s\S]*)土地面积\(公顷\)',
                                             item).strip()
                    # 土地面积(公顷)
                    landArea = reFunction(
                        '土地面积\(公顷\):(?:\s*)([\w}/\.{]*)(?:\s*)', item).strip()
                    # 项目名称
                    projectName = reFunction('项目名称:(?:\s*)([\s\S]*)土地用途明细',
                                             item).strip()
                    # 出让年限
                    transferTimeLimit = reFunction(
                        '出让年限:(?:\s*)([\s\S]*)成交价\(万元\)', item).strip()
                    # 成交价(万元)
                    transferPrice = reFunction(
                        '成交价\(万元\):(?:\s*)([\s\S]*)土地用途明细', item).strip()
                    # 土地用途明细(用途名称、面积)
                    landPurposeDetail = reFunction(
                        '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位',
                        item).strip() if reFunction(
                            '(?:\s*)面积\(公顷\)(?:\s*)([\w}/\.{]*)受让单位',
                            item).strip() else reFunction(
                                '(?:\s*)([\d\.]*)(?:[\s]*)受让单位', item).strip()
                    # 受让单位
                    transferUnit = reFunction('受让单位:(?:\s*)([\w}/{]*)(?:\s*)',
                                              item).strip()
                    # 土地使用条件
                    lanServiceCondition = reFunction(
                        '土地使用条件:(?:\s*)([\s\S]*)备注', item).strip()
                    # 备注
                    remark = reFunction(u'备注:(?:\s*)([\s\S]*)(?:\s*)[二、]?',
                                        item).strip()
                    # 爬取时间
                    crawlingTime = time.strftime("%Y-%m-%d %H:%M:%S",
                                                 time.localtime())
                    # 爬取地址url
                    url = response.url
                    # 唯一标识
                    md5Mark = encrypt_md5(parcelNumber + publishTime +
                                          parcelLocation + url)

                    # 存储数据
                    csvFile = [
                        administration, supplyNoticeTitle, publishTime,
                        fileTitle, textTitle, projectName, parcelNumber,
                        parcelLocation, landPurpose, landArea,
                        transferTimeLimit, transferPrice, landPurposeDetail,
                        transferUnit, remark, publicityPeriod, contactUnit,
                        unitAddr, postalCode, contactTel, contacter, email,
                        lanServiceCondition, crawlingTime, url, md5Mark
                    ]
                    self.fileDetail.write(','.join([
                        _.replace(',', ' ').replace('\n', '').replace(
                            '\r', '') if _ else _ for _ in csvFile
                    ]))
                    self.fileDetail.write('\n')

            #TODO
        except Exception as e:
            self.log(f'详情页数据解析失败, 错误: {e}', level=logging.ERROR)
Esempio n. 54
0
def preview_result(Xpath, inputtext):
    sel = Selector(text=inputtext)
    result = sel.xpath(Xpath).extract()
    n = len(result)
    for idx, element in enumerate(result[:min(4,n)], start=1):
        print(f"Element {idx}: {element}", end=sp)
Esempio n. 55
0
    def parse(self, response):

        pagesource = Selector(response)

        tax_rate = .01
        interest = 0.0435
        loan_term = 30
        insurance = .5
        dp_percentage = 0.25

        total_page = re.findall(
            r"\d+",
            response.xpath('//span[@class="pageText"]//text()').extract()
            [0])[1]
        current_page = re.findall(
            r"\d+",
            response.xpath('//span[@class="pageText"]//text()').extract()
            [0])[0]

        search_results = pagesource.xpath(
            "//div[@class='MapHomeCardReact HomeCard']")

        for search in search_results:
            entry = RedfinTestItem()
            entry['price'] = float(''.join(
                re.findall(
                    r"\d+",
                    search.xpath(
                        './/span[@data-rf-test-name="homecard-price"]//text()'
                    ).extract()[0])))
            entry['street'] = search.xpath(
                './/span[@data-rf-test-id="abp-streetLine"]//text()').extract(
                )[0]
            entry['citystatezip'] = search.xpath(
                './/span[@data-rf-test-id="abp-cityStateZip"]//text()'
            ).extract()[0]
            entry['zipcode'] = re.findall(
                r"\d+",
                search.xpath(
                    './/span[@data-rf-test-id="abp-cityStateZip"]//text()').
                extract()[0])
            entry['HOA'] = ''.join(
                re.findall(
                    r"\d+",
                    search.xpath(
                        './/span[@data-rf-test-name="homecard-amenities-hoa"]//text()'
                    ).extract()[0]))
            entry['Beds'] = ''.join(
                search.xpath('.//div[@class="value"]//text()').extract()[0])
            entry['Baths'] = ''.join(
                search.xpath('.//div[@class="value"]//text()').extract()[1])
            entry['SQFT'] = ''.join(
                search.xpath('.//div[@class="value"]//text()').extract()[2])

            entry['year_built'] = search.xpath(
                './/span[@data-rf-test-name="homecard-amenities-year-built"]//text()'
            ).extract()[0]
            entry['rent'] = get_rent(str(entry['street']),
                                     str(entry['zipcode']))
            entry['mortgage_pmt'] = float(
                Loan(entry['price'] * 1 - (dp_percentage), interest,
                     loan_term).monthly_payment)
            entry['insurance'] = insurance * make_float(entry['SQFT'])
            if entry['insurance'] == 0:
                entry['insurance'] == 60
            entry['tax'] = entry['price'] * tax_rate / 12
            entry['total_pmt'] = make_float(
                entry['HOA']
            ) + entry['mortgage_pmt'] + entry['insurance'] + entry['tax']
            entry['cashflow'] = get_cashflow(entry['rent'], entry['total_pmt'])
            #, entry['price_estimate']
            yield entry

        if int(total_page) > int(current_page):
            if int(current_page) == 1:
                next_page = response.url + "/page-2"
            else:
                next_page = re.sub(r"[page-][\d]+",
                                   "-" + str(int(current_page) + 1),
                                   response.url)
            yield Request(next_page, callback=self.parse)
    def parse(self, response):
        ''' 
        Scrape archive for articles
        Parameters
        ----------
        self:
            the PostillonSpider object
        response:
            The response from a scrapy request
        '''
        def init_selenium_driver():
            '''
            Initialize and return a firefox or chorme selenium driver depending on the option SELENIUM_DRIVER 

            Returns
            -------
            A firefox or chrome selenium driver depending on the option SELENIUM_DRIVER
            '''
            if SELENIUM_DRIVER == 'Firefox':
                firefoxOptions = webdriver.FirefoxOptions()
                firefoxOptions.headless = True
                desired_capabilities = firefoxOptions.to_capabilities()
                driver = webdriver.Firefox(
                    desired_capabilities=desired_capabilities)
            else:  # Chrome driver
                chrome_options = Options()
                chrome_options.headless = True
                driver = webdriver.Chrome(options=chrome_options)
            return driver

        def get_closed_elements():
            '''
            Returns all or some closed year and month elements, depending on the limit definitions.

            Returns
            -------
            All or some closed year and month elements, depending on the limit definitions.
            '''
            # Get all closed months of year to crawl, that are newer or equal to the limit specified by LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL
            if LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL:
                # get year
                element_of_YEAR_TO_CRAWL = driver.find_element_by_class_name(
                    'year-' + str(YEAR_TO_CRAWL))

                # Get closed months
                xpath = ".//li[contains(@class, 'closed') and (contains(@class, 'month-12')"
                for month in range(LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL - 1, 12):
                    month_plus_1 = month + 1
                    xpath += " or contains(@class, 'month-" + "{:02d}".format(
                        month + 1) + "')"
                xpath = xpath + ")]"

                closed_elements = element_of_YEAR_TO_CRAWL.find_elements_by_xpath(
                    xpath)
                closed_elements.append(element_of_YEAR_TO_CRAWL)

            # Get all closed months of year to crawl
            elif YEAR_TO_CRAWL:
                element_of_YEAR_TO_CRAWL = driver.find_element_by_class_name(
                    'year-' + str(YEAR_TO_CRAWL))

                closed_elements = element_of_YEAR_TO_CRAWL.find_elements_by_class_name(
                    'closed')
                closed_elements.append(element_of_YEAR_TO_CRAWL)

            # Get all closed years/months of the entire archive
            else:
                # also finds closed months inside closed years
                closed_elements = driver.find_elements_by_class_name('closed')

            return closed_elements

        def waitForLoad():
            '''
            Wait until at 1 article per year has been loaded. 
            If the current year is being crawled wait until an article of january or LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL 
            has been loaded (Because the current month of the current year is already loaded on page load).

            '''
            CURRENT_YEAR = datetime.now().year
            TIMEOUT = 20
            wait = WebDriverWait(driver, TIMEOUT)
            try:
                # xpath for tag that with class 'date' and content that includes '2020' or '1.2020' or '<LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL>.2020',
                # depending on what is to be crawled
                xpath = "//a/div/div/div[contains(@class, 'date') and contains(string(), '"
                if YEAR_TO_CRAWL:
                    # If the current year is crawled wait for an article of the first month to be loaded.
                    # This is necessary because the current month is already loaded on page load.
                    if YEAR_TO_CRAWL == CURRENT_YEAR:
                        if LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL:
                            xpath += str(
                                LIMIT_MIN_MONTH_OF_YEAR_TO_CRAWL) + "."
                        else:
                            xpath += "1."

                    xpath += str(YEAR_TO_CRAWL) + "')]"
                    wait.until(
                        EC.presence_of_element_located((By.XPATH, xpath)))

                # Wait for 1 artile per year
                else:
                    base_xpath = xpath
                    for i in range(2008, CURRENT_YEAR + 1):
                        # xpath for tag with class 'date' and the content that includes the year i
                        xpath = base_xpath + str(i) + "')]"
                        wait.until(
                            EC.presence_of_element_located((By.XPATH, xpath)))

            except TimeoutException as e:
                logging.warning(
                    "TimeoutException has been thrown while waiting for articles to load: %s",
                    e)

        def click_elements(elements):
            '''"
            Click all elements in elements

            Parameters
            ----------
            elements:
                HTML Elements to be clicked
            '''
            for element in elements:
                try:
                    # element.click() causes Exception: "could not be scrolled into view"
                    driver.execute_script("arguments[0].click();", element)
                    # print("click: " + element.get_attribute('class').split()[1])

                except Exception as e:
                    logging.warning(
                        "An exception has been thrown while clicking closed years/months: %s",
                        e)

        driver = init_selenium_driver()
        driver.get(root)

        # Close all years/months
        click_elements(driver.find_elements_by_class_name('open'))

        # Open closed years/months to load articles
        click_elements(get_closed_elements())

        # Wait for articles to be loaded
        waitForLoad()

        # Hand-off between Selenium and Scrapy
        sel = Selector(text=driver.page_source)

        # for all ul tags with class 'month-inner' get all contained li tags and get their direct a-tag children
        articleList = sel.xpath('//ul[@class="month-inner"]//li/a')

        articleList = utils.limit_crawl(articleList, TESTRUN_ARTICLES_LIMIT)

        if articleList:
            for article in articleList:
                # extract the value of the href attribute from article
                long_url = article.xpath('./@href').extract()[0]
                # extract the content of div-tags with class 'date' contained by article
                published_time = article.xpath(
                    './/div[@class="date"]/text()').extract()
                published_time = published_time[0] if len(
                    published_time) > 0 else ''

                if long_url and not utils.is_url_in_db(long_url):
                    yield scrapy.Request(long_url,
                                         callback=self.parse_article,
                                         cb_kwargs=dict(
                                             long_url=long_url,
                                             published_time=published_time))

                else:
                    utils.log_event(utils(), self.name, long_url, 'exists',
                                    'info')
                    logging.info('%s already in db', long_url)

        # Quit the selenium driver and close every associated window
        driver.quit()
Esempio n. 57
0
from scrapy import Selector

body = " <html><head><title>Hello World</title></head><body></body> </ html> "
selector = Selector(text=body)
title = selector.xpath('//title/text ()').extract_first()
print(title)
Esempio n. 58
0
 def parseNews(self, response):
     self.response_body_decode(response)
     sel = Selector(response)
     homeurl = tools.getHomeUrl(response.url)
     brandname = response.meta['brandname']
     news = None  # news保存新闻主体部分的SelectorList
     pagerule = None
     # 判断是否已经可以确定页面规则
     if response.meta.has_key('pagerule'):
         pagerule = response.meta['pagerule']
         news = sel.xpath(pagerule['pageform'])
     else:
         # 对于新闻页面规则库的每条规则进行匹配,然后对该类型的新闻页面进行爬取
         for each_rule in newspage_type.page_rules:
             news = sel.xpath(each_rule['pageform'])
             if len(news) > 0:
                 pagerule = each_rule
                 break
     if pagerule is None:
         raise ValueError('Error processing (' + response.url +
                          ') This page do not have corresponding rules')
     # 获得allpage 和 nextpage url
     if pagerule['allpage'] is None:
         allpage = []
     else:
         allpage = news.xpath(pagerule['allpage']).extract()
     if pagerule['nextpage'] is None:
         nextpage = []
     else:
         nextpage = news.xpath(pagerule['nextpage']).extract()
     # 如果包含全页阅读的url,则进行该处理
     if len(allpage) > 0:
         if tools.isCompleteUrl(allpage[0]):
             url = allpage[0]
         else:
             url = homeurl + allpage[0]
         r = Request(url, callback=self.parseNews)
         r.meta['brandname'] = brandname
         r.meta['pagerule'] = pagerule
         yield r
     elif len(nextpage) > 0:
         # 如果包含下一页,则进行该处理
         if tools.isCompleteUrl(nextpage[0]):
             url = nextpage[0]
         else:
             url = homeurl + nextpage[0]
         # 提取当前页面的title, date, content,保存到article中,传递至下一请求
         title = news.xpath(pagerule['title']).extract()
         date = self.getDate(news, response.url, pagerule['date'])
         content = self.getContent(news, pagerule['content'])
         article = {
             'brandname': brandname,
             'title': title,
             'date': date,
             'content': content
         }
         r = Request(url, callback=self.parseNextPage)
         r.meta['article'] = article
         r.meta['pagerule'] = pagerule
         yield r
     else:
         # 如果新闻只有一页,则直接提取新闻内容
         title = news.xpath(pagerule['title']).extract()
         date = self.getDate(news, response.url, pagerule['date'])
         content = self.getContent(news, pagerule['content'])
         item = NewsItem()
         item['brandname'] = brandname
         item['date'] = date
         item['title'] = "".join(title)
         item['content'] = "".join(content)
         yield item
Esempio n. 59
-1
    def reviews_parse(self,response):
        hxs = Selector(response)
        # print 11111111
        item = reviewsItem()

        sites = hxs.xpath('//*[@class="article"]/div[2]/div[@class="item"]/div[@class="info"]/ul')
        # sites = hxs.xpath('//*[@class="article"]/div[2]/div[@class="item"]/div[@class="info"]')
        for site in sites:
            item['userID'] = re.findall('people/(.+)/collect',response.url)
            # print response.url
            item['moviename'] = site.xpath('li[@class="title"]/a/em/text()').extract()
            item['movieID'] = site.xpath('li[@class="title"]/a/@href').re('subject/(.+)/$')

            moviesUrl =site.xpath('li[@class="title"]/a/@href').extract()[0]
            yield Request(url=moviesUrl,callback=self.movie_parse)

            item['ratingdate'] = site.xpath('li[3]/span[@class="date"]/text()').extract()
            if re.findall('rating\d+-t',site.xpath('li[3]/span[1]/@class').extract()[0]):
                item['rating'] = site.xpath('li[3]/span[1]/@class').re('\d+')
            else:
                item['rating'] = [u'']
            if site.xpath('li[4]/span[@class="comment"]/text()').extract():
                item['comment'] = site.xpath('li[4]/span[@class="comment"]/text()').extract()
            else:
                item['comment'] = [u'']
            yield item
            # print item

        if hxs.xpath('//*[@class="paginator"]/span[@class="next"]/a/@href').extract():
            nextreviewsUrl = hxs.xpath('//*[@class="paginator"]/span[@class="next"]/a/@href').extract()[0]
            # print nextreviewsUrl
            yield Request(url=nextreviewsUrl, callback=self.reviews_parse)
        pass
Esempio n. 60
-1
    def on_detail_page(self, response):
        if response.url == response.old_url:
            try:
                hxs = Selector(text=response.content)

                summary = hxs.xpath('//div[@class="card-summary-content"]/*').extract()
                content = []
                for ctx in summary:
                    text = clean_html_text(ctx)
                    content.append(text)
                content_text = " ".join(content)
                content_text=content_text.replace("[1]","")
                content_text=content_text.replace("[2]","")
                
                item_dict={}
                items = hxs.xpath('//div[@class="baseInfoWrap"]/div/div/*')
                
                for item in items:
                    title = item.xpath('./span/text()').extract()
                    title_value = item.xpath('./div/text()').extract()
                    print("key:value", to_value(title), to_value(title_value))
                    item_dict[to_value(title)] = to_value(title_value)
                
                item_dict['summary'] = content_text
                imgs = hxs.xpath('//div[@class="lemma-picture summary-pic"]/a/img/@src').extract()
                item_dict['logo'] = to_value(imgs)
                print(item_dict)
                # save_content(self.site.name, url, json.dumps(item_dict))
                # update_url(self.site.name, url, 200)
                return item_dict
            except Exception,e:
                # update_url(self.site.name, url, 500)
                logging.error(e)