Beispiel #1
0
    def filter(self, response):

        print ",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"
        print self.filter_district
        print self.filter_date_from 
        print self.filter_date_to
        

    	if self.filter_date_from != '':
            conditionItem270 = Selector(response).xpath('//input[@id="TAB_QueryConditionItem270"]/@value').extract()[0]
            self.conditionData += "|" + conditionItem270.encode('ascii','ignore') + ":%s~%s"% (self.filter_date_from,self.filter_date_to)

        if self.filter_district != '':
            self.city = self.filter_district
            for dist in self.dist_dict_list:
                if dist['name'] == self.filter_district:
                    self.dis_code = dist['value']
                    break 
            if self.dis_code == '':
                raise Exception("Wrong input for district")
            if len(self.dis_code) == 2:
                self.city = ''
                self.state = self.filter_district
            else:
                statecode = self.dis_code[0:2]
                for state in self.dist_dict_list:
                    if state['value'] == statecode:
                        self.state = state['name']
                        break 
            conditionItem256 = Selector(response).xpath('//input[@id="TAB_QueryConditionItem256"]/@value').extract()[0]
            dist_pref =  "|" + conditionItem256.encode('ascii','ignore') + ":%s%%~" % self.dis_code
            dist_name = unicode(self.filter_district,"utf-8").encode("utf8")
            self.conditionData += dist_pref + dist_name 
    def parse_helper(self, response):
        has_content = False
        topic = Selector(response)\
                .xpath("//div[@class='topic-item-info content']/h4/text()")\
                .extract_first()

        #First, crawl all application in this page
        for app in Selector(response).xpath("//div[@class='nofloat']"):
            has_content = True

            item = HuaweiAppStoreTopicAppItem()
            item["topic"] = topic.encode("utf-8")
            item["title"] = app.xpath(".//img[@class='app']/@title")\
                    .extract_first().encode("utf-8")
            item["appid"] = app.xpath(".//a[1]/@href")\
                    .re('http://appstore.huawei.com:80/app/(C\d+)')[0]
            item["image"] = app.xpath(".//img[@class='app']/@lazyload")\
                    .extract_first().encode("utf-8")
            item["desc"] = u''.join(app.xpath(".//p[@class='ft-light']/text()")\
                    .extract()).encode("utf-8")
            yield item

        #Then, try find next page
        if has_content:
            yield scrapy.Request(\
                    self.get_next_page(response.url), self.parse_helper)
    def parse_helper(self, response):
        has_content = False
        topic = Selector(response)\
                .xpath("//div[@class='topic-item-info content']/h4/text()")\
                .extract_first()

        #First, crawl all application in this page
        for app in Selector(response).xpath("//div[@class='nofloat']"):
            has_content = True

            item = HuaweiAppStoreTopicAppItem()
            item["topic"] = topic.encode("utf-8")
            item["title"] = app.xpath(".//img[@class='app']/@title")\
                    .extract_first().encode("utf-8")
            item["appid"] = app.xpath(".//a[1]/@href")\
                    .re('http://appstore.huawei.com:80/app/(C\d+)')[0]
            item["image"] = app.xpath(".//img[@class='app']/@lazyload")\
                    .extract_first().encode("utf-8")
            item["desc"] = u''.join(app.xpath(".//p[@class='ft-light']/text()")\
                    .extract()).encode("utf-8")
            yield item

        #Then, try find next page
        if has_content:
            yield scrapy.Request(\
                    self.get_next_page(response.url), self.parse_helper)
Beispiel #4
0
    def parse(self, response):
        dirname = os.sep.join(['root'] + response.url.split('/')[2:-1])
        filename = os.sep.join([dirname, response.url.split('/')[-1]])
        article_text = Selector(response).xpath(
            '//div[@class="post"]').extract()[0]

        parser = Selector(text=article_text)

        article_title = parser.xpath(
            '//a[@id="cb_post_title_url"]/text()').extract()[0]
        title_link = parser.xpath(
            '//a[@id="cb_post_title_url"]/@href').extract()[0]

        article_text = article_text.replace(title_link, title_link[6:])

        item = ArticleItem()
        item['image_urls'] = [x for x in parser.xpath('//img/@src').extract()]
        item['image_names'] = [x.split('/')[-1] for x in item['image_urls']]

        # process image links.
        for url in item['image_urls']:
            article_text = article_text.replace(url, url[6:])

        if not os.path.exists(dirname):
            os.makedirs(dirname)
        with open(filename, 'wb') as fp:
            fp.write(self.html_start_l + article_title.encode('utf-8') +
                     self.html_start_r +
                     article_text.encode('utf-8', 'ignore') + self.html_end)

        return item
	def parse_item(self, response):
		text = Selector(response).xpath("//body//text()").re('(\w+)')
		
		for text in text:
			newtext = text.encode('utf8')
			hxs = HtmlXPathSelector(response)
			item = BSiteItem()
			if newtext == 'aerospace' or newtext == 'Aerospace' or newtext == 'AEROSPACE':
				print 'True'
				test = response.url
				print test
Beispiel #6
0
    def parse(self, response):
        dirname = os.sep.join(['root'] + response.url.split('/')[2:-1])
        filename = os.sep.join(
            [dirname, response.url.split('/')[-1] + '.html'])
        # parse artitle text.
        article_text = Selector(response).xpath(
            '//div[@id="article_details"]').extract()[0]

        parser = Selector(text=article_text)

        # parse artile title.
        article_title = parser.xpath(
            '//span[@class="link_title"]/a/text()').extract()[0]
        article_links = parser.xpath(
            '//a[re:test(@href, "[^/]+/article/details/\d+")]/@href').extract(
            )

        # replace links.
        article_text = article_text.replace(
            'http://static.blog.csdn.net/css/blog_detail.css',
            '/static.blog.csdn.net/css/blog_detail.css')
        for link in article_links:
            article_text = article_text.replace(
                link, '/blog.csdn.net' + link + '.html')

        item = ArticleItem()
        item['image_urls'] = [x for x in parser.xpath('//img/@src').extract()]
        # handle such image(with watermark) url:
        #    http://img.blog.csdn.net/20140917165912117?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvaWFpdGk=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast
        item['image_names'] = [
            (lambda k: k
             if '?' not in k else k.split('?')[0] + '.png')(x).split('/')[-1]
            for x in item['image_urls']
        ]

        # process image links.
        for url in item['image_urls']:
            article_text = article_text.replace(
                url, (lambda k: k
                      if '?' not in k else k.split('?')[0] + '.png')(url)[6:])

        if not os.path.exists(dirname):
            os.makedirs(dirname)
        with open(filename, 'wb') as fp:
            fp.write(self.html_start_l + article_title.encode('utf-8') +
                     self.html_start_r +
                     article_text.encode('utf-8', 'ignore') + self.html_end)

        return item
Beispiel #7
0
    def parse_downurl(self, response):
        try:
            antivirus1 = response.css("#static_antivirus").extract()[0]
            antivirus = Selector(response).css(
                "#static_antivirus").extract()[0]
            # 从Static Analysis ------ Antivirus的结果页
            antiresult = re.findall(
                "((Microsoft|Kaspersky|ESET\-NOD32)</td>\n\s*<td>\n\s*<span class=\"text\-error\")",
                antivirus.encode("utf-8"), re.S)
            # 如果返回的列表为空,则表示这个样本eset、卡巴斯基和微软不报,不入库,直接返回。
            if antiresult == []:
                return
            # 提取点击下载按钮的下载地址
            url = response.xpath("//a[contains(@class,'btn-primary')]/@href"
                                 ).extract()[0].encode('utf-8')
            url = urlparse.urljoin("https://malwr.com", url)

            item = MalwrItem()
            item['file_urls'] = [url]
            return item
        except Exception, e:
            pass
    def parse(self, response):
        description = response.xpath(
            "//table[@class='itemlist']/tr[not(re:test(@class, "
            "'(spacer)'))]").extract()
        row = self.get_default_row_dict()
        # print description
        for i, v in enumerate(description):
            index = i
            if not row['rank']:
                value = Selector(text=v).xpath(
                    '//td[1]/span[@class="rank"]/text()').extract_first()
                row['rank'] = int(value.replace('.', '')) if value else 0

            if not row['story_text']:
                value = Selector(text=v).xpath(
                    '//td[3]/a[@class="storylink"]/text()').extract_first()
                row['story_text'] = value.encode("utf8") if value else ''

            if not row['link_href']:
                value = Selector(text=v).xpath(
                    '//td[3]/a[@class="storylink"]/@href').extract_first()
                # print value
                row['link_href'] = value if value else ''

            if not row['hn_user']:
                value = Selector(text=v).xpath(
                    '//a[@class="hnuser"]/text()').extract_first()
                row['hn_user'] = value.encode("utf8") if value else ''

            if not row['age']:
                value = Selector(text=v).xpath(
                    '//span[@class="age"]/a/text()').extract_first()
                row['age'] = int(value.split(' ')[0]) if value else 0

            if not row['total_comments']:
                value = Selector(text=v).xpath(
                    '//td[@class="subtext"]/a[contains(@href, "item?id=")]/text()'
                ).extract_first()
                if value:
                    value = value.encode('ascii', 'ignore').replace(
                        'comments', '') if value else ''
                    value = value.encode('ascii', 'ignore').replace(
                        'comment', '') if value else ''
                    row['total_comments'] = int(value) if represents_int(
                        value) else 0

            if not row['score']:
                value = Selector(text=v).xpath(
                    '//span[@class="score"]/text()').extract_first()
                row['score'] = int(value.split(' ')[0]) if value else 0

            if not row['hn_id_code']:
                value = Selector(
                    text=v).xpath('//tr[@class="athing"]/@id').extract_first()
                row['hn_id_code'] = int(value) if represents_int(value) else 0

            if all([None for i, v in row.items() if v == None]):
                print 'Go for save >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>'
                data = row.copy()
                row = self.get_default_row_dict()
                self.comment_url.append(
                    'https://news.ycombinator.com/item?id=15318440')
                news_id = data['hn_id_code']
                item = NewsBotItem(data)
                yield item
                request = scrapy.Request(
                    url='https://news.ycombinator.com/item?id=' + str(news_id),
                    callback=self.parse_comment)
                request.meta['item'] = item
                request.meta['news_id'] = int(news_id)
                yield request

            if index % 2:
                row = self.get_default_row_dict()
Beispiel #9
0
    def parse(self, response):
        dirname = os.sep.join(['root'] + response.url.split('/')[2:-1])
        filename = os.sep.join([dirname, response.url.split('/')[-1]])
        article_text = Selector(response).xpath('//div[@class="post"]').extract()[0]

        parser = Selector(text = article_text)

        article_title = parser.xpath('//a[@id="cb_post_title_url"]/text()').extract()[0]
        title_link = parser.xpath('//a[@id="cb_post_title_url"]/@href').extract()[0]

        article_text = article_text.replace(title_link, title_link[6:])

        item = ArticleItem()
        item['image_urls'] = [x for x in parser.xpath('//img/@src').extract()]
        item['image_names'] = [x.split('/')[-1] for x in item['image_urls']]

        # process image links.
        for url in item['image_urls']:
            article_text = article_text.replace(url, url[6:])

        if not os.path.exists(dirname):
            os.makedirs(dirname)
        with open(filename, 'wb') as fp:
            fp.write(self.html_start_l + article_title.encode('utf-8') + self.html_start_r + article_text.encode('utf-8', 'ignore') + self.html_end)

        return item
Beispiel #10
0
    title_id = [x for x in title_id if x != "0"]

    j = 0
    #    if (len(title_id)!=len(title)):
    #        if(title[j].encode('utf-8')==' '):

    for i in range(0, len(title_id)):

        if title[j].encode("utf-8") == " ":
            j = i + 1
        ocr_title.writerow(
            [
                title[j].encode("utf-8"),
                title_id[i].encode("utf-8"),
                section_id.encode("utf-8"),
                grade_id.encode("utf-8"),
                subject_id.encode("utf-8"),
                term_id.encode("utf-8"),
            ]
        )
        j = j + 1
    # section

    section = Selector(text=content).xpath("//h4/text()").extract()

    for i in range(0, len(section)):
        ocr_section.writerow(
            [
                section[i].encode("utf-8"),
                i,
                grade_id.encode("utf-8"),
def extract_from_row(html, xpath):
    content = Selector(text=html).xpath(xpath).extract()[0]
    return content.encode('ascii', 'ignore')
def extract_from_row(html, xpath):
    content = Selector(text=html).xpath(xpath).extract()[0]
    return content.encode("ascii", "ignore")
Beispiel #13
0
 def parse_getxinwen(self,response):
     item = response.meta['item']
     dates = {}
     day = []
     time0 = Selector(response).xpath('//*[@id="zwconttb"]/div[2]/text()').extract()[0][4:14] # 爬取新闻的发表时间
     k = 0
     for key in item:
         if key == 'xinwen':
             break
         else:
             k = k + 1
     if k == len(item.keys()): # 判断item['xinwen']是否存在
         item['xinwen'] = {}
     k = 0
     for key in item['xinwen']:
         if key == time0: # 判断该条新闻的发表日期是否已经存在,如果存在,直接在对应的时间中添加该新闻的标题,作者,内容和评论。
             content = ''
             i = 0
             while i < len(Selector(response).xpath('//p/text()').extract()) - 3:
                 data = Selector(response).xpath('//p/text()').extract()[i]
                 content = content + data.encode("UTF-8",'ignore')
                 i = i + 1
             item['xinwen'][time0].append({
                 'date':time0,
                 'title':Selector(response).xpath('//*[@id="zwconttbt"]/text()').extract()[0],
                 #'author':Selector(response).xpath('//*[@id="zwconttbn"]/strong/a/text()').extract()[0],
                 'content':content,
                 'comments':{} # 先初始化,后面再添加评论
             })
         else:
             k = k + 1
     if k == len(item['xinwen'].keys()):# 如果日期原先不存在
         if k == 30: # 判断是否已经爬满30天
             return item
         # 爬取新闻内容
         content = ''
         i = 0
         while i < len(Selector(response).xpath('//p/text()').extract()) - 3:
             data = Selector(response).xpath('//p/text()').extract()[i]
             content = content + data.encode("UTF-8",'ignore')
             i = i + 1
         # 如果没有30天,在字典中添加一个键对
         item['xinwen'][time0] = [{
                 'date':time0,
                 'title':Selector(response).xpath('//*[@id="zwconttbt"]/text()').extract()[0],
                 #'author':Selector(response).xpath('//*[@id="zwconttbn"]/strong/a/text()').extract()[0],
                 'content':content,
                 'comments':{} # 先初始化,后面再添加评论
             }]
     # day = item['xinwen'][time0]
     # for i in range(0, len(day)): # 遍历这篇新闻发表日期对应的数组
     #     if day[i]['title'] == Selector(response).xpath('//*[@id="zwconttbt"]/text()').extract()[0]:# 找到这篇新闻对应的字典
     #         j = 1
     #         while Selector(response).xpath('//*[@id="zwlist"]/div['+str(j)+']').extract():# 逐条爬取评论
     #             time1 = Selector(response).xpath('//*[@id="zwlist"]/div['+str(j)+']/div[3]/div/div[2]/text()').extract()[0][4:23] # 爬取评论时间
     #             if Selector(response).xpath('//*[@id="zwlist"]/div['+str(j)+']/div[3]/div/div/span/a/text()').extract(): # 爬取评论者的姓名
     #                name = Selector(response).xpath('//*[@id="zwlist"]/div['+str(j)+']/div[3]/div/div/span/a/text()').extract()[0]
     #             else:
     #                 name = Selector(response).xpath('//*[@id="zwlist"]/div['+str(j)+']/div[3]/div/div/span/span/text()').extract()[0]
     #             comment = ''
     #             for data in Selector(response).xpath('//*[@id="zwlist"]/div['+str(j)+']/div[3]/div/div[3]/child::text()').extract(): # 爬取评论的内容
     #                 comment = comment + data
     #             day[i]['comments'][time1] = {
     #                 'name':name,
     #                 'comment':comment
     #             } # 将爬取到的内容存入字典
     #             j = j + 1
     #         break
     # item['xinwen'][time0] = day
     num = 0
     for key in item['xinwen']:
         num = num + len(item['xinwen'][key])
     if item['numberxin'][1] < item['numberxin'][0] and num == item['numberxin'][1]: # 判断是否已经爬取完当前页面所有新闻的链接,能否跳转到下一页
         url = "http://guba.eastmoney.com/list,"+item['_id']+",1,f_"+str(item['numberxin'][2]+1)+".html"
         return Request(url, meta={'item':item}, callback=self.parse_xinwen)
     else:
         return item