Beispiel #1
0
    def parse_forum(self, response):
        """
        解析评论内容
        """
        item = Item()
        itemloader = ItemLoader(item=item, selector=response)
        for field in self.forum_xpath:
            item.fields[field] = Field()
            if 'main_body' in field:
                content = re.compile(r'<html><body><.*?>(.*?)</body></html>',
                                     re.S | re.M)
                content = content.findall(response.text)
                content = re.sub(r'<script>.*?</script>', '', ''.join(content))
                content = re.sub(r'[\r\n]', '', content)
                content = re.sub(r'<div .*?>.*?</div>', '', content)
                content = re.sub(r'<style .*?>.*?</style>', '', content,
                                 re.S | re.M)
                content = re.sub(r'&.*?;', '', content)
                content = re.sub(r'<.*?>', '', content, re.M | re.I)
                content = re.sub('  ', '', content)
                itemloader.add_value(field, content)
            elif 'content_url' in field:
                itemloader.add_value(field, response.url)
            else:
                itemloader.add_xpath(field, self.forum_xpath[field])

        item = self.format_item(itemloader.load_item())

        yield item
Beispiel #2
0
    def parse_shop(self, response):
        """
        解析提取model数据
        :param response:
        """
        item = Item()
        item_loader = ItemLoader(item=item, selector=response)
        for field in self.model_xpath:
            item.fields[field] = Field()
            if 'model_url' in field:
                item_loader.add_value(field, response.url)
            else:
                item_loader.add_xpath(field, self.model_xpath[field])

        item = self.format_item(item_loader.load_item())

        yield item

        # 拼接用户评论链接
        user_url = response.xpath(self.user_url).extract()
        for uid in user_url:
            brand = uid.split('-')[0]
            model = uid.split('-')[1]
            yield Request(
                self.user_review_url.format(brand=brand, model=model),
                self.parse_comment)
Beispiel #3
0
def PlayerRow(**fields):
    item = Item()
    for field in all_fields:
        item.fields[field] = Field()
    for field, value in fields.iteritems():
        item[field] = value
    return item
 def parse_item(self, response):
     item = Item()
     item.fields['url'] = Field()
     item.fields['url_md5'] = Field()
     item.fields['title'] = Field()
     item.fields['pubtime'] = Field()
     item.fields['content'] = Field()
     item.fields['author'] = Field()
     item.fields['author_url'] = Field()
     item.fields['site_name'] = Field()
     l = ItemLoader(item=item, response=response)
     url = response.url
     md5 = hashlib.md5()
     l.add_value(u'url', url)
     md5.update(url)
     url_md5 = md5.hexdigest()
     l.add_value(u'url_md5', url_md5)
     l.add_xpath('title', "//h1/text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath(
         'pubtime',
         "//p[@class='clearfix']//span/text() | //div[@class='titleLine-gY7DniPB']//span/text()",
         MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath('content', "//div[@class='text-3zQ3cZD4']//text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath(
         'author',
         "//p[@class='clearfix']/a[2]/text() | //div[@class='titleLine-gY7DniPB']/p/a[2]/text()",
         MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath('author_url',
                 "//div[@class='titleLine-gY7DniPB']/p/a[2]/@href",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_value("site_name", u"大风号")
     yield l.load_item()
Beispiel #5
0
 def __init__(self):
     """
     将item中的值初始化为空
     """
     Item.__init__(self)
     self['coal'] = ''
     self['non_coal'] = ''
Beispiel #6
0
 def parse(self, response):
     item = Item()
     fields = json.loads(self.conf.get("fields"))
     l = ItemLoader(item, response)
     item.fields["url"] = Field()
     item.fields["spider_jobid"] = Field()
     l.add_value("url", response.url)
     l.add_value("spider_jobid", self.spider_jobid)
     # 加载动态库字段建立Field,xpath规则 (方法一)
     for k in self.keys:
         if fields.get("fields", "") == "":
             logging.error(u"内容解析未得到!!!")
             return l.load_item()
         if fields.get("fields").get(k) != None:
             item.fields[k] = Field()
             if fields.get("fields").get(k).keys()[0] == "xpath":
                 l.add_xpath(
                     k,
                     u"{}".format(fields.get("fields").get(k).get("xpath")),
                     MapCompose(unicode.strip))
             elif fields.get("fields").get(k).keys()[0] == "value":
                 if fields.get("fields").get(k).get("value") == u"{TODAY}":
                     l.add_value(k, u"{}".format(datetime.now()))
                 l.add_value(
                     k,
                     u"{}".format(fields.get("fields").get(k).get("value")))
     return l.load_item()
Beispiel #7
0
 def __setitem__(self, name, value):
     processed_tag = code_to_name(name)
     if processed_tag in self._values and isinstance(
             self._values[processed_tag], BaseField):
         self._values[processed_tag].set_value(value)
     else:
         Item.__setitem__(self, name, value)
Beispiel #8
0
    def save_item(item: Item, spider):
        if isinstance(item, MessageItem):

            def message_is_unique(message_model: Message, limit=20) -> bool:
                subquery = session.query(Message) \
                    .order_by(Message.created_at.desc(), Message.id.desc()) \
                    .limit(limit) \
                    .subquery()

                alias = aliased(Message, subquery)

                return not session.query(
                    session.query(alias).filter(
                        alias.text == message_model.text, alias.image
                        == message_model.image).exists()).scalar()

            def message_fit_the_length(message_model: Message) -> bool:
                if message_model.image:
                    return len(remove_tags(message_model.text)) <= 1024
                return len(remove_tags(message_model.text)) <= 4096

            message = Message(text=item.get('text'),
                              image=item.get('image'),
                              url=item.get('url'))

            if message_is_unique(message) and message_fit_the_length(message):
                with session.begin():
                    session.add(message)
Beispiel #9
0
    def parse_user(self, response):
        """
        根据返回的 response 进行数据解析
        :param response: scrapy 框架返回的响应
        """
        result = {
            'user_name': response.meta['user_name'],
            'title': response.meta['title'],
            'date': response.meta['date'],
            'main_body': response.meta['main_body'],
            'content_url': response.meta['content_url'],
            'brand': response.meta['brand']
        }

        for content in response.xpath(self.user_list_xpath):
            item = Item()
            item_loader = ItemLoader(item=item, selector=content)
            for field in self.user_xpath:
                item.fields[field] = Field()
                if 'user_url' in field:
                    item_loader.add_value(field, response.url)
                else:
                    item_loader.add_xpath(field, self.user_xpath[field])

            result.update(item_loader.load_item())
            item = self.format_item(result)
            yield item

        # 用户评论
        user_comment = response.xpath('.//ul/li[@class="Comments"]/a/@href').extract()
        if user_comment:
            yield Request(self.url + user_comment[0], self.parse_comment)
Beispiel #10
0
    def __init__(self):
        Item.__init__(self)
        #爬取下来的一个全局唯一ID
        self['crawlerid'] = ''
        #页面链接
        self['url'] = ''
        #html源码
        self['html_code'] = ''
        #页面编码
        self['encoding'] = ''
        #标题
        self['title'] = ''
        #作者
        self['authors'] = []
        #正文
        self['content'] = ''
        #新闻时间
        self['time'] = ''
        #来源
        self['source'] = ''
        #编辑
        self['editor'] = ''
        #频道类别
        self['ctype'] = ''
        #频道类别
        self['subtype'] = ''
        #关键词
        self['keywords'] = []
        #摘要
        self['abstract'] = ''

        self['copyright'] = ''
        self['originality'] = ''
        self['type'] = 'text'
Beispiel #11
0
    def parse(self, response):
        """
        根据返回的 response 进行数据解析
        :param response: scrapy 框架返回的响应
        """
        keyword = response.meta['keyword']
        for complaint in response.xpath(self.list_xpath):
            item = Item()
            item_loader = ItemLoader(item=item, selector=complaint)
            for field in self.content_xpath:
                item.fields[field] = Field()
                if 'content_url' in field:
                    item_loader.add_value(field, response.url)
                elif 'brand' in field:
                    item_loader.add_value(field, keyword)
                else:
                    item_loader.add_xpath(field, self.content_xpath[field])

            # 用户链接
            user_id = complaint.xpath('.//span[@class="Author"]/a/@href').extract()
            for uid in user_id:
                yield Request(self.url + uid, self.parse_user, meta=dict(item_loader.load_item()))

        # 下一页
        next_page = response.xpath('//div[@id="PagerBefore"]/a[last()]/@href').extract()
        if next_page:
            yield Request(self.url + next_page[0], meta={'keyword': keyword}, callback=self.parse)
Beispiel #12
0
    def parse(self, response):
        """
        解析网页获取评论
        :param response:响应内容
        """
        search = response.meta['keyword']
        # 评论页
        for complaint in response.xpath(self.list_xpath):
            item = Item()
            item_loader = ItemLoader(item=item, selector=complaint)
            for field in self.content_xpath:
                item.fields[field] = Field()
                if 'content_url' in field:
                    item_loader.add_value(field, response.url)
                elif 'brand' in field:
                    item_loader.add_value(field, search)
                else:
                    item_loader.add_xpath(field, self.content_xpath[field])
            # 拼接user_url链接
            uid = complaint.xpath('.//tr/td[@class="small"]/a/@href').extract()
            yield Request(self.urls + uid[0],
                          self.parse_user,
                          meta=dict(item_loader.load_item()),
                          dont_filter=True)

        # 内容下一页
        next_page = response.xpath(
            '//div[@class="pagelinks"]/a[last()]/@href').extract()
        for page in next_page:
            yield Request(self.urls + page,
                          self.parse,
                          meta={'keyword': search})
Beispiel #13
0
 def __init__(self):
     """
     将item中的值初始化为空
     """
     Item.__init__(self)
     self['coal'] = ''
     self['non_coal'] = ''
Beispiel #14
0
 def second_parse(self, response):
     href_list = response.xpath('//ul[@class="wp100"]/li//div/p[@class="fs22"]/a/@href')
     print(len(href_list), response.url)
     for href in href_list:
         item = Item()
         next_url = 'http://www.sinyi.com.cn/' + href.extract().split('/', 1)[1].split('&cookieuid=')[0]
         item.fields['HouseUrl'] = Field()
         item['HouseUrl'] = next_url
         yield scrapy.Request(next_url, callback=self.third_parse, meta={'item': item})
 def parse_item(self, response):
     item = Item()
     item.fields['url'] = Field()
     item.fields['url_md5'] = Field()
     item.fields['title'] = Field()
     item.fields['pubtime'] = Field()
     item.fields['content'] = Field()
     item.fields['author'] = Field()
     item.fields['site_name'] = Field()
     l = ItemLoader(item=item, response=response)
     url = response.url
     url = url.replace("http://rym.quwenge.com/baidu_tiaozhuan.php?url=",
                       "")
     md5 = hashlib.md5()
     l.add_value(u'url', url)
     md5.update(url)
     url_md5 = md5.hexdigest()
     l.add_value(u'url_md5', url_md5)
     l.add_xpath('title', "//h1/text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath('pubtime', "//span[@class='read']/text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath('content', "//div[@id='content']//text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath('author', "//div[@class='name']/text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_value("site_name", u"百家号")
     yield l.load_item()
Beispiel #16
0
 def parse_item(self, response):
     item = Item()
     item.fields['url'] = Field()
     item.fields['url_md5'] = Field()
     item.fields['title'] = Field()
     item.fields['pubtime'] = Field()
     item.fields['content'] = Field()
     item.fields['author'] = Field()
     item.fields['site_name'] = Field()
     l = ItemLoader(item=item, response=response)
     url = response.url
     md5 = hashlib.md5()
     l.add_value(u'url', url)
     md5.update(url)
     url_md5 = md5.hexdigest()
     l.add_value(u'url_md5', url_md5)
     l.add_xpath('title', "//h2/text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath('pubtime', "//p[@class='time']/span[1]/text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath('content', "//div[@class='content']//text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_xpath('author', "//p[@class='time']/span[last()]/text()",
                 MapCompose(unicode.lstrip, unicode.rstrip))
     l.add_value("site_name", u"网易号")
     yield l.load_item()
Beispiel #17
0
 def normalize(item: Item) -> School:
     return School(name=item.get('name'),
                   id='SH-{}'.format(item.get('Dienststellennummer')),
                   address=item.get('Strasse'),
                   zip=item.get("Postleitzahl"),
                   city=item.get("Ort"),
                   email=item.get('E-Mail'),
                   school_type=item.get('Schularten'),
                   fax=item.get('Fax'),
                   phone=item.get('Telefon'),
                   director=item.get('Schulleitung'))
Beispiel #18
0
 def handle_1(self, response):
     item1 = response.meta.get("item1")
     req = response.meta.get("req")
     fragments = re.findall("url \+= '(.*?)'", response.text, re.S)
     detail_url = ''
     for j in fragments:
         detail_url += j
     item2 = Item()
     item2.fields["NewUrl"] = Field()
     item2["NewUrl"] = detail_url
     item2.update(item1)
     yield scrapy.Request(url=detail_url, callback=self.parse, meta={"req": req, "item2": item2, "last_page": True})
Beispiel #19
0
 def normalize(item: Item) -> School:
     return School(name=item.get('name'),
                   id='RP-{}'.format(item.get('id')),
                   address=item.get('Adresse'),
                   city=re.split('\d{5}',
                                 item.get('Ort').strip())[1].strip(),
                   zip=re.findall('\d{5}', item.get('Ort'))[0],
                   website=item.get('Internet'),
                   email=item.get('E-Mail'),
                   school_type=item.get('Schulform'),
                   fax=item.get('Fax'),
                   phone=item.get('Telefon'))
Beispiel #20
0
 def start_requests(self):
     for kw in self.kw_list:
         start_page = 1
         for page in range(start_page, 11):
             item = Item()
             item.fields["SearchWord"] = Field()
             item.fields["Page"] = Field()
             item["SearchWord"] = kw
             item["Page"] = page
             start_url = 'https://weixin.sogou.com/weixin?query={}&type=2&page={}&ie=utf8'.format(
                 parse.quote(kw), str(page))
             yield scrapy.Request(url=start_url, callback=self.parse,
                                  meta={"start_url": start_url, "item": item})
 def normalize(item: Item) -> School:
     tel = item.get('telefon')
     return School(name=item.get('name'),
                   phone=tel,
                   fax=item.get('telefax'),
                   website=item.get('homepage'),
                   email=item.get('e-mail'),
                   address=item.get('straße'),
                   city=item.get('ort'),
                   zip=item.get('plz'),
                   school_type=item.get('schultyp'),
                   director=item.get('schulleitung'),
                   id='SL-{}'.format(tel.replace(" ", "-")))
Beispiel #22
0
def on_duplicate_sql(*args, item: scrapy.Item):
    if args and isinstance(item, scrapy.Item):
        dup_keys = list()

        for index, key in enumerate(args):
            if index == 0:
                update_str = '  ON DUPLICATE KEY UPDATE {}="{}"'.format(key, item.get(key))
            else:
                update_str = '{}="{}"'.format(key, item.get(key))
            dup_keys.append(update_str)
        return ', '.join(dup_keys)
    else:
        return ''
 def normalize(self, item: Item) -> School:
     return School(name=item.get('name'),
                   id='RP-{}'.format(item.get('id')),
                   address=item.get('Adresse'),
                   city=item.get('Ort'),
                   website=item.get('Internet'),
                   email=item.get('E-Mail'),
                   school_type=item.get('Schulform'),
                   fax=item.get('Fax'),
                   phone=item.get('Telefon'))
Beispiel #24
0
 def normalize(item: Item) -> School:
     return School(name=item.get('name'),
                   phone=item.get('telefon'),
                   fax=item.get('fax'),
                   website=item.get('homepage'),
                   address=item.get('straße'),
                   city=item.get('ort'),
                   zip=item.get('plz'),
                   school_type=item.get('schultyp'),
                   id='HE-{}'.format(item.get('id')))
 def start_requests(self):
     item = Item()
     for url in self.start_urls:
         url_slit = url.split(":")
         site_name = url_slit[0]
         url = ":".join(url_slit[1:-1])
         item.fields["site_name"] = Field()
         item['site_name'] = site_name
         item.fields["source_url"] = Field()
         item['source_url'] = url_slit[-1]
         yield scrapy.Request(url=url,
                              meta={"item": item},
                              dont_filter=True,
                              callback=self.parse_url)
Beispiel #26
0
 def normalize(item: Item) -> School:
     city_parts = item.get('Ort').split()
     zip, city = city_parts[0], ' '.join(city_parts[1:])
     return School(name=item.get('Schulname'),
                   id='TH-{}'.format(item.get('Schulnummer')),
                   address=item.get('Straße'),
                   zip=zip,
                   city=city,
                   website=item.get('Internet'),
                   email=ThueringenSpider._deobfuscate_email(item.get('E-Mail')),
                   school_type=item.get('Schulart'),
                   provider=item.get('Schulträger'),
                   fax=item.get('Telefax'),
                   phone=item.get('Telefon'))
Beispiel #27
0
    def parse(self, response):
        req = response.meta.get("req")
        item = response.meta.get("item")
        url_list_handle = Selector(text=response.text)

        url_list = url_list_handle.xpath("//div[@class='txt-box']/h3/a/@href").extract()
        for index, url in enumerate(url_list):
            item1 = Item()
            item1.fields["Located"] = Field()
            item1["Located"] = index
            url = self.base_url + url
            url = self.get_real_url_handle(url)
            item1.update(item)
            yield scrapy.Request(url=url, callback=self.handle_1, meta={"req": req, "item1": item1})
Beispiel #28
0
 def __init__(self, *args, **kwargs):
     Item.__init__(self, *args, **kwargs)
     self['platform'] = kwargs.get("keyword")
     self['keyword'] = kwargs.get("keyword")
     self['crawl_time'] = int(time.time())
     self['url'] = kwargs.get("url")
     self['real_url'] = kwargs.get("real_url")
     self['title'] = kwargs.get("title")
     self['source_url'] = kwargs.get("title")
     self['spider'] = kwargs.get("spider")
     self['skip_url'] = kwargs.get("skip_url")
     self['snapshot_url'] = kwargs.get("snapshot_url")
     self['show_url'] = kwargs.get("show_url")
     self['is_ad'] = kwargs.get("is_ad")
     self['content'] = kwargs.get("content")
Beispiel #29
0
 def normalize(item: Item) -> School:
     city_parts = item.get('adresse_ort').split()
     zip_code, city = city_parts[0], city_parts[1:]
     return School(name=item.get('schulname'),
                   id='HH-{}'.format(item.get('schul_id')),
                   address=item.get('adresse_strasse_hausnr'),
                   address2='',
                   zip=zip_code,
                   city=' '.join(city),
                   website=item.get('schul_homepage'),
                   email=item.get('schul_email'),
                   school_type=item.get('schulform'),
                   fax=item.get('fax'),
                   phone=item.get('schul_telefonnr'),
                   director=item.get('name_schulleiter'))
Beispiel #30
0
 def normalize(item: Item) -> School:
     zip_code, *city_parts = item.get('city').split()
     return School(name=item.get('name'),
                   phone=item.get('phone'),
                   fax=item.get('fax'),
                   website=item.get('web'),
                   address=item.get('street'),
                   city=' '.join(city_parts),
                   zip=zip_code,
                   school_type=item.get('school_type'),
                   legal_status=item.get('type'),
                   id='BY-{}'.format(item.get('number')))
 def normalize(item: Item) -> School:
     return School(name=item.get('Name'),
                   id = 'SA-{}'.format(item.get('ID')),
                   address=re.split('\d{5}', item.get('Adresse').strip())[0].strip(),
                   zip=re.findall('\d{5}', item.get('Adresse').strip())[0],
                   city=re.split('\d{5}', item.get('Adresse').strip())[1].strip(),
                  # address=item.get('Adresse'),
                   website=item.get('Homepage'),
                   email=item.get('E-Mail'),
                   fax=item.get('Telefax'),
                   phone=item.get('Telefon'),
                   )
def test_add_stats_item_scraped_count_by_item_type(spider):
    for _ in range(15):
        spider.crawler.signals.send_catch_log_deferred(
            signal=signals.item_scraped,
            item={"_type": "regular_dict"},
            response="",
            spider=spider,
        )

    for _ in range(20):
        spider.crawler.signals.send_catch_log_deferred(
            signal=signals.item_scraped,
            item=Item(),
            response="",
            spider=spider,
        )

    for _ in range(25):
        spider.crawler.signals.send_catch_log_deferred(
            signal=signals.item_scraped,
            item=TestItem(),
            response="",
            spider=spider,
        )

    stats = spider.crawler.stats.get_stats()

    assert stats.get("spidermon_item_scraped_count") == 60
    assert stats.get("spidermon_item_scraped_count/dict") == 15
    assert stats.get("spidermon_item_scraped_count/Item") == 20
    assert stats.get("spidermon_item_scraped_count/TestItem") == 25
Beispiel #33
0
    def __getitem__(self, name):
        """Retrieve field's data

        Parameters
        ----------
        name : str
            Name or code of the field to be retrieved.

        Returns
        ----------
        What ever the field contained.

        Notes
        ----------
        Data returned by method may be of any type depending what the field
        contained.

        Examples
        ----------
        >>> record = Record(title="I'm a title")
        >>> record['245 a']
        "I'm a title"

        >>> record['title']
        "I'm a title"

        >>> record.get_field['issn']
        None
        """
        processed_tag = code_to_name(name)
        if processed_tag in self._values and isinstance(
                self._values[processed_tag], BaseField):
            return self._values[processed_tag].get_value()
        else:
            try:
                return Item.__getitem__(self, name)
            except KeyError:
                return None
def PlayerRow(**data):
    item = Item()
    for field, value in data.iteritems():
        item.fields[field] = Field()
        item[field] = value
    return item