Python CommonItem Examples, lib.scrapy.items.CommonItem Python Examples

Example #1

0

Show file

	def parse(self, item, *args, **kwargs):
		try:
			self.exporter.fields_to_export = ['uuid', 'domain', 'url', 'body', 'date', 'section', 'pdate']
			self.exporter.insert_query = "INSERT INTO	TB_CRAWLING (UUID,DOMAIN,URL,BODY,DATE,SECTION,PDATE) VALUES ('{uuid}', '{domain}', '{url}', '{body}', '{date}', '{section}', '{pdate}');"

			newbody = {}
			soup = BeautifulSoup(item['body'], features="lxml")

			category = soup.select_one('h1.a-size-large.a-spacing-medium.zg-margin-left-15.a-text-bold').extract().get_text()

			cont_items = soup.select('span.aok-inline-block.zg-item')

			for i in cont_items:
				newitem = CommonItem()
				newitem.fields['body'] = CommonField()
				newitem.fields['date'] = CommonField()
				newitem.fields['domain'] = CommonField()
				newitem.fields['spider_name'] = CommonField()
				newitem.fields['url'] = CommonField()
				newitem.fields['uuid'] = CommonField()
				newitem.fields['section'] = CommonField()
				newitem.fields["pdate"] = CommonField()

				title = BeautifulSoup(str(i), features="lxml").select_one('div.p13n-sc-truncate').extract().get_text()
				price = BeautifulSoup(str(i), features="lxml").select_one('span').extract().get_text()
				img = BeautifulSoup(str(i), features="lxml").select_one('img').attrs['src']
				
				newbody['category'] = str(category)
				newbody['title'] = str(title)
				newbody['price'] = str(price)
				newbody['image'] = str(img)

				newitem['body'] = re.escape(str(newbody)).replace("'", " ").replace(",", " ").replace('"', ' ').replace('{', ' ').replace('}', ' ')
				newitem['date'] = item['date']
				newitem['domain'] = item['domain']
				newitem['spider_name'] = item['spider_name']
				newitem['url'] = item['url']
				newitem['uuid'] = str(uuid.uuid1())
				newitem['section'] = item['section']
				newitem['pdate'] = datetime.datetime.now().strftime('%Y%m%d%H%M00')

				yield newitem

		except Exception as ex:
			print(ex)

Example #2

0

Show file

File: kotra_amazon.py Project: mobigen/MSF_V2

    def content_parse_100(self, response):
        try:
            ext_domain = tldextract.extract(urlparse(response.url).netloc)
            item = CommonItem()
            item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
            item["url"] = response.url
            item["domain"] = ext_domain.registered_domain
            item["body"] = response.body

            item.fields["section"] = CommonField()
            item["section"] = self.section
            yield item

        except Exception as ex:
            self.handler.management_info['current_exception'] = str(ex)
            self.handler.management_info[
                'spider_err_count'] = self.handler.management_info[
                    'spider_err_count'] + 1
            pass

Example #3

0

Show file

File: iparser.py Project: mobigen/MSF_V2

    def run(self):
        if not self.exporter:
            raise Exception('parser need to define exporter')

        self.isruning = True
        idle_time = 0
        while self.isruning:
            self.handler.management_info['parser_opened'] = self.parser_opened
            if self.handler.get_queue_cnt() > 0:
                idle_time = 0
                if not self.parser_opened:
                    self.open_parser()
                data = self.handler.dequeue()
                if data is not b'' and data is not None:
                    import msgpack
                    u_msg = msgpack.unpackb(data, raw=False)
                    item = CommonItem()
                    for k, v in u_msg[-1].items():
                        if 'fields_info' != v:
                            item.fields[v] = CommonField()
                            item[v] = u_msg[int(k)]
                    if self.exporter:
                        try:
                            parse_generator = self.parse(item)
                            if parse_generator:
                                for p in parse_generator:
                                    self.exporter.export_item(p)
                                    self.handler.management_info[
                                        'export_count'] = self.handler.management_info[
                                            'export_count'] + 1

                        except Exception as ex:
                            self.handler.management_info[
                                'current_exception'] = str(ex)
                            self.handler.management_info[
                                'export_err_count'] = self.handler.management_info[
                                    'export_err_count'] + 1

            else:
                idle_time = idle_time + 1
                if idle_time > 60 and self.parser_opened:
                    self.close_parser()
                time.sleep(1)

Example #4

0

Show file

File: kotra_tradenavi.py Project: mobigen/MSF_V2

    def parse_content(self, response, section, url):
        ext = tldextract.extract(urlparse(response.url).netloc)
        domain = ext.registered_domain
        try:
            ext_domain = tldextract.extract(urlparse(response.url).netloc)
            item = CommonItem()
            item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
            item["url"] = response.url
            item["domain"] = ext_domain.registered_domain

            item["body"] = response.css('div#webContents').getall()[0] if len(
                response.css(
                    'div#webContents').getall()) > 0 else response.body

            item.fields["section"] = CommonField()
            item["section"] = section

            yield item
        except Exception as ex:
            pass

Example #5

0

Show file

File: web.py Project: mobigen/MSF_V2

    def parse(self, response, recursive, section):
        ext = tldextract.extract(urlparse(response.url).netloc)
        domain = ext.registered_domain
        ext = DomainPatternLinkExtractor(domain,
                                         canonicalize=True,
                                         unique=True)
        urls = []

        if recursive:
            try:
                if response.headers['Content-Type'] \
                  and response.headers['Content-Type'].decode("utf-8").lower().find("application") == -1:
                    urls = [link.url for link in ext.extract_links(response)]
                else:
                    return
            except Exception as ex:
                pass
            for url in urls:
                yield response.follow(url,
                                      self.parse,
                                      cb_kwargs={
                                          'recursive': recursive,
                                          'section': section
                                      })

        try:
            ext_domain = tldextract.extract(urlparse(response.url).netloc)
            item = CommonItem()
            item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
            item["url"] = response.url
            item["domain"] = ext_domain.registered_domain
            item["body"] = response.body

            item.fields["section"] = CommonField()
            item["section"] = section

            yield item
        except Exception as ex:
            pass

Example #6

0

Show file

File: kotra_linkedin.py Project: mobigen/MSF_V2

    def parse(self, response):
        ext = tldextract.extract(urlparse(response.url).netloc)
        domain = ext.registered_domain
        ext = DomainPatternLinkExtractor(domain,
                                         canonicalize=True,
                                         unique=True)
        urls = []

        try:
            ext_domain = tldextract.extract(urlparse(response.url).netloc)
            item = CommonItem()
            item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
            item["url"] = response.url
            item["domain"] = ext_domain.registered_domain
            item["body"] = response.body

            item.fields["section"] = CommonField()
            item["section"] = section

            yield item
        except Exception as ex:
            pass

Example #7

0

Show file

File: kotra_amazon.py Project: mobigen/MSF_V2

    def content_parse_50(self, response):
        try:
            params = {}
            params['pg'] = 2
            query_string = urllib.parse.urlencode(params)
            yield response.follow(url=response.url + "?" + query_string,
                                  callback=self.content_parse_100)
            ext_domain = tldextract.extract(urlparse(response.url).netloc)
            item = CommonItem()
            item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
            item["url"] = response.url
            item["domain"] = ext_domain.registered_domain
            item["body"] = response.body

            item.fields["section"] = CommonField()
            item["section"] = self.section
            yield item

        except Exception as ex:
            self.handler.management_info['current_exception'] = str(ex)
            self.handler.management_info[
                'spider_err_count'] = self.handler.management_info[
                    'spider_err_count'] + 1
            pass

Example #8

0

Show file

 def parse(cls, response):
     ext = tldextract.extract(urlparse(response.url).netloc)
     item = CommonItem()
     item.setdefault('date', datetime.datetime.now().strftime("%Y%m%d%H%M"))
     item.setdefault('url', response.url)
     item.setdefault('domain', ext.registered_domain)
     item.setdefault('body', response.body)
     item.setdefault('encoding', response.encoding)
     yield item

Example #9

0

Show file

    def parse(self, item, *args, **kwargs):
        self.exporter.fields_to_export = [
            'uuid', 'domain', 'url', 'word', 'word_point', 'date', 'section',
            'pdate'
        ]
        try:
            tm = textmine.textmine()
            soup = BeautifulSoup(item['body'], 'html.parser')
            text = soup.extract().get_text()

            if self.conf.has_option(self.section, 'exclude_keywords'):
                exclude_keywords = ast.literal_eval(
                    self.conf.get(self.section, 'exclude_keywords'))
                for ex_word in exclude_keywords:
                    text = text.replace(ex_word, '')

            tm_result = tm.get(text)

            if len(tm_result) > 0 and len(tm_result[1]) > 0:
                for word in tm_result[1]:
                    new_item = CommonItem()
                    new_item.fields["uuid"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["domain"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["url"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["word"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["word_point"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["date"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["section"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["pdate"] = CommonField()
                    new_item["encoding"] = item["encoding"]
                    new_item["uuid"] = item["uuid"]
                    new_item["domain"] = item["domain"]
                    new_item["url"] = item["url"]
                    new_item["word"] = word[0]
                    new_item["word_point"] = str(word[1])
                    new_item["date"] = item["date"]
                    new_item["section"] = item["section"]
                    new_item["pdate"] = datetime.datetime.now().strftime(
                        '%Y%m%d%H%M00')

                    if self.start_pdate is None:
                        self.start_pdate = new_item["pdate"]

                    yield new_item
        except Exception as ex:
            print(ex)