def parse(self, item, *args, **kwargs): try: self.exporter.fields_to_export = ['uuid', 'domain', 'url', 'body', 'date', 'section', 'pdate'] self.exporter.insert_query = "INSERT INTO TB_CRAWLING (UUID,DOMAIN,URL,BODY,DATE,SECTION,PDATE) VALUES ('{uuid}', '{domain}', '{url}', '{body}', '{date}', '{section}', '{pdate}');" newbody = {} soup = BeautifulSoup(item['body'], features="lxml") category = soup.select_one('h1.a-size-large.a-spacing-medium.zg-margin-left-15.a-text-bold').extract().get_text() cont_items = soup.select('span.aok-inline-block.zg-item') for i in cont_items: newitem = CommonItem() newitem.fields['body'] = CommonField() newitem.fields['date'] = CommonField() newitem.fields['domain'] = CommonField() newitem.fields['spider_name'] = CommonField() newitem.fields['url'] = CommonField() newitem.fields['uuid'] = CommonField() newitem.fields['section'] = CommonField() newitem.fields["pdate"] = CommonField() title = BeautifulSoup(str(i), features="lxml").select_one('div.p13n-sc-truncate').extract().get_text() price = BeautifulSoup(str(i), features="lxml").select_one('span').extract().get_text() img = BeautifulSoup(str(i), features="lxml").select_one('img').attrs['src'] newbody['category'] = str(category) newbody['title'] = str(title) newbody['price'] = str(price) newbody['image'] = str(img) newitem['body'] = re.escape(str(newbody)).replace("'", " ").replace(",", " ").replace('"', ' ').replace('{', ' ').replace('}', ' ') newitem['date'] = item['date'] newitem['domain'] = item['domain'] newitem['spider_name'] = item['spider_name'] newitem['url'] = item['url'] newitem['uuid'] = str(uuid.uuid1()) newitem['section'] = item['section'] newitem['pdate'] = datetime.datetime.now().strftime('%Y%m%d%H%M00') yield newitem except Exception as ex: print(ex)
def content_parse_100(self, response): try: ext_domain = tldextract.extract(urlparse(response.url).netloc) item = CommonItem() item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S') item["url"] = response.url item["domain"] = ext_domain.registered_domain item["body"] = response.body item.fields["section"] = CommonField() item["section"] = self.section yield item except Exception as ex: self.handler.management_info['current_exception'] = str(ex) self.handler.management_info[ 'spider_err_count'] = self.handler.management_info[ 'spider_err_count'] + 1 pass
def run(self): if not self.exporter: raise Exception('parser need to define exporter') self.isruning = True idle_time = 0 while self.isruning: self.handler.management_info['parser_opened'] = self.parser_opened if self.handler.get_queue_cnt() > 0: idle_time = 0 if not self.parser_opened: self.open_parser() data = self.handler.dequeue() if data is not b'' and data is not None: import msgpack u_msg = msgpack.unpackb(data, raw=False) item = CommonItem() for k, v in u_msg[-1].items(): if 'fields_info' != v: item.fields[v] = CommonField() item[v] = u_msg[int(k)] if self.exporter: try: parse_generator = self.parse(item) if parse_generator: for p in parse_generator: self.exporter.export_item(p) self.handler.management_info[ 'export_count'] = self.handler.management_info[ 'export_count'] + 1 except Exception as ex: self.handler.management_info[ 'current_exception'] = str(ex) self.handler.management_info[ 'export_err_count'] = self.handler.management_info[ 'export_err_count'] + 1 else: idle_time = idle_time + 1 if idle_time > 60 and self.parser_opened: self.close_parser() time.sleep(1)
def parse_content(self, response, section, url): ext = tldextract.extract(urlparse(response.url).netloc) domain = ext.registered_domain try: ext_domain = tldextract.extract(urlparse(response.url).netloc) item = CommonItem() item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S') item["url"] = response.url item["domain"] = ext_domain.registered_domain item["body"] = response.css('div#webContents').getall()[0] if len( response.css( 'div#webContents').getall()) > 0 else response.body item.fields["section"] = CommonField() item["section"] = section yield item except Exception as ex: pass
def parse(self, response, recursive, section): ext = tldextract.extract(urlparse(response.url).netloc) domain = ext.registered_domain ext = DomainPatternLinkExtractor(domain, canonicalize=True, unique=True) urls = [] if recursive: try: if response.headers['Content-Type'] \ and response.headers['Content-Type'].decode("utf-8").lower().find("application") == -1: urls = [link.url for link in ext.extract_links(response)] else: return except Exception as ex: pass for url in urls: yield response.follow(url, self.parse, cb_kwargs={ 'recursive': recursive, 'section': section }) try: ext_domain = tldextract.extract(urlparse(response.url).netloc) item = CommonItem() item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S') item["url"] = response.url item["domain"] = ext_domain.registered_domain item["body"] = response.body item.fields["section"] = CommonField() item["section"] = section yield item except Exception as ex: pass
def parse(self, response): ext = tldextract.extract(urlparse(response.url).netloc) domain = ext.registered_domain ext = DomainPatternLinkExtractor(domain, canonicalize=True, unique=True) urls = [] try: ext_domain = tldextract.extract(urlparse(response.url).netloc) item = CommonItem() item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S') item["url"] = response.url item["domain"] = ext_domain.registered_domain item["body"] = response.body item.fields["section"] = CommonField() item["section"] = section yield item except Exception as ex: pass
def content_parse_50(self, response): try: params = {} params['pg'] = 2 query_string = urllib.parse.urlencode(params) yield response.follow(url=response.url + "?" + query_string, callback=self.content_parse_100) ext_domain = tldextract.extract(urlparse(response.url).netloc) item = CommonItem() item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S') item["url"] = response.url item["domain"] = ext_domain.registered_domain item["body"] = response.body item.fields["section"] = CommonField() item["section"] = self.section yield item except Exception as ex: self.handler.management_info['current_exception'] = str(ex) self.handler.management_info[ 'spider_err_count'] = self.handler.management_info[ 'spider_err_count'] + 1 pass
def parse(cls, response): ext = tldextract.extract(urlparse(response.url).netloc) item = CommonItem() item.setdefault('date', datetime.datetime.now().strftime("%Y%m%d%H%M")) item.setdefault('url', response.url) item.setdefault('domain', ext.registered_domain) item.setdefault('body', response.body) item.setdefault('encoding', response.encoding) yield item
def parse(self, item, *args, **kwargs): self.exporter.fields_to_export = [ 'uuid', 'domain', 'url', 'word', 'word_point', 'date', 'section', 'pdate' ] try: tm = textmine.textmine() soup = BeautifulSoup(item['body'], 'html.parser') text = soup.extract().get_text() if self.conf.has_option(self.section, 'exclude_keywords'): exclude_keywords = ast.literal_eval( self.conf.get(self.section, 'exclude_keywords')) for ex_word in exclude_keywords: text = text.replace(ex_word, '') tm_result = tm.get(text) if len(tm_result) > 0 and len(tm_result[1]) > 0: for word in tm_result[1]: new_item = CommonItem() new_item.fields["uuid"] = CommonField() new_item = CommonItem() new_item.fields["domain"] = CommonField() new_item = CommonItem() new_item.fields["url"] = CommonField() new_item = CommonItem() new_item.fields["word"] = CommonField() new_item = CommonItem() new_item.fields["word_point"] = CommonField() new_item = CommonItem() new_item.fields["date"] = CommonField() new_item = CommonItem() new_item.fields["section"] = CommonField() new_item = CommonItem() new_item.fields["pdate"] = CommonField() new_item["encoding"] = item["encoding"] new_item["uuid"] = item["uuid"] new_item["domain"] = item["domain"] new_item["url"] = item["url"] new_item["word"] = word[0] new_item["word_point"] = str(word[1]) new_item["date"] = item["date"] new_item["section"] = item["section"] new_item["pdate"] = datetime.datetime.now().strftime( '%Y%m%d%H%M00') if self.start_pdate is None: self.start_pdate = new_item["pdate"] yield new_item except Exception as ex: print(ex)