Ejemplo n.º 1
0
    def parse(self, item, *args, **kwargs):
        try:
            tm = textmine.textmine()
            soup = BeautifulSoup(item['body'], 'html.parser')
            text = soup.extract().get_text()

            if self.conf.has_option(self.section, 'exclude_keywords'):
                exclude_keywords = ast.literal_eval(
                    self.conf.get(self.section, 'exclude_keywords'))
                for ex_word in exclude_keywords:
                    text = text.replace(ex_word, '')

            tm_result = tm.get(text)
            # item.fields["pdate"] = CommonField()
            # item["pdate"] = datetime.datetime.now().strftime('%Y%m%d%H%M00')

            # if self.start_pdate is None:
            # 	self.start_pdate = item["pdate"]

            # item['body'] = str(item['body']).replace('\n', ' ').strip()

            # item.fields["text"] = CommonField()
            # item["text"] = text.replace('\n', ' ').strip()

            item.fields["top_sentence"] = CommonField()
            item.fields["top_word"] = CommonField()
            item.fields["sentences"] = CommonField()
            item.fields["words"] = CommonField()

            if len(tm_result) > 0 and len(tm_result[0]) > 0:
                item["top_sentence"] = str(tm_result[0][0][2]).replace(
                    '\n', ' ').strip()

            if len(tm_result) > 0 and len(tm_result[1]) > 0:
                item["top_word"] = str(tm_result[1][0][0]).replace(
                    '\n', ' ').strip()

            if len(tm_result) > 0:
                item["sentences"] = str(tm_result[0]).replace('\n',
                                                              ' ').strip()
            if len(tm_result) > 1:
                item["words"] = str(tm_result[1]).replace('\n', ' ').strip()

            # self.exporter.fields_to_export = ['uuid', 'domain', 'url', 'keyword', 'top_sentence', 'top_word', 'sentences', 'words', 'text', 'body', 'date', 'section', 'pdate']
            self.exporter.fields_to_export = [
                'uuid', 'domain', 'url', 'keyword', 'top_sentence', 'top_word',
                'sentences', 'words', 'date', 'section'
            ]

            yield item

        except Exception as ex:
            print(ex)
Ejemplo n.º 2
0
    def parse(self, item, *args, **kwargs):
        self.exporter.fields_to_export = [
            'uuid', 'domain', 'url', 'word', 'word_point', 'date', 'section',
            'pdate'
        ]
        try:
            tm = textmine.textmine()
            soup = BeautifulSoup(item['body'], 'html.parser')
            text = soup.extract().get_text()

            if self.conf.has_option(self.section, 'exclude_keywords'):
                exclude_keywords = ast.literal_eval(
                    self.conf.get(self.section, 'exclude_keywords'))
                for ex_word in exclude_keywords:
                    text = text.replace(ex_word, '')

            tm_result = tm.get(text)

            if len(tm_result) > 0 and len(tm_result[1]) > 0:
                for word in tm_result[1]:
                    new_item = CommonItem()
                    new_item.fields["uuid"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["domain"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["url"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["word"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["word_point"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["date"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["section"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["pdate"] = CommonField()
                    new_item["encoding"] = item["encoding"]
                    new_item["uuid"] = item["uuid"]
                    new_item["domain"] = item["domain"]
                    new_item["url"] = item["url"]
                    new_item["word"] = word[0]
                    new_item["word_point"] = str(word[1])
                    new_item["date"] = item["date"]
                    new_item["section"] = item["section"]
                    new_item["pdate"] = datetime.datetime.now().strftime(
                        '%Y%m%d%H%M00')

                    if self.start_pdate is None:
                        self.start_pdate = new_item["pdate"]

                    yield new_item
        except Exception as ex:
            print(ex)
Ejemplo n.º 3
0
    def process_item(self, item, spider):
        item.fields['fields_info'] = CommonField()
        item.fields['uuid'] = CommonField()
        item.fields['spider_name'] = CommonField()
        fields_info = {}
        for idx, val in enumerate(item.fields):
            fields_info.setdefault(str(idx), val)

        item['fields_info'] = fields_info
        item['uuid'] = str(uuid.uuid1())
        item['spider_name'] = str(spider.name)

        self.exporter.fields_to_export = item.fields.keys()
        try:
            self.exporter.export_item(item)
        except Exception as ex:
            logger.error("QueueWriterPipeline Exception : %s", str(ex))

        return item
Ejemplo n.º 4
0
	def content_parse(self, response, keyword):
		try:
			ext_domain = tldextract.extract(urlparse(response.url).netloc)
			item = CommonItem()
			item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
			item["url"] = response.url
			item["domain"] = ext_domain.registered_domain
			item["body"] = response.body
			item["encoding"] = response.encoding
			item.fields["section"] = CommonField()
			item["section"] = self.section

			item.fields["keyword"] = CommonField()
			item["keyword"] = keyword

			yield item
		except Exception as ex:
			self.handler.management_info['current_exception'] = str(ex)
			self.handler.management_info['spider_err_count'] = self.handler.management_info['spider_err_count'] + 1
			pass
Ejemplo n.º 5
0
	def parse(self, item, *args, **kwargs):
		try:
			self.exporter.fields_to_export = ['uuid', 'domain', 'url', 'body', 'date', 'section', 'pdate']
			self.exporter.insert_query = "INSERT INTO	TB_CRAWLING (UUID,DOMAIN,URL,BODY,DATE,SECTION,PDATE) VALUES ('{uuid}', '{domain}', '{url}', '{body}', '{date}', '{section}', '{pdate}');"

			newbody = {}
			soup = BeautifulSoup(item['body'], features="lxml")

			category = soup.select_one('h1.a-size-large.a-spacing-medium.zg-margin-left-15.a-text-bold').extract().get_text()

			cont_items = soup.select('span.aok-inline-block.zg-item')

			for i in cont_items:
				newitem = CommonItem()
				newitem.fields['body'] = CommonField()
				newitem.fields['date'] = CommonField()
				newitem.fields['domain'] = CommonField()
				newitem.fields['spider_name'] = CommonField()
				newitem.fields['url'] = CommonField()
				newitem.fields['uuid'] = CommonField()
				newitem.fields['section'] = CommonField()
				newitem.fields["pdate"] = CommonField()

				title = BeautifulSoup(str(i), features="lxml").select_one('div.p13n-sc-truncate').extract().get_text()
				price = BeautifulSoup(str(i), features="lxml").select_one('span').extract().get_text()
				img = BeautifulSoup(str(i), features="lxml").select_one('img').attrs['src']
				
				newbody['category'] = str(category)
				newbody['title'] = str(title)
				newbody['price'] = str(price)
				newbody['image'] = str(img)

				newitem['body'] = re.escape(str(newbody)).replace("'", " ").replace(",", " ").replace('"', ' ').replace('{', ' ').replace('}', ' ')
				newitem['date'] = item['date']
				newitem['domain'] = item['domain']
				newitem['spider_name'] = item['spider_name']
				newitem['url'] = item['url']
				newitem['uuid'] = str(uuid.uuid1())
				newitem['section'] = item['section']
				newitem['pdate'] = datetime.datetime.now().strftime('%Y%m%d%H%M00')

				yield newitem

		except Exception as ex:
			print(ex)
Ejemplo n.º 6
0
    def parse(self, item, *args, **kwargs):
        try:
            item.fields["pdate"] = CommonField()
            item["pdate"] = datetime.datetime.now().strftime('%Y%m%d%H%M00')
            item['body'] = re.escape(item['body']).replace("'", "''").replace(
                ",", " ").replace('\n', ' ')

            self.exporter.fields_to_export = [
                'uuid', 'domain', 'url', 'body', 'date', 'section', 'pdate'
            ]
            # self.exporter.insert_query = "INSERT INTO	TB_CRAWLING (UUID,DOMAIN,URL,BODY,DATE,SECTION,PDATE) VALUES ('{uuid}', '{domain}', '{url}', '{body}', '{date}', '{section}', '{pdate}');"
            self.exporter.insert_query = "insert into	tb_crawling (uuid,domain,url,body,date,section,pdate) values ('{uuid}', '{domain}', '{url}', '{body}', '{date}', '{section}', '{pdate}');"

            yield item

        except Exception as ex:
            print(ex)
Ejemplo n.º 7
0
    def run(self):
        if not self.exporter:
            raise Exception('parser need to define exporter')

        self.isruning = True
        idle_time = 0
        while self.isruning:
            self.handler.management_info['parser_opened'] = self.parser_opened
            if self.handler.get_queue_cnt() > 0:
                idle_time = 0
                if not self.parser_opened:
                    self.open_parser()
                data = self.handler.dequeue()
                if data is not b'' and data is not None:
                    import msgpack
                    u_msg = msgpack.unpackb(data, raw=False)
                    item = CommonItem()
                    for k, v in u_msg[-1].items():
                        if 'fields_info' != v:
                            item.fields[v] = CommonField()
                            item[v] = u_msg[int(k)]
                    if self.exporter:
                        try:
                            parse_generator = self.parse(item)
                            if parse_generator:
                                for p in parse_generator:
                                    self.exporter.export_item(p)
                                    self.handler.management_info[
                                        'export_count'] = self.handler.management_info[
                                            'export_count'] + 1

                        except Exception as ex:
                            self.handler.management_info[
                                'current_exception'] = str(ex)
                            self.handler.management_info[
                                'export_err_count'] = self.handler.management_info[
                                    'export_err_count'] + 1

            else:
                idle_time = idle_time + 1
                if idle_time > 60 and self.parser_opened:
                    self.close_parser()
                time.sleep(1)
Ejemplo n.º 8
0
    def parse_content(self, response, section, url):
        ext = tldextract.extract(urlparse(response.url).netloc)
        domain = ext.registered_domain
        try:
            ext_domain = tldextract.extract(urlparse(response.url).netloc)
            item = CommonItem()
            item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
            item["url"] = response.url
            item["domain"] = ext_domain.registered_domain

            item["body"] = response.css('div#webContents').getall()[0] if len(
                response.css(
                    'div#webContents').getall()) > 0 else response.body

            item.fields["section"] = CommonField()
            item["section"] = section

            yield item
        except Exception as ex:
            pass
Ejemplo n.º 9
0
    def parse(self, response, recursive, section):
        ext = tldextract.extract(urlparse(response.url).netloc)
        domain = ext.registered_domain
        ext = DomainPatternLinkExtractor(domain,
                                         canonicalize=True,
                                         unique=True)
        urls = []

        if recursive:
            try:
                if response.headers['Content-Type'] \
                  and response.headers['Content-Type'].decode("utf-8").lower().find("application") == -1:
                    urls = [link.url for link in ext.extract_links(response)]
                else:
                    return
            except Exception as ex:
                pass
            for url in urls:
                yield response.follow(url,
                                      self.parse,
                                      cb_kwargs={
                                          'recursive': recursive,
                                          'section': section
                                      })

        try:
            ext_domain = tldextract.extract(urlparse(response.url).netloc)
            item = CommonItem()
            item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
            item["url"] = response.url
            item["domain"] = ext_domain.registered_domain
            item["body"] = response.body

            item.fields["section"] = CommonField()
            item["section"] = section

            yield item
        except Exception as ex:
            pass
Ejemplo n.º 10
0
    def parse(self, response):
        ext = tldextract.extract(urlparse(response.url).netloc)
        domain = ext.registered_domain
        ext = DomainPatternLinkExtractor(domain,
                                         canonicalize=True,
                                         unique=True)
        urls = []

        try:
            ext_domain = tldextract.extract(urlparse(response.url).netloc)
            item = CommonItem()
            item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
            item["url"] = response.url
            item["domain"] = ext_domain.registered_domain
            item["body"] = response.body

            item.fields["section"] = CommonField()
            item["section"] = section

            yield item
        except Exception as ex:
            pass
Ejemplo n.º 11
0
    def content_parse_50(self, response):
        try:
            params = {}
            params['pg'] = 2
            query_string = urllib.parse.urlencode(params)
            yield response.follow(url=response.url + "?" + query_string,
                                  callback=self.content_parse_100)
            ext_domain = tldextract.extract(urlparse(response.url).netloc)
            item = CommonItem()
            item["date"] = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
            item["url"] = response.url
            item["domain"] = ext_domain.registered_domain
            item["body"] = response.body

            item.fields["section"] = CommonField()
            item["section"] = self.section
            yield item

        except Exception as ex:
            self.handler.management_info['current_exception'] = str(ex)
            self.handler.management_info[
                'spider_err_count'] = self.handler.management_info[
                    'spider_err_count'] + 1
            pass
Ejemplo n.º 12
0
class kitaItem(CommonItem):
    section = CommonField()
Ejemplo n.º 13
0
    def parse(self, item, *args, **kwargs):
        try:
            item.fields["pdate"] = CommonField()
            item["pdate"] = datetime.datetime.now().strftime('%Y%m%d%H%M00')
            self.data_reg_dt = datetime.datetime.now().strftime('%Y%m%d%H%M00')
            self.exporter.fields_to_export = [
                'uuid', 'domain', 'url', 'body', 'date', 'section', 'pdate'
            ]
            self.exporter.insert_query = "INSERT INTO	TB_CRAWLING (UUID,DOMAIN,URL,BODY,DATE,SECTION,PDATE) VALUES ('{uuid}', '{domain}', '{url}', '{body}', '{date}', '{section}', '{pdate}');"

            if self.gdp_url == item['url']:
                self.gdp_res = item['body']
            elif self.pop_url == item['url']:
                self.pop_res = item['body']

            if self.pop_res and self.gdp_res:

                pop_dict = {}

                for dicts in json.loads(self.pop_res)[1]:
                    NAT_CD = dicts['country']['id'].strip()
                    NAT_NAME = dicts['country']['value'].strip()
                    ISO_WD3_NAT_CD = dicts['countryiso3code'].strip()
                    BASE_YR = dicts['date'].strip()
                    POPLTN_VAL = dicts['value']

                    pop_dict.setdefault(
                        '|'.join(map(str, [NAT_CD, NAT_NAME, ISO_WD3_NAT_CD])),
                        {}).setdefault(BASE_YR, POPLTN_VAL)

                # print('%s\n' % '|^|'.join(map(str, self.header_list)))

                for dicts in json.loads(self.gdp_res)[1]:

                    NAT_CD = dicts['country']['id'].strip()
                    NAT_NAME = dicts['country']['value'].strip()
                    ISO_WD3_NAT_CD = dicts['countryiso3code'].strip()
                    BASE_YR = dicts['date'].strip()
                    GDP_VAL = dicts['value']

                    try:
                        POPLTN_VAL = pop_dict['|'.join(
                            map(str,
                                [NAT_CD, NAT_NAME, ISO_WD3_NAT_CD]))][BASE_YR]
                    except:
                        POPLTN_VAL = ''

                    res_line = [
                        NAT_CD, NAT_NAME, ISO_WD3_NAT_CD, BASE_YR, GDP_VAL,
                        POPLTN_VAL, self.data_reg_dt
                    ]
                    # print('%s\n' % '|^|'.join(map(str, res_line)))

                    item['body'] = '%s' % '|^|'.join(map(str, res_line))
                    yield item
                    del pop_dict['|'.join(
                        map(str, [NAT_CD, NAT_NAME, ISO_WD3_NAT_CD]))][BASE_YR]

                for dicts in pop_dict:
                    for BASE_YR in pop_dict[dicts]:
                        # __LOG__.Watch(dicts)
                        # __LOG__.Watch(BASE_YR)
                        # __LOG__.Watch(pop_dict[dicts][BASE_YR])

                        POPLTN_VAL = pop_dict[dicts][BASE_YR]
                        NAT_CD, NAT_NAME, ISO_WD3_NAT_CD = dicts.split('|')
                        GDP_VAL = ''

                        res_line = [
                            NAT_CD, NAT_NAME, ISO_WD3_NAT_CD, BASE_YR, GDP_VAL,
                            POPLTN_VAL, self.data_reg_dt
                        ]
                        # print('%s\n' % '|^|'.join(map(str, res_line)))
                        item['body'] = '%s' % '|^|'.join(map(str, res_line))
                        yield item

                self.gdp_res = None
                self.pop_res = None

        except Exception as ex:
            print(ex)