def __init__(self, root_url, num_threads=1):
        root_dir= os.path.abspath(__file__)
        root_dir = '/'.join(root_dir.split('/')[-2:])
        root_dir = root_dir.replace('crawler-', '').replace('.py', '')

        verbose('root directory for storing data is {}'.format(root_dir))
        
        super().__init__(root_url = root_url,
                         root_dir = root_dir,
                         num_threads = num_threads)

        self.month_alias = MonthAlias()
    def process_page(self, url, soup):
        global uid_
        # remove all javascript and stylesheet code
        for script in soup(["script", "style"]): 
            script.extract()

        content = soup.find(class_='db-contentScn')

        if not content:
            log.error('content extraction failed')
            verbose('content extraction failed')
            log.error('{}'.format(url))
            raise Exception
        else:
            verbose(' Content:=')
            verbose('  size: {}'.format(len(content)))        
            year, month = self.extract_year_month(url, soup)
            log.info('year, month = {}, {}'.format(year, month))

            verbose('  year/month: {}/{}'.format(year, month))
            name = '___'.join(
                url.split('?')[0].split('/')[-2:]
            ).replace('.html', '')

            log.debug(content)
            paras = content.findAll('p')  ; log.debug(pformat(paras))

            path_suffix = '{}/{}/{}.txt'.format(year, self.month_alias[month], name)

            for d in self.SUBDIRS:
                mkdir('{}/{}/{}'.format(d, year, self.month_alias[month]))

            
            page_content  = '\n'.join(p.text for p in paras)
            page_abstract = paras[0].text.strip()
            title         = soup.find('h1')

            breadcrumbs  = soup.find(class_='breadCrums').findAll('a')
            breadcrumbs  = ','.join([b.text.replace('\n', '').replace('\r', '')
                                for b in breadcrumbs])

            tags = soup.find(class_='tglst').findAll('a')
            tags = ','.join([b.text.replace('\n', '').replace('\r', '')
                                for b in tags])
                                
            log.info(title.text)
            log.info(breadcrumbs)
            log.info(tags)

            record = '{}|{}|{}|{}|{}'.format(path_suffix.strip(), url, title.text.strip(),
                                breadcrumbs, tags)

            return (path_suffix,
                    record, 
                    {
                        self.ARTICLES_DIR : page_content
                        , self.ABSTRACTS_DIR: page_abstract
                    }
            )
コード例 #3
0
    def process_page(self, page_name, soup):
        global uid_
        # remove all javascript and stylesheet code
        for script in soup(["script", "style"]): 
            script.extract()

        content = soup.find(class_='article')

        if not content:
            log.error('content extraction failed')
            verbose('content extraction failed')
            log.error('{}'.format(page_name))
            raise Exception
        else:
            verbose(' Content:=')
            verbose('  size: {}'.format(len(content)))        
            year, month = self.extract_year_month(page_name, soup)
            log.info('year, month = {}, {}'.format(year, month))

            verbose('  year/month: {}/{}'.format(year, month))
            m = re.search('{}\/.*\/([^\/]+).html'.format(self.ROOT_URL), page_name)
            if m:
                log.debug(pformat(m))
                name = m.group(1)
            else:
                uid_ += 1
                name = '{}'.format(uid_)

            log.debug(content)
            paras = content.findAll('p')  ; log.debug(pformat(paras))

            path_suffix = '{}/{}/{}.txt'.format(year, self.month_alias[month], name)

            for d in self.SUBDIRS:
                mkdir('{}/{}/{}'.format(d, year, self.month_alias[month]))

            
            page_content  = '\n'.join(p.text for p in paras)
            page_abstract = paras[0].text.strip()
            title         = soup.find(class_='headline')
            record        = '{}|{}'.format(path_suffix.strip(), title.text.strip())

            log.info(title.text)

            return (path_suffix,
                    record, 
                    {
                        self.ARTICLES_DIR : page_content
                        , self.ABSTRACTS_DIR: page_abstract
                    }
            )
コード例 #4
0
    def process_page(self, url, soup):
        global uid_
        # remove all javascript and stylesheet code
        for script in soup(["script", "style", "iframe"]):
            script.extract()

        content = soup.find(class_='_picCon _disable_copy _munchDiscuss')

        if not content:
            log.error('content extraction failed')
            verbose('content extraction failed')
            log.error('{}'.format(url))
            raise Exception
        else:
            try:
                verbose('content extraction Success')  #New
                verbose(' Content:=')
                verbose('  size: {}'.format(len(content)))
                year, month = self.extract_year_month(url, soup)
                log.info('year, month = {}, {}'.format(year, month))

                verbose('  year/month: {}/{}'.format(year, month))
                name = '___'.join(url.split('?')[0].split('/')[-2:]).replace(
                    '.html', '')

                log.debug(content)
                # paras = content.findAll('p')
                # log.debug(pformat(paras))

                path_suffix = '{}/{}/{}.txt'.format(year,
                                                    self.month_alias[month],
                                                    name)

                for d in self.SUBDIRS:
                    mkdir('{}/{}/{}'.format(d, year, self.month_alias[month]))

                # page_content = '\n'.join(p.text for p in paras)
                page_content = content.text.replace(u'\xa0', '')
                page_abstract = soup.find(class_='small_intro').text.strip()
                title = soup.find(class_='arH LineHiet')

                breadcrumbs = soup.find(class_='breadcrumbs').findAll('a')
                breadcrumbs = ','.join([
                    b.text.replace('\n', '').replace('\r', '')
                    for b in breadcrumbs
                ])

                tags = soup.find(class_='_tag pb-0 pb-md-3').findAll('a')
                tags = ','.join(
                    [b.text.replace('\n', '').replace('\r', '') for b in tags])
                log.info(title.text)
                log.info(breadcrumbs)
                log.info(tags)

                record = '{}|{}|{}|{}|{}'.format(path_suffix.strip(), url,
                                                 title.text.strip(),
                                                 breadcrumbs, tags)
                return (path_suffix, record, {
                    self.ARTICLES_DIR: page_content,
                    self.ABSTRACTS_DIR: page_abstract
                })
            except:
                verbose("Error while processing")
コード例 #5
0
    def process_page(self, url, soup):
        global uid_
        # remove all javascript and stylesheet code
        for script in soup(["script", "style"]):
            script.extract()

        content = soup.find(class_='article').find('arttextxml')

        if not content:
            log.error('content extraction failed')
            verbose('content extraction failed')
            log.error('{}'.format(url))
            raise Exception
        else:
            try:
                verbose('content extraction Success')
                verbose(' Content:=')
                verbose('  size: {}'.format(len(content)))
                year, month = self.extract_year_month(url, soup)
                log.info('year, month = {}, {}'.format(year, month))

                verbose('  year/month: {}/{}'.format(year, month))
                name = '___'.join(url.split('?')[0].split('/')[-5:-2]).replace(
                    '.html', '')

                log.debug(content)
                # paras = content.findAll('p')
                # log.debug(pformat(paras))

                path_suffix = '{}/{}/{}.txt'.format(year,
                                                    self.month_alias[month],
                                                    name)

                for d in self.SUBDIRS:
                    mkdir('{}/{}/{}'.format(d, year, self.month_alias[month]))

                page_content = content.text
                page_abstract = soup.find(class_='artsyn').text
                title = soup.find(class_='leftmain').findAll('h1')[0]
                verbose(title)
                breadcrumbs = soup.find(class_='breadcrumb').findAll('li')
                breadcrumbs = ','.join([
                    b.text.replace('\n', '').replace('\r', '')
                    for b in breadcrumbs
                ])

                tags = soup.find(class_='keyinfo').findAll('a')
                tags = ','.join([
                    b.text.replace('\n', '').replace('\r',
                                                     '').replace('|', '')
                    for b in tags
                ])
                log.info(title.text)
                log.info(breadcrumbs)
                log.info(tags)

                record = '{}|{}|{}|{}|{}'.format(path_suffix.strip(), url,
                                                 title.text.strip(),
                                                 breadcrumbs, tags)

                return (path_suffix, record, {
                    self.ARTICLES_DIR: page_content,
                    self.ABSTRACTS_DIR: page_abstract
                })
            except:
                verbose("Error while processing")
コード例 #6
0
    def process_page(self, url, soup):
        global uid_
        # remove all javascript and stylesheet code
        for script in soup(["script", "style"]):
            script.extract()

        content = soup.find(class_='rightsec')

        if not content:
            log.error('content extraction failed')
            verbose('content extraction failed')
            log.error('{}'.format(url))
            raise Exception
        else:
            try:
                verbose('content extraction Success')#New
                verbose(' Content:=')                
                verbose('  size: {}'.format(len(content)))
                year, month = self.extract_year_month(url, soup)
                log.info('year, month = {}, {}'.format(year, month))

                verbose('  year/month: {}/{}'.format(year, month))
                name = '___'.join(
                    url.split('?')[0].split('/')[-2:]
                ).replace('.html', '')

                log.debug(content)
                paras = content.findAll('p')
                log.debug(pformat(paras))
                
                path_suffix = '{}/{}/{}.txt'.format(year,
                                                    self.month_alias[month], name)

                for d in self.SUBDIRS:
                    mkdir('{}/{}/{}'.format(d, year, self.month_alias[month]))

                page_content = '\n'.join(p.text for p in paras)
                page_abstract = soup.find(class_='synopsis').text
                title = soup.find(class_='storytop').find('h1')
                
                breadcrumbs = soup.find_all('div', class_=['breadcrumb', 'MT30'])[0].find('ul').find_all('li')
                breadcrumbs = ','.join([b.text.replace('\n', '').replace('\r', '').replace('»', '').strip()
                                        for b in breadcrumbs])
                # tags = soup.find(class_='tag-list').findAll('a')
                # tags = ','.join([b.text.replace('\n', '').replace('\r', '')
                #                 for b in tags])
                tags = "" #No tags
                log.info(title.text)
                log.info(breadcrumbs)
                log.info(tags)

                record = '{}|{}|{}|{}|{}'.format(path_suffix.strip(), url, title.text.strip(),
                                                breadcrumbs, tags)
                return (path_suffix,
                        record,
                        {
                            self.ARTICLES_DIR: page_content, self.ABSTRACTS_DIR: page_abstract
                        }
                        )
            except :
                verbose("Error while processing")