def __init__(self, root_url, num_threads=1): root_dir= os.path.abspath(__file__) root_dir = '/'.join(root_dir.split('/')[-2:]) root_dir = root_dir.replace('crawler-', '').replace('.py', '') verbose('root directory for storing data is {}'.format(root_dir)) super().__init__(root_url = root_url, root_dir = root_dir, num_threads = num_threads) self.month_alias = MonthAlias()
def process_page(self, url, soup): global uid_ # remove all javascript and stylesheet code for script in soup(["script", "style"]): script.extract() content = soup.find(class_='db-contentScn') if not content: log.error('content extraction failed') verbose('content extraction failed') log.error('{}'.format(url)) raise Exception else: verbose(' Content:=') verbose(' size: {}'.format(len(content))) year, month = self.extract_year_month(url, soup) log.info('year, month = {}, {}'.format(year, month)) verbose(' year/month: {}/{}'.format(year, month)) name = '___'.join( url.split('?')[0].split('/')[-2:] ).replace('.html', '') log.debug(content) paras = content.findAll('p') ; log.debug(pformat(paras)) path_suffix = '{}/{}/{}.txt'.format(year, self.month_alias[month], name) for d in self.SUBDIRS: mkdir('{}/{}/{}'.format(d, year, self.month_alias[month])) page_content = '\n'.join(p.text for p in paras) page_abstract = paras[0].text.strip() title = soup.find('h1') breadcrumbs = soup.find(class_='breadCrums').findAll('a') breadcrumbs = ','.join([b.text.replace('\n', '').replace('\r', '') for b in breadcrumbs]) tags = soup.find(class_='tglst').findAll('a') tags = ','.join([b.text.replace('\n', '').replace('\r', '') for b in tags]) log.info(title.text) log.info(breadcrumbs) log.info(tags) record = '{}|{}|{}|{}|{}'.format(path_suffix.strip(), url, title.text.strip(), breadcrumbs, tags) return (path_suffix, record, { self.ARTICLES_DIR : page_content , self.ABSTRACTS_DIR: page_abstract } )
def process_page(self, page_name, soup): global uid_ # remove all javascript and stylesheet code for script in soup(["script", "style"]): script.extract() content = soup.find(class_='article') if not content: log.error('content extraction failed') verbose('content extraction failed') log.error('{}'.format(page_name)) raise Exception else: verbose(' Content:=') verbose(' size: {}'.format(len(content))) year, month = self.extract_year_month(page_name, soup) log.info('year, month = {}, {}'.format(year, month)) verbose(' year/month: {}/{}'.format(year, month)) m = re.search('{}\/.*\/([^\/]+).html'.format(self.ROOT_URL), page_name) if m: log.debug(pformat(m)) name = m.group(1) else: uid_ += 1 name = '{}'.format(uid_) log.debug(content) paras = content.findAll('p') ; log.debug(pformat(paras)) path_suffix = '{}/{}/{}.txt'.format(year, self.month_alias[month], name) for d in self.SUBDIRS: mkdir('{}/{}/{}'.format(d, year, self.month_alias[month])) page_content = '\n'.join(p.text for p in paras) page_abstract = paras[0].text.strip() title = soup.find(class_='headline') record = '{}|{}'.format(path_suffix.strip(), title.text.strip()) log.info(title.text) return (path_suffix, record, { self.ARTICLES_DIR : page_content , self.ABSTRACTS_DIR: page_abstract } )
def process_page(self, url, soup): global uid_ # remove all javascript and stylesheet code for script in soup(["script", "style", "iframe"]): script.extract() content = soup.find(class_='_picCon _disable_copy _munchDiscuss') if not content: log.error('content extraction failed') verbose('content extraction failed') log.error('{}'.format(url)) raise Exception else: try: verbose('content extraction Success') #New verbose(' Content:=') verbose(' size: {}'.format(len(content))) year, month = self.extract_year_month(url, soup) log.info('year, month = {}, {}'.format(year, month)) verbose(' year/month: {}/{}'.format(year, month)) name = '___'.join(url.split('?')[0].split('/')[-2:]).replace( '.html', '') log.debug(content) # paras = content.findAll('p') # log.debug(pformat(paras)) path_suffix = '{}/{}/{}.txt'.format(year, self.month_alias[month], name) for d in self.SUBDIRS: mkdir('{}/{}/{}'.format(d, year, self.month_alias[month])) # page_content = '\n'.join(p.text for p in paras) page_content = content.text.replace(u'\xa0', '') page_abstract = soup.find(class_='small_intro').text.strip() title = soup.find(class_='arH LineHiet') breadcrumbs = soup.find(class_='breadcrumbs').findAll('a') breadcrumbs = ','.join([ b.text.replace('\n', '').replace('\r', '') for b in breadcrumbs ]) tags = soup.find(class_='_tag pb-0 pb-md-3').findAll('a') tags = ','.join( [b.text.replace('\n', '').replace('\r', '') for b in tags]) log.info(title.text) log.info(breadcrumbs) log.info(tags) record = '{}|{}|{}|{}|{}'.format(path_suffix.strip(), url, title.text.strip(), breadcrumbs, tags) return (path_suffix, record, { self.ARTICLES_DIR: page_content, self.ABSTRACTS_DIR: page_abstract }) except: verbose("Error while processing")
def process_page(self, url, soup): global uid_ # remove all javascript and stylesheet code for script in soup(["script", "style"]): script.extract() content = soup.find(class_='article').find('arttextxml') if not content: log.error('content extraction failed') verbose('content extraction failed') log.error('{}'.format(url)) raise Exception else: try: verbose('content extraction Success') verbose(' Content:=') verbose(' size: {}'.format(len(content))) year, month = self.extract_year_month(url, soup) log.info('year, month = {}, {}'.format(year, month)) verbose(' year/month: {}/{}'.format(year, month)) name = '___'.join(url.split('?')[0].split('/')[-5:-2]).replace( '.html', '') log.debug(content) # paras = content.findAll('p') # log.debug(pformat(paras)) path_suffix = '{}/{}/{}.txt'.format(year, self.month_alias[month], name) for d in self.SUBDIRS: mkdir('{}/{}/{}'.format(d, year, self.month_alias[month])) page_content = content.text page_abstract = soup.find(class_='artsyn').text title = soup.find(class_='leftmain').findAll('h1')[0] verbose(title) breadcrumbs = soup.find(class_='breadcrumb').findAll('li') breadcrumbs = ','.join([ b.text.replace('\n', '').replace('\r', '') for b in breadcrumbs ]) tags = soup.find(class_='keyinfo').findAll('a') tags = ','.join([ b.text.replace('\n', '').replace('\r', '').replace('|', '') for b in tags ]) log.info(title.text) log.info(breadcrumbs) log.info(tags) record = '{}|{}|{}|{}|{}'.format(path_suffix.strip(), url, title.text.strip(), breadcrumbs, tags) return (path_suffix, record, { self.ARTICLES_DIR: page_content, self.ABSTRACTS_DIR: page_abstract }) except: verbose("Error while processing")
def process_page(self, url, soup): global uid_ # remove all javascript and stylesheet code for script in soup(["script", "style"]): script.extract() content = soup.find(class_='rightsec') if not content: log.error('content extraction failed') verbose('content extraction failed') log.error('{}'.format(url)) raise Exception else: try: verbose('content extraction Success')#New verbose(' Content:=') verbose(' size: {}'.format(len(content))) year, month = self.extract_year_month(url, soup) log.info('year, month = {}, {}'.format(year, month)) verbose(' year/month: {}/{}'.format(year, month)) name = '___'.join( url.split('?')[0].split('/')[-2:] ).replace('.html', '') log.debug(content) paras = content.findAll('p') log.debug(pformat(paras)) path_suffix = '{}/{}/{}.txt'.format(year, self.month_alias[month], name) for d in self.SUBDIRS: mkdir('{}/{}/{}'.format(d, year, self.month_alias[month])) page_content = '\n'.join(p.text for p in paras) page_abstract = soup.find(class_='synopsis').text title = soup.find(class_='storytop').find('h1') breadcrumbs = soup.find_all('div', class_=['breadcrumb', 'MT30'])[0].find('ul').find_all('li') breadcrumbs = ','.join([b.text.replace('\n', '').replace('\r', '').replace('»', '').strip() for b in breadcrumbs]) # tags = soup.find(class_='tag-list').findAll('a') # tags = ','.join([b.text.replace('\n', '').replace('\r', '') # for b in tags]) tags = "" #No tags log.info(title.text) log.info(breadcrumbs) log.info(tags) record = '{}|{}|{}|{}|{}'.format(path_suffix.strip(), url, title.text.strip(), breadcrumbs, tags) return (path_suffix, record, { self.ARTICLES_DIR: page_content, self.ABSTRACTS_DIR: page_abstract } ) except : verbose("Error while processing")