def parse_article(self, html): soup = super(SageSource, self).parse_article(html) # Do some preprocessing if not soup: return False # To download tables, we need the content URL and the number of tables content_url = soup.find('meta', {'name': 'citation_public_url'})['content'] n_tables = len(soup.find_all('span', class_='table-label')) # Now download each table and parse it tables = [] for i in range(n_tables): t_num = i+1 url = '%s/T%d.expansion.html' % (content_url, t_num) table_html = scrape.get_url(url) table_html = self.decode_html_entities(table_html) table_soup = BeautifulSoup(table_html) tc = table_soup.find(class_='table-expansion') t = tc.find('table', {'id': 'table-%d' % (t_num)}) t = self.parse_table(t) if t: t.number = t_num t.title = tc.find(class_='table-label').text try: t.caption = tc.find(class_='table-caption').get_text() except: pass try: t.notes = tc.find(class_='table-footnotes').get_text() except: pass tables.append(t) self.article.tables = tables return self.article
def FindJobs(jobTitle: str, jobType: str, filterPattern: str): base_url = "https://www.indeed.fr" url = scrape.get_url(base_url, jobTitle, jobType) df = scrape.get_offers(base_url, url) df_pr = process.process_offers(df, r'filterPattern') df_pr.to_csv('offers.csv') return df_pr
def parse_article(self, html): soup = super(JournalOfCognitiveNeuroscienceSource, self).parse_article(html) if not soup: return False # To download tables, we need the DOI and the number of tables m = re.search('\<meta.*content="http://dx.doi.org/(10.1162/jocn_a_00371)["\s]+', html) doi = m.group(1) pattern = re.compile('^T\d+$') n_tables = len(soup.find_all('table', {'id': pattern })) tables = [] # Now download each table and parse it for i in range(n_tables): url = 'http://www.mitpressjournals.org/action/showPopup?citid=citart1&id=T%d&doi=%s' % (i+1, doi) table_html = scrape.get_url(url) table_html = self.decode_html_entities(table_html) table_soup = BeautifulSoup(table_html) t = table_soup.find('table').find('table') # JCogNeuro nests tables 2-deep t = self.parse_table(t) if t: tables.append(t) self.article.tables = tables return self.article
def parse_article(self, html): soup = super(JournalOfCognitiveNeuroscienceSource, self).parse_article(html) if not soup: return False # To download tables, we need the DOI and the number of tables m = re.search( '\<meta.*content="http://dx.doi.org/(10.1162/jocn_a_00371)["\s]+', html) doi = m.group(1) pattern = re.compile('^T\d+$') n_tables = len(soup.find_all('table', {'id': pattern})) tables = [] # Now download each table and parse it for i in range(n_tables): url = 'http://www.mitpressjournals.org/action/showPopup?citid=citart1&id=T%d&doi=%s' % ( i + 1, doi) table_html = scrape.get_url(url) table_html = self.decode_html_entities(table_html) table_soup = BeautifulSoup(table_html) t = table_soup.find('table').find( 'table') # JCogNeuro nests tables 2-deep t = self.parse_table(t) if t: tables.append(t) self.article.tables = tables return self.article
def _download_table(self, url): ''' For Sources that have tables in separate files, a helper for downloading and extracting the table data. Also saves to file if desired. ''' delay = self.delay if hasattr(self, 'delay') else 0 if self.table_dir is not None: filename = '%s/%s' % (self.table_dir, url.replace('/', '_')) if os.path.exists(filename): table_html = open(filename).read().decode('utf-8') else: table_html = scrape.get_url(url, delay=delay) open(filename, 'w').write(table_html.encode('utf-8')) else: table_html = scrape.get_url(url, delay=delay) table_html = self.decode_html_entities(table_html) return (BeautifulSoup(table_html))
def _download_table(self, url): ''' For Sources that have tables in separate files, a helper for downloading and extracting the table data. Also saves to file if desired. ''' delay = self.delay if hasattr(self, 'delay') else 0 if self.table_dir is not None: filename = '%s/%s' % (self.table_dir, url.replace('/', '_')) if os.path.exists(filename): table_html = open(filename).read().decode('utf-8') else: table_html = scrape.get_url(url, delay=delay) open(filename, 'w').write(table_html.encode('utf-8')) else: table_html = scrape.get_url(url, delay=delay) table_html = self.decode_html_entities(table_html) return(BeautifulSoup(table_html))
def parse_article(self, html): soup = super(SageSource, self).parse_article(html) # Do some preprocessing if not soup: return False # To download tables, we need the content URL and the number of tables content_url = soup.find('meta', {'name': 'citation_public_url'})['content'] n_tables = len(soup.find_all('span', class_='table-label')) # Now download each table and parse it tables = [] for i in range(n_tables): t_num = i + 1 url = '%s/T%d.expansion.html' % (content_url, t_num) table_html = scrape.get_url(url) table_html = self.decode_html_entities(table_html) table_soup = BeautifulSoup(table_html) tc = table_soup.find(class_='table-expansion') t = tc.find('table', {'id': 'table-%d' % (t_num)}) t = self.parse_table(t) if t: t.number = t_num t.title = tc.find(class_='table-label').text try: t.caption = tc.find(class_='table-caption').get_text() except: pass try: t.notes = tc.find(class_='table-footnotes').get_text() except: pass tables.append(t) self.article.tables = tables return self.article