def _do_portal_acm(self, source, page): """ Searches the page for a link to the reference, and then retrieves the reference. Returns a tuple with the full reference and its format. """ log.info('Using ACM Portal reference wrapper') #@UndefinedVariable ref = (None, None) anchor = page.find('a', {'onclick':re.compile('popBibTex.cfm')}) if not anchor: return ref jscript = anchor['onclick'].replace('window.open', '').strip('\(\)') ref_url = jscript.split(',')[0].strip('\'') ref_url = source + '/' + ref_url try: page = BeautifulSoup(self._browser.get_page(ref_url)) except BrowserError: log.error('Browse error while retrieving entry page') #@UndefinedVariable return ref pre = page.find('pre') if not pre: return ref # As the wrapper has been hardcoded, we already know what will be the # format of the reference return (pre.find(text=True).strip(), ReferenceFormat.BIBTEX)
def test_get_text_from_non_leaf(self): soup = BeautifulSoup('<html><body>' '<div>' '<span>Text 01</span>' '<span>Text 02</span>' '</div>' '</html></body>') text = soup.findAll('div', text=True) self.failUnless(len(text) == 2) self.failUnless(text[0] == u'Text 01')
def check_result_url(self, url, check_string): if url in self.results_cache.keys(): return self.results_cache[url] else: elements = None try: time.sleep(5) page = self.browser.get_page(url) page = self._clean_content(page) page = BeautifulSoup(page) elements = page.findAll(True, text=re.compile(check_string.lower())) except BrowserError, e: print 'ERROR: Browser error: %s' % e except Exception, e: print 'ERROR: Error checking error: %s' % e
def setUp(self): self.gs = GoogleSearch('query text') fixture_path = normpath( join(dirname(__file__), ('../../../../tests/' 'fixtures/search/googleSearch.html'))) self.fixture = open(fixture_path) self.page = BeautifulSoup(self.fixture.read())
def setUp(self): self.ss = ScholarSearch('query text') fixture_path = normpath( join(dirname(__file__), ('../../../../tests/' 'fixtures/search/scholarSearch.html'))) self.fixture = open(fixture_path) self.page = BeautifulSoup(self.fixture.read()) self.results = self.ss._extract_raw_results_list(self.page)
def _get_soup(self, file_name): file_path = normpath( join(dirname(__file__), ('../../../../tests/' 'fixtures/wrappers/' + file_name))) file = open(file_path) soup = BeautifulSoup(file.read()) file.close() return soup
def test_apply_no_sibling(self): html = BeautifulSoup('<html><body><div id="01" class="div01"><span>' 'Some text</span><p>Paragraph</p></div>' '</body></html>') path = ['.*', {}, (u'div', {u'class': u'div01'}, 1), (u'p', {}, -1)] self.rule.pattern = path result = self.rule.apply(html) self.failIf(not result) self.failUnless(result[0] == "Paragraph")
def clean_content(self, content): if not content: return None to_replace = { '\n': ' ', '\r': '', '\t': '', '<br>': ' ', '<br/>': ' ', '&': '&', '&': '&', '"': '"', '"': '"', '’': "'", ''': "'", ''': "'", '–': '-', ' ': ' ' } for key in to_replace: content = content.replace(key, to_replace[key]) # Remove consecutive whitespaces content = re.sub(' {2,}', ' ', content) content = re.sub('>( *)<', '><', content) content = BeautifulSoup(content) # Remove comments comments = content.findAll(text=lambda text: isinstance(text, Comment)) [element.extract() for element in comments] # Remove unnecessary HTML elements for tag in ['meta', 'link', 'style', 'script']: elements = content.findAll(tag) [element.extract() for element in elements] return content
def setUp(self): factory = UtilFactory() self.iec = IEController(factory, ReferenceFormat.BIBTEX) self.top_results = [ SearchResult( 'result01', 'http://portal.acm.org/citation.cfm?id=507338.507355'), SearchResult( 'result01', 'http://www.springerlink.com/index/D7X7KX6772HQ2135.pdf') ] self.empty_page = BeautifulSoup("<html><head/><body/></html>") self.page = self._get_soup('acm01.html') self.text = 'ss'
def clean_content(self, content): if not content: return None to_replace = {'\n':' ', '\r':'', '\t':'', '<br>':' ', '<br/>':' ', '&':'&', '&':'&', '"':'"', '"':'"', '’':"'", ''':"'", ''':"'", '–':'-', ' ':' '} for key in to_replace: content = content.replace(key, to_replace[key]) # Remove consecutive whitespaces content = re.sub(' {2,}', ' ', content) content = re.sub('>( *)<', '><', content) content = BeautifulSoup(content) # Remove comments comments = content.findAll(text=lambda text:isinstance(text, Comment)) [element.extract() for element in comments] # Remove unnecessary HTML elements for tag in ['meta', 'link', 'style', 'script']: elements = content.findAll(tag) [element.extract() for element in elements] return content
def extract(self, input_file): input_file = self._check_input_file(input_file) # Extraction command and its options. They may be parametrized in the # future command = [self._pdf_extraction_tool, '-q', '-f', '1', '-l', '2', '-enc', 'ASCII7', '-htmlmeta', input_file, '-'] try: pop = subprocess.Popen(command, stdout=subprocess.PIPE) except subprocess.CalledProcessError as cpe: log.error ('Error executing PDF text extraction tool. Return code: ' #@UndefinedVariable + repr(cpe.returncode)) except OSError: log.error ('PDF extraction tool not found') #@UndefinedVariable stdout = pop.communicate()[0] if not stdout: raise ExtractionError('Corrupted file') parser = BeautifulSoup(stdout) document = Document() self._extract_metadata(parser, document) self._extract_content(parser, document) return document
def test_create_soup_from_empty_string(self): try: soup = BeautifulSoup('') self.failIf(soup is None) except: self.fail("Soup of empty string shouldn't raise an exception")
class Searcher(object): """ Base class for searching with a search engine """ GOOGLE = 0 SCHOLAR = 1 BING = 2 YAHOO = 3 def __init__(self, query='', random_agent=False, debug=False): self.query = query self.debug = debug self.browser = Browser(debug=debug) self.prepare() if random_agent: self.browser.set_random_user_agent() def prepare(self): self.results_info = None self.eor = False # end of results self._page = 0 self._results_per_page = 30 self._last_from = 0 def get_query(self): return self.__query def set_query(self, value): self.__query = value self.prepare() query = property(get_query, set_query) @property def num_results(self): if not self.results_info: page = self._get_results_page() self.results_info = self._extract_info(page) if self.results_info['total'] == 0: self.eor = True return self.results_info['total'] @property def search_engine_url(self): raise NotImplementedError() def _get_page(self): return self._page def _set_page(self, page): self._page = page page = property(_get_page, _set_page) def _get_results_per_page(self): return self._results_per_page def _set_results_par_page(self, rpp): self._results_per_page = rpp results_per_page = property(_get_results_per_page, _set_results_par_page) def get_results(self): """ Gets a page of results """ if self.eor: return [] page = self._get_results_page() search_info = self._extract_info(page) if not self.results_info: self.results_info = search_info if self.num_results == 0: self.eor = True return [] results = self._extract_results(page) if not results: self.eor = True return [] if self._page > 0 and search_info['from'] == self._last_from: self.eor = True return [] if search_info['to'] == search_info['total']: self.eor = True self._page += 1 self._last_from = search_info['from'] return results def _maybe_raise(self, cls, *arg): if self.debug: raise cls(*arg) def _get_safe_url(self): return self.search_engine_url % { 'query': urllib.quote_plus(self.query), 'start': self._page * self._results_per_page, 'num': self._results_per_page } def _get_results_page(self): safe_url = self._get_safe_url() # Wait a random time between 0.5 and 1,5 seconds before doing the # search #time_to_wait = random.randrange(5, 15, 2) / 10.0 #log.debug('Waiting %g before searching %s' % (time_to_wait, safe_url)) #time.sleep(time_to_wait) try: page = self.browser.get_page(safe_url) except BrowserError, e: raise SearchError("Failed getting %s: %s" % (e.url, e.error)) return BeautifulSoup(page)
def test_get_invalid_content_element(self): example = Example(value='random text', content=BeautifulSoup('')) elements = self.ruler._get_content_elements(example.value, example.content) self.failIf(elements)