def _do_portal_acm(self, source, page):
     """
     Searches the page for a link to the reference, and then retrieves the
     reference.
     Returns a tuple with the full reference and its format.
     """ 
     log.info('Using ACM Portal reference wrapper') #@UndefinedVariable
     ref = (None, None)
     anchor = page.find('a', {'onclick':re.compile('popBibTex.cfm')})
     if not anchor:
         return ref
     jscript = anchor['onclick'].replace('window.open', '').strip('\(\)')
     ref_url = jscript.split(',')[0].strip('\'')
     ref_url = source + '/' + ref_url
     
     try:
         page = BeautifulSoup(self._browser.get_page(ref_url))
     except BrowserError:
         log.error('Browse error while retrieving entry page') #@UndefinedVariable
         return ref
     
     pre = page.find('pre')
     if not pre:
         return ref
     
     # As the wrapper has been hardcoded, we already know what will be the
     # format of the reference
     return (pre.find(text=True).strip(), ReferenceFormat.BIBTEX)
 def test_get_text_from_non_leaf(self):
     soup = BeautifulSoup('<html><body>'
                          '<div>'
                          '<span>Text 01</span>'
                          '<span>Text 02</span>'
                          '</div>'
                          '</html></body>')
     text = soup.findAll('div', text=True)
     self.failUnless(len(text) == 2)
     self.failUnless(text[0] == u'Text 01')
Example #3
0
 def test_get_text_from_non_leaf(self):
     soup = BeautifulSoup('<html><body>'
                          '<div>'
                          '<span>Text 01</span>'
                          '<span>Text 02</span>'
                          '</div>'
                          '</html></body>')
     text = soup.findAll('div', text=True)
     self.failUnless(len(text) == 2)
     self.failUnless(text[0] == u'Text 01')
Example #4
0
 def check_result_url(self, url, check_string):
     if url in self.results_cache.keys():
         return self.results_cache[url]
     else:
         elements = None
         try:
             time.sleep(5)
             page = self.browser.get_page(url)
             page = self._clean_content(page)
             page = BeautifulSoup(page)
             elements = page.findAll(True,
                                     text=re.compile(check_string.lower()))
         except BrowserError, e:
             print 'ERROR: Browser error: %s' % e
         except Exception, e:
             print 'ERROR: Error checking error: %s' % e
 def check_result_url(self, url, check_string):
     if url in self.results_cache.keys():
         return self.results_cache[url]
     else:
         elements = None
         try:
             time.sleep(5)
             page = self.browser.get_page(url)
             page = self._clean_content(page)
             page = BeautifulSoup(page)
             elements = page.findAll(True,
                                     text=re.compile(check_string.lower()))
         except BrowserError, e:
             print 'ERROR: Browser error: %s' % e
         except Exception, e:
             print 'ERROR: Error checking error: %s' % e
Example #6
0
 def setUp(self):
     self.gs = GoogleSearch('query text')
     fixture_path = normpath(
         join(dirname(__file__), ('../../../../tests/'
                                  'fixtures/search/googleSearch.html')))
     self.fixture = open(fixture_path)
     self.page = BeautifulSoup(self.fixture.read())
Example #7
0
 def setUp(self):
     self.ss = ScholarSearch('query text')
     fixture_path = normpath(
         join(dirname(__file__), ('../../../../tests/'
                                  'fixtures/search/scholarSearch.html')))
     self.fixture = open(fixture_path)
     self.page = BeautifulSoup(self.fixture.read())
     self.results = self.ss._extract_raw_results_list(self.page)
Example #8
0
 def _get_soup(self, file_name):
     file_path = normpath(
         join(dirname(__file__), ('../../../../tests/'
                                  'fixtures/wrappers/' + file_name)))
     file = open(file_path)
     soup = BeautifulSoup(file.read())
     file.close()
     return soup
    def test_apply_no_sibling(self):
        html = BeautifulSoup('<html><body><div id="01" class="div01"><span>'
                             'Some text</span><p>Paragraph</p></div>'
                             '</body></html>')

        path = ['.*', {}, (u'div', {u'class': u'div01'}, 1), (u'p', {}, -1)]

        self.rule.pattern = path
        result = self.rule.apply(html)
        self.failIf(not result)
        self.failUnless(result[0] == "Paragraph")
Example #10
0
    def clean_content(self, content):
        if not content:
            return None

        to_replace = {
            '\n': ' ',
            '\r': '',
            '\t': '',
            '<br>': ' ',
            '<br/>': ' ',
            '&amp;': '&',
            '&#38;': '&',
            '&#34;': '"',
            '&quot;': '"',
            '&rsquo;': "'",
            '&#39;': "'",
            '&apos;': "'",
            '&#x2013;': '-',
            '&nbsp;': ' '
        }
        for key in to_replace:
            content = content.replace(key, to_replace[key])

        # Remove consecutive whitespaces
        content = re.sub(' {2,}', ' ', content)
        content = re.sub('>( *)<', '><', content)

        content = BeautifulSoup(content)

        # Remove comments
        comments = content.findAll(text=lambda text: isinstance(text, Comment))
        [element.extract() for element in comments]

        # Remove unnecessary HTML elements
        for tag in ['meta', 'link', 'style', 'script']:
            elements = content.findAll(tag)
            [element.extract() for element in elements]

        return content
Example #11
0
 def setUp(self):
     factory = UtilFactory()
     self.iec = IEController(factory, ReferenceFormat.BIBTEX)
     self.top_results = [
         SearchResult(
             'result01',
             'http://portal.acm.org/citation.cfm?id=507338.507355'),
         SearchResult(
             'result01',
             'http://www.springerlink.com/index/D7X7KX6772HQ2135.pdf')
     ]
     self.empty_page = BeautifulSoup("<html><head/><body/></html>")
     self.page = self._get_soup('acm01.html')
     self.text = 'ss'
Example #12
0
    def clean_content(self, content):
        if not content:
            return None
        
        to_replace = {'\n':' ',
                      '\r':'',
                      '\t':'',
                      '<br>':' ',
                      '<br/>':' ',
                      '&amp;':'&',
                      '&#38;':'&',
                      '&#34;':'"',
                      '&quot;':'"',
                      '&rsquo;':"'",
                      '&#39;':"'",
                      '&apos;':"'",
                      '&#x2013;':'-',
                      '&nbsp;':' '}
        for key in to_replace:
            content = content.replace(key, to_replace[key])

        # Remove consecutive whitespaces
        content = re.sub(' {2,}', ' ', content)
        content = re.sub('>( *)<', '><', content)

        content = BeautifulSoup(content)
        
        # Remove comments
        comments = content.findAll(text=lambda text:isinstance(text, Comment))
        [element.extract() for element in comments]
        
        # Remove unnecessary HTML elements
        for tag in ['meta', 'link', 'style', 'script']:
            elements = content.findAll(tag)
            [element.extract() for element in elements]

        return content
    def extract(self, input_file):
        input_file = self._check_input_file(input_file)
        # Extraction command and its options. They may be parametrized in the
        # future
        command = [self._pdf_extraction_tool, '-q', '-f', '1', '-l', '2',
                   '-enc', 'ASCII7', '-htmlmeta', input_file, '-']
        try:
            pop = subprocess.Popen(command, stdout=subprocess.PIPE)
        except subprocess.CalledProcessError as cpe:
            log.error ('Error executing PDF text extraction tool. Return code: ' #@UndefinedVariable
                   + repr(cpe.returncode))
        except OSError:
            log.error ('PDF extraction tool not found') #@UndefinedVariable
        
        stdout = pop.communicate()[0]
        if not stdout:
            raise ExtractionError('Corrupted file')
        
        parser = BeautifulSoup(stdout)
        document = Document()
        self._extract_metadata(parser, document)
        self._extract_content(parser, document)

        return document
Example #14
0
 def test_create_soup_from_empty_string(self):
     try:
         soup = BeautifulSoup('')
         self.failIf(soup is None)
     except:
         self.fail("Soup of empty string shouldn't raise an exception")
Example #15
0
class Searcher(object):
    """
    Base class for searching with a search engine
    """
    GOOGLE = 0
    SCHOLAR = 1
    BING = 2
    YAHOO = 3

    def __init__(self, query='', random_agent=False, debug=False):
        self.query = query
        self.debug = debug
        self.browser = Browser(debug=debug)
        self.prepare()

        if random_agent:
            self.browser.set_random_user_agent()

    def prepare(self):
        self.results_info = None
        self.eor = False  # end of results
        self._page = 0
        self._results_per_page = 30
        self._last_from = 0

    def get_query(self):
        return self.__query

    def set_query(self, value):
        self.__query = value
        self.prepare()

    query = property(get_query, set_query)

    @property
    def num_results(self):
        if not self.results_info:
            page = self._get_results_page()
            self.results_info = self._extract_info(page)
            if self.results_info['total'] == 0:
                self.eor = True
        return self.results_info['total']

    @property
    def search_engine_url(self):
        raise NotImplementedError()

    def _get_page(self):
        return self._page

    def _set_page(self, page):
        self._page = page

    page = property(_get_page, _set_page)

    def _get_results_per_page(self):
        return self._results_per_page

    def _set_results_par_page(self, rpp):
        self._results_per_page = rpp

    results_per_page = property(_get_results_per_page, _set_results_par_page)

    def get_results(self):
        """ Gets a page of results """
        if self.eor:
            return []

        page = self._get_results_page()
        search_info = self._extract_info(page)
        if not self.results_info:
            self.results_info = search_info
            if self.num_results == 0:
                self.eor = True
                return []
        results = self._extract_results(page)
        if not results:
            self.eor = True
            return []
        if self._page > 0 and search_info['from'] == self._last_from:
            self.eor = True
            return []
        if search_info['to'] == search_info['total']:
            self.eor = True
        self._page += 1
        self._last_from = search_info['from']
        return results

    def _maybe_raise(self, cls, *arg):
        if self.debug:
            raise cls(*arg)

    def _get_safe_url(self):
        return self.search_engine_url % {
            'query': urllib.quote_plus(self.query),
            'start': self._page * self._results_per_page,
            'num': self._results_per_page
        }

    def _get_results_page(self):
        safe_url = self._get_safe_url()

        # Wait a random time between 0.5 and 1,5 seconds before doing the
        # search
        #time_to_wait = random.randrange(5, 15, 2) / 10.0
        #log.debug('Waiting %g before searching %s' % (time_to_wait, safe_url))
        #time.sleep(time_to_wait)

        try:
            page = self.browser.get_page(safe_url)
        except BrowserError, e:
            raise SearchError("Failed getting %s: %s" % (e.url, e.error))
        return BeautifulSoup(page)
 def test_get_invalid_content_element(self):
     example = Example(value='random text', content=BeautifulSoup(''))
     elements = self.ruler._get_content_elements(example.value,
                                                 example.content)
     self.failIf(elements)