class ContextResolverTest(unittest.TestCase): def setUp(self): self.cr = ContextResolver() self.soup = ContentCleaner().clean_content(html) self.element01 = self.soup.find('td', text='Value 01').parent self.element02 = self.soup.find('td', text='Value 03').parent def tearDown(self): pass def test_get_context(self): context = self.cr.get_context(self.element01) self.failUnless(context[u'Field 01:'] == 1) def test_get_tree_context(self): context = self.cr.get_context(self.element02) self.failUnless(context[u'Field 03'] == 1) self.failUnless(context[u'33'] == 1) def test_merge_contexts(self): context01 = {u'Field 01:': 1} context02 = {u'Field 01:': 3, u'Field 02:': 1, u'Field 03:': 4} merged = self.cr.merge_context(context01, context02) self.failUnless(merged == { u'Field 02:': 1, u'Field 01:': 4, u'Field 03:': 4 }) def test_clean_context(self): context = { 'a': 2, 'b': 3, 'c': 1, 'this string is quite long. yes indeed': 4 } result = self.cr.clean_context(context) self.failUnless(result == {'a': 2, 'b': 3}) def test_get_top_words(self): context = {u'a': 3, 'b': 5, 'c': 1, u'd': 2, 'e': 4} expected = ['b', 'e', u'a'] result = self.cr.get_top_strings(context, 3) self.failUnless(result == expected) def test_check_context(self): context01 = {'a': 3, 'b': 5, 'c': 1, 'd': 2, 'e': 4} context02 = {'a': 1, 'x': 3} result = self.cr.check_context(context01, context02) self.failUnless(result) context02 = {'x': 3} result = self.cr.check_context(context01, context02) self.failIf(result) context01 = {} result = self.cr.check_context(context01, context02) self.failUnless(result)
def get_soup(file_name): file_path = normpath( join(dirname(__file__), ('../../../../tests/' 'fixtures/wrappers/' + file_name))) file = open(file_path) contents = file.read() contents = ContentCleaner().clean_content(contents) file.close() return contents
class ContextResolverTest(unittest.TestCase): def setUp(self): self.cr = ContextResolver() self.soup = ContentCleaner().clean_content(html) self.element01 = self.soup.find('td', text='Value 01').parent self.element02 = self.soup.find('td', text='Value 03').parent def tearDown(self): pass def test_get_context(self): context = self.cr.get_context(self.element01) self.failUnless(context[u'Field 01:'] == 1) def test_get_tree_context(self): context = self.cr.get_context(self.element02) self.failUnless(context[u'Field 03'] == 1) self.failUnless(context[u'33'] == 1) def test_merge_contexts(self): context01 = {u'Field 01:':1} context02 = {u'Field 01:':3, u'Field 02:':1, u'Field 03:':4} merged = self.cr.merge_context(context01, context02) self.failUnless(merged == {u'Field 02:': 1, u'Field 01:': 4, u'Field 03:': 4}) def test_clean_context(self): context = {'a':2, 'b':3, 'c':1, 'this string is quite long. yes indeed':4} result = self.cr.clean_context(context) self.failUnless(result == {'a':2, 'b':3}) def test_get_top_words(self): context = {u'a':3, 'b':5, 'c':1, u'd':2, 'e':4} expected = ['b', 'e', u'a'] result = self.cr.get_top_strings(context, 3) self.failUnless(result == expected) def test_check_context(self): context01 = {'a':3, 'b':5, 'c':1, 'd':2, 'e':4} context02 = {'a':1, 'x':3} result = self.cr.check_context(context01, context02) self.failUnless(result) context02 = {'x':3} result = self.cr.check_context(context01, context02) self.failIf(result) context01 = {} result = self.cr.check_context(context01, context02) self.failUnless(result)
def _get_content(self, url): """ This method looks for the content of an example's URL. In order not to overload the server, it sleeps for some time between multiple calls. """ time_to_sleep = (self.seconds_between_requests - (datetime.now() - self.last_request).seconds) if time_to_sleep > 0: sleep(time_to_sleep) content = None try: content = Browser().get_page(url) content = ContentCleaner().clean_content(content) except BrowserError as e: log.error('Error retrieving page %s: %s' % (url, #@UndefinedVariable e.error)) self.last_request = datetime.now() return content
def extract_reference(self, top_results, raw_text): """ Returns a list of References if they can be extracted or an empty list otherwise. A single publication may need more than a reference (e.g: inproceedings and its proceedings) """ log.info('Using %d top results' % len(top_results)) #@UndefinedVariable page = None references = [] for result in top_results: try: log.debug('Retrieving page for result %s' % result.url) #@UndefinedVariable page = self.browser.get_page(result.url) except BrowserError as e: log.error('Error retrieving page %s: %s' % ( result.url, #@UndefinedVariable e.error)) continue page = ContentCleaner().clean_content(page) references = self._use_reference_wrappers(result.base_url, page, raw_text) if not references: references = self._use_rule_wrappers(result.base_url, page, raw_text) if references: break # Convert to target format, if necessary for reference in references: self._format_reference(reference) # Return the extracted reference and the result that has been used return (references, result)
def setUp(self): self.cr = ContextResolver() self.soup = ContentCleaner().clean_content(html) self.element01 = self.soup.find('td', text='Value 01').parent self.element02 = self.soup.find('td', text='Value 03').parent