def __init__(self): self.info = {} self.nexamples = 4 self.base_path = '/home/rxuriguera/benchmark/pages/' self.fields = ['addres', 'author', 'isbn', 'issn', 'journal', 'number', 'pages', 'publisher', 'title', 'volume', 'year'] self.libraries = ['informaworld']#['acm', 'citeulike', 'computerorg', 'econpapers', 'ideas', 'informaworld', 'sciencedirect', 'scientificcommons', 'springer'] self.factory = UtilFactory() self.iec = IEController(self.factory, secs_between_reqs=0, wrapper_gen_examples=self.nexamples) self.rec = ReferencesController(self.factory)
def setUp(self): factory = UtilFactory() self.iec = IEController(factory, ReferenceFormat.BIBTEX) self.top_results = [ SearchResult( 'result01', 'http://portal.acm.org/citation.cfm?id=507338.507355'), SearchResult( 'result01', 'http://www.springerlink.com/index/D7X7KX6772HQ2135.pdf') ] self.empty_page = BeautifulSoup("<html><head/><body/></html>") self.page = self._get_soup('acm01.html') self.text = 'ss'
def make_reference(self, file, target_format): """ Uses the controllers to extract the content of a file, get some query strings, retrieve results from a search engine, and extract the reference. """ extraction = Extraction() extraction.file_path = file extraction.target_format = target_format log.info("Making reference for file: %s" % file) #@UndefinedVariable rce = RCEController(self.factory) raw_text = rce.extract_content(file, FileFormat.TXT) if not raw_text: return extraction extraction.query_strings = rce.get_query_strings(raw_text) if not extraction.query_strings: log.error('No query strings extracted') #@UndefinedVariable return extraction log.debug("Query strings %s" % str(extraction.query_strings)) #@UndefinedVariable ir = IRController(self.factory) extraction.top_results, extraction.used_query = ( ir.get_top_results(extraction.query_strings)) if not extraction.top_results: log.error('No top results to use with the available wrappers ' #@UndefinedVariable 'after trying %d queries' % len(extraction.query_strings)) return extraction extraction.query_strings.remove(extraction.used_query) log.debug("Used query %s" % str(extraction.used_query)) #@UndefinedVariable log.debug("Query returned %d top results" % len(extraction.top_results)) #@UndefinedVariable ie = IEController(self.factory, target_format) extraction.entries, extraction.used_result = ( ie.extract_reference(extraction.top_results, raw_text)) extraction.top_results.remove(extraction.used_result) log.info("Used result: %s" % str(extraction.used_result)) #@UndefinedVariable validator = ReferenceValidator(FIELD_WEIGHTS) for entry in extraction.entries: validator.validate(entry, raw_text) return extraction
def __init__(self, url): super(WrapperGenerator, self).__init__() self.name = 'WrapTrainer' self.url = url self.factory = UtilFactory() self.ie_controller = IEController(self.factory)