Example #1
0
    def __init__(self):

        self.info = {}
        
        self.nexamples = 4
        self.base_path = '/home/rxuriguera/benchmark/pages/'
        self.fields = ['addres', 'author', 'isbn', 'issn', 'journal', 'number', 'pages', 'publisher', 'title', 'volume', 'year']
        self.libraries = ['informaworld']#['acm', 'citeulike', 'computerorg', 'econpapers', 'ideas', 'informaworld', 'sciencedirect', 'scientificcommons', 'springer']
        
        self.factory = UtilFactory()
        self.iec = IEController(self.factory, secs_between_reqs=0,
                                wrapper_gen_examples=self.nexamples)
        self.rec = ReferencesController(self.factory)
Example #2
0
 def setUp(self):
     factory = UtilFactory()
     self.iec = IEController(factory, ReferenceFormat.BIBTEX)
     self.top_results = [
         SearchResult(
             'result01',
             'http://portal.acm.org/citation.cfm?id=507338.507355'),
         SearchResult(
             'result01',
             'http://www.springerlink.com/index/D7X7KX6772HQ2135.pdf')
     ]
     self.empty_page = BeautifulSoup("<html><head/><body/></html>")
     self.page = self._get_soup('acm01.html')
     self.text = 'ss'
    def make_reference(self, file, target_format):
        """
        Uses the controllers to extract the content of a file, get some query
        strings, retrieve results from a search engine, and extract the
        reference.
        """
        extraction = Extraction()
        
        extraction.file_path = file
        extraction.target_format = target_format
        
        log.info("Making reference for file: %s" % file) #@UndefinedVariable

        rce = RCEController(self.factory)
        raw_text = rce.extract_content(file, FileFormat.TXT)
        if not raw_text:
            return extraction
        
        extraction.query_strings = rce.get_query_strings(raw_text)
        if not extraction.query_strings:
            log.error('No query strings extracted') #@UndefinedVariable
            return extraction
        log.debug("Query strings %s" % str(extraction.query_strings)) #@UndefinedVariable
        
        ir = IRController(self.factory)
        extraction.top_results, extraction.used_query = (
            ir.get_top_results(extraction.query_strings))
        if not extraction.top_results:
            log.error('No top results to use with the available wrappers ' #@UndefinedVariable
                      'after trying %d queries' % 
                      len(extraction.query_strings))
            return extraction
        extraction.query_strings.remove(extraction.used_query)
        
        log.debug("Used query %s" % str(extraction.used_query)) #@UndefinedVariable
        log.debug("Query returned %d top results" % len(extraction.top_results)) #@UndefinedVariable
        
        ie = IEController(self.factory, target_format)
        extraction.entries, extraction.used_result = (
            ie.extract_reference(extraction.top_results, raw_text))
        extraction.top_results.remove(extraction.used_result)
        log.info("Used result: %s" % str(extraction.used_result)) #@UndefinedVariable
        
        validator = ReferenceValidator(FIELD_WEIGHTS)
        for entry in extraction.entries:
            validator.validate(entry, raw_text)
        
        return extraction
Example #4
0
 def __init__(self, url):
     super(WrapperGenerator, self).__init__()
     self.name = 'WrapTrainer'
     self.url = url
     self.factory = UtilFactory()
     self.ie_controller = IEController(self.factory)