Esempio n. 1
0
    def make_reference(self, file, target_format):
        """
        Uses the controllers to extract the content of a file, get some query
        strings, retrieve results from a search engine, and extract the
        reference.
        """
        extraction = Extraction()
        
        extraction.file_path = file
        extraction.target_format = target_format
        
        log.info("Making reference for file: %s" % file) #@UndefinedVariable

        rce = RCEController(self.factory)
        raw_text = rce.extract_content(file, FileFormat.TXT)
        if not raw_text:
            return extraction
        
        extraction.query_strings = rce.get_query_strings(raw_text)
        if not extraction.query_strings:
            log.error('No query strings extracted') #@UndefinedVariable
            return extraction
        log.debug("Query strings %s" % str(extraction.query_strings)) #@UndefinedVariable
        
        ir = IRController(self.factory)
        extraction.top_results, extraction.used_query = (
            ir.get_top_results(extraction.query_strings))
        if not extraction.top_results:
            log.error('No top results to use with the available wrappers ' #@UndefinedVariable
                      'after trying %d queries' % 
                      len(extraction.query_strings))
            return extraction
        extraction.query_strings.remove(extraction.used_query)
        
        log.debug("Used query %s" % str(extraction.used_query)) #@UndefinedVariable
        log.debug("Query returned %d top results" % len(extraction.top_results)) #@UndefinedVariable
        
        ie = IEController(self.factory, target_format)
        extraction.entries, extraction.used_result = (
            ie.extract_reference(extraction.top_results, raw_text))
        extraction.top_results.remove(extraction.used_result)
        log.info("Used result: %s" % str(extraction.used_result)) #@UndefinedVariable
        
        validator = ReferenceValidator(FIELD_WEIGHTS)
        for entry in extraction.entries:
            validator.validate(entry, raw_text)
        
        return extraction
class TestRCEController(unittest.TestCase):
    some_text = """Neurocomputing 35 (2000) 3}26

        Class separability estimation and incremental learning using 
        boundary methods Jose-Luis Sancho *, William E. Pierson , 
        Batu Ulug , H AnmH bal R. Figueiras-Vidal , Stanley C. Ahalt 
        ATSC-DI, Escuela Politecnica Superior. Universidad Carlos III 
        Leganes-Madrid, Spain & & Department of Electrical Engineering, 
        he Ohio State University Columbus, OH 43210, USA Received 7 
        January 1999; revised 5 April 1999; accepted 10 April 2000

        Abstract In this paper we discuss the use of boundary methods 
        (BMs) for distribution analysis. We view these methods as tools 
        which can be used to extract useful information from sample 
        distributions. We believe that the information thus extracted has 
        utility for a number of applications, but in particular we discuss 
        the use of BMs as a mechanism for class separability estimation and 
        as an aid to constructing robust and e$cient neural networks (NNs) 
        to solve classi"cation problems. In the "rst case, BMs can 
        establish the utili...
    """
        
    def setUp(self):
        factory = UtilFactory()
        self.rcec = RCEController(factory)
        self.pdf = normpath(join(dirname(__file__), ('../../../../tests/'
                                     'fixtures/extraction/article.pdf')))

    def tearDown(self):
        pass

    def test_extract_content_from_non_existent_file(self):
        content = self.rcec.extract_content('somefile.pdf', FileFormat.TXT)
        self.failUnless(content is None)
    
    def test_extract_content_to_invalid_target_format(self):
        content = self.rcec.extract_content(self.pdf, 'invalid format')
        self.failUnless(content is None)
            
    def test_extract_content_from_pdf(self):
        content = self.rcec.extract_content(self.pdf, FileFormat.TXT)
        self.failUnless(content is not None)
    
    def test_get_query_strings(self):
        strings = self.rcec.get_query_strings(self.some_text)
        self.failUnless(len(strings) > 0)
class TestRCEController(unittest.TestCase):
    some_text = """Neurocomputing 35 (2000) 3}26

        Class separability estimation and incremental learning using 
        boundary methods Jose-Luis Sancho *, William E. Pierson , 
        Batu Ulug , H AnmH bal R. Figueiras-Vidal , Stanley C. Ahalt 
        ATSC-DI, Escuela Politecnica Superior. Universidad Carlos III 
        Leganes-Madrid, Spain & & Department of Electrical Engineering, 
        he Ohio State University Columbus, OH 43210, USA Received 7 
        January 1999; revised 5 April 1999; accepted 10 April 2000

        Abstract In this paper we discuss the use of boundary methods 
        (BMs) for distribution analysis. We view these methods as tools 
        which can be used to extract useful information from sample 
        distributions. We believe that the information thus extracted has 
        utility for a number of applications, but in particular we discuss 
        the use of BMs as a mechanism for class separability estimation and 
        as an aid to constructing robust and e$cient neural networks (NNs) 
        to solve classi"cation problems. In the "rst case, BMs can 
        establish the utili...
    """

    def setUp(self):
        factory = UtilFactory()
        self.rcec = RCEController(factory)
        self.pdf = normpath(join(dirname(__file__), ("../../../../tests/" "fixtures/extraction/article.pdf")))

    def tearDown(self):
        pass

    def test_extract_content_from_non_existent_file(self):
        content = self.rcec.extract_content("somefile.pdf", FileFormat.TXT)
        self.failUnless(content is None)

    def test_extract_content_to_invalid_target_format(self):
        content = self.rcec.extract_content(self.pdf, "invalid format")
        self.failUnless(content is None)

    def test_extract_content_from_pdf(self):
        content = self.rcec.extract_content(self.pdf, FileFormat.TXT)
        self.failUnless(content is not None)

    def test_get_query_strings(self):
        strings = self.rcec.get_query_strings(self.some_text)
        self.failUnless(len(strings) > 0)
 def setUp(self):
     factory = UtilFactory()
     self.rcec = RCEController(factory)
     self.pdf = normpath(join(dirname(__file__), ("../../../../tests/" "fixtures/extraction/article.pdf")))
 def setUp(self):
     factory = UtilFactory()
     self.rcec = RCEController(factory)
     self.pdf = normpath(join(dirname(__file__), ('../../../../tests/'
                                  'fixtures/extraction/article.pdf')))