Example #1
0
 def __init__(self, format=ReferenceFormat.BIBTEX):
     super(ReferenceImporter, self).__init__()
     self.name = 'Importer'
     self.format = format
     self.util_factory = UtilFactory()
     self.ref_controller = ReferencesController(self.util_factory,
                                                self.format)
     self.path = ''
class TestIEController(unittest.TestCase):
        
    def setUp(self):
        factory = UtilFactory()
        self.rec = ReferencesController(factory, ReferenceFormat.BIBTEX)
        self.path = normpath(join(dirname(__file__), ('../../../../tests/'
                                     'fixtures/references/bibtex/import.bib')))
        
    def tearDown(self):
        pass

    def test_persist_file_references(self):
        self.rec.persist_file_references(self.path)
class TestIEController(unittest.TestCase):
    def setUp(self):
        factory = UtilFactory()
        self.rec = ReferencesController(factory, ReferenceFormat.BIBTEX)
        self.path = normpath(
            join(dirname(__file__), ('../../../../tests/'
                                     'fixtures/references/bibtex/import.bib')))

    def tearDown(self):
        pass

    def test_persist_file_references(self):
        self.rec.persist_file_references(self.path)
Example #4
0
    def __init__(self):

        self.info = {}
        
        self.nexamples = 4
        self.base_path = '/home/rxuriguera/benchmark/pages/'
        self.fields = ['addres', 'author', 'isbn', 'issn', 'journal', 'number', 'pages', 'publisher', 'title', 'volume', 'year']
        self.libraries = ['informaworld']#['acm', 'citeulike', 'computerorg', 'econpapers', 'ideas', 'informaworld', 'sciencedirect', 'scientificcommons', 'springer']
        
        self.factory = UtilFactory()
        self.iec = IEController(self.factory, secs_between_reqs=0,
                                wrapper_gen_examples=self.nexamples)
        self.rec = ReferencesController(self.factory)
Example #5
0
class ReferenceImporter(threading.Thread):
    def __init__(self, format=ReferenceFormat.BIBTEX):
        super(ReferenceImporter, self).__init__()
        self.name = 'Importer'
        self.format = format
        self.util_factory = UtilFactory()
        self.ref_controller = ReferencesController(self.util_factory,
                                                   self.format)
        self.path = ''

    def get_path(self):
        return self.__path

    def set_path(self, value):
        self.__path = value

    def import_references(self, path):
        log.info('Importing references from %s' % path) #@UndefinedVariable
        references = self.ref_controller.persist_file_references(path)
        return len(references)
    
    def run(self):
        if not self.path:
            return
        self.import_references(self.path)
        
    path = property(get_path, set_path)
Example #6
0
class ReferenceImporter(threading.Thread):
    def __init__(self, format=ReferenceFormat.BIBTEX):
        super(ReferenceImporter, self).__init__()
        self.name = 'Importer'
        self.format = format
        self.util_factory = UtilFactory()
        self.ref_controller = ReferencesController(self.util_factory,
                                                   self.format)
        self.path = ''

    def get_path(self):
        return self.__path

    def set_path(self, value):
        self.__path = value

    def import_references(self, path):
        log.info('Importing references from %s' % path)  #@UndefinedVariable
        references = self.ref_controller.persist_file_references(path)
        return len(references)

    def run(self):
        if not self.path:
            return
        self.import_references(self.path)

    path = property(get_path, set_path)
Example #7
0
 def __init__(self, format=ReferenceFormat.BIBTEX):
     super(ReferenceImporter, self).__init__()
     self.name = 'Importer'
     self.format = format
     self.util_factory = UtilFactory()
     self.ref_controller = ReferencesController(self.util_factory,
                                                self.format)
     self.path = ''
 def setUp(self):
     factory = UtilFactory()
     self.rec = ReferencesController(factory, ReferenceFormat.BIBTEX)
     self.path = normpath(join(dirname(__file__), ('../../../../tests/'
                                  'fixtures/references/bibtex/import.bib')))
 def setUp(self):
     factory = UtilFactory()
     self.rec = ReferencesController(factory, ReferenceFormat.BIBTEX)
     self.path = normpath(
         join(dirname(__file__), ('../../../../tests/'
                                  'fixtures/references/bibtex/import.bib')))
Example #10
0
class ExtractionStats(object):
    def __init__(self):

        self.info = {}
        
        self.nexamples = 4
        self.base_path = '/home/rxuriguera/benchmark/pages/'
        self.fields = ['addres', 'author', 'isbn', 'issn', 'journal', 'number', 'pages', 'publisher', 'title', 'volume', 'year']
        self.libraries = ['informaworld']#['acm', 'citeulike', 'computerorg', 'econpapers', 'ideas', 'informaworld', 'sciencedirect', 'scientificcommons', 'springer']
        
        self.factory = UtilFactory()
        self.iec = IEController(self.factory, secs_between_reqs=0,
                                wrapper_gen_examples=self.nexamples)
        self.rec = ReferencesController(self.factory)

    def save_msg(self, msg):
        print msg
        self.file.write(''.join([msg, '\n']))
        self.file.flush()
        
    def run(self):
        self.info = {}

        for library in self.libraries:
            lib_info = self.info.setdefault(library, []) #@UnusedVariable
            
            self.run_library(library)
        
        
        
    def run_library(self, library):
        self.file = open(''.join([self.base_path, library, '/extraction-results-', str(self.nexamples), '-corrected.csv']), 'w')
        self.session = create_session(''.join(['sqlite:///', self.base_path, '/', library, '/extraction-stats-', library, '-', str(self.nexamples), '-corrected.db']), debug=True)
        #self.session = create_session('sqlite:///:memory:', debug=True)
        self.wg = gateways.WrapperGateway(self.session)
        self.eg = gateways.ExtractionGateway(self.session)

        
        
        self.save_msg('Extraction results for library: %s' % library)
        
        files = open(''.join([self.base_path, library, '/', 'filelist.txt']), 'r')
        html_url, text_file = files.readline().split(' ', 1)
        files.seek(0)
        url = html_url.rsplit('/', 1)[0] #@UnusedVariable
        
        
        #self.import_generate(library, url)
        

        
        references = []
        for line in files.readlines():
            line = line.strip()
            html_url, text_file = line.split(' ', 1)
            
            text_file = open(text_file, 'r')
            text = text_file.read()
            text_file.close()
            
            top_results = [SearchResult('Some result', html_url)]
            print html_url

            refs, result = self.iec.extract_reference(top_results, text) #@UnusedVariable
            
            if refs:
                references.append(refs[0])
            else:
                references.append(None)
            
        # Load control references
        control_file = ''.join([self.base_path, library, '/extraction-results-control.bib'])
        control = self.rec._parse_entries_file(control_file)
         
        for control, extracted in zip(control, references):
            if not extracted:
                continue
            
            self.save_msg(extracted.entry)
            self.save_msg('\n')
            
            total_control_fields = 0
            total_extracted_fields = 0
            
            correct = 0
            parcial = 0
            error = 0
            
            valid = 0
            invalid = 0
            
            for field in control.fields:
                if field in ['url', 'reference_type', 'reference_id']:
                    continue
                
                control_value = control.get_field(field)
                total_control_fields += 1
                
                extracted_value = extracted.get_field(field)
                if not extracted_value:
                    continue
                
                if extracted_value.valid:
                    valid += 1
                else:
                    invalid += 1
                
                control_value = control_value.value
                extracted_value = extracted_value.value
                
                if type(control_value) is list:
                    self.save_msg('Comparing field %s values:\n\t%s\n\t%s' % (field, simplejson.dumps(control_value), simplejson.dumps(extracted_value)))
                    self.save_msg('\tCHECK MANUALLY')
                    continue
                
                control_value = control_value.strip()
                extracted_value = extracted_value.strip()
                
                control_regex = re.escape(control_value)
                extracted_regex = re.escape(extracted_value) #@UnusedVariable
                
                self.save_msg('Comparing field %s values:\n\t%s\n\t%s' % (field, control_value, extracted_value))
                
                if control_value == extracted_value:
                    correct += 1
                    self.save_msg('\tCorrect')
                elif re.search(control_regex, extracted_value): #or re.match(extracted_regex, control_value):
                    parcial += 1
                    self.save_msg('\tParcial')
                else:
                    error += 1
                    self.save_msg('\tIncorrect')
                
                total_extracted_fields += 1
            
            self.save_msg('')
            self.save_msg('Marked Valid;Marked invalid')
            self.save_msg('%d;%d' % (valid, invalid))
            
            self.save_msg('Total available;Total extracted;Incorrect;Parcial;Correct')
            self.save_msg('%d;%d;%d;%d;%d' % (total_control_fields, total_extracted_fields, error, parcial, correct))
            
            self.save_msg('\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n')
        self.file.close()
        
        
    def import_generate(self, library, url):
        # Import references
        importer = ReferenceImporter()
        importer.import_references(''.join([self.base_path, library, '/', library, '-', str(self.nexamples), '.bib']))
        
        # Generate wrappers
        generator = WrapperGenerator(url)
        generator.set_wrapper_gen_examples(self.nexamples)
        generator.generate_wrappers()