def __init__(self): self.info = {} self.nexamples = 4 self.base_path = '/home/rxuriguera/benchmark/pages/' self.fields = ['addres', 'author', 'isbn', 'issn', 'journal', 'number', 'pages', 'publisher', 'title', 'volume', 'year'] self.libraries = ['informaworld']#['acm', 'citeulike', 'computerorg', 'econpapers', 'ideas', 'informaworld', 'sciencedirect', 'scientificcommons', 'springer'] self.factory = UtilFactory() self.iec = IEController(self.factory, secs_between_reqs=0, wrapper_gen_examples=self.nexamples) self.rec = ReferencesController(self.factory)
def setUp(self): factory = UtilFactory() self.iec = IEController(factory, ReferenceFormat.BIBTEX) self.top_results = [ SearchResult( 'result01', 'http://portal.acm.org/citation.cfm?id=507338.507355'), SearchResult( 'result01', 'http://www.springerlink.com/index/D7X7KX6772HQ2135.pdf') ] self.empty_page = BeautifulSoup("<html><head/><body/></html>") self.page = self._get_soup('acm01.html') self.text = 'ss'
def make_reference(self, file, target_format): """ Uses the controllers to extract the content of a file, get some query strings, retrieve results from a search engine, and extract the reference. """ extraction = Extraction() extraction.file_path = file extraction.target_format = target_format log.info("Making reference for file: %s" % file) #@UndefinedVariable rce = RCEController(self.factory) raw_text = rce.extract_content(file, FileFormat.TXT) if not raw_text: return extraction extraction.query_strings = rce.get_query_strings(raw_text) if not extraction.query_strings: log.error('No query strings extracted') #@UndefinedVariable return extraction log.debug("Query strings %s" % str(extraction.query_strings)) #@UndefinedVariable ir = IRController(self.factory) extraction.top_results, extraction.used_query = ( ir.get_top_results(extraction.query_strings)) if not extraction.top_results: log.error('No top results to use with the available wrappers ' #@UndefinedVariable 'after trying %d queries' % len(extraction.query_strings)) return extraction extraction.query_strings.remove(extraction.used_query) log.debug("Used query %s" % str(extraction.used_query)) #@UndefinedVariable log.debug("Query returned %d top results" % len(extraction.top_results)) #@UndefinedVariable ie = IEController(self.factory, target_format) extraction.entries, extraction.used_result = ( ie.extract_reference(extraction.top_results, raw_text)) extraction.top_results.remove(extraction.used_result) log.info("Used result: %s" % str(extraction.used_result)) #@UndefinedVariable validator = ReferenceValidator(FIELD_WEIGHTS) for entry in extraction.entries: validator.validate(entry, raw_text) return extraction
class WrapperGenerator(threading.Thread): def __init__(self, url): super(WrapperGenerator, self).__init__() self.name = 'WrapTrainer' self.url = url self.factory = UtilFactory() self.ie_controller = IEController(self.factory) def run(self): self.generate_wrappers() def set_wrapper_gen_examples(self, num_examples): self.ie_controller.wrapper_gen_examples = num_examples def generate_wrappers(self): self.ie_controller.generate_wrappers(self.url)
def setUp(self): factory = UtilFactory() self.iec = IEController(factory, ReferenceFormat.BIBTEX) self.top_results = [ SearchResult('result01', 'http://portal.acm.org/citation.cfm?id=507338.507355'), SearchResult('result01', 'http://www.springerlink.com/index/D7X7KX6772HQ2135.pdf')] self.empty_page = BeautifulSoup("<html><head/><body/></html>") self.page = self._get_soup('acm01.html') self.text = 'ss';
class TestIEController(unittest.TestCase): def setUp(self): factory = UtilFactory() self.iec = IEController(factory, ReferenceFormat.BIBTEX) self.top_results = [ SearchResult('result01', 'http://portal.acm.org/citation.cfm?id=507338.507355'), SearchResult('result01', 'http://www.springerlink.com/index/D7X7KX6772HQ2135.pdf')] self.empty_page = BeautifulSoup("<html><head/><body/></html>") self.page = self._get_soup('acm01.html') self.text = 'ss'; def tearDown(self): pass def test_use_reference_wrappers_page_with_no_wrapper(self): references = self.iec._use_reference_wrappers('some_source', self.empty_page, self.text) self.failUnless(len(references) == 0) def xtest_use_reference_wrappers_page_with_wrapper(self): references = self.iec._use_reference_wrappers('http://portal.acm.org', self.page, self.text) self.failUnless(len(references) == 1) def test_format_reference_same_format(self): ref = Reference(format=ReferenceFormat.BIBTEX, entry='formatted entry') self.iec._format_reference(ref) self.failUnless(ref.get_entry() == 'formatted entry') def test_format_reference_different_format(self): ref = Reference() ref.set_field('reference_id', 'Lmadsen99') ref.set_field('title', 'Some article title') self.iec._format_reference(ref) self.failUnless(ref.get_entry().startswith('@article{Lmadsen99,')) self.failUnless(ref.get_format() == self.iec.format) def xtest_use_rule_wrappers(self): references = self.iec._use_rule_wrappers(u'some_source', 'test 2007 content', '') self.failUnless(len(references) == 1) self.failUnless(len(references[0].fields) == 3) def test_validate_reference_fields(self): ref = Reference() ref.set_field('title', 'Some article title') ref.set_field('year', '32') raw_text = "Some article title and something else" self.iec._validate_reference_fields(ref, raw_text) self.failUnless(ref.get_field('title').valid == True) self.failUnless(ref.get_field('year').valid == False) def _get_soup(self, file_name): file_path = normpath(join(dirname(__file__), ('../../../../tests/' 'fixtures/wrappers/' + file_name))) file = open(file_path) soup = BeautifulSoup(file.read()) file.close() return soup def test_set_value_guides(self): value_guides = self.iec.value_guides self.failUnless(len(value_guides) == 5)
def __init__(self, url): super(WrapperGenerator, self).__init__() self.name = 'WrapTrainer' self.url = url self.factory = UtilFactory() self.ie_controller = IEController(self.factory)
class ExtractionStats(object): def __init__(self): self.info = {} self.nexamples = 4 self.base_path = '/home/rxuriguera/benchmark/pages/' self.fields = ['addres', 'author', 'isbn', 'issn', 'journal', 'number', 'pages', 'publisher', 'title', 'volume', 'year'] self.libraries = ['informaworld']#['acm', 'citeulike', 'computerorg', 'econpapers', 'ideas', 'informaworld', 'sciencedirect', 'scientificcommons', 'springer'] self.factory = UtilFactory() self.iec = IEController(self.factory, secs_between_reqs=0, wrapper_gen_examples=self.nexamples) self.rec = ReferencesController(self.factory) def save_msg(self, msg): print msg self.file.write(''.join([msg, '\n'])) self.file.flush() def run(self): self.info = {} for library in self.libraries: lib_info = self.info.setdefault(library, []) #@UnusedVariable self.run_library(library) def run_library(self, library): self.file = open(''.join([self.base_path, library, '/extraction-results-', str(self.nexamples), '-corrected.csv']), 'w') self.session = create_session(''.join(['sqlite:///', self.base_path, '/', library, '/extraction-stats-', library, '-', str(self.nexamples), '-corrected.db']), debug=True) #self.session = create_session('sqlite:///:memory:', debug=True) self.wg = gateways.WrapperGateway(self.session) self.eg = gateways.ExtractionGateway(self.session) self.save_msg('Extraction results for library: %s' % library) files = open(''.join([self.base_path, library, '/', 'filelist.txt']), 'r') html_url, text_file = files.readline().split(' ', 1) files.seek(0) url = html_url.rsplit('/', 1)[0] #@UnusedVariable #self.import_generate(library, url) references = [] for line in files.readlines(): line = line.strip() html_url, text_file = line.split(' ', 1) text_file = open(text_file, 'r') text = text_file.read() text_file.close() top_results = [SearchResult('Some result', html_url)] print html_url refs, result = self.iec.extract_reference(top_results, text) #@UnusedVariable if refs: references.append(refs[0]) else: references.append(None) # Load control references control_file = ''.join([self.base_path, library, '/extraction-results-control.bib']) control = self.rec._parse_entries_file(control_file) for control, extracted in zip(control, references): if not extracted: continue self.save_msg(extracted.entry) self.save_msg('\n') total_control_fields = 0 total_extracted_fields = 0 correct = 0 parcial = 0 error = 0 valid = 0 invalid = 0 for field in control.fields: if field in ['url', 'reference_type', 'reference_id']: continue control_value = control.get_field(field) total_control_fields += 1 extracted_value = extracted.get_field(field) if not extracted_value: continue if extracted_value.valid: valid += 1 else: invalid += 1 control_value = control_value.value extracted_value = extracted_value.value if type(control_value) is list: self.save_msg('Comparing field %s values:\n\t%s\n\t%s' % (field, simplejson.dumps(control_value), simplejson.dumps(extracted_value))) self.save_msg('\tCHECK MANUALLY') continue control_value = control_value.strip() extracted_value = extracted_value.strip() control_regex = re.escape(control_value) extracted_regex = re.escape(extracted_value) #@UnusedVariable self.save_msg('Comparing field %s values:\n\t%s\n\t%s' % (field, control_value, extracted_value)) if control_value == extracted_value: correct += 1 self.save_msg('\tCorrect') elif re.search(control_regex, extracted_value): #or re.match(extracted_regex, control_value): parcial += 1 self.save_msg('\tParcial') else: error += 1 self.save_msg('\tIncorrect') total_extracted_fields += 1 self.save_msg('') self.save_msg('Marked Valid;Marked invalid') self.save_msg('%d;%d' % (valid, invalid)) self.save_msg('Total available;Total extracted;Incorrect;Parcial;Correct') self.save_msg('%d;%d;%d;%d;%d' % (total_control_fields, total_extracted_fields, error, parcial, correct)) self.save_msg('\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n') self.file.close() def import_generate(self, library, url): # Import references importer = ReferenceImporter() importer.import_references(''.join([self.base_path, library, '/', library, '-', str(self.nexamples), '.bib'])) # Generate wrappers generator = WrapperGenerator(url) generator.set_wrapper_gen_examples(self.nexamples) generator.generate_wrappers()
class TestIEController(unittest.TestCase): def setUp(self): factory = UtilFactory() self.iec = IEController(factory, ReferenceFormat.BIBTEX) self.top_results = [ SearchResult( 'result01', 'http://portal.acm.org/citation.cfm?id=507338.507355'), SearchResult( 'result01', 'http://www.springerlink.com/index/D7X7KX6772HQ2135.pdf') ] self.empty_page = BeautifulSoup("<html><head/><body/></html>") self.page = self._get_soup('acm01.html') self.text = 'ss' def tearDown(self): pass def test_use_reference_wrappers_page_with_no_wrapper(self): references = self.iec._use_reference_wrappers('some_source', self.empty_page, self.text) self.failUnless(len(references) == 0) def xtest_use_reference_wrappers_page_with_wrapper(self): references = self.iec._use_reference_wrappers('http://portal.acm.org', self.page, self.text) self.failUnless(len(references) == 1) def test_format_reference_same_format(self): ref = Reference(format=ReferenceFormat.BIBTEX, entry='formatted entry') self.iec._format_reference(ref) self.failUnless(ref.get_entry() == 'formatted entry') def test_format_reference_different_format(self): ref = Reference() ref.set_field('reference_id', 'Lmadsen99') ref.set_field('title', 'Some article title') self.iec._format_reference(ref) self.failUnless(ref.get_entry().startswith('@article{Lmadsen99,')) self.failUnless(ref.get_format() == self.iec.format) def xtest_use_rule_wrappers(self): references = self.iec._use_rule_wrappers(u'some_source', 'test 2007 content', '') self.failUnless(len(references) == 1) self.failUnless(len(references[0].fields) == 3) def test_validate_reference_fields(self): ref = Reference() ref.set_field('title', 'Some article title') ref.set_field('year', '32') raw_text = "Some article title and something else" self.iec._validate_reference_fields(ref, raw_text) self.failUnless(ref.get_field('title').valid == True) self.failUnless(ref.get_field('year').valid == False) def _get_soup(self, file_name): file_path = normpath( join(dirname(__file__), ('../../../../tests/' 'fixtures/wrappers/' + file_name))) file = open(file_path) soup = BeautifulSoup(file.read()) file.close() return soup def test_set_value_guides(self): value_guides = self.iec.value_guides self.failUnless(len(value_guides) == 5)