def find_reference_by_id(self, id): if not id: raise ValueError log.debug('Querying the database. Reference with id %s' % str(id)) #@UndefinedVariable m_reference = (self.session.query(mappers.Reference). filter(mappers.Reference.id == id).one()) if not m_reference: return None log.debug('Creating new reference') #@UndefinedVariable reference = Reference() reference.id = m_reference.id reference.validity = m_reference.validity log.debug('Adding fields') #@UndefinedVariable for m_field in m_reference.fields: reference.set_field(m_field.name, m_field.value, m_field.valid) log.debug('Adding authors') #@UndefinedVariable authors = [] for m_author in m_reference.authors: authors.append(m_author.to_name_dict()) if authors: reference.set_field(u'author', authors, True) log.debug('Adding editors') #@UndefinedVariable editors = [] for m_editor in m_reference.editors: editors.append(m_editor.to_name_dict()) if editors: reference.set_field(u'editor', editors, True) return reference
def test_validate_incorrect_reference(self): incorrect_ref = Reference() incorrect_ref.set_field('title', ('some arbitrary text'), False) incorrect_ref.set_field('author', [{'first_name':'Jose-Luis', 'last_name':'Sancho', 'middle_name':''}], True) self.rv.validate(incorrect_ref) self.failUnless(incorrect_ref.validity < 0.5)
def _use_rule_wrappers(self, source, page, raw_text): """ Look if there is any wrapper in the database for the given source. """ log.info('Attempting to extract reference with ruled wrappers') #@UndefinedVariable fields = {} reference = Reference() wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers) wrapper_field_collections = wrapper_manager.find_wrapper_collections(source) for collection in wrapper_field_collections: # Get the wrappers for the current collection url, field = collection.url, collection.field wrappers = wrapper_manager.get_wrappers(url, field) log.debug('Collection %s:%s has %d wrappers' % (url, field, #@UndefinedVariable len(wrappers))) # Get field validator try: validator = self.field_validation[collection.field][1] except KeyError: validator = None # Extract information using the wrappers we have for wrapper in wrappers: info = wrapper.extract_info(page) # we expect 'info' to be a string if type(info) == list and not (collection.field == 'author' or collection.field == 'editor'): continue log.debug('Info extracted by wrapper: %s' % info) #@UndefinedVariable valid = validator.validate(info, raw_text) if validator else True # Save the extracted info even if it's not correct. It will # be overwritten afterwards if necessary reference.set_field(field, info, valid) if not valid: log.debug('The extracted information is not valid. ' #@UndefinedVariable 'Downvoting wrapper.') wrapper.downvotes += 1 wrapper_manager.update_wrapper(wrapper) else: log.debug('The extracted information is valid. ' #@UndefinedVariable 'Upvoting wrapper') wrapper.upvotes += 1 wrapper_manager.update_wrapper(wrapper) fields[field] = info break if len(reference.fields) > 0: log.info('Extracted reference') #@UndefinedVariable return [reference] else: log.info('Could not extract reference using ruled wrappers') #@UndefinedVariable return []
def test_validate_correct_reference(self): correct_ref = Reference() correct_ref.set_field('author', [{'first_name':'Jose-Luis', 'last_name':'Sancho', 'middle_name':''}], True) correct_ref.set_field('title', ('Class separability estimation and ' 'incremental learning using boundary methods'), True) self.rv.validate(correct_ref) self.failUnless(correct_ref.validity == 1.0)
def test_validate_reference_fields(self): ref = Reference() ref.set_field('title', 'Some article title') ref.set_field('year', '32') raw_text = "Some article title and something else" self.iec._validate_reference_fields(ref, raw_text) self.failUnless(ref.get_field('title').valid == True) self.failUnless(ref.get_field('year').valid == False)
def setUp(self): self.ref = Reference() self.ref.set_field('reference_id', 'Lmadsen99') self.ref.set_field('author', [{ 'first_name': 'Lars', 'last_name': 'Madsen', 'middle_name': 'Lithen' }]) self.ref.set_field('title', 'Some article title') self.ref.set_field('pages', '133--144') self.ref.set_field('journal', 'Some journal') self.ref.set_field('year', '1999') self.ref_formatter = ReferenceFormatter() self.format_generator = BibtexGenerator()
def test_format_reference_different_format(self): ref = Reference() ref.set_field('reference_id', 'Lmadsen99') ref.set_field('title', 'Some article title') self.iec._format_reference(ref) self.failUnless(ref.get_entry().startswith('@article{Lmadsen99,')) self.failUnless(ref.get_format() == self.iec.format)
def test_validate_incorrect_reference(self): incorrect_ref = Reference() incorrect_ref.set_field('title', ('some arbitrary text'), False) incorrect_ref.set_field('author', [{ 'first_name': 'Jose-Luis', 'last_name': 'Sancho', 'middle_name': '' }], True) self.rv.validate(incorrect_ref) self.failUnless(incorrect_ref.validity < 0.5)
def test_validate_correct_reference(self): correct_ref = Reference() correct_ref.set_field('author', [{ 'first_name': 'Jose-Luis', 'last_name': 'Sancho', 'middle_name': '' }], True) correct_ref.set_field('title', ('Class separability estimation and ' 'incremental learning using boundary methods'), True) self.rv.validate(correct_ref) self.failUnless(correct_ref.validity == 1.0)
def _use_reference_wrappers(self, source, page, raw_text): """ Use a reference wrapper to get the reference from a given page. Returns a list of References with the full entry, format and a structure with the different fields. A single publication may need more than a reference (e.g: inproceedings and its proceedings) """ log.info('Attempting to extract reference with a reference wrapper' ) #@UndefinedVariable references = [] entry, format = ReferenceWrapper().extract_info(source, page) if not entry: log.debug('Could not find any entry using a reference wrapper' ) #@UndefinedVariable return references # Create a parser for the given reference format try: parser = self.util_factory.create_parser(format) except UtilCreationError as e: log.error('Could not create a parser for %s: %s' % ( format, #@UndefinedVariable e.args)) return references if not parser.check_format(entry): log.error('Given entry is not in %s' % format) #@UndefinedVariable return references # There may be more than one entry for the same file. log.debug('Parsing extracted entries') #@UndefinedVariable try: entries = parser.split_source(entry) for entry in entries: fields = parser.parse_entry(entry) reference = Reference(fields, format, entry) self._validate_reference_fields(reference, raw_text) references.append(reference) except Exception, e: log.error('Error parsing extracted entry: %s ' % e) #@UndefinedVariable
def test_format_reference_same_format(self): ref = Reference(format=ReferenceFormat.BIBTEX, entry='formatted entry') self.iec._format_reference(ref) self.failUnless(ref.get_entry() == 'formatted entry')
if not content: log.info('Empty entries file') #@UndefinedVariable return references if not self.parser.check_format(content): log.error('Given entry is not in %s' % format) #@UndefinedVariable return references # There may be more than one entry for the same file. log.debug('Parsing entries') #@UndefinedVariable entries = self.parser.split_source(content) for entry in entries: fields = self.parser.parse_entry(entry) reference = Reference(fields, format, entry) reference.validity = 1.0 references.append(reference) return references def persist_file_references(self, file_path): """ Parses references from a file and stores them to the database """ extraction_gw = ExtractionGateway() references = self._parse_entries_file(file_path) extractions = [] for reference, index in zip(references, range(len(references))): extraction = Extraction()
def _use_rule_wrappers(self, source, page, raw_text): """ Look if there is any wrapper in the database for the given source. """ log.info('Attempting to extract reference with ruled wrappers' ) #@UndefinedVariable fields = {} reference = Reference() wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers) wrapper_field_collections = wrapper_manager.find_wrapper_collections( source) for collection in wrapper_field_collections: # Get the wrappers for the current collection url, field = collection.url, collection.field wrappers = wrapper_manager.get_wrappers(url, field) log.debug('Collection %s:%s has %d wrappers' % ( url, field, #@UndefinedVariable len(wrappers))) # Get field validator try: validator = self.field_validation[collection.field][1] except KeyError: validator = None # Extract information using the wrappers we have for wrapper in wrappers: info = wrapper.extract_info(page) # we expect 'info' to be a string if type(info) == list and not (collection.field == 'author' or collection.field == 'editor'): continue log.debug('Info extracted by wrapper: %s' % info) #@UndefinedVariable valid = validator.validate(info, raw_text) if validator else True # Save the extracted info even if it's not correct. It will # be overwritten afterwards if necessary reference.set_field(field, info, valid) if not valid: log.debug( 'The extracted information is not valid. ' #@UndefinedVariable 'Downvoting wrapper.') wrapper.downvotes += 1 wrapper_manager.update_wrapper(wrapper) else: log.debug( 'The extracted information is valid. ' #@UndefinedVariable 'Upvoting wrapper') wrapper.upvotes += 1 wrapper_manager.update_wrapper(wrapper) fields[field] = info break if len(reference.fields) > 0: log.info('Extracted reference') #@UndefinedVariable return [reference] else: log.info('Could not extract reference using ruled wrappers' ) #@UndefinedVariable return []
class TestReference(unittest.TestCase): def setUp(self): self.ref = Reference() def tearDown(self): pass def test_set_and_get_field(self): self.ref.set_field('random_field', 'random_value') self.failUnless(self.ref.get_field('random_field').value == 'random_value') def test_get_fields(self): self.ref.set_field('rf01', 'rv01') self.ref.set_field('rf02', 'rv02') self.ref.set_field('rf03', 'rv04') self.failUnless(len(self.ref.get_fields()) == 3) self.failUnless(self.ref.get_fields() == ['rf01', 'rf02', 'rf03']) def test_set_field_to_none(self): self.ref.set_field('some_field', None) field = self.ref.get_field('some_field') self.failUnless(field.valid == False) def test_set_and_get_entry(self): self.ref.set_entry('This is an entry') self.failUnless(self.ref.get_entry() == 'This is an entry')
class TestReferenceFormatter(unittest.TestCase): def setUp(self): self.ref = Reference() self.ref.set_field('reference_id', 'Lmadsen99') self.ref.set_field('author', [{ 'first_name': 'Lars', 'last_name': 'Madsen', 'middle_name': 'Lithen' }]) self.ref.set_field('title', 'Some article title') self.ref.set_field('pages', '133--144') self.ref.set_field('journal', 'Some journal') self.ref.set_field('year', '1999') self.ref_formatter = ReferenceFormatter() self.format_generator = BibtexGenerator() def tearDown(self): pass def test_formatter(self): self.ref_formatter.format_reference(self.ref, self.format_generator) entry = self.ref.get_entry() self.failUnless( entry == ('@article{Lmadsen99,' + os.linesep + 'title = {Some article title},' + os.linesep + 'author = {Madsen, Lithen, Lars},' + os.linesep + 'year = 1999,' + os.linesep + 'journal = {Some journal},' + os.linesep + 'pages = {133--144}' + os.linesep + '}' + os.linesep)) self.failUnless(self.ref.format == self.format_generator.format)
def setUp(self): self.ref = Reference()