예제 #1
0
 def find_reference_by_id(self, id):
     if not id:
         raise ValueError
     
     log.debug('Querying the database. Reference with id %s' % str(id)) #@UndefinedVariable
     m_reference = (self.session.query(mappers.Reference).
                    filter(mappers.Reference.id == id).one())
     
     if not m_reference:
         return None
     
     log.debug('Creating new reference') #@UndefinedVariable
     reference = Reference()
     reference.id = m_reference.id
     reference.validity = m_reference.validity
     
     log.debug('Adding fields') #@UndefinedVariable
     for m_field in m_reference.fields:
         reference.set_field(m_field.name, m_field.value, m_field.valid)
     
     log.debug('Adding authors') #@UndefinedVariable
     authors = []
     for m_author in m_reference.authors:
         authors.append(m_author.to_name_dict())
     if authors:
         reference.set_field(u'author', authors, True)
     
     log.debug('Adding editors') #@UndefinedVariable
     editors = []
     for m_editor in m_reference.editors:
         editors.append(m_editor.to_name_dict())
     if editors:
         reference.set_field(u'editor', editors, True)
     
     return reference
 def test_validate_incorrect_reference(self):
     incorrect_ref = Reference()
     incorrect_ref.set_field('title', ('some arbitrary text'), False)
     incorrect_ref.set_field('author', [{'first_name':'Jose-Luis',
                                         'last_name':'Sancho',
                                         'middle_name':''}], True)
     self.rv.validate(incorrect_ref)
     self.failUnless(incorrect_ref.validity < 0.5)
예제 #3
0
 def _use_rule_wrappers(self, source, page, raw_text):
     """
     Look if there is any wrapper in the database for the given source.
     """
     log.info('Attempting to extract reference with ruled wrappers') #@UndefinedVariable
     fields = {}
     reference = Reference()
     wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers)
     wrapper_field_collections = wrapper_manager.find_wrapper_collections(source)
     
     for collection in wrapper_field_collections:
         # Get the wrappers for the current collection
         url, field = collection.url, collection.field
         wrappers = wrapper_manager.get_wrappers(url, field)
         log.debug('Collection %s:%s has %d wrappers' % (url, field, #@UndefinedVariable
                                                         len(wrappers)))
         
         # Get field validator
         try:
             validator = self.field_validation[collection.field][1]
         except KeyError:
             validator = None
         
         # Extract information using the wrappers we have
         for wrapper in wrappers:
             info = wrapper.extract_info(page)
             # we expect 'info' to be a string
             if type(info) == list and not (collection.field == 'author' 
                  or collection.field == 'editor'):
                 continue 
             log.debug('Info extracted by wrapper: %s' % info) #@UndefinedVariable
             
             valid = validator.validate(info, raw_text) if validator else True
             # Save the extracted info even if it's not correct. It will
             # be overwritten afterwards if necessary
             reference.set_field(field, info, valid)
             
             if not valid: 
                 log.debug('The extracted information is not valid. ' #@UndefinedVariable
                           'Downvoting wrapper.') 
                 wrapper.downvotes += 1
                 wrapper_manager.update_wrapper(wrapper)
             else:
                 log.debug('The extracted information is valid. ' #@UndefinedVariable
                           'Upvoting wrapper') 
                 wrapper.upvotes += 1
                 wrapper_manager.update_wrapper(wrapper)
                 fields[field] = info
                 break
             
     if len(reference.fields) > 0:
         log.info('Extracted reference')  #@UndefinedVariable
         return [reference]
     else:
         log.info('Could not extract reference using ruled wrappers')  #@UndefinedVariable
         return []
 def test_validate_correct_reference(self):
     correct_ref = Reference()
     correct_ref.set_field('author', [{'first_name':'Jose-Luis',
                                       'last_name':'Sancho',
                                       'middle_name':''}], True)
     correct_ref.set_field('title', ('Class separability estimation and '
         'incremental learning using boundary methods'), True)        
     
     self.rv.validate(correct_ref)
     self.failUnless(correct_ref.validity == 1.0)
예제 #5
0
 def test_validate_reference_fields(self):
     ref = Reference()
     ref.set_field('title', 'Some article title')
     ref.set_field('year', '32')
     raw_text = "Some article title and something else"
     self.iec._validate_reference_fields(ref, raw_text)
     self.failUnless(ref.get_field('title').valid == True)
     self.failUnless(ref.get_field('year').valid == False)
    def setUp(self):
        self.ref = Reference()
        self.ref.set_field('reference_id', 'Lmadsen99')
        self.ref.set_field('author', [{
            'first_name': 'Lars',
            'last_name': 'Madsen',
            'middle_name': 'Lithen'
        }])
        self.ref.set_field('title', 'Some article title')
        self.ref.set_field('pages', '133--144')
        self.ref.set_field('journal', 'Some journal')
        self.ref.set_field('year', '1999')

        self.ref_formatter = ReferenceFormatter()
        self.format_generator = BibtexGenerator()
예제 #7
0
    def test_format_reference_different_format(self):
        ref = Reference()
        ref.set_field('reference_id', 'Lmadsen99')
        ref.set_field('title', 'Some article title')

        self.iec._format_reference(ref)

        self.failUnless(ref.get_entry().startswith('@article{Lmadsen99,'))
        self.failUnless(ref.get_format() == self.iec.format)
 def test_validate_reference_fields(self):
     ref = Reference()
     ref.set_field('title', 'Some article title')
     ref.set_field('year', '32')
     raw_text = "Some article title and something else"
     self.iec._validate_reference_fields(ref, raw_text)
     self.failUnless(ref.get_field('title').valid == True)
     self.failUnless(ref.get_field('year').valid == False)
 def test_format_reference_different_format(self):
     ref = Reference()
     ref.set_field('reference_id', 'Lmadsen99')
     ref.set_field('title', 'Some article title')
     
     self.iec._format_reference(ref)
     
     self.failUnless(ref.get_entry().startswith('@article{Lmadsen99,'))
     self.failUnless(ref.get_format() == self.iec.format)
 def test_validate_incorrect_reference(self):
     incorrect_ref = Reference()
     incorrect_ref.set_field('title', ('some arbitrary text'), False)
     incorrect_ref.set_field('author', [{
         'first_name': 'Jose-Luis',
         'last_name': 'Sancho',
         'middle_name': ''
     }], True)
     self.rv.validate(incorrect_ref)
     self.failUnless(incorrect_ref.validity < 0.5)
    def test_validate_correct_reference(self):
        correct_ref = Reference()
        correct_ref.set_field('author', [{
            'first_name': 'Jose-Luis',
            'last_name': 'Sancho',
            'middle_name': ''
        }], True)
        correct_ref.set_field('title',
                              ('Class separability estimation and '
                               'incremental learning using boundary methods'),
                              True)

        self.rv.validate(correct_ref)
        self.failUnless(correct_ref.validity == 1.0)
예제 #12
0
    def _use_reference_wrappers(self, source, page, raw_text):
        """
        Use a reference wrapper to get the reference from a given page.
        Returns a list of References with the full entry, format and a 
        structure with the different fields.
        A single publication may need more than a reference (e.g: inproceedings
        and its proceedings)
        """
        log.info('Attempting to extract reference with a reference wrapper'
                 )  #@UndefinedVariable
        references = []
        entry, format = ReferenceWrapper().extract_info(source, page)
        if not entry:
            log.debug('Could not find any entry using a reference wrapper'
                      )  #@UndefinedVariable
            return references

        # Create a parser for the given reference format
        try:
            parser = self.util_factory.create_parser(format)
        except UtilCreationError as e:
            log.error('Could not create a parser for %s: %s' % (
                format,  #@UndefinedVariable
                e.args))
            return references

        if not parser.check_format(entry):
            log.error('Given entry is not in %s' % format)  #@UndefinedVariable
            return references

        # There may be more than one entry for the same file.
        log.debug('Parsing extracted entries')  #@UndefinedVariable
        try:
            entries = parser.split_source(entry)
            for entry in entries:
                fields = parser.parse_entry(entry)
                reference = Reference(fields, format, entry)
                self._validate_reference_fields(reference, raw_text)
                references.append(reference)
        except Exception, e:
            log.error('Error parsing extracted entry: %s ' %
                      e)  #@UndefinedVariable
 def test_format_reference_same_format(self):
     ref = Reference(format=ReferenceFormat.BIBTEX, entry='formatted entry')
     self.iec._format_reference(ref)
     self.failUnless(ref.get_entry() == 'formatted entry')
예제 #14
0
     if not content:
         log.info('Empty entries file') #@UndefinedVariable
         return references
     
     
     if not self.parser.check_format(content):
         log.error('Given entry is not in %s' % format) #@UndefinedVariable
         return references
     
     # There may be more than one entry for the same file.
     log.debug('Parsing entries') #@UndefinedVariable
     
     entries = self.parser.split_source(content)
     for entry in entries:
         fields = self.parser.parse_entry(entry)
         reference = Reference(fields, format, entry)
         reference.validity = 1.0
         references.append(reference)
     return references
     
 def persist_file_references(self, file_path):
     """
     Parses references from a file and stores them to the database
     """
     extraction_gw = ExtractionGateway()
     references = self._parse_entries_file(file_path)
     extractions = []
     
     for reference, index in zip(references, range(len(references))):
         
         extraction = Extraction()
예제 #15
0
    def _use_rule_wrappers(self, source, page, raw_text):
        """
        Look if there is any wrapper in the database for the given source.
        """
        log.info('Attempting to extract reference with ruled wrappers'
                 )  #@UndefinedVariable
        fields = {}
        reference = Reference()
        wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers)
        wrapper_field_collections = wrapper_manager.find_wrapper_collections(
            source)

        for collection in wrapper_field_collections:
            # Get the wrappers for the current collection
            url, field = collection.url, collection.field
            wrappers = wrapper_manager.get_wrappers(url, field)
            log.debug('Collection %s:%s has %d wrappers' % (
                url,
                field,  #@UndefinedVariable
                len(wrappers)))

            # Get field validator
            try:
                validator = self.field_validation[collection.field][1]
            except KeyError:
                validator = None

            # Extract information using the wrappers we have
            for wrapper in wrappers:
                info = wrapper.extract_info(page)
                # we expect 'info' to be a string
                if type(info) == list and not (collection.field == 'author' or
                                               collection.field == 'editor'):
                    continue
                log.debug('Info extracted by wrapper: %s' %
                          info)  #@UndefinedVariable

                valid = validator.validate(info,
                                           raw_text) if validator else True
                # Save the extracted info even if it's not correct. It will
                # be overwritten afterwards if necessary
                reference.set_field(field, info, valid)

                if not valid:
                    log.debug(
                        'The extracted information is not valid. '  #@UndefinedVariable
                        'Downvoting wrapper.')
                    wrapper.downvotes += 1
                    wrapper_manager.update_wrapper(wrapper)
                else:
                    log.debug(
                        'The extracted information is valid. '  #@UndefinedVariable
                        'Upvoting wrapper')
                    wrapper.upvotes += 1
                    wrapper_manager.update_wrapper(wrapper)
                    fields[field] = info
                    break

        if len(reference.fields) > 0:
            log.info('Extracted reference')  #@UndefinedVariable
            return [reference]
        else:
            log.info('Could not extract reference using ruled wrappers'
                     )  #@UndefinedVariable
            return []
예제 #16
0
        if not content:
            log.info('Empty entries file')  #@UndefinedVariable
            return references

        if not self.parser.check_format(content):
            log.error('Given entry is not in %s' % format)  #@UndefinedVariable
            return references

        # There may be more than one entry for the same file.
        log.debug('Parsing entries')  #@UndefinedVariable

        entries = self.parser.split_source(content)
        for entry in entries:
            fields = self.parser.parse_entry(entry)
            reference = Reference(fields, format, entry)
            reference.validity = 1.0
            references.append(reference)
        return references

    def persist_file_references(self, file_path):
        """
        Parses references from a file and stores them to the database
        """
        extraction_gw = ExtractionGateway()
        references = self._parse_entries_file(file_path)
        extractions = []

        for reference, index in zip(references, range(len(references))):

            extraction = Extraction()
class TestReference(unittest.TestCase):

    def setUp(self):
        self.ref = Reference()

    def tearDown(self):
        pass

    def test_set_and_get_field(self):
        self.ref.set_field('random_field', 'random_value')
        self.failUnless(self.ref.get_field('random_field').value == 'random_value')

    def test_get_fields(self):
        self.ref.set_field('rf01', 'rv01')
        self.ref.set_field('rf02', 'rv02')
        self.ref.set_field('rf03', 'rv04')
        self.failUnless(len(self.ref.get_fields()) == 3)
        self.failUnless(self.ref.get_fields() == ['rf01', 'rf02', 'rf03'])

    def test_set_field_to_none(self):
        self.ref.set_field('some_field', None)
        field = self.ref.get_field('some_field')
        self.failUnless(field.valid == False)
        
    def test_set_and_get_entry(self):
        self.ref.set_entry('This is an entry')
        self.failUnless(self.ref.get_entry() == 'This is an entry')
class TestReferenceFormatter(unittest.TestCase):
    def setUp(self):
        self.ref = Reference()
        self.ref.set_field('reference_id', 'Lmadsen99')
        self.ref.set_field('author', [{
            'first_name': 'Lars',
            'last_name': 'Madsen',
            'middle_name': 'Lithen'
        }])
        self.ref.set_field('title', 'Some article title')
        self.ref.set_field('pages', '133--144')
        self.ref.set_field('journal', 'Some journal')
        self.ref.set_field('year', '1999')

        self.ref_formatter = ReferenceFormatter()
        self.format_generator = BibtexGenerator()

    def tearDown(self):
        pass

    def test_formatter(self):
        self.ref_formatter.format_reference(self.ref, self.format_generator)
        entry = self.ref.get_entry()
        self.failUnless(
            entry == ('@article{Lmadsen99,' + os.linesep +
                      'title = {Some article title},' + os.linesep +
                      'author = {Madsen, Lithen, Lars},' + os.linesep +
                      'year = 1999,' + os.linesep +
                      'journal = {Some journal},' + os.linesep +
                      'pages = {133--144}' + os.linesep + '}' + os.linesep))

        self.failUnless(self.ref.format == self.format_generator.format)
 def setUp(self):
     self.ref = Reference()
예제 #20
0
 def test_format_reference_same_format(self):
     ref = Reference(format=ReferenceFormat.BIBTEX, entry='formatted entry')
     self.iec._format_reference(ref)
     self.failUnless(ref.get_entry() == 'formatted entry')