Python infoの例、bibim.log.info Pythonの例

コード例 #1

0

ファイルを表示

ファイル: reference_importer.py プロジェクト: Alex-Linhares/bibtexIndexMaker

 def finish(self):
     self.progressBar.setMaximum(1)
     self.progressBar.setValue(1)
     log.info('Finished importing. Results can be found in the Manage page'
              )  #@UndefinedVariable
     log.removeHandler(self.guihandler)  #@UndefinedVariable
     self.empty_edit.setText('Done!')

コード例 #2

0

ファイルを表示

ファイル: reference_extraction.py プロジェクト: Alex-Linhares/bibtexIndexMaker

 def finish(self):
     log.info('Finished extracting. Results can be found in the Manage ' #@UndefinedVariable
              'page') 
     # Stop the thread before jumping to next page
     self.thread.exiting = True        
     log.removeHandler(self.guihandler) #@UndefinedVariable
     self.empty_edit.setText('Done!')

コード例 #3

0

ファイルを表示

ファイル: controllers.py プロジェクト: rxuriguera/bibtexIndexMaker

 def persist_file_references(self, file_path):
     """
     Parses references from a file and stores them to the database
     """
     extraction_gw = ExtractionGateway()
     references = self._parse_entries_file(file_path)
     extractions = []
     
     for reference, index in zip(references, range(len(references))):
         
         extraction = Extraction()
         
         # Clean fields that we don't want
         reference.remove_field('reference_id')
         reference.remove_field('abstract')
         reference.remove_field('reference_type')
         
         url = reference.remove_field('url')
         if not url:
             url = file_path
         else:
             url = url.value
         
         extraction.used_result = SearchResult('', url)
         text = unicode('Reference %d from %s' % (index,
                             file_path.rsplit('/', 1)[-1]))
         extraction.file_path = text
         extraction.entries.append(reference)
         extractions.append(extraction)
         extraction_gw.persist_extraction(extraction)
         log.info(''.join(['Imported ', text.lower()])) #@UndefinedVariable
     
     return extractions

コード例 #4

0

ファイルを表示

ファイル: reference_wrappers.py プロジェクト: Alex-Linhares/bibtexIndexMaker

 def _do_portal_acm(self, source, page):
     """
     Searches the page for a link to the reference, and then retrieves the
     reference.
     Returns a tuple with the full reference and its format.
     """ 
     log.info('Using ACM Portal reference wrapper') #@UndefinedVariable
     ref = (None, None)
     anchor = page.find('a', {'onclick':re.compile('popBibTex.cfm')})
     if not anchor:
         return ref
     jscript = anchor['onclick'].replace('window.open', '').strip('\(\)')
     ref_url = jscript.split(',')[0].strip('\'')
     ref_url = source + '/' + ref_url
     
     try:
         page = BeautifulSoup(self._browser.get_page(ref_url))
     except BrowserError:
         log.error('Browse error while retrieving entry page') #@UndefinedVariable
         return ref
     
     pre = page.find('pre')
     if not pre:
         return ref
     
     # As the wrapper has been hardcoded, we already know what will be the
     # format of the reference
     return (pre.find(text=True).strip(), ReferenceFormat.BIBTEX)

コード例 #5

0

ファイルを表示

    def persist_file_references(self, file_path):
        """
        Parses references from a file and stores them to the database
        """
        extraction_gw = ExtractionGateway()
        references = self._parse_entries_file(file_path)
        extractions = []

        for reference, index in zip(references, range(len(references))):

            extraction = Extraction()

            # Clean fields that we don't want
            reference.remove_field('reference_id')
            reference.remove_field('abstract')
            reference.remove_field('reference_type')

            url = reference.remove_field('url')
            if not url:
                url = file_path
            else:
                url = url.value

            extraction.used_result = SearchResult('', url)
            text = unicode('Reference %d from %s' %
                           (index, file_path.rsplit('/', 1)[-1]))
            extraction.file_path = text
            extraction.entries.append(reference)
            extractions.append(extraction)
            extraction_gw.persist_extraction(extraction)
            log.info(''.join(['Imported ', text.lower()]))  #@UndefinedVariable

        return extractions

コード例 #6

0

ファイルを表示

ファイル: controllers.py プロジェクト: rxuriguera/bibtexIndexMaker

 def generate_wrappers(self, url):
     wrapper_manager = WrapperGateway()
     example_manager = ExampleGateway(max_examples=self.max_examples,
                                      max_examples_from_db=
                                      self.max_examples_from_db,
                                      seconds_between_requests=
                                      self.secs_between_reqs)
     example_sets = example_manager.get_examples(self.wrapper_gen_examples,
                                                 url, self.min_validity)
     
     rulers = []
     for set in example_sets:
         log.info('Starting wrapper training for set "%s"' % set) #@UndefinedVariable
         
         if set == 'author' or set == 'editor':
             rulers = [MultiValuePathRuler(),
                       SeparatorsRegexRuler(),
                       ElementsRegexRuler(),
                       PersonRuler()]
         else:
             try:
                 value_guide = self.value_guides[set]
                 pass
             except KeyError:
                 value_guide = '.*'
             rulers = [PathRuler(value_guide), RegexRuler()] 
     
         trainer = WrapperTrainer(rulers, self.wrapper_gen_examples)
         try:
             wrappers = trainer.train(example_sets[set])
             wrappers = self._prune_wrappers(wrappers)
             wrapper_manager.persist_wrappers(url, set, wrappers)
             log.info('Trainer generated %d wrappers' % len(wrappers)) #@UndefinedVariable
         except Exception, e:
             log.error('Error training wrapper for set "%s": %s' % (set, e)) #@UndefinedVariable

コード例 #7

0

ファイルを表示

ファイル: wrapper_training.py プロジェクト: rxuriguera/bibtexIndexMaker

    def initializePage(self):
        log.addHandler(self.guihandler)  # @UndefinedVariable

        url = self.field("url").toPyObject()
        log.info("Starting training for URL: %s" % url)  # @UndefinedVariable

        self.thread = WrapperTrainingThread(self, url)
        # Connect thread signals
        self.connect(self.thread, QtCore.SIGNAL("finished()"), self.finish)
        self.connect(self.thread, QtCore.SIGNAL("terminated()"), self.finish)
        self.thread.start()

コード例 #8

0

ファイルを表示

ファイル: wrapper_training.py プロジェクト: Alex-Linhares/bibtexIndexMaker

    def initializePage(self):
        log.addHandler(self.guihandler)  #@UndefinedVariable

        url = self.field('url').toPyObject()
        log.info("Starting training for URL: %s" % url)  #@UndefinedVariable

        self.thread = WrapperTrainingThread(self, url)
        # Connect thread signals
        self.connect(self.thread, QtCore.SIGNAL("finished()"), self.finish)
        self.connect(self.thread, QtCore.SIGNAL("terminated()"), self.finish)
        self.thread.start()

コード例 #9

0

ファイルを表示

ファイル: refmaker.py プロジェクト: Alex-Linhares/bibtexIndexMaker

    def make_reference(self, file, target_format):
        """
        Uses the controllers to extract the content of a file, get some query
        strings, retrieve results from a search engine, and extract the
        reference.
        """
        extraction = Extraction()
        
        extraction.file_path = file
        extraction.target_format = target_format
        
        log.info("Making reference for file: %s" % file) #@UndefinedVariable

        rce = RCEController(self.factory)
        raw_text = rce.extract_content(file, FileFormat.TXT)
        if not raw_text:
            return extraction
        
        extraction.query_strings = rce.get_query_strings(raw_text)
        if not extraction.query_strings:
            log.error('No query strings extracted') #@UndefinedVariable
            return extraction
        log.debug("Query strings %s" % str(extraction.query_strings)) #@UndefinedVariable
        
        ir = IRController(self.factory)
        extraction.top_results, extraction.used_query = (
            ir.get_top_results(extraction.query_strings))
        if not extraction.top_results:
            log.error('No top results to use with the available wrappers ' #@UndefinedVariable
                      'after trying %d queries' % 
                      len(extraction.query_strings))
            return extraction
        extraction.query_strings.remove(extraction.used_query)
        
        log.debug("Used query %s" % str(extraction.used_query)) #@UndefinedVariable
        log.debug("Query returned %d top results" % len(extraction.top_results)) #@UndefinedVariable
        
        ie = IEController(self.factory, target_format)
        extraction.entries, extraction.used_result = (
            ie.extract_reference(extraction.top_results, raw_text))
        extraction.top_results.remove(extraction.used_result)
        log.info("Used result: %s" % str(extraction.used_result)) #@UndefinedVariable
        
        validator = ReferenceValidator(FIELD_WEIGHTS)
        for entry in extraction.entries:
            validator.validate(entry, raw_text)
        
        return extraction

コード例 #10

0

ファイルを表示

ファイル: reference_wrappers.py プロジェクト: Alex-Linhares/bibtexIndexMaker

 def _do_citeseerx(self, source, page):
     """
     Searches the page for a link to the reference, and then retrieves the
     reference.
     Returns a tuple with the full reference and its format.
     """ 
     log.info('Using CiteSeerX reference wrapper') #@UndefinedVariable
     ref = (None, None)
     
     try:
         ref_element = page.find('div', {'class':'content'},
                                 text=re.compile('@\w*{'))
         ref_element = ref_element.parent.findAll(text=True)
         reference = ''.join(ref_element)
     except Exception, e:
         log.warn('Could not find reference in citeseerx page: %s' % e) #@UndefinedVariable
         return ref

コード例 #11

0

ファイルを表示

ファイル: entry.py プロジェクト: rxuriguera/bibtexIndexMaker

    def run(self):
        log.debug("Start running index maker") #@UndefinedVariable

        # Run threads
        self.thread_runner = ThreadRunner(self.trhead_class,
                                          self._in_queue, self._out_queue)
        self.thread_runner.start()
        
        while not (self.thread_runner.finished and self._out_queue.empty()):
            extraction = self._out_queue.get()
            log.info('Persisting extraction results') #@UndefinedVariable
            # Persist the extraction
            ExtractionGateway().persist_extraction(extraction)
            self.processed.append(extraction)

        # Commit changes to the database
        flush_changes()
        log.debug("Total processed: %d" % len(self.processed)) #@UndefinedVariable

コード例 #12

0

ファイルを表示

    def run(self):
        log.debug("Start running index maker")  #@UndefinedVariable

        # Run threads
        self.thread_runner = ThreadRunner(self.trhead_class, self._in_queue,
                                          self._out_queue)
        self.thread_runner.start()

        while not (self.thread_runner.finished and self._out_queue.empty()):
            extraction = self._out_queue.get()
            log.info('Persisting extraction results')  #@UndefinedVariable
            # Persist the extraction
            ExtractionGateway().persist_extraction(extraction)
            self.processed.append(extraction)

        # Commit changes to the database
        flush_changes()
        log.debug("Total processed: %d" %
                  len(self.processed))  #@UndefinedVariable

コード例 #13

0

ファイルを表示

    def _use_reference_wrappers(self, source, page, raw_text):
        """
        Use a reference wrapper to get the reference from a given page.
        Returns a list of References with the full entry, format and a 
        structure with the different fields.
        A single publication may need more than a reference (e.g: inproceedings
        and its proceedings)
        """
        log.info('Attempting to extract reference with a reference wrapper'
                 )  #@UndefinedVariable
        references = []
        entry, format = ReferenceWrapper().extract_info(source, page)
        if not entry:
            log.debug('Could not find any entry using a reference wrapper'
                      )  #@UndefinedVariable
            return references

        # Create a parser for the given reference format
        try:
            parser = self.util_factory.create_parser(format)
        except UtilCreationError as e:
            log.error('Could not create a parser for %s: %s' % (
                format,  #@UndefinedVariable
                e.args))
            return references

        if not parser.check_format(entry):
            log.error('Given entry is not in %s' % format)  #@UndefinedVariable
            return references

        # There may be more than one entry for the same file.
        log.debug('Parsing extracted entries')  #@UndefinedVariable
        try:
            entries = parser.split_source(entry)
            for entry in entries:
                fields = parser.parse_entry(entry)
                reference = Reference(fields, format, entry)
                self._validate_reference_fields(reference, raw_text)
                references.append(reference)
        except Exception, e:
            log.error('Error parsing extracted entry: %s ' %
                      e)  #@UndefinedVariable

コード例 #14

0

ファイルを表示

    def get_top_results(self, query_strings, engine=ENGINE):
        """
        Returns a list of search results.
        """
        results = []

        # Get a searcher
        try:
            searcher = self.util_factory.create_searcher(engine)
        except UtilCreationError as e:
            log.error('Could not create a searcher: %s' %
                      e.args)  #@UndefinedVariable
            return results

        # Search the query strings
        for query in query_strings:
            searcher.set_query(query)
            try:
                log.debug('Searching query %s' % (query))  #@UndefinedVariable
                results = searcher.get_results()
            except SearchError, e:
                log.error(e.error)  #@UndefinedVariable
                break

            if searcher.num_results >= self.too_many_results:
                log.debug(
                    'Search with query %s yielded too many results '  #@UndefinedVariable
                    '(%d or more)' % (query, self.too_many_results))
                results = []
                continue

            if results:
                log.info(
                    'Searcher yielded the following results using '  #@UndefinedVariable
                    'query %s' % (query))
                for result in results:
                    log.info('    %s' % result.url[:120])  #@UndefinedVariable
                results = self._sort_results(results)

            if results:
                break

コード例 #15

0

ファイルを表示

    def extract_reference(self, top_results, raw_text):
        """
        Returns a list of References if they can be extracted or an empty 
        list otherwise.
        A single publication may need more than a reference (e.g: inproceedings
        and its proceedings)
        """

        log.info('Using %d top results' %
                 len(top_results))  #@UndefinedVariable
        page = None
        references = []
        for result in top_results:
            try:
                log.debug('Retrieving page for result %s' %
                          result.url)  #@UndefinedVariable
                page = self.browser.get_page(result.url)
            except BrowserError as e:
                log.error('Error retrieving page %s: %s' % (
                    result.url,  #@UndefinedVariable
                    e.error))
                continue

            page = ContentCleaner().clean_content(page)

            references = self._use_reference_wrappers(result.base_url, page,
                                                      raw_text)
            if not references:
                references = self._use_rule_wrappers(result.base_url, page,
                                                     raw_text)

            if references:
                break

        # Convert to target format, if necessary
        for reference in references:
            self._format_reference(reference)

        # Return the extracted reference and the result that has been used
        return (references, result)

コード例 #16

0

ファイルを表示

    def generate_wrappers(self, url):
        wrapper_manager = WrapperGateway()
        example_manager = ExampleGateway(
            max_examples=self.max_examples,
            max_examples_from_db=self.max_examples_from_db,
            seconds_between_requests=self.secs_between_reqs)
        example_sets = example_manager.get_examples(self.wrapper_gen_examples,
                                                    url, self.min_validity)

        rulers = []
        for set in example_sets:
            log.info('Starting wrapper training for set "%s"' %
                     set)  #@UndefinedVariable

            if set == 'author' or set == 'editor':
                rulers = [
                    MultiValuePathRuler(),
                    SeparatorsRegexRuler(),
                    ElementsRegexRuler(),
                    PersonRuler()
                ]
            else:
                try:
                    value_guide = self.value_guides[set]
                    pass
                except KeyError:
                    value_guide = '.*'
                rulers = [PathRuler(value_guide), RegexRuler()]

            trainer = WrapperTrainer(rulers, self.wrapper_gen_examples)
            try:
                wrappers = trainer.train(example_sets[set])
                wrappers = self._prune_wrappers(wrappers)
                wrapper_manager.persist_wrappers(url, set, wrappers)
                log.info('Trainer generated %d wrappers' %
                         len(wrappers))  #@UndefinedVariable
            except Exception, e:
                log.error('Error training wrapper for set "%s": %s' %
                          (set, e))  #@UndefinedVariable

コード例 #17

0

ファイルを表示

ファイル: controllers.py プロジェクト: rxuriguera/bibtexIndexMaker

 def _use_reference_wrappers(self, source, page, raw_text):
     """
     Use a reference wrapper to get the reference from a given page.
     Returns a list of References with the full entry, format and a 
     structure with the different fields.
     A single publication may need more than a reference (e.g: inproceedings
     and its proceedings)
     """
     log.info('Attempting to extract reference with a reference wrapper') #@UndefinedVariable
     references = []
     entry, format = ReferenceWrapper().extract_info(source, page)
     if not entry:
         log.debug('Could not find any entry using a reference wrapper') #@UndefinedVariable
         return references
     
     # Create a parser for the given reference format
     try:
         parser = self.util_factory.create_parser(format)
     except UtilCreationError as e:
         log.error('Could not create a parser for %s: %s' % (format, #@UndefinedVariable
                                                             e.args))
         return references
     
     if not parser.check_format(entry):
         log.error('Given entry is not in %s' % format) #@UndefinedVariable
         return references
     
     # There may be more than one entry for the same file.
     log.debug('Parsing extracted entries') #@UndefinedVariable
     try:
         entries = parser.split_source(entry)
         for entry in entries:
             fields = parser.parse_entry(entry)
             reference = Reference(fields, format, entry)
             self._validate_reference_fields(reference, raw_text)
             references.append(reference)
     except Exception, e:
         log.error('Error parsing extracted entry: %s ' % e) #@UndefinedVariable

コード例 #18

0

ファイルを表示

ファイル: controllers.py プロジェクト: rxuriguera/bibtexIndexMaker

 def get_top_results(self, query_strings, engine=ENGINE):
     """
     Returns a list of search results.
     """
     results = []
     
     # Get a searcher
     try:
         searcher = self.util_factory.create_searcher(engine)
     except UtilCreationError as e:
         log.error('Could not create a searcher: %s' % e.args) #@UndefinedVariable
         return results
 
     # Search the query strings       
     for query in query_strings:
         searcher.set_query(query)
         try:
             log.debug('Searching query %s' % (query)) #@UndefinedVariable
             results = searcher.get_results()
         except SearchError, e:
             log.error(e.error) #@UndefinedVariable
             break
         
         if searcher.num_results >= self.too_many_results:
             log.debug('Search with query %s yielded too many results ' #@UndefinedVariable
                       '(%d or more)' % (query, self.too_many_results)) 
             results = []
             continue
         
         if results:
             log.info('Searcher yielded the following results using ' #@UndefinedVariable
                      'query %s' % (query)) 
             for result in results:
                 log.info('    %s' % result.url[:120]) #@UndefinedVariable
             results = self._sort_results(results)
             
         if results:
             break

コード例 #19

0

ファイルを表示

ファイル: controllers.py プロジェクト: rxuriguera/bibtexIndexMaker

 def extract_reference(self, top_results, raw_text):
     """
     Returns a list of References if they can be extracted or an empty 
     list otherwise.
     A single publication may need more than a reference (e.g: inproceedings
     and its proceedings)
     """
     
     log.info('Using %d top results' % len(top_results)) #@UndefinedVariable
     page = None
     references = []
     for result in top_results:
         try:
             log.debug('Retrieving page for result %s' % result.url) #@UndefinedVariable
             page = self.browser.get_page(result.url)
         except BrowserError as e:
             log.error('Error retrieving page %s: %s' % (result.url, #@UndefinedVariable
                                                         e.error))
             continue
         
         page = ContentCleaner().clean_content(page)
         
         references = self._use_reference_wrappers(result.base_url, page,
                                                   raw_text)
         if not references:
             references = self._use_rule_wrappers(result.base_url, page,
                                                  raw_text)
             
         if references:
             break
     
     # Convert to target format, if necessary
     for reference in references:
         self._format_reference(reference)
     
     # Return the extracted reference and the result that has been used
     return (references, result)

コード例 #20

0

ファイルを表示

ファイル: controllers.py プロジェクト: rxuriguera/bibtexIndexMaker

 def _use_rule_wrappers(self, source, page, raw_text):
     """
     Look if there is any wrapper in the database for the given source.
     """
     log.info('Attempting to extract reference with ruled wrappers') #@UndefinedVariable
     fields = {}
     reference = Reference()
     wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers)
     wrapper_field_collections = wrapper_manager.find_wrapper_collections(source)
     
     for collection in wrapper_field_collections:
         # Get the wrappers for the current collection
         url, field = collection.url, collection.field
         wrappers = wrapper_manager.get_wrappers(url, field)
         log.debug('Collection %s:%s has %d wrappers' % (url, field, #@UndefinedVariable
                                                         len(wrappers)))
         
         # Get field validator
         try:
             validator = self.field_validation[collection.field][1]
         except KeyError:
             validator = None
         
         # Extract information using the wrappers we have
         for wrapper in wrappers:
             info = wrapper.extract_info(page)
             # we expect 'info' to be a string
             if type(info) == list and not (collection.field == 'author' 
                  or collection.field == 'editor'):
                 continue 
             log.debug('Info extracted by wrapper: %s' % info) #@UndefinedVariable
             
             valid = validator.validate(info, raw_text) if validator else True
             # Save the extracted info even if it's not correct. It will
             # be overwritten afterwards if necessary
             reference.set_field(field, info, valid)
             
             if not valid: 
                 log.debug('The extracted information is not valid. ' #@UndefinedVariable
                           'Downvoting wrapper.') 
                 wrapper.downvotes += 1
                 wrapper_manager.update_wrapper(wrapper)
             else:
                 log.debug('The extracted information is valid. ' #@UndefinedVariable
                           'Upvoting wrapper') 
                 wrapper.upvotes += 1
                 wrapper_manager.update_wrapper(wrapper)
                 fields[field] = info
                 break
             
     if len(reference.fields) > 0:
         log.info('Extracted reference')  #@UndefinedVariable
         return [reference]
     else:
         log.info('Could not extract reference using ruled wrappers')  #@UndefinedVariable
         return []

コード例 #21

0

ファイルを表示

 def _check_still_valid(self, mapper, content, min_validity):
     """
     It checks if the information to be extracted is really present within
     the contents. If it doesn't, then it updates the database so the
     corresponding records won't be used again.
     """
     
     # In case the content could not be extracted, don't update the database
     if not content:
         return False
     
     # For each piece of information, check if it exists in the contents.
     # At this point, we don't care about its location, but if it
     # can be found.
     not_found = 0.0
     for field in mapper.fields:
         found = content.find(text=re.compile(re.escape(field.value)))
         if not found:
             log.info('Field %s with value %s cannot be found anymore in %d' #@UndefinedVariable
                      % (field.name, field.value, mapper.id))
             field.valid = False
             not_found += 1
     
     # Recompute validity
     if len(mapper.fields):
         validity = 1 - (not_found / len(mapper.fields))
     else:
         validity = 1
          
     if validity < min_validity:
         log.info('Reference "%d" marked as invalid from now on.' % #@UndefinedVariable
                   mapper.id)
         mapper.validity = validity
         return False
 
     return  True

コード例 #22

0

ファイルを表示

ファイル: reference_importer.py プロジェクト: rxuriguera/bibtexIndexMaker

 def finish(self):
     self.progressBar.setMaximum(1)
     self.progressBar.setValue(1)
     log.info('Finished importing. Results can be found in the Manage page') #@UndefinedVariable
     log.removeHandler(self.guihandler) #@UndefinedVariable
     self.empty_edit.setText('Done!')

コード例 #23

0

ファイルを表示

 def import_references(self, path):
     log.info('Importing references from %s' % path)  #@UndefinedVariable
     references = self.ref_controller.persist_file_references(path)
     return len(references)

コード例 #24

0

ファイルを表示

ファイル: controllers.py プロジェクト: rxuriguera/bibtexIndexMaker

     self.parser = self.util_factory.create_parser(self.format)
 except UtilCreationError, e:
     log.error('Error creating parser for format %s: %s' % #@UndefinedVariable 
               (str(self.format), str(e)))
     return references
 
 try:
     file = open(file_path, 'r')
     content = file.read()
 except Exception, e:
     log.error('Error reading entries file %s: %s' % #@UndefinedVariable
               (file_path, str(e)))
     return references
 
 if not content:
     log.info('Empty entries file') #@UndefinedVariable
     return references
 
 
 if not self.parser.check_format(content):
     log.error('Given entry is not in %s' % format) #@UndefinedVariable
     return references
 
 # There may be more than one entry for the same file.
 log.debug('Parsing entries') #@UndefinedVariable
 
 entries = self.parser.split_source(content)
 for entry in entries:
     fields = self.parser.parse_entry(entry)
     reference = Reference(fields, format, entry)
     reference.validity = 1.0

コード例 #25

0

ファイルを表示

    def _use_rule_wrappers(self, source, page, raw_text):
        """
        Look if there is any wrapper in the database for the given source.
        """
        log.info('Attempting to extract reference with ruled wrappers'
                 )  #@UndefinedVariable
        fields = {}
        reference = Reference()
        wrapper_manager = WrapperGateway(max_wrappers=self.max_wrappers)
        wrapper_field_collections = wrapper_manager.find_wrapper_collections(
            source)

        for collection in wrapper_field_collections:
            # Get the wrappers for the current collection
            url, field = collection.url, collection.field
            wrappers = wrapper_manager.get_wrappers(url, field)
            log.debug('Collection %s:%s has %d wrappers' % (
                url,
                field,  #@UndefinedVariable
                len(wrappers)))

            # Get field validator
            try:
                validator = self.field_validation[collection.field][1]
            except KeyError:
                validator = None

            # Extract information using the wrappers we have
            for wrapper in wrappers:
                info = wrapper.extract_info(page)
                # we expect 'info' to be a string
                if type(info) == list and not (collection.field == 'author' or
                                               collection.field == 'editor'):
                    continue
                log.debug('Info extracted by wrapper: %s' %
                          info)  #@UndefinedVariable

                valid = validator.validate(info,
                                           raw_text) if validator else True
                # Save the extracted info even if it's not correct. It will
                # be overwritten afterwards if necessary
                reference.set_field(field, info, valid)

                if not valid:
                    log.debug(
                        'The extracted information is not valid. '  #@UndefinedVariable
                        'Downvoting wrapper.')
                    wrapper.downvotes += 1
                    wrapper_manager.update_wrapper(wrapper)
                else:
                    log.debug(
                        'The extracted information is valid. '  #@UndefinedVariable
                        'Upvoting wrapper')
                    wrapper.upvotes += 1
                    wrapper_manager.update_wrapper(wrapper)
                    fields[field] = info
                    break

        if len(reference.fields) > 0:
            log.info('Extracted reference')  #@UndefinedVariable
            return [reference]
        else:
            log.info('Could not extract reference using ruled wrappers'
                     )  #@UndefinedVariable
            return []

コード例 #26

0

ファイルを表示

ファイル: entry.py プロジェクト: rxuriguera/bibtexIndexMaker

 def import_references(self, path):
     log.info('Importing references from %s' % path) #@UndefinedVariable
     references = self.ref_controller.persist_file_references(path)
     return len(references)

コード例 #27

0

ファイルを表示

            log.error('Error creating parser for format %s: %s'
                      %  #@UndefinedVariable 
                      (str(self.format), str(e)))
            return references

        try:
            file = open(file_path, 'r')
            content = file.read()
        except Exception, e:
            log.error(
                'Error reading entries file %s: %s' %  #@UndefinedVariable
                (file_path, str(e)))
            return references

        if not content:
            log.info('Empty entries file')  #@UndefinedVariable
            return references

        if not self.parser.check_format(content):
            log.error('Given entry is not in %s' % format)  #@UndefinedVariable
            return references

        # There may be more than one entry for the same file.
        log.debug('Parsing entries')  #@UndefinedVariable

        entries = self.parser.split_source(content)
        for entry in entries:
            fields = self.parser.parse_entry(entry)
            reference = Reference(fields, format, entry)
            reference.validity = 1.0
            references.append(reference)