Ejemplo n.º 1
0
 def _update_people(self, tree, people, add_method):
     """
     Updates the lists of authors or editors from a reference. The one
     that gets updated is decided depending on the people and add_method
     parameteres.
     """
     for index in range(tree.topLevelItemCount()):
         item = tree.topLevelItem(index)
         
         if not (item.text(0) or item.text(1) or item.text(2)):
             if(len(people) > index):
                 people.pop(index)
             continue
         
         log.debug("Index: %d Number of fields %d" % (index , len(people))) #@UndefinedVariable
         
         try:
             first_name = unicode(item.text(0))
             middle_name = unicode(item.text(1))
             last_name = unicode(item.text(2))
         except TypeError, e:
             log.error("Type error when casting to store to database %s" % str(e)) #@UndefinedVariable
             continue
         
         if(len(people) > index):
             people[index].name = first_name
             people[index].value = middle_name
             people[index].valid = last_name
         else:
             add_method(first_name, middle_name, last_name)
Ejemplo n.º 2
0
 def _update_fields(self, reference):
     """
     Updates the fields of a reference
     """
     log.debug('Updating reference') #@UndefinedVariable
     for index in range(self.editor.fields.topLevelItemCount()):
         item = self.editor.fields.topLevelItem(index)
         
         # Remove empty items
         if ((len(reference.fields) > index) and 
             not (item.text(0) and item.text(1) and (item.text(2)))):
             reference.fields.pop(index) 
         
         # Skip non-empty items that have an invalid status
         if not ((item.text(2) == 'True' or item.text(2) == 'False')):
             continue
         
         log.debug('Index: %d Number of fields %d' % (index , len(reference.fields))) #@UndefinedVariable
         
         try:
             name = unicode(item.text(0))
             value = unicode(item.text(1))
             valid = True if str(item.text(2)) == "True" else False
         except TypeError, e:
             log.error('Type error when casting to store to database %s' % str(e)) #@UndefinedVariable
             continue
         
         if(len(reference.fields) > index):
             reference.fields[index].name = name
             reference.fields[index].value = value
             reference.fields[index].valid = valid
         else:
             reference.add_field(name, value, valid)
 def _do_portal_acm(self, source, page):
     """
     Searches the page for a link to the reference, and then retrieves the
     reference.
     Returns a tuple with the full reference and its format.
     """ 
     log.info('Using ACM Portal reference wrapper') #@UndefinedVariable
     ref = (None, None)
     anchor = page.find('a', {'onclick':re.compile('popBibTex.cfm')})
     if not anchor:
         return ref
     jscript = anchor['onclick'].replace('window.open', '').strip('\(\)')
     ref_url = jscript.split(',')[0].strip('\'')
     ref_url = source + '/' + ref_url
     
     try:
         page = BeautifulSoup(self._browser.get_page(ref_url))
     except BrowserError:
         log.error('Browse error while retrieving entry page') #@UndefinedVariable
         return ref
     
     pre = page.find('pre')
     if not pre:
         return ref
     
     # As the wrapper has been hardcoded, we already know what will be the
     # format of the reference
     return (pre.find(text=True).strip(), ReferenceFormat.BIBTEX)
Ejemplo n.º 4
0
 def _update_rules(self, wrapper):
     for index in range(self.ui.rules.topLevelItemCount() - 1):
         item = self.ui.rules.topLevelItem(index)
         
         log.debug('Updating rule %d' % index) #@UndefinedVariable
         
         # Remove empty items
         if ((len(wrapper.rules) > index) and 
             not (item.text(0) and item.text(1) and item.text(2))):
             wrapper.rules.pop(index)
             continue
         
         # Skip non-empty items that have an invalid status
         try:
             rule_type = str(item.text(0))
             pattern = str(item.text(1))
             order = int(str(item.text(2)))
         except (TypeError, ValueError):
             log.error('Error when casting to store to database') #@UndefinedVariable
             continue
         
         # Check that the pattern can be converted to a python object
         try:
             pattern_py = simplejson.loads(pattern) #@UnusedVariable
         except ValueError:
             log.debug('Cannot convert pattern %s to Python objects' % pattern) #@UndefinedVariable
             continue
         
         # Update or append the rules
         if(len(wrapper.rules) > index):
             wrapper.rules[index].rule_type = rule_type
             wrapper.rules[index].pattern = pattern
             wrapper.rules[index].order = order
         else:
             wrapper.add_rule_by_info(rule_type, pattern, order)
Ejemplo n.º 5
0
 def run(self):
     """
     Runs indefinitely until it is asked to finish.
     Processes files from the 'input_queue' and supplies them to a 
     'ReferenceMaker' object.
     Once the ReferenceMaker is done, it stores the results in tuples
     (file, reference) to the output queue.
     """
     log.debug("Running thread", extra={'threadname':self.getName()}) #@UndefinedVariable
     while not self.stop_event.isSet():
         file = None
         if not self.in_queue.empty():
             try:
                 file = self.in_queue.get(False)
             except Queue.Empty:
                 continue
         if file:
             log.debug("Processing file %s" % file) #@UndefinedVariable
             try:
                 reference = ReferenceMaker().make_reference(file,
                                                         self.target_format)
                 self.out_queue.put(reference)
             except Exception, e:
                 log.error('Unexpected exception while extracting reference' #@UndefinedVariable
                           ' for file %s: %s' % (file, str(e)))
                 self.out_queue.put(Extraction())
                 continue
Ejemplo n.º 6
0
 def run(self):
     """
     Runs indefinitely until it is asked to finish.
     Processes files from the 'input_queue' and supplies them to a 
     'ReferenceMaker' object.
     Once the ReferenceMaker is done, it stores the results in tuples
     (file, reference) to the output queue.
     """
     log.debug("Running thread",
               extra={'threadname': self.getName()})  #@UndefinedVariable
     while not self.stop_event.isSet():
         file = None
         if not self.in_queue.empty():
             try:
                 file = self.in_queue.get(False)
             except Queue.Empty:
                 continue
         if file:
             log.debug("Processing file %s" % file)  #@UndefinedVariable
             try:
                 reference = ReferenceMaker().make_reference(
                     file, self.target_format)
                 self.out_queue.put(reference)
             except Exception, e:
                 log.error(
                     'Unexpected exception while extracting reference'  #@UndefinedVariable
                     ' for file %s: %s' % (file, str(e)))
                 self.out_queue.put(Extraction())
                 continue
Ejemplo n.º 7
0
 def generate_wrappers(self, url):
     wrapper_manager = WrapperGateway()
     example_manager = ExampleGateway(max_examples=self.max_examples,
                                      max_examples_from_db=
                                      self.max_examples_from_db,
                                      seconds_between_requests=
                                      self.secs_between_reqs)
     example_sets = example_manager.get_examples(self.wrapper_gen_examples,
                                                 url, self.min_validity)
     
     rulers = []
     for set in example_sets:
         log.info('Starting wrapper training for set "%s"' % set) #@UndefinedVariable
         
         if set == 'author' or set == 'editor':
             rulers = [MultiValuePathRuler(),
                       SeparatorsRegexRuler(),
                       ElementsRegexRuler(),
                       PersonRuler()]
         else:
             try:
                 value_guide = self.value_guides[set]
                 pass
             except KeyError:
                 value_guide = '.*'
             rulers = [PathRuler(value_guide), RegexRuler()] 
     
         trainer = WrapperTrainer(rulers, self.wrapper_gen_examples)
         try:
             wrappers = trainer.train(example_sets[set])
             wrappers = self._prune_wrappers(wrappers)
             wrapper_manager.persist_wrappers(url, set, wrappers)
             log.info('Trainer generated %d wrappers' % len(wrappers)) #@UndefinedVariable
         except Exception, e:
             log.error('Error training wrapper for set "%s": %s' % (set, e)) #@UndefinedVariable
Ejemplo n.º 8
0
 def _get_content_elements(self, value, content):
     """
     Looks in the content to find the elements that contain the desired 
     value. Raises a ValueError exception if the content does not contain 
     the value.
     """
     try:
         elements = content.findAll(True, text=re.compile(value))
     except NameError, e:
         log.error("Example's content is not an HTML document: %s" % e) #@UndefinedVariable
         elements = []
Ejemplo n.º 9
0
    def _apply_single_input(self, input):
        log.debug('Applying RegexRule with pattern %s' % self.pattern) #@UndefinedVariable

        try:
            regex = re.compile(self.pattern)
            input = input.strip()
            matches = re.search(regex, input)
        except Exception, e:
            log.error('Exception applying RegexRule with pattern %s: %s'  #@UndefinedVariable
                      % (self.pattern, e))
            return ''
 def done(self, status):
     path = QtGui.QFileDialog.getSaveFileName(self,
         caption='Save references to file', filter='BibTeX (*.bib)')
     if not path:
         return
     log.debug('Saving to file: %s' % path) #@UndefinedVariable
     try:
         file = open(path, 'w')
         file.write(unicode(self.page01.ui.entriesEdit.toPlainText()))
         file.close()
     except Exception, e:
         log.error('Error saving references to %s' % e) #@UndefinedVariable
Ejemplo n.º 11
0
 def _parse_entries_file(self, file_path):
     """
     Reads the file described by 'file_path' and parses all the references
     that it may contain.
     Returns a list of Reference instances with the extracted instances.
     """
     references = []
     try:
         self.parser = self.util_factory.create_parser(self.format)
     except UtilCreationError, e:
         log.error('Error creating parser for format %s: %s' % #@UndefinedVariable 
                   (str(self.format), str(e)))
         return references
Ejemplo n.º 12
0
 def _parse_entries_file(self, file_path):
     """
     Reads the file described by 'file_path' and parses all the references
     that it may contain.
     Returns a list of Reference instances with the extracted instances.
     """
     references = []
     try:
         self.parser = self.util_factory.create_parser(self.format)
     except UtilCreationError, e:
         log.error('Error creating parser for format %s: %s'
                   %  #@UndefinedVariable 
                   (str(self.format), str(e)))
         return references
Ejemplo n.º 13
0
    def make_reference(self, file, target_format):
        """
        Uses the controllers to extract the content of a file, get some query
        strings, retrieve results from a search engine, and extract the
        reference.
        """
        extraction = Extraction()
        
        extraction.file_path = file
        extraction.target_format = target_format
        
        log.info("Making reference for file: %s" % file) #@UndefinedVariable

        rce = RCEController(self.factory)
        raw_text = rce.extract_content(file, FileFormat.TXT)
        if not raw_text:
            return extraction
        
        extraction.query_strings = rce.get_query_strings(raw_text)
        if not extraction.query_strings:
            log.error('No query strings extracted') #@UndefinedVariable
            return extraction
        log.debug("Query strings %s" % str(extraction.query_strings)) #@UndefinedVariable
        
        ir = IRController(self.factory)
        extraction.top_results, extraction.used_query = (
            ir.get_top_results(extraction.query_strings))
        if not extraction.top_results:
            log.error('No top results to use with the available wrappers ' #@UndefinedVariable
                      'after trying %d queries' % 
                      len(extraction.query_strings))
            return extraction
        extraction.query_strings.remove(extraction.used_query)
        
        log.debug("Used query %s" % str(extraction.used_query)) #@UndefinedVariable
        log.debug("Query returned %d top results" % len(extraction.top_results)) #@UndefinedVariable
        
        ie = IEController(self.factory, target_format)
        extraction.entries, extraction.used_result = (
            ie.extract_reference(extraction.top_results, raw_text))
        extraction.top_results.remove(extraction.used_result)
        log.info("Used result: %s" % str(extraction.used_result)) #@UndefinedVariable
        
        validator = ReferenceValidator(FIELD_WEIGHTS)
        for entry in extraction.entries:
            validator.validate(entry, raw_text)
        
        return extraction
Ejemplo n.º 14
0
 def _format_reference(self, reference):
     """
     Formats a reference with the target format.
     """
     if reference.format == self.format:
         return
     
     # Create a formatter and a generator
     formatter = ReferenceFormatter()
     try:
         generator = self.util_factory.create_generator(self.format)
     except UtilCreationError as e:
         log.error('Could not create a formatter for %s: %s' % #@UndefinedVariable
                   (self.format, e.args))
         return
     
     formatter.format_reference(reference, generator)
Ejemplo n.º 15
0
    def _format_reference(self, reference):
        """
        Formats a reference with the target format.
        """
        if reference.format == self.format:
            return

        # Create a formatter and a generator
        formatter = ReferenceFormatter()
        try:
            generator = self.util_factory.create_generator(self.format)
        except UtilCreationError as e:
            log.error('Could not create a formatter for %s: %s'
                      %  #@UndefinedVariable
                      (self.format, e.args))
            return

        formatter.format_reference(reference, generator)
Ejemplo n.º 16
0
 def _get_content(self, url):
     """
     This method looks for the content of an example's URL. In order not to
     overload the server, it sleeps for some time between multiple calls. 
     """
     time_to_sleep = (self.seconds_between_requests - 
                     (datetime.now() - self.last_request).seconds)
     if time_to_sleep > 0:
         sleep(time_to_sleep)
     
     content = None
     try:
         content = Browser().get_page(url)
         content = ContentCleaner().clean_content(content)
     except BrowserError as e:
         log.error('Error retrieving page %s: %s' % (url, #@UndefinedVariable
                                                     e.error))
     self.last_request = datetime.now()
     return content
Ejemplo n.º 17
0
 def format_reference(self, reference_id):
     log.debug('Retrieving reference from the database') #@UndefinedVariable
     reference = self.reference_gw.find_reference_by_id(reference_id)
     if not reference:
         log.error('Reference with id %d could not be retrieved'  #@UndefinedVariable
                   % reference_id)
         return None
     
     formatter = ReferenceFormatter()
     try:
         generator = self.util_factory.create_generator(self.format)
     except UtilCreationError as e:
         log.error('Could not create a formatter for %s: %s' % #@UndefinedVariable
                   (self.format, e.args))
         return None
     
     log.debug('Starting to format') #@UndefinedVariable
     formatter.format_reference(reference, generator)
     
     return reference.entry
Ejemplo n.º 18
0
    def get_top_results(self, query_strings, engine=ENGINE):
        """
        Returns a list of search results.
        """
        results = []

        # Get a searcher
        try:
            searcher = self.util_factory.create_searcher(engine)
        except UtilCreationError as e:
            log.error('Could not create a searcher: %s' %
                      e.args)  #@UndefinedVariable
            return results

        # Search the query strings
        for query in query_strings:
            searcher.set_query(query)
            try:
                log.debug('Searching query %s' % (query))  #@UndefinedVariable
                results = searcher.get_results()
            except SearchError, e:
                log.error(e.error)  #@UndefinedVariable
                break

            if searcher.num_results >= self.too_many_results:
                log.debug(
                    'Search with query %s yielded too many results '  #@UndefinedVariable
                    '(%d or more)' % (query, self.too_many_results))
                results = []
                continue

            if results:
                log.info(
                    'Searcher yielded the following results using '  #@UndefinedVariable
                    'query %s' % (query))
                for result in results:
                    log.info('    %s' % result.url[:120])  #@UndefinedVariable
                results = self._sort_results(results)

            if results:
                break
Ejemplo n.º 19
0
    def extract_reference(self, top_results, raw_text):
        """
        Returns a list of References if they can be extracted or an empty 
        list otherwise.
        A single publication may need more than a reference (e.g: inproceedings
        and its proceedings)
        """

        log.info('Using %d top results' %
                 len(top_results))  #@UndefinedVariable
        page = None
        references = []
        for result in top_results:
            try:
                log.debug('Retrieving page for result %s' %
                          result.url)  #@UndefinedVariable
                page = self.browser.get_page(result.url)
            except BrowserError as e:
                log.error('Error retrieving page %s: %s' % (
                    result.url,  #@UndefinedVariable
                    e.error))
                continue

            page = ContentCleaner().clean_content(page)

            references = self._use_reference_wrappers(result.base_url, page,
                                                      raw_text)
            if not references:
                references = self._use_rule_wrappers(result.base_url, page,
                                                     raw_text)

            if references:
                break

        # Convert to target format, if necessary
        for reference in references:
            self._format_reference(reference)

        # Return the extracted reference and the result that has been used
        return (references, result)
Ejemplo n.º 20
0
    def generate_wrappers(self, url):
        wrapper_manager = WrapperGateway()
        example_manager = ExampleGateway(
            max_examples=self.max_examples,
            max_examples_from_db=self.max_examples_from_db,
            seconds_between_requests=self.secs_between_reqs)
        example_sets = example_manager.get_examples(self.wrapper_gen_examples,
                                                    url, self.min_validity)

        rulers = []
        for set in example_sets:
            log.info('Starting wrapper training for set "%s"' %
                     set)  #@UndefinedVariable

            if set == 'author' or set == 'editor':
                rulers = [
                    MultiValuePathRuler(),
                    SeparatorsRegexRuler(),
                    ElementsRegexRuler(),
                    PersonRuler()
                ]
            else:
                try:
                    value_guide = self.value_guides[set]
                    pass
                except KeyError:
                    value_guide = '.*'
                rulers = [PathRuler(value_guide), RegexRuler()]

            trainer = WrapperTrainer(rulers, self.wrapper_gen_examples)
            try:
                wrappers = trainer.train(example_sets[set])
                wrappers = self._prune_wrappers(wrappers)
                wrapper_manager.persist_wrappers(url, set, wrappers)
                log.info('Trainer generated %d wrappers' %
                         len(wrappers))  #@UndefinedVariable
            except Exception, e:
                log.error('Error training wrapper for set "%s": %s' %
                          (set, e))  #@UndefinedVariable
Ejemplo n.º 21
0
 def get_top_results(self, query_strings, engine=ENGINE):
     """
     Returns a list of search results.
     """
     results = []
     
     # Get a searcher
     try:
         searcher = self.util_factory.create_searcher(engine)
     except UtilCreationError as e:
         log.error('Could not create a searcher: %s' % e.args) #@UndefinedVariable
         return results
 
     # Search the query strings       
     for query in query_strings:
         searcher.set_query(query)
         try:
             log.debug('Searching query %s' % (query)) #@UndefinedVariable
             results = searcher.get_results()
         except SearchError, e:
             log.error(e.error) #@UndefinedVariable
             break
         
         if searcher.num_results >= self.too_many_results:
             log.debug('Search with query %s yielded too many results ' #@UndefinedVariable
                       '(%d or more)' % (query, self.too_many_results)) 
             results = []
             continue
         
         if results:
             log.info('Searcher yielded the following results using ' #@UndefinedVariable
                      'query %s' % (query)) 
             for result in results:
                 log.info('    %s' % result.url[:120]) #@UndefinedVariable
             results = self._sort_results(results)
             
         if results:
             break
Ejemplo n.º 22
0
    def format_reference(self, reference_id):
        log.debug(
            'Retrieving reference from the database')  #@UndefinedVariable
        reference = self.reference_gw.find_reference_by_id(reference_id)
        if not reference:
            log.error(
                'Reference with id %d could not be retrieved'  #@UndefinedVariable
                % reference_id)
            return None

        formatter = ReferenceFormatter()
        try:
            generator = self.util_factory.create_generator(self.format)
        except UtilCreationError as e:
            log.error('Could not create a formatter for %s: %s'
                      %  #@UndefinedVariable
                      (self.format, e.args))
            return None

        log.debug('Starting to format')  #@UndefinedVariable
        formatter.format_reference(reference, generator)

        return reference.entry
Ejemplo n.º 23
0
 def extract_reference(self, top_results, raw_text):
     """
     Returns a list of References if they can be extracted or an empty 
     list otherwise.
     A single publication may need more than a reference (e.g: inproceedings
     and its proceedings)
     """
     
     log.info('Using %d top results' % len(top_results)) #@UndefinedVariable
     page = None
     references = []
     for result in top_results:
         try:
             log.debug('Retrieving page for result %s' % result.url) #@UndefinedVariable
             page = self.browser.get_page(result.url)
         except BrowserError as e:
             log.error('Error retrieving page %s: %s' % (result.url, #@UndefinedVariable
                                                         e.error))
             continue
         
         page = ContentCleaner().clean_content(page)
         
         references = self._use_reference_wrappers(result.base_url, page,
                                                   raw_text)
         if not references:
             references = self._use_rule_wrappers(result.base_url, page,
                                                  raw_text)
             
         if references:
             break
     
     # Convert to target format, if necessary
     for reference in references:
         self._format_reference(reference)
     
     # Return the extracted reference and the result that has been used
     return (references, result)
Ejemplo n.º 24
0
    def _update_rules(self, wrapper):
        for index in range(self.ui.rules.topLevelItemCount() - 1):
            item = self.ui.rules.topLevelItem(index)

            log.debug('Updating rule %d' % index)  #@UndefinedVariable

            # Remove empty items
            if ((len(wrapper.rules) > index)
                    and not (item.text(0) and item.text(1) and item.text(2))):
                wrapper.rules.pop(index)
                continue

            # Skip non-empty items that have an invalid status
            try:
                rule_type = str(item.text(0))
                pattern = str(item.text(1))
                order = int(str(item.text(2)))
            except (TypeError, ValueError):
                log.error('Error when casting to store to database'
                          )  #@UndefinedVariable
                continue

            # Check that the pattern can be converted to a python object
            try:
                pattern_py = simplejson.loads(pattern)  #@UnusedVariable
            except ValueError:
                log.debug('Cannot convert pattern %s to Python objects' %
                          pattern)  #@UndefinedVariable
                continue

            # Update or append the rules
            if (len(wrapper.rules) > index):
                wrapper.rules[index].rule_type = rule_type
                wrapper.rules[index].pattern = pattern
                wrapper.rules[index].order = order
            else:
                wrapper.add_rule_by_info(rule_type, pattern, order)
Ejemplo n.º 25
0
    def extract(self, input_file):
        input_file = self._check_input_file(input_file)
        # Extraction command and its options. They may be parametrized in the
        # future
        command = [self._pdf_extraction_tool, '-q', '-f', '1', '-l', '2',
                   '-enc', 'ASCII7', '-htmlmeta', input_file, '-']
        try:
            pop = subprocess.Popen(command, stdout=subprocess.PIPE)
        except subprocess.CalledProcessError as cpe:
            log.error ('Error executing PDF text extraction tool. Return code: ' #@UndefinedVariable
                   + repr(cpe.returncode))
        except OSError:
            log.error ('PDF extraction tool not found') #@UndefinedVariable
        
        stdout = pop.communicate()[0]
        if not stdout:
            raise ExtractionError('Corrupted file')
        
        parser = BeautifulSoup(stdout)
        document = Document()
        self._extract_metadata(parser, document)
        self._extract_content(parser, document)

        return document
Ejemplo n.º 26
0
class ReferencesController(Controller):
    def __init__(self, factory, format=ReferenceFormat.BIBTEX):
        super(ReferencesController, self).__init__(factory)
        self.format = format

    def get_format(self):
        return self.__format

    def set_format(self, value):
        self.__format = value

    format = property(get_format, set_format)

    def _parse_entries_file(self, file_path):
        """
        Reads the file described by 'file_path' and parses all the references
        that it may contain.
        Returns a list of Reference instances with the extracted instances.
        """
        references = []
        try:
            self.parser = self.util_factory.create_parser(self.format)
        except UtilCreationError, e:
            log.error('Error creating parser for format %s: %s'
                      %  #@UndefinedVariable 
                      (str(self.format), str(e)))
            return references

        try:
            file = open(file_path, 'r')
            content = file.read()
        except Exception, e:
            log.error(
                'Error reading entries file %s: %s' %  #@UndefinedVariable
                (file_path, str(e)))
            return references
Ejemplo n.º 27
0
    def _use_reference_wrappers(self, source, page, raw_text):
        """
        Use a reference wrapper to get the reference from a given page.
        Returns a list of References with the full entry, format and a 
        structure with the different fields.
        A single publication may need more than a reference (e.g: inproceedings
        and its proceedings)
        """
        log.info('Attempting to extract reference with a reference wrapper'
                 )  #@UndefinedVariable
        references = []
        entry, format = ReferenceWrapper().extract_info(source, page)
        if not entry:
            log.debug('Could not find any entry using a reference wrapper'
                      )  #@UndefinedVariable
            return references

        # Create a parser for the given reference format
        try:
            parser = self.util_factory.create_parser(format)
        except UtilCreationError as e:
            log.error('Could not create a parser for %s: %s' % (
                format,  #@UndefinedVariable
                e.args))
            return references

        if not parser.check_format(entry):
            log.error('Given entry is not in %s' % format)  #@UndefinedVariable
            return references

        # There may be more than one entry for the same file.
        log.debug('Parsing extracted entries')  #@UndefinedVariable
        try:
            entries = parser.split_source(entry)
            for entry in entries:
                fields = parser.parse_entry(entry)
                reference = Reference(fields, format, entry)
                self._validate_reference_fields(reference, raw_text)
                references.append(reference)
        except Exception, e:
            log.error('Error parsing extracted entry: %s ' %
                      e)  #@UndefinedVariable
Ejemplo n.º 28
0
    def extract_content(self, file, target_format):
        content = None
        source_format = FileFormat().get_format(file)
        
        # Get an extractor
        try:
            extractor = self.util_factory.create_extractor(source_format,
                                                           target_format)
        except UtilCreationError as e:
            log.error('Could not create an extractor: %s' % e.args) #@UndefinedVariable
            return content

        # Extract content
        try:
            content = extractor.extract(file).content
        except ExtractionError:
            log.error('Could not extract content for file: %s' % file) #@UndefinedVariable
        except IOError as e:
            log.error('Error extracting file content: %s' % e.args) #@UndefinedVariable
        return content
Ejemplo n.º 29
0
 def _use_reference_wrappers(self, source, page, raw_text):
     """
     Use a reference wrapper to get the reference from a given page.
     Returns a list of References with the full entry, format and a 
     structure with the different fields.
     A single publication may need more than a reference (e.g: inproceedings
     and its proceedings)
     """
     log.info('Attempting to extract reference with a reference wrapper') #@UndefinedVariable
     references = []
     entry, format = ReferenceWrapper().extract_info(source, page)
     if not entry:
         log.debug('Could not find any entry using a reference wrapper') #@UndefinedVariable
         return references
     
     # Create a parser for the given reference format
     try:
         parser = self.util_factory.create_parser(format)
     except UtilCreationError as e:
         log.error('Could not create a parser for %s: %s' % (format, #@UndefinedVariable
                                                             e.args))
         return references
     
     if not parser.check_format(entry):
         log.error('Given entry is not in %s' % format) #@UndefinedVariable
         return references
     
     # There may be more than one entry for the same file.
     log.debug('Parsing extracted entries') #@UndefinedVariable
     try:
         entries = parser.split_source(entry)
         for entry in entries:
             fields = parser.parse_entry(entry)
             reference = Reference(fields, format, entry)
             self._validate_reference_fields(reference, raw_text)
             references.append(reference)
     except Exception, e:
         log.error('Error parsing extracted entry: %s ' % e) #@UndefinedVariable
Ejemplo n.º 30
0
    def extract_content(self, file, target_format):
        content = None
        source_format = FileFormat().get_format(file)

        # Get an extractor
        try:
            extractor = self.util_factory.create_extractor(
                source_format, target_format)
        except UtilCreationError as e:
            log.error('Could not create an extractor: %s' %
                      e.args)  #@UndefinedVariable
            return content

        # Extract content
        try:
            content = extractor.extract(file).content
        except ExtractionError:
            log.error('Could not extract content for file: %s' %
                      file)  #@UndefinedVariable
        except IOError as e:
            log.error('Error extracting file content: %s' %
                      e.args)  #@UndefinedVariable
        return content
Ejemplo n.º 31
0
        try:
            file = open(file_path, 'r')
            content = file.read()
        except Exception, e:
            log.error(
                'Error reading entries file %s: %s' %  #@UndefinedVariable
                (file_path, str(e)))
            return references

        if not content:
            log.info('Empty entries file')  #@UndefinedVariable
            return references

        if not self.parser.check_format(content):
            log.error('Given entry is not in %s' % format)  #@UndefinedVariable
            return references

        # There may be more than one entry for the same file.
        log.debug('Parsing entries')  #@UndefinedVariable

        entries = self.parser.split_source(content)
        for entry in entries:
            fields = self.parser.parse_entry(entry)
            reference = Reference(fields, format, entry)
            reference.validity = 1.0
            references.append(reference)
        return references

    def persist_file_references(self, file_path):
        """
Ejemplo n.º 32
0
     
     try:
         file = open(file_path, 'r')
         content = file.read()
     except Exception, e:
         log.error('Error reading entries file %s: %s' % #@UndefinedVariable
                   (file_path, str(e)))
         return references
     
     if not content:
         log.info('Empty entries file') #@UndefinedVariable
         return references
     
     
     if not self.parser.check_format(content):
         log.error('Given entry is not in %s' % format) #@UndefinedVariable
         return references
     
     # There may be more than one entry for the same file.
     log.debug('Parsing entries') #@UndefinedVariable
     
     entries = self.parser.split_source(content)
     for entry in entries:
         fields = self.parser.parse_entry(entry)
         reference = Reference(fields, format, entry)
         reference.validity = 1.0
         references.append(reference)
     return references
     
 def persist_file_references(self, file_path):
     """