def _update_people(self, tree, people, add_method): """ Updates the lists of authors or editors from a reference. The one that gets updated is decided depending on the people and add_method parameteres. """ for index in range(tree.topLevelItemCount()): item = tree.topLevelItem(index) if not (item.text(0) or item.text(1) or item.text(2)): if(len(people) > index): people.pop(index) continue log.debug("Index: %d Number of fields %d" % (index , len(people))) #@UndefinedVariable try: first_name = unicode(item.text(0)) middle_name = unicode(item.text(1)) last_name = unicode(item.text(2)) except TypeError, e: log.error("Type error when casting to store to database %s" % str(e)) #@UndefinedVariable continue if(len(people) > index): people[index].name = first_name people[index].value = middle_name people[index].valid = last_name else: add_method(first_name, middle_name, last_name)
def _update_fields(self, reference): """ Updates the fields of a reference """ log.debug('Updating reference') #@UndefinedVariable for index in range(self.editor.fields.topLevelItemCount()): item = self.editor.fields.topLevelItem(index) # Remove empty items if ((len(reference.fields) > index) and not (item.text(0) and item.text(1) and (item.text(2)))): reference.fields.pop(index) # Skip non-empty items that have an invalid status if not ((item.text(2) == 'True' or item.text(2) == 'False')): continue log.debug('Index: %d Number of fields %d' % (index , len(reference.fields))) #@UndefinedVariable try: name = unicode(item.text(0)) value = unicode(item.text(1)) valid = True if str(item.text(2)) == "True" else False except TypeError, e: log.error('Type error when casting to store to database %s' % str(e)) #@UndefinedVariable continue if(len(reference.fields) > index): reference.fields[index].name = name reference.fields[index].value = value reference.fields[index].valid = valid else: reference.add_field(name, value, valid)
def _do_portal_acm(self, source, page): """ Searches the page for a link to the reference, and then retrieves the reference. Returns a tuple with the full reference and its format. """ log.info('Using ACM Portal reference wrapper') #@UndefinedVariable ref = (None, None) anchor = page.find('a', {'onclick':re.compile('popBibTex.cfm')}) if not anchor: return ref jscript = anchor['onclick'].replace('window.open', '').strip('\(\)') ref_url = jscript.split(',')[0].strip('\'') ref_url = source + '/' + ref_url try: page = BeautifulSoup(self._browser.get_page(ref_url)) except BrowserError: log.error('Browse error while retrieving entry page') #@UndefinedVariable return ref pre = page.find('pre') if not pre: return ref # As the wrapper has been hardcoded, we already know what will be the # format of the reference return (pre.find(text=True).strip(), ReferenceFormat.BIBTEX)
def _update_rules(self, wrapper): for index in range(self.ui.rules.topLevelItemCount() - 1): item = self.ui.rules.topLevelItem(index) log.debug('Updating rule %d' % index) #@UndefinedVariable # Remove empty items if ((len(wrapper.rules) > index) and not (item.text(0) and item.text(1) and item.text(2))): wrapper.rules.pop(index) continue # Skip non-empty items that have an invalid status try: rule_type = str(item.text(0)) pattern = str(item.text(1)) order = int(str(item.text(2))) except (TypeError, ValueError): log.error('Error when casting to store to database') #@UndefinedVariable continue # Check that the pattern can be converted to a python object try: pattern_py = simplejson.loads(pattern) #@UnusedVariable except ValueError: log.debug('Cannot convert pattern %s to Python objects' % pattern) #@UndefinedVariable continue # Update or append the rules if(len(wrapper.rules) > index): wrapper.rules[index].rule_type = rule_type wrapper.rules[index].pattern = pattern wrapper.rules[index].order = order else: wrapper.add_rule_by_info(rule_type, pattern, order)
def run(self): """ Runs indefinitely until it is asked to finish. Processes files from the 'input_queue' and supplies them to a 'ReferenceMaker' object. Once the ReferenceMaker is done, it stores the results in tuples (file, reference) to the output queue. """ log.debug("Running thread", extra={'threadname':self.getName()}) #@UndefinedVariable while not self.stop_event.isSet(): file = None if not self.in_queue.empty(): try: file = self.in_queue.get(False) except Queue.Empty: continue if file: log.debug("Processing file %s" % file) #@UndefinedVariable try: reference = ReferenceMaker().make_reference(file, self.target_format) self.out_queue.put(reference) except Exception, e: log.error('Unexpected exception while extracting reference' #@UndefinedVariable ' for file %s: %s' % (file, str(e))) self.out_queue.put(Extraction()) continue
def run(self): """ Runs indefinitely until it is asked to finish. Processes files from the 'input_queue' and supplies them to a 'ReferenceMaker' object. Once the ReferenceMaker is done, it stores the results in tuples (file, reference) to the output queue. """ log.debug("Running thread", extra={'threadname': self.getName()}) #@UndefinedVariable while not self.stop_event.isSet(): file = None if not self.in_queue.empty(): try: file = self.in_queue.get(False) except Queue.Empty: continue if file: log.debug("Processing file %s" % file) #@UndefinedVariable try: reference = ReferenceMaker().make_reference( file, self.target_format) self.out_queue.put(reference) except Exception, e: log.error( 'Unexpected exception while extracting reference' #@UndefinedVariable ' for file %s: %s' % (file, str(e))) self.out_queue.put(Extraction()) continue
def generate_wrappers(self, url): wrapper_manager = WrapperGateway() example_manager = ExampleGateway(max_examples=self.max_examples, max_examples_from_db= self.max_examples_from_db, seconds_between_requests= self.secs_between_reqs) example_sets = example_manager.get_examples(self.wrapper_gen_examples, url, self.min_validity) rulers = [] for set in example_sets: log.info('Starting wrapper training for set "%s"' % set) #@UndefinedVariable if set == 'author' or set == 'editor': rulers = [MultiValuePathRuler(), SeparatorsRegexRuler(), ElementsRegexRuler(), PersonRuler()] else: try: value_guide = self.value_guides[set] pass except KeyError: value_guide = '.*' rulers = [PathRuler(value_guide), RegexRuler()] trainer = WrapperTrainer(rulers, self.wrapper_gen_examples) try: wrappers = trainer.train(example_sets[set]) wrappers = self._prune_wrappers(wrappers) wrapper_manager.persist_wrappers(url, set, wrappers) log.info('Trainer generated %d wrappers' % len(wrappers)) #@UndefinedVariable except Exception, e: log.error('Error training wrapper for set "%s": %s' % (set, e)) #@UndefinedVariable
def _get_content_elements(self, value, content): """ Looks in the content to find the elements that contain the desired value. Raises a ValueError exception if the content does not contain the value. """ try: elements = content.findAll(True, text=re.compile(value)) except NameError, e: log.error("Example's content is not an HTML document: %s" % e) #@UndefinedVariable elements = []
def _apply_single_input(self, input): log.debug('Applying RegexRule with pattern %s' % self.pattern) #@UndefinedVariable try: regex = re.compile(self.pattern) input = input.strip() matches = re.search(regex, input) except Exception, e: log.error('Exception applying RegexRule with pattern %s: %s' #@UndefinedVariable % (self.pattern, e)) return ''
def done(self, status): path = QtGui.QFileDialog.getSaveFileName(self, caption='Save references to file', filter='BibTeX (*.bib)') if not path: return log.debug('Saving to file: %s' % path) #@UndefinedVariable try: file = open(path, 'w') file.write(unicode(self.page01.ui.entriesEdit.toPlainText())) file.close() except Exception, e: log.error('Error saving references to %s' % e) #@UndefinedVariable
def _parse_entries_file(self, file_path): """ Reads the file described by 'file_path' and parses all the references that it may contain. Returns a list of Reference instances with the extracted instances. """ references = [] try: self.parser = self.util_factory.create_parser(self.format) except UtilCreationError, e: log.error('Error creating parser for format %s: %s' % #@UndefinedVariable (str(self.format), str(e))) return references
def make_reference(self, file, target_format): """ Uses the controllers to extract the content of a file, get some query strings, retrieve results from a search engine, and extract the reference. """ extraction = Extraction() extraction.file_path = file extraction.target_format = target_format log.info("Making reference for file: %s" % file) #@UndefinedVariable rce = RCEController(self.factory) raw_text = rce.extract_content(file, FileFormat.TXT) if not raw_text: return extraction extraction.query_strings = rce.get_query_strings(raw_text) if not extraction.query_strings: log.error('No query strings extracted') #@UndefinedVariable return extraction log.debug("Query strings %s" % str(extraction.query_strings)) #@UndefinedVariable ir = IRController(self.factory) extraction.top_results, extraction.used_query = ( ir.get_top_results(extraction.query_strings)) if not extraction.top_results: log.error('No top results to use with the available wrappers ' #@UndefinedVariable 'after trying %d queries' % len(extraction.query_strings)) return extraction extraction.query_strings.remove(extraction.used_query) log.debug("Used query %s" % str(extraction.used_query)) #@UndefinedVariable log.debug("Query returned %d top results" % len(extraction.top_results)) #@UndefinedVariable ie = IEController(self.factory, target_format) extraction.entries, extraction.used_result = ( ie.extract_reference(extraction.top_results, raw_text)) extraction.top_results.remove(extraction.used_result) log.info("Used result: %s" % str(extraction.used_result)) #@UndefinedVariable validator = ReferenceValidator(FIELD_WEIGHTS) for entry in extraction.entries: validator.validate(entry, raw_text) return extraction
def _format_reference(self, reference): """ Formats a reference with the target format. """ if reference.format == self.format: return # Create a formatter and a generator formatter = ReferenceFormatter() try: generator = self.util_factory.create_generator(self.format) except UtilCreationError as e: log.error('Could not create a formatter for %s: %s' % #@UndefinedVariable (self.format, e.args)) return formatter.format_reference(reference, generator)
def _get_content(self, url): """ This method looks for the content of an example's URL. In order not to overload the server, it sleeps for some time between multiple calls. """ time_to_sleep = (self.seconds_between_requests - (datetime.now() - self.last_request).seconds) if time_to_sleep > 0: sleep(time_to_sleep) content = None try: content = Browser().get_page(url) content = ContentCleaner().clean_content(content) except BrowserError as e: log.error('Error retrieving page %s: %s' % (url, #@UndefinedVariable e.error)) self.last_request = datetime.now() return content
def format_reference(self, reference_id): log.debug('Retrieving reference from the database') #@UndefinedVariable reference = self.reference_gw.find_reference_by_id(reference_id) if not reference: log.error('Reference with id %d could not be retrieved' #@UndefinedVariable % reference_id) return None formatter = ReferenceFormatter() try: generator = self.util_factory.create_generator(self.format) except UtilCreationError as e: log.error('Could not create a formatter for %s: %s' % #@UndefinedVariable (self.format, e.args)) return None log.debug('Starting to format') #@UndefinedVariable formatter.format_reference(reference, generator) return reference.entry
def get_top_results(self, query_strings, engine=ENGINE): """ Returns a list of search results. """ results = [] # Get a searcher try: searcher = self.util_factory.create_searcher(engine) except UtilCreationError as e: log.error('Could not create a searcher: %s' % e.args) #@UndefinedVariable return results # Search the query strings for query in query_strings: searcher.set_query(query) try: log.debug('Searching query %s' % (query)) #@UndefinedVariable results = searcher.get_results() except SearchError, e: log.error(e.error) #@UndefinedVariable break if searcher.num_results >= self.too_many_results: log.debug( 'Search with query %s yielded too many results ' #@UndefinedVariable '(%d or more)' % (query, self.too_many_results)) results = [] continue if results: log.info( 'Searcher yielded the following results using ' #@UndefinedVariable 'query %s' % (query)) for result in results: log.info(' %s' % result.url[:120]) #@UndefinedVariable results = self._sort_results(results) if results: break
def extract_reference(self, top_results, raw_text): """ Returns a list of References if they can be extracted or an empty list otherwise. A single publication may need more than a reference (e.g: inproceedings and its proceedings) """ log.info('Using %d top results' % len(top_results)) #@UndefinedVariable page = None references = [] for result in top_results: try: log.debug('Retrieving page for result %s' % result.url) #@UndefinedVariable page = self.browser.get_page(result.url) except BrowserError as e: log.error('Error retrieving page %s: %s' % ( result.url, #@UndefinedVariable e.error)) continue page = ContentCleaner().clean_content(page) references = self._use_reference_wrappers(result.base_url, page, raw_text) if not references: references = self._use_rule_wrappers(result.base_url, page, raw_text) if references: break # Convert to target format, if necessary for reference in references: self._format_reference(reference) # Return the extracted reference and the result that has been used return (references, result)
def generate_wrappers(self, url): wrapper_manager = WrapperGateway() example_manager = ExampleGateway( max_examples=self.max_examples, max_examples_from_db=self.max_examples_from_db, seconds_between_requests=self.secs_between_reqs) example_sets = example_manager.get_examples(self.wrapper_gen_examples, url, self.min_validity) rulers = [] for set in example_sets: log.info('Starting wrapper training for set "%s"' % set) #@UndefinedVariable if set == 'author' or set == 'editor': rulers = [ MultiValuePathRuler(), SeparatorsRegexRuler(), ElementsRegexRuler(), PersonRuler() ] else: try: value_guide = self.value_guides[set] pass except KeyError: value_guide = '.*' rulers = [PathRuler(value_guide), RegexRuler()] trainer = WrapperTrainer(rulers, self.wrapper_gen_examples) try: wrappers = trainer.train(example_sets[set]) wrappers = self._prune_wrappers(wrappers) wrapper_manager.persist_wrappers(url, set, wrappers) log.info('Trainer generated %d wrappers' % len(wrappers)) #@UndefinedVariable except Exception, e: log.error('Error training wrapper for set "%s": %s' % (set, e)) #@UndefinedVariable
def get_top_results(self, query_strings, engine=ENGINE): """ Returns a list of search results. """ results = [] # Get a searcher try: searcher = self.util_factory.create_searcher(engine) except UtilCreationError as e: log.error('Could not create a searcher: %s' % e.args) #@UndefinedVariable return results # Search the query strings for query in query_strings: searcher.set_query(query) try: log.debug('Searching query %s' % (query)) #@UndefinedVariable results = searcher.get_results() except SearchError, e: log.error(e.error) #@UndefinedVariable break if searcher.num_results >= self.too_many_results: log.debug('Search with query %s yielded too many results ' #@UndefinedVariable '(%d or more)' % (query, self.too_many_results)) results = [] continue if results: log.info('Searcher yielded the following results using ' #@UndefinedVariable 'query %s' % (query)) for result in results: log.info(' %s' % result.url[:120]) #@UndefinedVariable results = self._sort_results(results) if results: break
def format_reference(self, reference_id): log.debug( 'Retrieving reference from the database') #@UndefinedVariable reference = self.reference_gw.find_reference_by_id(reference_id) if not reference: log.error( 'Reference with id %d could not be retrieved' #@UndefinedVariable % reference_id) return None formatter = ReferenceFormatter() try: generator = self.util_factory.create_generator(self.format) except UtilCreationError as e: log.error('Could not create a formatter for %s: %s' % #@UndefinedVariable (self.format, e.args)) return None log.debug('Starting to format') #@UndefinedVariable formatter.format_reference(reference, generator) return reference.entry
def extract_reference(self, top_results, raw_text): """ Returns a list of References if they can be extracted or an empty list otherwise. A single publication may need more than a reference (e.g: inproceedings and its proceedings) """ log.info('Using %d top results' % len(top_results)) #@UndefinedVariable page = None references = [] for result in top_results: try: log.debug('Retrieving page for result %s' % result.url) #@UndefinedVariable page = self.browser.get_page(result.url) except BrowserError as e: log.error('Error retrieving page %s: %s' % (result.url, #@UndefinedVariable e.error)) continue page = ContentCleaner().clean_content(page) references = self._use_reference_wrappers(result.base_url, page, raw_text) if not references: references = self._use_rule_wrappers(result.base_url, page, raw_text) if references: break # Convert to target format, if necessary for reference in references: self._format_reference(reference) # Return the extracted reference and the result that has been used return (references, result)
def _update_rules(self, wrapper): for index in range(self.ui.rules.topLevelItemCount() - 1): item = self.ui.rules.topLevelItem(index) log.debug('Updating rule %d' % index) #@UndefinedVariable # Remove empty items if ((len(wrapper.rules) > index) and not (item.text(0) and item.text(1) and item.text(2))): wrapper.rules.pop(index) continue # Skip non-empty items that have an invalid status try: rule_type = str(item.text(0)) pattern = str(item.text(1)) order = int(str(item.text(2))) except (TypeError, ValueError): log.error('Error when casting to store to database' ) #@UndefinedVariable continue # Check that the pattern can be converted to a python object try: pattern_py = simplejson.loads(pattern) #@UnusedVariable except ValueError: log.debug('Cannot convert pattern %s to Python objects' % pattern) #@UndefinedVariable continue # Update or append the rules if (len(wrapper.rules) > index): wrapper.rules[index].rule_type = rule_type wrapper.rules[index].pattern = pattern wrapper.rules[index].order = order else: wrapper.add_rule_by_info(rule_type, pattern, order)
def extract(self, input_file): input_file = self._check_input_file(input_file) # Extraction command and its options. They may be parametrized in the # future command = [self._pdf_extraction_tool, '-q', '-f', '1', '-l', '2', '-enc', 'ASCII7', '-htmlmeta', input_file, '-'] try: pop = subprocess.Popen(command, stdout=subprocess.PIPE) except subprocess.CalledProcessError as cpe: log.error ('Error executing PDF text extraction tool. Return code: ' #@UndefinedVariable + repr(cpe.returncode)) except OSError: log.error ('PDF extraction tool not found') #@UndefinedVariable stdout = pop.communicate()[0] if not stdout: raise ExtractionError('Corrupted file') parser = BeautifulSoup(stdout) document = Document() self._extract_metadata(parser, document) self._extract_content(parser, document) return document
class ReferencesController(Controller): def __init__(self, factory, format=ReferenceFormat.BIBTEX): super(ReferencesController, self).__init__(factory) self.format = format def get_format(self): return self.__format def set_format(self, value): self.__format = value format = property(get_format, set_format) def _parse_entries_file(self, file_path): """ Reads the file described by 'file_path' and parses all the references that it may contain. Returns a list of Reference instances with the extracted instances. """ references = [] try: self.parser = self.util_factory.create_parser(self.format) except UtilCreationError, e: log.error('Error creating parser for format %s: %s' % #@UndefinedVariable (str(self.format), str(e))) return references try: file = open(file_path, 'r') content = file.read() except Exception, e: log.error( 'Error reading entries file %s: %s' % #@UndefinedVariable (file_path, str(e))) return references
def _use_reference_wrappers(self, source, page, raw_text): """ Use a reference wrapper to get the reference from a given page. Returns a list of References with the full entry, format and a structure with the different fields. A single publication may need more than a reference (e.g: inproceedings and its proceedings) """ log.info('Attempting to extract reference with a reference wrapper' ) #@UndefinedVariable references = [] entry, format = ReferenceWrapper().extract_info(source, page) if not entry: log.debug('Could not find any entry using a reference wrapper' ) #@UndefinedVariable return references # Create a parser for the given reference format try: parser = self.util_factory.create_parser(format) except UtilCreationError as e: log.error('Could not create a parser for %s: %s' % ( format, #@UndefinedVariable e.args)) return references if not parser.check_format(entry): log.error('Given entry is not in %s' % format) #@UndefinedVariable return references # There may be more than one entry for the same file. log.debug('Parsing extracted entries') #@UndefinedVariable try: entries = parser.split_source(entry) for entry in entries: fields = parser.parse_entry(entry) reference = Reference(fields, format, entry) self._validate_reference_fields(reference, raw_text) references.append(reference) except Exception, e: log.error('Error parsing extracted entry: %s ' % e) #@UndefinedVariable
def extract_content(self, file, target_format): content = None source_format = FileFormat().get_format(file) # Get an extractor try: extractor = self.util_factory.create_extractor(source_format, target_format) except UtilCreationError as e: log.error('Could not create an extractor: %s' % e.args) #@UndefinedVariable return content # Extract content try: content = extractor.extract(file).content except ExtractionError: log.error('Could not extract content for file: %s' % file) #@UndefinedVariable except IOError as e: log.error('Error extracting file content: %s' % e.args) #@UndefinedVariable return content
def _use_reference_wrappers(self, source, page, raw_text): """ Use a reference wrapper to get the reference from a given page. Returns a list of References with the full entry, format and a structure with the different fields. A single publication may need more than a reference (e.g: inproceedings and its proceedings) """ log.info('Attempting to extract reference with a reference wrapper') #@UndefinedVariable references = [] entry, format = ReferenceWrapper().extract_info(source, page) if not entry: log.debug('Could not find any entry using a reference wrapper') #@UndefinedVariable return references # Create a parser for the given reference format try: parser = self.util_factory.create_parser(format) except UtilCreationError as e: log.error('Could not create a parser for %s: %s' % (format, #@UndefinedVariable e.args)) return references if not parser.check_format(entry): log.error('Given entry is not in %s' % format) #@UndefinedVariable return references # There may be more than one entry for the same file. log.debug('Parsing extracted entries') #@UndefinedVariable try: entries = parser.split_source(entry) for entry in entries: fields = parser.parse_entry(entry) reference = Reference(fields, format, entry) self._validate_reference_fields(reference, raw_text) references.append(reference) except Exception, e: log.error('Error parsing extracted entry: %s ' % e) #@UndefinedVariable
def extract_content(self, file, target_format): content = None source_format = FileFormat().get_format(file) # Get an extractor try: extractor = self.util_factory.create_extractor( source_format, target_format) except UtilCreationError as e: log.error('Could not create an extractor: %s' % e.args) #@UndefinedVariable return content # Extract content try: content = extractor.extract(file).content except ExtractionError: log.error('Could not extract content for file: %s' % file) #@UndefinedVariable except IOError as e: log.error('Error extracting file content: %s' % e.args) #@UndefinedVariable return content
try: file = open(file_path, 'r') content = file.read() except Exception, e: log.error( 'Error reading entries file %s: %s' % #@UndefinedVariable (file_path, str(e))) return references if not content: log.info('Empty entries file') #@UndefinedVariable return references if not self.parser.check_format(content): log.error('Given entry is not in %s' % format) #@UndefinedVariable return references # There may be more than one entry for the same file. log.debug('Parsing entries') #@UndefinedVariable entries = self.parser.split_source(content) for entry in entries: fields = self.parser.parse_entry(entry) reference = Reference(fields, format, entry) reference.validity = 1.0 references.append(reference) return references def persist_file_references(self, file_path): """
try: file = open(file_path, 'r') content = file.read() except Exception, e: log.error('Error reading entries file %s: %s' % #@UndefinedVariable (file_path, str(e))) return references if not content: log.info('Empty entries file') #@UndefinedVariable return references if not self.parser.check_format(content): log.error('Given entry is not in %s' % format) #@UndefinedVariable return references # There may be more than one entry for the same file. log.debug('Parsing entries') #@UndefinedVariable entries = self.parser.split_source(content) for entry in entries: fields = self.parser.parse_entry(entry) reference = Reference(fields, format, entry) reference.validity = 1.0 references.append(reference) return references def persist_file_references(self, file_path): """