def extract_text(self, version=None, perform_ocr=False, ln='en'): """ Try what is necessary to extract the textual information of a document. @param version: the version of the document for which text is required. If not specified the text will be retrieved from the last version. @type version: integer @param perform_ocr: whether to perform OCR. @type perform_ocr: bool @param ln: a two letter language code to give as a hint to the OCR procedure. @type ln: string @raise InvenioBibDocFileError: in case of error. @note: the text is extracted and cached for later use. Use L{get_text} to retrieve it. """ from invenio.legacy.websubmit.file_converter import get_best_format_to_extract_text_from, convert_file, InvenioWebSubmitFileConverterError if version is None: version = self.get_latest_version() docfiles = self.list_version_files(version) ## We try to extract text only from original or OCRed documents. filenames = [ docfile.get_full_path() for docfile in docfiles if 'CONVERTED' not in docfile.flags or 'OCRED' in docfile.flags ] try: filename = get_best_format_to_extract_text_from(filenames) except InvenioWebSubmitFileConverterError: ## We fall back on considering all the documents filenames = [docfile.get_full_path() for docfile in docfiles] try: filename = get_best_format_to_extract_text_from(filenames) except InvenioWebSubmitFileConverterError: open(os.path.join(self.basedir, '.text;%i' % version), 'w').write('') return try: convert_file(filename, os.path.join(self.basedir, '.text;%i' % version), '.txt', perform_ocr=perform_ocr, ln=ln) if version == self.get_latest_version(): run_sql( "UPDATE bibdoc SET text_extraction_date=NOW() WHERE id=%s", (self.id, )) except InvenioWebSubmitFileConverterError as e: register_exception( alert_admin=True, prefix="Error in extracting text from bibdoc %i, version %i" % (self.id, version)) raise InvenioBibDocFileError, str(e)
def createRelatedFormats(fullpath, overwrite=True, debug=False): """Given a fullpath, this function extracts the file's extension and finds in which additional format the file can be converted and converts it. @param fullpath: (string) complete path to file @param overwrite: (bool) overwrite already existing formats Return a list of the paths to the converted files """ file_converter_logger = get_file_converter_logger() old_logging_level = file_converter_logger.getEffectiveLevel() if debug: file_converter_logger.setLevel(DEBUG) try: createdpaths = [] basedir, filename, extension = decompose_file(fullpath) extension = extension.lower() if debug: print("basedir: %s, filename: %s, extension: %s" % (basedir, filename, extension), file=sys.stderr) filelist = glob.glob(os.path.join(basedir, '%s*' % filename)) if debug: print("filelist: %s" % filelist, file=sys.stderr) missing_formats = get_missing_formats(filelist) if debug: print("missing_formats: %s" % missing_formats, file=sys.stderr) for path, formats in iteritems(missing_formats): if debug: print("... path: %s, formats: %s" % (path, formats), file=sys.stderr) for aformat in formats: if debug: print("...... aformat: %s" % aformat, file=sys.stderr) newpath = os.path.join(basedir, filename + aformat) if debug: print("...... newpath: %s" % newpath, file=sys.stderr) try: convert_file(path, newpath) createdpaths.append(newpath) except InvenioWebSubmitFileConverterError as msg: if debug: print("...... Exception: %s" % msg, file=sys.stderr) register_exception(alert_admin=True) finally: if debug: file_converter_logger.setLevel(old_logging_level) return createdpaths
def extract_text(self, version=None, perform_ocr=False, ln='en'): """ Try what is necessary to extract the textual information of a document. @param version: the version of the document for which text is required. If not specified the text will be retrieved from the last version. @type version: integer @param perform_ocr: whether to perform OCR. @type perform_ocr: bool @param ln: a two letter language code to give as a hint to the OCR procedure. @type ln: string @raise InvenioBibDocFileError: in case of error. @note: the text is extracted and cached for later use. Use L{get_text} to retrieve it. """ from invenio.legacy.websubmit.file_converter import get_best_format_to_extract_text_from, convert_file, InvenioWebSubmitFileConverterError if version is None: version = self.get_latest_version() docfiles = self.list_version_files(version) ## We try to extract text only from original or OCRed documents. filenames = [docfile.get_full_path() for docfile in docfiles if 'CONVERTED' not in docfile.flags or 'OCRED' in docfile.flags] try: filename = get_best_format_to_extract_text_from(filenames) except InvenioWebSubmitFileConverterError: ## We fall back on considering all the documents filenames = [docfile.get_full_path() for docfile in docfiles] try: filename = get_best_format_to_extract_text_from(filenames) except InvenioWebSubmitFileConverterError: open(os.path.join(self.basedir, '.text;%i' % version), 'w').write('') return try: convert_file(filename, os.path.join(self.basedir, '.text;%i' % version), '.txt', perform_ocr=perform_ocr, ln=ln) if version == self.get_latest_version(): run_sql("UPDATE bibdoc SET text_extraction_date=NOW() WHERE id=%s", (self.id, )) except InvenioWebSubmitFileConverterError as e: register_exception(alert_admin=True, prefix="Error in extracting text from bibdoc %i, version %i" % (self.id, version)) raise InvenioBibDocFileError, str(e)
def _run_test(self): from invenio.legacy.websubmit.file_converter import InvenioWebSubmitFileConverterError, convert_file try: tmpdir_snapshot1 = set(os.listdir(cfg['CFG_TMPDIR'])) output_file = convert_file(self.input_file, output_format=self.to_format) tmpdir_snapshot2 = set(os.listdir(cfg['CFG_TMPDIR'])) tmpdir_snapshot2.discard(os.path.basename(output_file)) if not os.path.exists(output_file): raise InvenioWebSubmitFileConverterError("output_file %s was not correctly created" % output_file) if tmpdir_snapshot2 - tmpdir_snapshot1: raise InvenioWebSubmitFileConverterError("Some temporary files were left over: %s" % (tmpdir_snapshot2 - tmpdir_snapshot1)) except Exception, err: register_exception(alert_admin=True) self.fail("ERROR: when converting from %s to %s: %s, the log contained: %s" % (self.from_format, self.to_format, err, self.log.getvalue()))
def _run_test(self): from invenio.legacy.websubmit.file_converter import InvenioWebSubmitFileConverterError, convert_file try: tmpdir_snapshot1 = set(os.listdir(cfg['CFG_TMPDIR'])) output_file = convert_file(self.input_file, output_format=self.to_format) tmpdir_snapshot2 = set(os.listdir(cfg['CFG_TMPDIR'])) tmpdir_snapshot2.discard(os.path.basename(output_file)) if not os.path.exists(output_file): raise InvenioWebSubmitFileConverterError( "output_file %s was not correctly created" % output_file) if tmpdir_snapshot2 - tmpdir_snapshot1: raise InvenioWebSubmitFileConverterError( "Some temporary files were left over: %s" % (tmpdir_snapshot2 - tmpdir_snapshot1)) except Exception, err: register_exception(alert_admin=True) self.fail( "ERROR: when converting from %s to %s: %s, the log contained: %s" % (self.from_format, self.to_format, err, self.log.getvalue()))
def createRelatedFormats(fullpath, overwrite=True, debug=False, consider_version=False): """Given a fullpath, this function extracts the file's extension and finds in which additional format the file can be converted and converts it. @param fullpath: (string) complete path to file @param overwrite: (bool) overwrite already existing formats @param consider_version: (bool) if True, consider the version info in C{fullpath} to find missing format for that specific version, if C{fullpath} contains version info Return a list of the paths to the converted files """ file_converter_logger = get_file_converter_logger() old_logging_level = file_converter_logger.getEffectiveLevel() if debug: file_converter_logger.setLevel(DEBUG) try: createdpaths = [] if consider_version: try: basedir, filename, extension, version = decompose_file_with_version( fullpath) except: basedir, filename, extension = decompose_file(fullpath) version = 0 else: basedir, filename, extension = decompose_file(fullpath) version = 0 extension = extension.lower() if debug: print("basedir: %s, filename: %s, extension: %s" % (basedir, filename, extension), file=sys.stderr) if overwrite: missing_formats = get_missing_formats([fullpath]) else: if version: filelist = glob.glob( os.path.join(basedir, '%s*;%s' % (filename, version))) else: filelist = glob.glob(os.path.join(basedir, '%s*' % filename)) if debug: print("filelist: %s" % filelist, file=sys.stderr) missing_formats = get_missing_formats(filelist) if debug: print("missing_formats: %s" % missing_formats, file=sys.stderr) for path, formats in iteritems(missing_formats): if debug: print("... path: %s, formats: %s" % (path, formats), file=sys.stderr) for aformat in formats: if debug: print("...... aformat: %s" % aformat, file=sys.stderr) newpath = os.path.join(basedir, filename + aformat) if debug: print("...... newpath: %s" % newpath, file=sys.stderr) try: if CFG_BIBDOCFILE_FILEDIR in basedir: # We should create the new files in a temporary location, not # directly inside the BibDoc directory. newpath = convert_file(path, output_format=aformat) else: convert_file(path, newpath) createdpaths.append(newpath) except InvenioWebSubmitFileConverterError as msg: if debug: print("...... Exception: %s" % msg, file=sys.stderr) register_exception(alert_admin=True) finally: if debug: file_converter_logger.setLevel(old_logging_level) return createdpaths
def get_words_from_fulltext(self, url_direct_or_indirect): """Returns all the words contained in the document specified by URL_DIRECT_OR_INDIRECT with the words being split by various SRE_SEPARATORS regexp set earlier. If FORCE_FILE_EXTENSION is set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF file. (This is interesting to index Indico for example.) Note also that URL_DIRECT_OR_INDIRECT may be either a direct URL to the fulltext file or an URL to a setlink-like page body that presents the links to be indexed. In the latter case the URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs to fulltext documents, for all knows file extensions as specified by global CONV_PROGRAMS config variable. """ write_message("... reading fulltext files from %s started" % url_direct_or_indirect, verbose=2) try: if bibdocfile_url_p(url_direct_or_indirect): write_message("... %s is an internal document" % url_direct_or_indirect, verbose=2) bibdoc = bibdocfile_url_to_bibdoc(url_direct_or_indirect) indexer = get_idx_indexer('fulltext') if indexer != 'native': # A document might belong to multiple records for rec_link in bibdoc.bibrec_links: recid = rec_link["recid"] # Adds fulltexts of all files once per records if not recid in fulltext_added: bibrecdocs = BibRecDocs(recid) text = bibrecdocs.get_text() if indexer == 'SOLR' and CFG_SOLR_URL: solr_add_fulltext(recid, text) elif indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED: xapian_add(recid, 'fulltext', text) fulltext_added.add(recid) # we are relying on an external information retrieval system # to provide full-text indexing, so dispatch text to it and # return nothing here: return [] else: text = "" if hasattr(bibdoc, "get_text"): text = bibdoc.get_text() return self.tokenize_for_words_default(text) else: if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY: write_message("... %s is external URL but indexing only local files" % url_direct_or_indirect, verbose=2) return [] write_message("... %s is an external URL" % url_direct_or_indirect, verbose=2) urls_to_index = set() for splash_re, url_re in iteritems(CFG_BIBINDEX_SPLASH_PAGES): if re.match(splash_re, url_direct_or_indirect): write_message("... %s is a splash page (%s)" % (url_direct_or_indirect, splash_re), verbose=2) html = urllib2.urlopen(url_direct_or_indirect).read() urls = get_links_in_html_page(html) write_message("... found these URLs in %s splash page: %s" % (url_direct_or_indirect, ", ".join(urls)), verbose=3) for url in urls: if re.match(url_re, url): write_message("... will index %s (matched by %s)" % (url, url_re), verbose=2) urls_to_index.add(url) if not urls_to_index: urls_to_index.add(url_direct_or_indirect) write_message("... will extract words from %s" % ', '.join(urls_to_index), verbose=2) words = {} for url in urls_to_index: tmpdoc = download_url(url) file_converter_logger = get_file_converter_logger() old_logging_level = file_converter_logger.getEffectiveLevel() if self.verbose > 3: file_converter_logger.setLevel(logging.DEBUG) try: try: tmptext = convert_file(tmpdoc, output_format='.txt') text = open(tmptext).read() os.remove(tmptext) indexer = get_idx_indexer('fulltext') if indexer != 'native': if indexer == 'SOLR' and CFG_SOLR_URL: solr_add_fulltext(None, text) # FIXME: use real record ID if indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED: #xapian_add(None, 'fulltext', text) # FIXME: use real record ID pass # we are relying on an external information retrieval system # to provide full-text indexing, so dispatch text to it and # return nothing here: tmpwords = [] else: tmpwords = self.tokenize_for_words_default(text) words.update(dict(map(lambda x: (x, 1), tmpwords))) except Exception as e: message = 'ERROR: it\'s impossible to correctly extract words from %s referenced by %s: %s' % (url, url_direct_or_indirect, e) register_exception(prefix=message, alert_admin=True) write_message(message, stream=sys.stderr) finally: os.remove(tmpdoc) if self.verbose > 3: file_converter_logger.setLevel(old_logging_level) return words.keys() except Exception as e: message = 'ERROR: it\'s impossible to correctly extract words from %s: %s' % (url_direct_or_indirect, e) register_exception(prefix=message, alert_admin=True) write_message(message, stream=sys.stderr) return []
def createRelatedFormats(fullpath, overwrite=True, debug=False, consider_version=False): """Given a fullpath, this function extracts the file's extension and finds in which additional format the file can be converted and converts it. @param fullpath: (string) complete path to file @param overwrite: (bool) overwrite already existing formats @param consider_version: (bool) if True, consider the version info in C{fullpath} to find missing format for that specific version, if C{fullpath} contains version info Return a list of the paths to the converted files """ file_converter_logger = get_file_converter_logger() old_logging_level = file_converter_logger.getEffectiveLevel() if debug: file_converter_logger.setLevel(DEBUG) try: createdpaths = [] if consider_version: try: basedir, filename, extension, version = decompose_file_with_version(fullpath) except: basedir, filename, extension = decompose_file(fullpath) version = 0 else: basedir, filename, extension = decompose_file(fullpath) version = 0 extension = extension.lower() if debug: print("basedir: %s, filename: %s, extension: %s" % (basedir, filename, extension), file=sys.stderr) if overwrite: missing_formats = get_missing_formats([fullpath]) else: if version: filelist = glob.glob(os.path.join(basedir, '%s*;%s' % (filename, version))) else: filelist = glob.glob(os.path.join(basedir, '%s*' % filename)) if debug: print("filelist: %s" % filelist, file=sys.stderr) missing_formats = get_missing_formats(filelist) if debug: print("missing_formats: %s" % missing_formats, file=sys.stderr) for path, formats in iteritems(missing_formats): if debug: print("... path: %s, formats: %s" % (path, formats), file=sys.stderr) for aformat in formats: if debug: print("...... aformat: %s" % aformat, file=sys.stderr) newpath = os.path.join(basedir, filename + aformat) if debug: print("...... newpath: %s" % newpath, file=sys.stderr) try: if CFG_BIBDOCFILE_FILEDIR in basedir: # We should create the new files in a temporary location, not # directly inside the BibDoc directory. newpath = convert_file(path, output_format=aformat) else: convert_file(path, newpath) createdpaths.append(newpath) except InvenioWebSubmitFileConverterError as msg: if debug: print("...... Exception: %s" % msg, file=sys.stderr) register_exception(alert_admin=True) finally: if debug: file_converter_logger.setLevel(old_logging_level) return createdpaths
def get_words_from_fulltext(self, url_direct_or_indirect): """Returns all the words contained in the document specified by URL_DIRECT_OR_INDIRECT with the words being split by various SRE_SEPARATORS regexp set earlier. If FORCE_FILE_EXTENSION is set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF file. (This is interesting to index Indico for example.) Note also that URL_DIRECT_OR_INDIRECT may be either a direct URL to the fulltext file or an URL to a setlink-like page body that presents the links to be indexed. In the latter case the URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs to fulltext documents, for all knows file extensions as specified by global CONV_PROGRAMS config variable. """ write_message("... reading fulltext files from %s started" % url_direct_or_indirect, verbose=2) try: if bibdocfile_url_p(url_direct_or_indirect): write_message("... %s is an internal document" % url_direct_or_indirect, verbose=2) try: bibdoc = bibdocfile_url_to_bibdoc(url_direct_or_indirect) except InvenioBibDocFileError: # Outdated 8564 tag return [] indexer = get_idx_indexer('fulltext') if indexer != 'native': # A document might belong to multiple records for rec_link in bibdoc.bibrec_links: recid = rec_link["recid"] # Adds fulltexts of all files once per records if not recid in fulltext_added: bibrecdocs = BibRecDocs(recid) try: text = bibrecdocs.get_text() except InvenioBibDocFileError: # Invalid PDF continue if indexer == 'SOLR' and CFG_SOLR_URL: solr_add_fulltext(recid, text) elif indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED: xapian_add(recid, 'fulltext', text) fulltext_added.add(recid) # we are relying on an external information retrieval system # to provide full-text indexing, so dispatch text to it and # return nothing here: return [] else: text = "" if hasattr(bibdoc, "get_text"): text = bibdoc.get_text() return self.tokenize_for_words_default(text) else: if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY: write_message( "... %s is external URL but indexing only local files" % url_direct_or_indirect, verbose=2) return [] write_message("... %s is an external URL" % url_direct_or_indirect, verbose=2) urls_to_index = set() for splash_re, url_re in iteritems(CFG_BIBINDEX_SPLASH_PAGES): if re.match(splash_re, url_direct_or_indirect): write_message("... %s is a splash page (%s)" % (url_direct_or_indirect, splash_re), verbose=2) html = urllib2.urlopen(url_direct_or_indirect).read() urls = get_links_in_html_page(html) write_message( "... found these URLs in %s splash page: %s" % (url_direct_or_indirect, ", ".join(urls)), verbose=3) for url in urls: if re.match(url_re, url): write_message( "... will index %s (matched by %s)" % (url, url_re), verbose=2) urls_to_index.add(url) if not urls_to_index: urls_to_index.add(url_direct_or_indirect) write_message("... will extract words from %s" % ', '.join(urls_to_index), verbose=2) words = {} for url in urls_to_index: tmpdoc = download_url(url) file_converter_logger = get_file_converter_logger() old_logging_level = file_converter_logger.getEffectiveLevel( ) if self.verbose > 3: file_converter_logger.setLevel(logging.DEBUG) try: try: tmptext = convert_file(tmpdoc, output_format='.txt') text = open(tmptext).read() os.remove(tmptext) indexer = get_idx_indexer('fulltext') if indexer != 'native': if indexer == 'SOLR' and CFG_SOLR_URL: solr_add_fulltext( None, text) # FIXME: use real record ID if indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED: #xapian_add(None, 'fulltext', text) # FIXME: use real record ID pass # we are relying on an external information retrieval system # to provide full-text indexing, so dispatch text to it and # return nothing here: tmpwords = [] else: tmpwords = self.tokenize_for_words_default( text) words.update(dict(map(lambda x: (x, 1), tmpwords))) except Exception as e: message = 'ERROR: it\'s impossible to correctly extract words from %s referenced by %s: %s' % ( url, url_direct_or_indirect, e) register_exception(prefix=message, alert_admin=True) write_message(message, stream=sys.stderr) finally: os.remove(tmpdoc) if self.verbose > 3: file_converter_logger.setLevel(old_logging_level) return words.keys() except Exception as e: message = 'ERROR: it\'s impossible to correctly extract words from %s: %s' % ( url_direct_or_indirect, e) register_exception(prefix=message, alert_admin=True) write_message(message, stream=sys.stderr) return []