def Move_Files_Archive(parameters, curdir, form, user_info=None): """DEPRECATED: Use FFT instead.""" MainDir = "%s/files/MainFiles" % curdir IncludeDir = "%s/files/AdditionalFiles" % curdir watcheddirs = {'Main': MainDir, 'Additional': IncludeDir} for type, dir in watcheddirs.iteritems(): if os.path.exists(dir): formats = {} files = os.listdir(dir) files.sort() for file in files: dummy, filename, extension = decompose_file(file) if not formats.has_key(filename): formats[filename] = [] formats[filename].append(normalize_format(extension)) # first delete all missing files bibarchive = BibRecDocs(sysno) existingBibdocs = bibarchive.list_bibdocs(type) for existingBibdoc in existingBibdocs: if not formats.has_key(existingBibdoc.get_docname()): existingBibdoc.delete() # then create/update the new ones for key in formats.keys(): # instanciate bibdoc object bibarchive.add_new_file('%s/%s%s' % (dir, key, formats[key]), doctype=type, never_fail=True) return ""
def _getfile_py(req, recid=0, docid=0, version="", name="", docformat="", ln=CFG_SITE_LANG): if not recid: ## Let's obtain the recid from the docid if docid: try: bibdoc = BibDoc(docid=docid) recid = bibdoc.bibrec_links[0]["recid"] except InvenioBibDocFileError: return warning_page(_("An error has happened in trying to retrieve the requested file."), req, ln) else: return warning_page(_("Not enough information to retrieve the document"), req, ln) else: brd = BibRecDocs(recid) if not name and docid: ## Let's obtain the name from the docid try: name = brd.get_docname(docid) except InvenioBibDocFileError: return warning_page(_("An error has happened in trying to retrieving the requested file."), req, ln) docformat = normalize_format(docformat) redirect_to_url( req, "%s/%s/%s/files/%s%s?ln=%s%s" % (CFG_SITE_URL, CFG_SITE_RECORD, recid, name, docformat, ln, version and "version=%s" % version or ""), apache.HTTP_MOVED_PERMANENTLY, )
def Move_Files_Archive(parameters, curdir, form, user_info=None): """DEPRECATED: Use FFT instead.""" MainDir = "%s/files/MainFiles" % curdir IncludeDir = "%s/files/AdditionalFiles" % curdir watcheddirs = {'Main' : MainDir, 'Additional' : IncludeDir} for type, dir in watcheddirs.iteritems(): if os.path.exists(dir): formats = {} files = os.listdir(dir) files.sort() for file in files: dummy, filename, extension = decompose_file(file) if not formats.has_key(filename): formats[filename] = [] formats[filename].append(normalize_format(extension)) # first delete all missing files bibarchive = BibRecDocs(sysno) existingBibdocs = bibarchive.list_bibdocs(type) for existingBibdoc in existingBibdocs: if not formats.has_key(existingBibdoc.get_docname()): existingBibdoc.delete() # then create/update the new ones for key in formats.keys(): # instanciate bibdoc object bibarchive.add_new_file('%s/%s%s' % (dir, key, formats[key]), doctype=type, never_fail=True) return ""
def main_cli(): """ main function when the library behaves as a normal CLI tool. """ from invenio.bibdocfile import normalize_format parser = OptionParser() parser.add_option("-c", "--convert", dest="input_name", help="convert the specified FILE", metavar="FILE") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="Enable debug information") parser.add_option("--special-pdf2hocr2pdf", dest="ocrize", help="convert the given scanned PDF into a PDF with OCRed text", metavar="FILE") parser.add_option("-f", "--format", dest="output_format", help="the desired output format", metavar="FORMAT") parser.add_option("-o", "--output", dest="output_name", help="the desired output FILE (if not specified a new file will be generated with the desired output format)") parser.add_option("--without-pdfa", action="store_false", dest="pdf_a", default=True, help="don't force creation of PDF/A PDFs") parser.add_option("--without-pdfopt", action="store_false", dest="pdfopt", default=True, help="don't force optimization of PDFs files") parser.add_option("--without-ocr", action="store_false", dest="ocr", default=True, help="don't force OCR") parser.add_option("--can-convert", dest="can_convert", help="display all the possible format that is possible to generate from the given format", metavar="FORMAT") parser.add_option("--is-ocr-needed", dest="check_ocr_is_needed", help="check if OCR is needed for the FILE specified", metavar="FILE") parser.add_option("-t", "--title", dest="title", help="specify the title (used when creating PDFs)", metavar="TITLE") parser.add_option("-l", "--language", dest="ln", help="specify the language (used when performing OCR, e.g. en, it, fr...)", metavar="LN", default='en') (options, dummy) = parser.parse_args() if options.debug: getLogger().setLevel(DEBUG) if options.can_convert: if options.can_convert: input_format = normalize_format(options.can_convert) if input_format == '.pdf': if can_pdfopt(): print "PDF linearization supported" else: print "No PDF linearization support" if can_pdfa(): print "PDF/A generation supported" else: print "No PDF/A generation support" if can_perform_ocr(): print "OCR supported" else: print "OCR not supported" print 'Can convert from "%s" to:' % input_format[1:], for output_format in __CONVERSION_MAP: if can_convert(input_format, output_format): print '"%s"' % output_format[1:], print elif options.check_ocr_is_needed: print "Checking if OCR is needed on %s..." % options.check_ocr_is_needed, sys.stdout.flush() if guess_is_OCR_needed(options.check_ocr_is_needed): print "needed." else: print "not needed." elif options.ocrize: try: output = pdf2hocr2pdf(options.ocrize, output_file=options.output_name, title=options.title, ln=options.ln) print "Output stored in %s" % output except InvenioWebSubmitFileConverterError, err: print "ERROR: %s" % err sys.exit(1)
def can_convert(input_format, output_format, max_intermediate_conversions=2): """Return the chain of conversion to transform input_format into output_format, if any.""" from invenio.bibdocfile import normalize_format if max_intermediate_conversions <= 0: return [] input_format = normalize_format(input_format) output_format = normalize_format(output_format) if input_format in __CONVERSION_MAP: if output_format in __CONVERSION_MAP[input_format]: return [__CONVERSION_MAP[input_format][output_format]] best_res = [] best_intermediate = '' for intermediate_format in __CONVERSION_MAP[input_format]: res = can_convert(intermediate_format, output_format, max_intermediate_conversions-1) if res and (len(res) < best_res or not best_res): best_res = res best_intermediate = intermediate_format if best_res: return [__CONVERSION_MAP[input_format][best_intermediate]] + best_res return []
def get_best_format_to_extract_text_from(filelist, best_formats=CFG_WEBSUBMIT_BEST_FORMATS_TO_EXTRACT_TEXT_FROM): """ Return among the filelist the best file whose format is best suited for extracting text. """ from invenio.bibdocfile import decompose_file, normalize_format best_formats = [normalize_format(aformat) for aformat in best_formats if can_convert(aformat, '.txt')] for aformat in best_formats: for filename in filelist: if decompose_file(filename, skip_version=True)[2].endswith(aformat): return filename raise InvenioWebSubmitFileConverterError("It's not possible to extract valuable text from any of the proposed files.")
def prepare_io(input_file, output_file=None, output_ext=None, need_working_dir=True): """Clean input_file and the output_file.""" from invenio.bibdocfile import decompose_file, normalize_format output_ext = normalize_format(output_ext) debug('Preparing IO for input=%s, output=%s, output_ext=%s' % (input_file, output_file, output_ext)) if output_ext is None: if output_file is None: output_ext = '.tmp' else: output_ext = decompose_file(output_file, skip_version=True)[2] if output_file is None: try: (fd, output_file) = tempfile.mkstemp(suffix=output_ext, dir=CFG_TMPDIR) os.close(fd) except IOError, err: raise InvenioWebSubmitFileConverterError("It's impossible to create a temporary file: %s" % err)
def _getfile_py(req, recid=0, docid=0, version="", name="", docformat="", ln=CFG_SITE_LANG): if not recid: ## Let's obtain the recid from the docid if docid: try: bibdoc = BibDoc(docid=docid) recid = bibdoc.bibrec_links[0]["recid"] except InvenioBibDocFileError: return warning_page( _("An error has happened in trying to retrieve the requested file." ), req, ln) else: return warning_page( _('Not enough information to retrieve the document'), req, ln) else: brd = BibRecDocs(recid) if not name and docid: ## Let's obtain the name from the docid try: name = brd.get_docname(docid) except InvenioBibDocFileError: return warning_page( _("An error has happened in trying to retrieving the requested file." ), req, ln) docformat = normalize_format(docformat) redirect_to_url( req, '%s/%s/%s/files/%s%s?ln=%s%s' % (CFG_SITE_URL, CFG_SITE_RECORD, recid, name, docformat, ln, version and 'version=%s' % version or ''), apache.HTTP_MOVED_PERMANENTLY)
def convert_file(input_file, output_file=None, output_format=None, **params): """ Convert files from one format to another. @param input_file [string] the path to an existing file @param output_file [string] the path to the desired ouput. (if None a temporary file is generated) @param output_format [string] the desired format (if None it is taken from output_file) @param params other paramaters to pass to the particular converter @return [string] the final output_file """ from invenio.bibdocfile import decompose_file, normalize_format if output_format is None: if output_file is None: raise ValueError("At least output_file or format should be specified.") else: output_ext = decompose_file(output_file, skip_version=True)[2] else: output_ext = normalize_format(output_format) input_ext = decompose_file(input_file, skip_version=True)[2] conversion_chain = can_convert(input_ext, output_ext) if conversion_chain: current_input = input_file current_output = None for i in xrange(len(conversion_chain)): if i == (len(conversion_chain) - 1): current_output = output_file converter = conversion_chain[i][0] final_params = dict(conversion_chain[i][1]) final_params.update(params) try: return converter(current_input, current_output, **final_params) except InvenioWebSubmitFileConverterError, err: raise InvenioWebSubmitFileConverterError("Error when converting from %s to %s: %s" % (input_file, output_ext, err)) except Exception, err: register_exception() raise InvenioWebSubmitFileConverterError("Unexpected error when converting from %s to %s (%s): %s" % (input_file, output_ext, type(err), err)) current_input = current_output
def _getfile_py(req, recid=0, docid=0, version="", name="", format="", ln=CFG_SITE_LANG): if not recid: ## Let's obtain the recid from the docid if docid: try: bibdoc = BibDoc(docid=docid) recid = bibdoc.get_recid() except InvenioBibDocFileError: return warning_page(_("An error has happened in trying to retrieve the requested file."), req, ln) else: return warning_page(_('Not enough information to retrieve the document'), req, ln) else: if not name and docid: ## Let's obtain the name from the docid try: bibdoc = BibDoc(docid) name = bibdoc.get_docname() except InvenioBibDocFileError: return warning_page(_("An error has happened in trying to retrieving the requested file."), req, ln) format = normalize_format(format) redirect_to_url(req, '%s/%s/%s/files/%s%s?ln=%s%s' % (CFG_SITE_URL, CFG_SITE_RECORD, recid, name, format, ln, version and 'version=%s' % version or ''), apache.HTTP_MOVED_PERMANENTLY)
def normalize_desired_conversion(): ret = {} for key, value in desired_conversion.iteritems(): ret[normalize_format(key)] = [normalize_format(aformat) for aformat in value] return ret
def test_base(self): self.assertEqual(".format", bibdocfile.normalize_format(".format"))
def test_ampersand(self): self.assertEqual(".format", bibdocfile.normalize_format(".format&suffix"))
def unoconv(input_file, output_file=None, output_format='txt', pdfopt=True, **dummy): """Use unconv to convert among OpenOffice understood documents.""" from invenio.bibdocfile import normalize_format try: check_openoffice_tmpdir() except InvenioWebSubmitFileConverterError, err: register_exception(alert_admin=True, prefix='ERROR: it\'s impossible to properly execute OpenOffice.org conversions: %s' % err) raise input_file, output_file, dummy = prepare_io(input_file, output_file, output_format, need_working_dir=False) if output_format == 'txt': unoconv_format = 'text' else: unoconv_format = output_format try: tmpfile = tempfile.mktemp(dir=CFG_OPENOFFICE_TMPDIR, suffix=normalize_format(output_format)) execute_command('sudo', '-u', CFG_OPENOFFICE_USER, CFG_PATH_OPENOFFICE_PYTHON, os.path.join(CFG_PYLIBDIR, 'invenio', 'unoconv.py'), '-v', '-s', CFG_OPENOFFICE_SERVER_HOST, '-p', str(CFG_OPENOFFICE_SERVER_PORT), '--outputfile', tmpfile, '-f', unoconv_format, input_file) except InvenioWebSubmitFileConverterError: time.sleep(5) execute_command('sudo', '-u', CFG_OPENOFFICE_USER, CFG_PATH_OPENOFFICE_PYTHON, os.path.join(CFG_PYLIBDIR, 'invenio', 'unoconv.py'), '-v', '-s', CFG_OPENOFFICE_SERVER_HOST, '-p', str(CFG_OPENOFFICE_SERVER_PORT), '--outputfile', tmpfile, '-f', unoconv_format, input_file) if not os.path.exists(tmpfile): raise InvenioWebSubmitFileConverterError('No output was generated by OpenOffice') output_format = normalize_format(output_format) if output_format == '.pdf' and pdfopt: pdf2pdfopt(tmpfile, output_file) else: shutil.copy(tmpfile, output_file) execute_command('sudo', '-u', CFG_OPENOFFICE_USER, CFG_PATH_OPENOFFICE_PYTHON, '-c', 'import os; os.remove(%s)' % repr(tmpfile))
def test_unicode(self): self.assertEqual(".\xca\xac", bibdocfile.normalize_format(u".ʬ"))
def test_unicode_subformat(self): self.assertEqual(".format;\xca\xac", bibdocfile.normalize_format(u".format;ʬ"))
import re from invenio.bibdocfile import BibRecDocs, file_strip_ext, normalize_format, compose_format from invenio.messages import gettext_set_language from invenio.config import CFG_SITE_URL, CFG_BASE_URL, CFG_CERN_SITE, CFG_SITE_RECORD, \ CFG_BIBFORMAT_HIDDEN_FILE_FORMATS from invenio.bibdocfile_config import CFG_BIBDOCFILE_ICON_SUBFORMAT_RE from invenio.urlutils import get_relative_url from cgi import escape, parse_qs from urlparse import urlparse from os.path import basename import urllib _CFG_NORMALIZED_BIBFORMAT_HIDDEN_FILE_FORMATS = set( normalize_format(fmt) for fmt in CFG_BIBFORMAT_HIDDEN_FILE_FORMATS) _CFG_BIBFORMAT_HIDDEN_DOCTYPES = ['Plot'] if CFG_CERN_SITE: _CFG_BIBFORMAT_HIDDEN_DOCTYPES.append('arXiv') cern_arxiv_categories = [ "astro-ph", "chao-dyn", "cond-mat", "gr-qc", "hep-ex", "hep-lat", "hep-ph", "hep-th", "math-ph", "math", "nucl-ex", "nucl-th", "out", "physics", "quant-ph", "q-alg", "cs", "adap-org", "comp-gas", "chem-ph", "cs", "math", "neuro-sys", "patt-sol", "solv-int", "acc-phys", "alg-geom", "ao-sci", "atom-ph", "cmp-lg", "dg-ga", "funct-an", "mtrl-th", "plasm-ph", "q-alg", "supr-con" ]
"""BibFormat element - Prints a links to fulltext """ __revision__ = "$Id$" import re from invenio.bibdocfile import BibRecDocs, file_strip_ext, normalize_format, compose_format from invenio.messages import gettext_set_language from invenio.config import CFG_SITE_URL, CFG_CERN_SITE, CFG_SITE_RECORD, \ CFG_BIBFORMAT_HIDDEN_FILE_FORMATS from invenio.bibdocfile_config import CFG_BIBDOCFILE_ICON_SUBFORMAT_RE from cgi import escape, parse_qs from urlparse import urlparse from os.path import basename import urllib _CFG_NORMALIZED_BIBFORMAT_HIDDEN_FILE_FORMATS = set(normalize_format(fmt) for fmt in CFG_BIBFORMAT_HIDDEN_FILE_FORMATS) cern_arxiv_categories = ["astro-ph", "chao-dyn", "cond-mat", "gr-qc", "hep-ex", "hep-lat", "hep-ph", "hep-th", "math-ph", "math", "nucl-ex", "nucl-th", "out", "physics", "quant-ph", "q-alg", "cs", "adap-org", "comp-gas", "chem-ph", "cs", "math", "neuro-sys", "patt-sol", "solv-int", "acc-phys", "alg-geom", "ao-sci", "atom-ph", "cmp-lg", "dg-ga", "funct-an", "mtrl-th", "plasm-ph", "q-alg", "supr-con"] def format_element(bfo, style, separator='; ', show_icons='no', focus_on_main_file='no', show_subformat_icons='no'): """ This is the default format for formatting fulltext links. When possible, it returns only the main file(s) (+ link to
def unoconv(input_file, output_file=None, output_format='txt', pdfopt=True, **dummy): """Use unconv to convert among OpenOffice understood documents.""" from invenio.bibdocfile import normalize_format try: check_openoffice_tmpdir() except InvenioWebSubmitFileConverterError, err: register_exception(alert_admin=True, prefix='ERROR: it\'s impossible to properly execute OpenOffice.org conversions: %s' % err) raise input_file, output_file, dummy = prepare_io(input_file, output_file, output_format, need_working_dir=False) if output_format == 'txt': unoconv_format = 'text' else: unoconv_format = output_format try: tmpfile = tempfile.mktemp(dir=CFG_OPENOFFICE_TMPDIR, suffix=normalize_format(output_format)) execute_command('sudo', '-u', CFG_OPENOFFICE_USER, CFG_PATH_OPENOFFICE_PYTHON, os.path.join(CFG_PYLIBDIR, 'invenio', 'unoconv.py'), '-v', '-s', CFG_OPENOFFICE_SERVER_HOST, '-p', CFG_OPENOFFICE_SERVER_PORT, '--outputfile', tmpfile, '-f', unoconv_format, input_file) except InvenioWebSubmitFileConverterError: time.sleep(5) execute_command('sudo', '-u', CFG_OPENOFFICE_USER, CFG_PATH_OPENOFFICE_PYTHON, os.path.join(CFG_PYLIBDIR, 'invenio', 'unoconv.py'), '-v', '-s', CFG_OPENOFFICE_SERVER_HOST, '-p', CFG_OPENOFFICE_SERVER_PORT, '--outputfile', tmpfile, '-f', unoconv_format, input_file) if not os.path.exists(tmpfile): raise InvenioWebSubmitFileConverterError('No output was generated by OpenOffice') output_format = normalize_format(output_format) if output_format == '.pdf' and pdfopt: pdf2pdfopt(tmpfile, output_file) else: shutil.copy(tmpfile, output_file) execute_command('sudo', '-u', CFG_OPENOFFICE_USER, CFG_PATH_OPENOFFICE_PYTHON, '-c', 'import os; os.remove(%s)' % repr(tmpfile))
_("An error has happened in trying to retrieve the requested file."), req, CFG_SITE_NAME, ln ) else: return warningMsg(_("Not enough information to retrieve the document"), req, CFG_SITE_NAME, ln) else: if not name and docid: ## Let's obtain the name from the docid try: bibdoc = BibDoc(docid) name = bibdoc.get_docname() except InvenioWebSubmitFileError, e: return warningMsg( _("An error has happened in trying to retrieving the requested file."), req, CFG_SITE_NAME, ln ) format = normalize_format(format) redirect_to_url( req, "%s/%s/%s/files/%s%s?ln=%s%s" % (CFG_SITE_URL, CFG_SITE_RECORD, recid, name, format, ln, version and "version=%s" % version or ""), apache.HTTP_MOVED_PERMANENTLY, ) return _getfile_py(req, **args) # -------------------------------------------------- from invenio.websubmit_engine import home, action, interface, endaction