Python convert_fileの例、invenio.legacy.websubmit.file_converter.convert_file Pythonの例

コード例 #1

0

ファイルを表示

ファイル: bom_textdoc.py プロジェクト: chokribr/invenio-1

    def extract_text(self, version=None, perform_ocr=False, ln='en'):
        """
        Try what is necessary to extract the textual information of a document.

        @param version: the version of the document for which text is required.
            If not specified the text will be retrieved from the last version.
        @type version: integer
        @param perform_ocr: whether to perform OCR.
        @type perform_ocr: bool
        @param ln: a two letter language code to give as a hint to the OCR
            procedure.
        @type ln: string
        @raise InvenioBibDocFileError: in case of error.
        @note: the text is extracted and cached for later use. Use L{get_text}
            to retrieve it.
        """
        from invenio.legacy.websubmit.file_converter import get_best_format_to_extract_text_from, convert_file, InvenioWebSubmitFileConverterError
        if version is None:
            version = self.get_latest_version()
        docfiles = self.list_version_files(version)
        ## We try to extract text only from original or OCRed documents.
        filenames = [
            docfile.get_full_path() for docfile in docfiles
            if 'CONVERTED' not in docfile.flags or 'OCRED' in docfile.flags
        ]
        try:
            filename = get_best_format_to_extract_text_from(filenames)
        except InvenioWebSubmitFileConverterError:
            ## We fall back on considering all the documents
            filenames = [docfile.get_full_path() for docfile in docfiles]
            try:
                filename = get_best_format_to_extract_text_from(filenames)
            except InvenioWebSubmitFileConverterError:
                open(os.path.join(self.basedir, '.text;%i' % version),
                     'w').write('')
                return
        try:
            convert_file(filename,
                         os.path.join(self.basedir, '.text;%i' % version),
                         '.txt',
                         perform_ocr=perform_ocr,
                         ln=ln)
            if version == self.get_latest_version():
                run_sql(
                    "UPDATE bibdoc SET text_extraction_date=NOW() WHERE id=%s",
                    (self.id, ))
        except InvenioWebSubmitFileConverterError as e:
            register_exception(
                alert_admin=True,
                prefix="Error in extracting text from bibdoc %i, version %i" %
                (self.id, version))
            raise InvenioBibDocFileError, str(e)

コード例 #2

0

ファイルを表示

ファイル: Shared_Functions.py プロジェクト: mhellmic/b2share

def createRelatedFormats(fullpath, overwrite=True, debug=False):
    """Given a fullpath, this function extracts the file's extension and
    finds in which additional format the file can be converted and converts it.
    @param fullpath: (string) complete path to file
    @param overwrite: (bool) overwrite already existing formats
    Return a list of the paths to the converted files
    """
    file_converter_logger = get_file_converter_logger()
    old_logging_level = file_converter_logger.getEffectiveLevel()
    if debug:
        file_converter_logger.setLevel(DEBUG)
    try:
        createdpaths = []
        basedir, filename, extension = decompose_file(fullpath)
        extension = extension.lower()
        if debug:
            print("basedir: %s, filename: %s, extension: %s" % (basedir, filename, extension), file=sys.stderr)

        filelist = glob.glob(os.path.join(basedir, '%s*' % filename))
        if debug:
            print("filelist: %s" % filelist, file=sys.stderr)
        missing_formats = get_missing_formats(filelist)
        if debug:
            print("missing_formats: %s" % missing_formats, file=sys.stderr)
        for path, formats in iteritems(missing_formats):
            if debug:
                print("... path: %s, formats: %s" % (path, formats), file=sys.stderr)
            for aformat in formats:
                if debug:
                    print("...... aformat: %s" % aformat, file=sys.stderr)
                newpath = os.path.join(basedir, filename + aformat)
                if debug:
                    print("...... newpath: %s" % newpath, file=sys.stderr)
                try:
                    convert_file(path, newpath)
                    createdpaths.append(newpath)
                except InvenioWebSubmitFileConverterError as msg:
                    if debug:
                        print("...... Exception: %s" % msg, file=sys.stderr)
                    register_exception(alert_admin=True)
    finally:
        if debug:
            file_converter_logger.setLevel(old_logging_level)
    return createdpaths

コード例 #3

0

ファイルを表示

ファイル: bom_textdoc.py プロジェクト: SCOAP3/invenio

    def extract_text(self, version=None, perform_ocr=False, ln='en'):
        """
        Try what is necessary to extract the textual information of a document.

        @param version: the version of the document for which text is required.
            If not specified the text will be retrieved from the last version.
        @type version: integer
        @param perform_ocr: whether to perform OCR.
        @type perform_ocr: bool
        @param ln: a two letter language code to give as a hint to the OCR
            procedure.
        @type ln: string
        @raise InvenioBibDocFileError: in case of error.
        @note: the text is extracted and cached for later use. Use L{get_text}
            to retrieve it.
        """
        from invenio.legacy.websubmit.file_converter import get_best_format_to_extract_text_from, convert_file, InvenioWebSubmitFileConverterError
        if version is None:
            version = self.get_latest_version()
        docfiles = self.list_version_files(version)
        ## We try to extract text only from original or OCRed documents.
        filenames = [docfile.get_full_path() for docfile in docfiles if 'CONVERTED' not in docfile.flags or 'OCRED' in docfile.flags]
        try:
            filename = get_best_format_to_extract_text_from(filenames)
        except InvenioWebSubmitFileConverterError:
            ## We fall back on considering all the documents
            filenames = [docfile.get_full_path() for docfile in docfiles]
            try:
                filename = get_best_format_to_extract_text_from(filenames)
            except InvenioWebSubmitFileConverterError:
                open(os.path.join(self.basedir, '.text;%i' % version), 'w').write('')
                return
        try:
            convert_file(filename, os.path.join(self.basedir, '.text;%i' % version), '.txt', perform_ocr=perform_ocr, ln=ln)
            if version == self.get_latest_version():
                run_sql("UPDATE bibdoc SET text_extraction_date=NOW() WHERE id=%s", (self.id, ))
        except InvenioWebSubmitFileConverterError as e:
            register_exception(alert_admin=True, prefix="Error in extracting text from bibdoc %i, version %i" % (self.id, version))
            raise InvenioBibDocFileError, str(e)

コード例 #4

0

ファイルを表示

ファイル: test_websubmit.py プロジェクト: cranmer/opendata.cern.ch

 def _run_test(self):
     from invenio.legacy.websubmit.file_converter import InvenioWebSubmitFileConverterError, convert_file
     try:
         tmpdir_snapshot1 = set(os.listdir(cfg['CFG_TMPDIR']))
         output_file = convert_file(self.input_file, output_format=self.to_format)
         tmpdir_snapshot2 = set(os.listdir(cfg['CFG_TMPDIR']))
         tmpdir_snapshot2.discard(os.path.basename(output_file))
         if not os.path.exists(output_file):
             raise InvenioWebSubmitFileConverterError("output_file %s was not correctly created" % output_file)
         if tmpdir_snapshot2 - tmpdir_snapshot1:
             raise InvenioWebSubmitFileConverterError("Some temporary files were left over: %s" % (tmpdir_snapshot2 - tmpdir_snapshot1))
     except Exception, err:
         register_exception(alert_admin=True)
         self.fail("ERROR: when converting from %s to %s: %s, the log contained: %s" % (self.from_format, self.to_format, err, self.log.getvalue()))

コード例 #5

0

ファイルを表示

 def _run_test(self):
     from invenio.legacy.websubmit.file_converter import InvenioWebSubmitFileConverterError, convert_file
     try:
         tmpdir_snapshot1 = set(os.listdir(cfg['CFG_TMPDIR']))
         output_file = convert_file(self.input_file,
                                    output_format=self.to_format)
         tmpdir_snapshot2 = set(os.listdir(cfg['CFG_TMPDIR']))
         tmpdir_snapshot2.discard(os.path.basename(output_file))
         if not os.path.exists(output_file):
             raise InvenioWebSubmitFileConverterError(
                 "output_file %s was not correctly created" % output_file)
         if tmpdir_snapshot2 - tmpdir_snapshot1:
             raise InvenioWebSubmitFileConverterError(
                 "Some temporary files were left over: %s" %
                 (tmpdir_snapshot2 - tmpdir_snapshot1))
     except Exception, err:
         register_exception(alert_admin=True)
         self.fail(
             "ERROR: when converting from %s to %s: %s, the log contained: %s"
             % (self.from_format, self.to_format, err, self.log.getvalue()))

コード例 #6

0

ファイルを表示

ファイル: Shared_Functions.py プロジェクト: chokribr/invenio-1

def createRelatedFormats(fullpath,
                         overwrite=True,
                         debug=False,
                         consider_version=False):
    """Given a fullpath, this function extracts the file's extension and
    finds in which additional format the file can be converted and converts it.
    @param fullpath: (string) complete path to file
    @param overwrite: (bool) overwrite already existing formats
    @param consider_version: (bool) if True, consider the version info
                             in C{fullpath} to find missing format
                             for that specific version, if C{fullpath}
                             contains version info
    Return a list of the paths to the converted files
    """
    file_converter_logger = get_file_converter_logger()
    old_logging_level = file_converter_logger.getEffectiveLevel()
    if debug:
        file_converter_logger.setLevel(DEBUG)
    try:
        createdpaths = []
        if consider_version:
            try:
                basedir, filename, extension, version = decompose_file_with_version(
                    fullpath)
            except:
                basedir, filename, extension = decompose_file(fullpath)
                version = 0
        else:
            basedir, filename, extension = decompose_file(fullpath)
            version = 0
        extension = extension.lower()
        if debug:
            print("basedir: %s, filename: %s, extension: %s" %
                  (basedir, filename, extension),
                  file=sys.stderr)

        if overwrite:
            missing_formats = get_missing_formats([fullpath])
        else:
            if version:
                filelist = glob.glob(
                    os.path.join(basedir, '%s*;%s' % (filename, version)))
            else:
                filelist = glob.glob(os.path.join(basedir, '%s*' % filename))
            if debug:
                print("filelist: %s" % filelist, file=sys.stderr)
            missing_formats = get_missing_formats(filelist)
        if debug:
            print("missing_formats: %s" % missing_formats, file=sys.stderr)
        for path, formats in iteritems(missing_formats):
            if debug:
                print("... path: %s, formats: %s" % (path, formats),
                      file=sys.stderr)
            for aformat in formats:
                if debug:
                    print("...... aformat: %s" % aformat, file=sys.stderr)
                newpath = os.path.join(basedir, filename + aformat)
                if debug:
                    print("...... newpath: %s" % newpath, file=sys.stderr)
                try:
                    if CFG_BIBDOCFILE_FILEDIR in basedir:
                        # We should create the new files in a temporary location, not
                        # directly inside the BibDoc directory.
                        newpath = convert_file(path, output_format=aformat)
                    else:
                        convert_file(path, newpath)
                    createdpaths.append(newpath)
                except InvenioWebSubmitFileConverterError as msg:
                    if debug:
                        print("...... Exception: %s" % msg, file=sys.stderr)
                    register_exception(alert_admin=True)
    finally:
        if debug:
            file_converter_logger.setLevel(old_logging_level)
    return createdpaths

コード例 #7

0

ファイルを表示

ファイル: BibIndexFulltextTokenizer.py プロジェクト: mhellmic/b2share

    def get_words_from_fulltext(self, url_direct_or_indirect):
        """Returns all the words contained in the document specified by
           URL_DIRECT_OR_INDIRECT with the words being split by various
           SRE_SEPARATORS regexp set earlier.  If FORCE_FILE_EXTENSION is
           set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF
           file.  (This is interesting to index Indico for example.)  Note
           also that URL_DIRECT_OR_INDIRECT may be either a direct URL to
           the fulltext file or an URL to a setlink-like page body that
           presents the links to be indexed.  In the latter case the
           URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs
           to fulltext documents, for all knows file extensions as
           specified by global CONV_PROGRAMS config variable.
        """
        write_message("... reading fulltext files from %s started" % url_direct_or_indirect, verbose=2)
        try:
            if bibdocfile_url_p(url_direct_or_indirect):
                write_message("... %s is an internal document" % url_direct_or_indirect, verbose=2)
                bibdoc = bibdocfile_url_to_bibdoc(url_direct_or_indirect)
                indexer = get_idx_indexer('fulltext')
                if indexer != 'native':
                    # A document might belong to multiple records
                    for rec_link in bibdoc.bibrec_links:
                        recid = rec_link["recid"]
                        # Adds fulltexts of all files once per records
                        if not recid in fulltext_added:
                            bibrecdocs = BibRecDocs(recid)
                            text = bibrecdocs.get_text()
                            if indexer == 'SOLR' and CFG_SOLR_URL:
                                solr_add_fulltext(recid, text)
                            elif indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED:
                                xapian_add(recid, 'fulltext', text)

                        fulltext_added.add(recid)
                    # we are relying on an external information retrieval system
                    # to provide full-text indexing, so dispatch text to it and
                    # return nothing here:
                    return []
                else:
                    text = ""
                    if hasattr(bibdoc, "get_text"):
                        text = bibdoc.get_text()
                    return self.tokenize_for_words_default(text)
            else:
                if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY:
                    write_message("... %s is external URL but indexing only local files" % url_direct_or_indirect, verbose=2)
                    return []
                write_message("... %s is an external URL" % url_direct_or_indirect, verbose=2)
                urls_to_index = set()
                for splash_re, url_re in iteritems(CFG_BIBINDEX_SPLASH_PAGES):
                    if re.match(splash_re, url_direct_or_indirect):
                        write_message("... %s is a splash page (%s)" % (url_direct_or_indirect, splash_re), verbose=2)
                        html = urllib2.urlopen(url_direct_or_indirect).read()
                        urls = get_links_in_html_page(html)
                        write_message("... found these URLs in %s splash page: %s" % (url_direct_or_indirect, ", ".join(urls)), verbose=3)
                        for url in urls:
                            if re.match(url_re, url):
                                write_message("... will index %s (matched by %s)" % (url, url_re), verbose=2)
                                urls_to_index.add(url)
                if not urls_to_index:
                    urls_to_index.add(url_direct_or_indirect)
                write_message("... will extract words from %s" % ', '.join(urls_to_index), verbose=2)
                words = {}
                for url in urls_to_index:
                    tmpdoc = download_url(url)
                    file_converter_logger = get_file_converter_logger()
                    old_logging_level = file_converter_logger.getEffectiveLevel()
                    if self.verbose > 3:
                        file_converter_logger.setLevel(logging.DEBUG)
                    try:
                        try:
                            tmptext = convert_file(tmpdoc, output_format='.txt')
                            text = open(tmptext).read()
                            os.remove(tmptext)

                            indexer = get_idx_indexer('fulltext')
                            if indexer != 'native':
                                if indexer == 'SOLR' and CFG_SOLR_URL:
                                    solr_add_fulltext(None, text) # FIXME: use real record ID
                                if indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED:
                                    #xapian_add(None, 'fulltext', text) # FIXME: use real record ID
                                    pass
                                # we are relying on an external information retrieval system
                                # to provide full-text indexing, so dispatch text to it and
                                # return nothing here:
                                tmpwords = []
                            else:
                                tmpwords = self.tokenize_for_words_default(text)
                            words.update(dict(map(lambda x: (x, 1), tmpwords)))
                        except Exception as e:
                            message = 'ERROR: it\'s impossible to correctly extract words from %s referenced by %s: %s' % (url, url_direct_or_indirect, e)
                            register_exception(prefix=message, alert_admin=True)
                            write_message(message, stream=sys.stderr)
                    finally:
                        os.remove(tmpdoc)
                        if self.verbose > 3:
                            file_converter_logger.setLevel(old_logging_level)
                return words.keys()
        except Exception as e:
            message = 'ERROR: it\'s impossible to correctly extract words from %s: %s' % (url_direct_or_indirect, e)
            register_exception(prefix=message, alert_admin=True)
            write_message(message, stream=sys.stderr)
            return []

コード例 #8

0

ファイルを表示

ファイル: Shared_Functions.py プロジェクト: SCOAP3/invenio

def createRelatedFormats(fullpath, overwrite=True, debug=False, consider_version=False):
    """Given a fullpath, this function extracts the file's extension and
    finds in which additional format the file can be converted and converts it.
    @param fullpath: (string) complete path to file
    @param overwrite: (bool) overwrite already existing formats
    @param consider_version: (bool) if True, consider the version info
                             in C{fullpath} to find missing format
                             for that specific version, if C{fullpath}
                             contains version info
    Return a list of the paths to the converted files
    """
    file_converter_logger = get_file_converter_logger()
    old_logging_level = file_converter_logger.getEffectiveLevel()
    if debug:
        file_converter_logger.setLevel(DEBUG)
    try:
        createdpaths = []
        if consider_version:
            try:
                basedir, filename, extension, version = decompose_file_with_version(fullpath)
            except:
                basedir, filename, extension = decompose_file(fullpath)
                version = 0
        else:
            basedir, filename, extension = decompose_file(fullpath)
            version = 0
        extension = extension.lower()
        if debug:
            print("basedir: %s, filename: %s, extension: %s" % (basedir, filename, extension), file=sys.stderr)

        if overwrite:
            missing_formats = get_missing_formats([fullpath])
        else:
            if version:
                filelist = glob.glob(os.path.join(basedir, '%s*;%s' % (filename, version)))
            else:
                filelist = glob.glob(os.path.join(basedir, '%s*' % filename))
            if debug:
                print("filelist: %s" % filelist, file=sys.stderr)
            missing_formats = get_missing_formats(filelist)
        if debug:
            print("missing_formats: %s" % missing_formats, file=sys.stderr)
        for path, formats in iteritems(missing_formats):
            if debug:
                print("... path: %s, formats: %s" % (path, formats), file=sys.stderr)
            for aformat in formats:
                if debug:
                    print("...... aformat: %s" % aformat, file=sys.stderr)
                newpath = os.path.join(basedir, filename + aformat)
                if debug:
                    print("...... newpath: %s" % newpath, file=sys.stderr)
                try:
                    if CFG_BIBDOCFILE_FILEDIR in basedir:
                        # We should create the new files in a temporary location, not
                        # directly inside the BibDoc directory.
                        newpath = convert_file(path, output_format=aformat)
                    else:
                        convert_file(path, newpath)
                    createdpaths.append(newpath)
                except InvenioWebSubmitFileConverterError as msg:
                    if debug:
                        print("...... Exception: %s" % msg, file=sys.stderr)
                    register_exception(alert_admin=True)
    finally:
        if debug:
            file_converter_logger.setLevel(old_logging_level)
    return createdpaths

コード例 #9

0

ファイルを表示

    def get_words_from_fulltext(self, url_direct_or_indirect):
        """Returns all the words contained in the document specified by
           URL_DIRECT_OR_INDIRECT with the words being split by various
           SRE_SEPARATORS regexp set earlier.  If FORCE_FILE_EXTENSION is
           set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF
           file.  (This is interesting to index Indico for example.)  Note
           also that URL_DIRECT_OR_INDIRECT may be either a direct URL to
           the fulltext file or an URL to a setlink-like page body that
           presents the links to be indexed.  In the latter case the
           URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs
           to fulltext documents, for all knows file extensions as
           specified by global CONV_PROGRAMS config variable.
        """
        write_message("... reading fulltext files from %s started" %
                      url_direct_or_indirect,
                      verbose=2)
        try:
            if bibdocfile_url_p(url_direct_or_indirect):
                write_message("... %s is an internal document" %
                              url_direct_or_indirect,
                              verbose=2)
                try:
                    bibdoc = bibdocfile_url_to_bibdoc(url_direct_or_indirect)
                except InvenioBibDocFileError:
                    # Outdated 8564 tag
                    return []
                indexer = get_idx_indexer('fulltext')
                if indexer != 'native':
                    # A document might belong to multiple records
                    for rec_link in bibdoc.bibrec_links:
                        recid = rec_link["recid"]
                        # Adds fulltexts of all files once per records
                        if not recid in fulltext_added:
                            bibrecdocs = BibRecDocs(recid)
                            try:
                                text = bibrecdocs.get_text()
                            except InvenioBibDocFileError:
                                # Invalid PDF
                                continue
                            if indexer == 'SOLR' and CFG_SOLR_URL:
                                solr_add_fulltext(recid, text)
                            elif indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED:
                                xapian_add(recid, 'fulltext', text)

                        fulltext_added.add(recid)
                    # we are relying on an external information retrieval system
                    # to provide full-text indexing, so dispatch text to it and
                    # return nothing here:
                    return []
                else:
                    text = ""
                    if hasattr(bibdoc, "get_text"):
                        text = bibdoc.get_text()
                    return self.tokenize_for_words_default(text)
            else:
                if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY:
                    write_message(
                        "... %s is external URL but indexing only local files"
                        % url_direct_or_indirect,
                        verbose=2)
                    return []
                write_message("... %s is an external URL" %
                              url_direct_or_indirect,
                              verbose=2)
                urls_to_index = set()
                for splash_re, url_re in iteritems(CFG_BIBINDEX_SPLASH_PAGES):
                    if re.match(splash_re, url_direct_or_indirect):
                        write_message("... %s is a splash page (%s)" %
                                      (url_direct_or_indirect, splash_re),
                                      verbose=2)
                        html = urllib2.urlopen(url_direct_or_indirect).read()
                        urls = get_links_in_html_page(html)
                        write_message(
                            "... found these URLs in %s splash page: %s" %
                            (url_direct_or_indirect, ", ".join(urls)),
                            verbose=3)
                        for url in urls:
                            if re.match(url_re, url):
                                write_message(
                                    "... will index %s (matched by %s)" %
                                    (url, url_re),
                                    verbose=2)
                                urls_to_index.add(url)
                if not urls_to_index:
                    urls_to_index.add(url_direct_or_indirect)
                write_message("... will extract words from %s" %
                              ', '.join(urls_to_index),
                              verbose=2)
                words = {}
                for url in urls_to_index:
                    tmpdoc = download_url(url)
                    file_converter_logger = get_file_converter_logger()
                    old_logging_level = file_converter_logger.getEffectiveLevel(
                    )
                    if self.verbose > 3:
                        file_converter_logger.setLevel(logging.DEBUG)
                    try:
                        try:
                            tmptext = convert_file(tmpdoc,
                                                   output_format='.txt')
                            text = open(tmptext).read()
                            os.remove(tmptext)

                            indexer = get_idx_indexer('fulltext')
                            if indexer != 'native':
                                if indexer == 'SOLR' and CFG_SOLR_URL:
                                    solr_add_fulltext(
                                        None,
                                        text)  # FIXME: use real record ID
                                if indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED:
                                    #xapian_add(None, 'fulltext', text) # FIXME: use real record ID
                                    pass
                                # we are relying on an external information retrieval system
                                # to provide full-text indexing, so dispatch text to it and
                                # return nothing here:
                                tmpwords = []
                            else:
                                tmpwords = self.tokenize_for_words_default(
                                    text)
                            words.update(dict(map(lambda x: (x, 1), tmpwords)))
                        except Exception as e:
                            message = 'ERROR: it\'s impossible to correctly extract words from %s referenced by %s: %s' % (
                                url, url_direct_or_indirect, e)
                            register_exception(prefix=message,
                                               alert_admin=True)
                            write_message(message, stream=sys.stderr)
                    finally:
                        os.remove(tmpdoc)
                        if self.verbose > 3:
                            file_converter_logger.setLevel(old_logging_level)
                return words.keys()
        except Exception as e:
            message = 'ERROR: it\'s impossible to correctly extract words from %s: %s' % (
                url_direct_or_indirect, e)
            register_exception(prefix=message, alert_admin=True)
            write_message(message, stream=sys.stderr)
            return []