Example #1
0
    def test_cli_extract_from_url(self):
        """bibclassify -k ${taxonomy}.rdf {url/record/94}"""

        path, url = self.get_test_file(94)

        args = ("-k %s.rdf %s" % (self.taxonomy_name, url)).split()
        options = bibclassify_cli._read_options(args)

        self.redirect()

        bibclassify_engine.output_keywords_for_sources(
            options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"])

        results, errors = self.unredirect()

        res, msg = check_pdf2(results)
        if not res:
            self.fail(msg)
Example #2
0
    def test_cli_extract_from_directory(self):
        """bibclassify -k ${taxonomy}.rdf directory/"""

        path, url = self.get_test_file(94)

        path = os.path.dirname(path)

        if not os.path.exists(path):
            sys.stderr.write("No PDF folder for testing found, returning\n")
            return

        args = ("-k %s.rdf %s" % (self.taxonomy_name, path)).split()
        options = bibclassify_cli._read_options(args)

        self.redirect()

        bibclassify_engine.output_keywords_for_sources(
            options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"])

        results, errors = self.unredirect()

        res, msg = check_pdf2(results)
        if not res:
            self.fail(msg)
    def test_cli_extract_from_url(self):
        """bibclassify -k ${taxonomy}.rdf {url/record/94}"""

        path, url = self.get_test_file(94)

        args = ("-k %s.rdf %s" % (self.taxonomy_name, url)).split()
        options = bibclassify_cli._read_options(args)

        self.redirect()

        bibclassify_engine.output_keywords_for_sources(
            options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"],
        )

        results, errors = self.unredirect()

        res, msg = check_pdf2(results)
        if not res:
            self.fail(msg)
    def test_cli_extract_from_directory(self):
        """bibclassify -k ${taxonomy}.rdf directory/"""

        path, url = self.get_test_file(94)

        path = os.path.dirname(path)

        if not os.path.exists(path):
            sys.stderr.write("No PDF folder for testing found, returning\n")
            return

        args = ("-k %s.rdf %s" % (self.taxonomy_name, path)).split()
        options = bibclassify_cli._read_options(args)

        self.redirect()

        bibclassify_engine.output_keywords_for_sources(
            options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"],
        )

        results, errors = self.unredirect()

        res, msg = check_pdf2(results)
        if not res:
            self.fail(msg)
Example #5
0
def main():
    """Main function """
    arguments = sys.argv
    for index, argument in enumerate(arguments):
        if 'bibclassify' in argument:
            arguments = arguments[index + 1:]
            break
    else:
        arguments = arguments[1:]

    run_as_daemon = False

    # Check if running in standalone or daemon mode.
    if not arguments and not bconfig.STANDALONE:
        run_as_daemon = True
    elif len(arguments) == 1 and arguments[0].isdigit():
        # Running the task with its PID number (bibsched style).
        run_as_daemon = True

    specific_daemon_options = ('-i', '--recid', '-c', '--collection', '-f')
    for option in specific_daemon_options:
        for arg in arguments:
            if arg.startswith(option):
                run_as_daemon = True

    if run_as_daemon:
        from invenio import bibclassify_daemon as daemon
        if daemon:
            daemon.bibclassify_daemon()
        else:
            log.error(
                "We are running in a standalone mode, can't start daemon")
    else:
        options = _read_options(arguments)

        if options['check_taxonomy']:
            reader.check_taxonomy(options['taxonomy'])

        engine.output_keywords_for_sources(
            options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"])
Example #6
0
def main():
    """Main function """
    arguments = sys.argv
    for index, argument in enumerate(arguments):
        if 'bibclassify' in argument:
            arguments = arguments[index+1:]
            break
    else:
        arguments = arguments[1:]


    run_as_daemon = False

    # Check if running in standalone or daemon mode.
    if not arguments and not bconfig.STANDALONE:
        run_as_daemon = True
    elif len(arguments) == 1 and arguments[0].isdigit():
        # Running the task with its PID number (bibsched style).
        run_as_daemon = True

    specific_daemon_options = ('-i', '--recid', '-c', '--collection', '-f')
    for option in specific_daemon_options:
        for arg in arguments:
            if arg.startswith(option):
                run_as_daemon = True

    if run_as_daemon:
        from invenio import bibclassify_daemon as daemon
        if daemon:
            daemon.bibclassify_daemon()
        else:
            log.error("We are running in a standalone mode, can't start daemon")
    else:
        options = _read_options(arguments)

        if options['check_taxonomy']:
            reader.check_taxonomy(options['taxonomy'])

        engine.output_keywords_for_sources(options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"])
    def test_full_and_partial_matching_mode(self):
        """bibclassify - difference of extraction on part or full contents of pdf"""

        path, url = self.get_test_file(94)

        if not os.path.exists(path):
            sys.stderr.write("No PDF for testing found, returning\n")
            return

        results = []
        for case in [
            "-k %s.rdf %s" % (self.taxonomy_name, path),
            "-k %s.rdf %s -m partial" % (self.taxonomy_name, path),
        ]:
            args = (case).split()
            options = bibclassify_cli._read_options(args)

            self.redirect()

            bibclassify_engine.output_keywords_for_sources(
                options["text_files"],
                options["taxonomy"],
                rebuild_cache=options["rebuild_cache"],
                no_cache=options["no_cache"],
                output_mode=options["output_mode"],
                output_limit=options["output_limit"],
                spires=options["spires"],
                match_mode=options["match_mode"],
                with_author_keywords=options["with_author_keywords"],
                extract_acronyms=options["extract_acronyms"],
                only_core_tags=options["only_core_tags"],
            )

            r, e = self.unredirect()
            results.append(r)

        res, msg = check_pdf1(results[1])
        if not res:
            self.fail(msg)
        res, msg = check_pdf2(results[0])
        if not res:
            self.fail(msg)
Example #8
0
    def test_full_and_partial_matching_mode(self):
        """bibclassify - difference of extraction on part or full contents of pdf"""

        path, url = self.get_test_file(94)

        if not os.path.exists(path):
            sys.stderr.write("No PDF for testing found, returning\n")
            return

        results = []
        for case in [
                "-k %s.rdf %s" % (self.taxonomy_name, path),
                "-k %s.rdf %s -m partial" % (self.taxonomy_name, path)
        ]:
            args = (case).split()
            options = bibclassify_cli._read_options(args)

            self.redirect()

            bibclassify_engine.output_keywords_for_sources(
                options["text_files"],
                options["taxonomy"],
                rebuild_cache=options["rebuild_cache"],
                no_cache=options["no_cache"],
                output_mode=options["output_mode"],
                output_limit=options["output_limit"],
                spires=options["spires"],
                match_mode=options["match_mode"],
                with_author_keywords=options["with_author_keywords"],
                extract_acronyms=options["extract_acronyms"],
                only_core_tags=options["only_core_tags"])

            r, e = self.unredirect()
            results.append(r)

        res, msg = check_pdf1(results[1])
        if not res:
            self.fail(msg)
        res, msg = check_pdf2(results[0])
        if not res:
            self.fail(msg)