Esempio n. 1
0
    def test_cli_extract_from_url(self):
        """bibclassify -k HEP.rdf http://arxiv.org/pdf/0808.1825"""

        args = "-k HEP.rdf http://arxiv.org/pdf/0808.1825".split()
        options = bibclassify_cli._read_options(args)

        bibclassify_engine.output_keywords_for_sources(options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"])


        self.stdout.seek(0)
        results = self.stdout.read()
        self.stderr.seek(0)
        errors = self.stderr.read()


        res, msg = check_pdf0(results)
        if not res:
            self.fail(msg)
    def test_cli_extract_from_url(self):
        """bibclassify -k ${taxonomy}.rdf {url/record/94}"""

        path, url = self.get_test_file(94)

        args = ("-k %s.rdf %s" % (self.taxonomy_name, url)).split()
        options = bibclassify_cli._read_options(args)

        self.redirect()

        bibclassify_engine.output_keywords_for_sources(
            options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"])

        results, errors = self.unredirect()

        res, msg = check_pdf2(results)
        if not res:
            self.fail(msg)
    def test_cli_extract_from_directory(self):
        """bibclassify -k ${taxonomy}.rdf directory/"""

        path, url = self.get_test_file(94)

        path = os.path.dirname(path)

        if not os.path.exists(path):
            sys.stderr.write("No PDF folder for testing found, returning\n")
            return

        args = ("-k %s.rdf %s" % (self.taxonomy_name, path)).split()
        options = bibclassify_cli._read_options(args)

        self.redirect()

        bibclassify_engine.output_keywords_for_sources(
            options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"])

        results, errors = self.unredirect()

        res, msg = check_pdf2(results)
        if not res:
            self.fail(msg)
    def test_cli_extract_from_url(self):
        """bibclassify -k ${taxonomy}.rdf {url/record/94}"""

        path, url = self.get_test_file(94)

        args = ("-k %s.rdf %s" % (self.taxonomy_name, url)).split()
        options = bibclassify_cli._read_options(args)

        self.redirect()

        bibclassify_engine.output_keywords_for_sources(options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"])


        results, errors = self.unredirect()

        res, msg = check_pdf2(results)
        if not res:
            self.fail(msg)
    def test_cli_extract_from_filepath(self):
        """bibclassify -k ${taxonomy}.rdf {cache}/article.pdf"""


        path, url = self.get_test_file(94)

        if not os.path.exists(path):
            sys.stderr.write("No PDF for testing found, please load demo records\n")
            return

        args = ("-k %s.rdf %s" % (self.taxonomy_name, path)).split()
        options = bibclassify_cli._read_options(args)

        self.redirect()

        bibclassify_engine.output_keywords_for_sources(options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"])


        results, errors = self.unredirect()


        res, msg = check_pdf2(results)
        if not res:
            self.fail(msg)
Esempio n. 6
0
def main():
    """Main function """
    arguments = sys.argv
    for index, argument in enumerate(arguments):
        if 'bibclassify' in argument:
            arguments = arguments[index + 1:]
            break
    else:
        arguments = arguments[1:]

    run_as_daemon = False

    # Check if running in standalone or daemon mode.
    if not arguments and not bconfig.STANDALONE:
        run_as_daemon = True
    elif len(arguments) == 1 and arguments[0].isdigit():
        # Running the task with its PID number (bibsched style).
        run_as_daemon = True

    specific_daemon_options = ('-i', '--recid', '-c', '--collection', '-f')
    for option in specific_daemon_options:
        for arg in arguments:
            if arg.startswith(option):
                run_as_daemon = True

    if run_as_daemon:
        import bibclassify_daemon as daemon
        if daemon:
            daemon.bibclassify_daemon()
        else:
            log.error(
                "We are running in a standalone mode, can't start daemon")
    else:
        options = _read_options(arguments)

        if options['check_taxonomy']:
            reader.check_taxonomy(options['taxonomy'])

        engine.output_keywords_for_sources(
            options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"])
Esempio n. 7
0
def main():
    """Main function """
    arguments = sys.argv
    for index, argument in enumerate(arguments):
        if 'bibclassify' in argument:
            arguments = arguments[index+1:]
            break
    else:
        arguments = arguments[1:]


    run_as_daemon = False

    # Check if running in standalone or daemon mode.
    if not arguments and not bconfig.STANDALONE:
        run_as_daemon = True
    elif len(arguments) == 1 and arguments[0].isdigit():
        # Running the task with its PID number (bibsched style).
        run_as_daemon = True

    specific_daemon_options = ('-i', '--recid', '-c', '--collection', '-f')
    for option in specific_daemon_options:
        for arg in arguments:
            if arg.startswith(option):
                run_as_daemon = True

    if run_as_daemon:
        import bibclassify_daemon as daemon
        if daemon:
            daemon.bibclassify_daemon()
        else:
            log.error("We are running in a standalone mode, can't start daemon")
    else:
        options = _read_options(arguments)

        if options['check_taxonomy']:
            reader.check_taxonomy(options['taxonomy'])

        engine.output_keywords_for_sources(options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"])
Esempio n. 8
0
    def test_full_and_partial_matching_mode(self):
        """bibclassify - difference of extraction on part or full contents of pdf"""

        path = os.path.join(os.path.dirname(__file__), '../../../var/data/files/g0/90/9611103.pdf;1')
        if not os.path.exists(path):
            sys.stderr.write("No PDF for testing found, returning")
            return

        results = []
        for case in ["-k HEP.rdf %s" % path, "-k HEP.rdf %s -m partial" % path]:
            args = (case).split()
            options = bibclassify_cli._read_options(args)

            self.stdout.truncate(0)
            self.stderr.truncate(0)

            bibclassify_engine.output_keywords_for_sources(options["text_files"],
                options["taxonomy"],
                rebuild_cache=options["rebuild_cache"],
                no_cache=options["no_cache"],
                output_mode=options["output_mode"],
                output_limit=options["output_limit"],
                spires=options["spires"],
                match_mode=options["match_mode"],
                with_author_keywords=options["with_author_keywords"],
                extract_acronyms=options["extract_acronyms"],
                only_core_tags=options["only_core_tags"])


            self.stdout.flush()
            self.stdout.seek(0)
            results.append(self.stdout.read())
            self.stderr.flush()
            self.stderr.seek(0)
            errors = self.stderr.read()

        res, msg = check_pdf1(results[1])
        if not res:
            self.fail(msg)
        res, msg = check_pdf2(results[0])
        if not res:
            self.fail(msg)
    def test_full_and_partial_matching_mode(self):
        """bibclassify - difference of extraction on part or full contents of pdf"""

        path, url = self.get_test_file(94)

        if not os.path.exists(path):
            sys.stderr.write("No PDF for testing found, returning\n")
            return

        results = []
        for case in [
                "-k %s.rdf %s" % (self.taxonomy_name, path),
                "-k %s.rdf %s -m partial" % (self.taxonomy_name, path)
        ]:
            args = (case).split()
            options = bibclassify_cli._read_options(args)

            self.redirect()

            bibclassify_engine.output_keywords_for_sources(
                options["text_files"],
                options["taxonomy"],
                rebuild_cache=options["rebuild_cache"],
                no_cache=options["no_cache"],
                output_mode=options["output_mode"],
                output_limit=options["output_limit"],
                spires=options["spires"],
                match_mode=options["match_mode"],
                with_author_keywords=options["with_author_keywords"],
                extract_acronyms=options["extract_acronyms"],
                only_core_tags=options["only_core_tags"])

            r, e = self.unredirect()
            results.append(r)

        res, msg = check_pdf1(results[1])
        if not res:
            self.fail(msg)
        res, msg = check_pdf2(results[0])
        if not res:
            self.fail(msg)
    def test_full_and_partial_matching_mode(self):
        """bibclassify - difference of extraction on part or full contents of pdf"""

        path, url = self.get_test_file(94)

        if not os.path.exists(path):
            sys.stderr.write("No PDF for testing found, returning\n")
            return

        results = []
        for case in ["-k %s.rdf %s" % (self.taxonomy_name, path), "-k %s.rdf %s -m partial" % (self.taxonomy_name, path)]:
            args = (case).split()
            options = bibclassify_cli._read_options(args)

            self.redirect()

            bibclassify_engine.output_keywords_for_sources(options["text_files"],
                options["taxonomy"],
                rebuild_cache=options["rebuild_cache"],
                no_cache=options["no_cache"],
                output_mode=options["output_mode"],
                output_limit=options["output_limit"],
                spires=options["spires"],
                match_mode=options["match_mode"],
                with_author_keywords=options["with_author_keywords"],
                extract_acronyms=options["extract_acronyms"],
                only_core_tags=options["only_core_tags"])

            r, e = self.unredirect()
            results.append(r)



        res, msg = check_pdf1(results[1])
        if not res:
            self.fail(msg)
        res, msg = check_pdf2(results[0])
        if not res:
            self.fail(msg)
Esempio n. 11
0
    def test_cli_extract_from_directory(self):
        """bibclassify -k HEP.rdf directory/"""


        path = os.path.abspath(os.path.dirname(__file__) + '/../../../var/data/files/g0/90')

        if not os.path.exists(path):
            print "No PDF folder for testing found, returning"
            return


        args = ("-k HEP.rdf %s" % path).split()
        options = bibclassify_cli._read_options(args)

        bibclassify_engine.output_keywords_for_sources(options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"])


        self.stdout.flush()
        self.stdout.seek(0)
        results = self.stdout.read()
        self.stderr.flush()
        self.stderr.seek(0)
        errors = self.stderr.read()

        res, msg = check_pdf2(results)
        if not res:
            self.fail(msg)