Python get_regular_expressionsの例、bibclassify_ontology_reader.get_regular_expressions Pythonの例

コード例 #1

0

ファイルを表示

ファイル: bibclassify_engine.py プロジェクト: pombredanne/invenio

def get_keywords_from_local_file(local_file, taxonomy, rebuild_cache=False,
    match_mode="full", no_cache=False, with_author_keywords=False):

    text_lines = text_lines_from_local_file(local_file)

    global _SKWS
    global _CKWS
    if not _SKWS:
        if taxonomy is not None:
            _SKWS, _CKWS = get_regular_expressions(taxonomy,
                rebuild=rebuild_cache, no_cache=no_cache)
        else:
            write_message("ERROR: Please specify an ontology in order to "
                "extract keywords.", stream=sys.stderr, verbose=1)

    text_lines = cut_references(text_lines)
    fulltext = normalize_fulltext("\n".join(text_lines))

    author_keywords = None
    if with_author_keywords:
        author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext)

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    single_keywords = get_single_keywords(_SKWS, fulltext)

    composite_keywords = get_composite_keywords(_CKWS, fulltext,
        single_keywords)

    return (single_keywords, composite_keywords)

コード例 #2

0

ファイルを表示

ファイル: bibclassify_engine.py プロジェクト: pombredanne/invenio

def get_keywords_from_text(text_lines, taxonomy=None, output_mode="text",
    output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False,
    match_mode="full", no_cache=False, with_author_keywords=False,
    rebuild_cache=False, only_core_tags=False):
    """Returns a formatted string containing the keywords for a single
    document."""
    global _SKWS
    global _CKWS
    if not _SKWS:
        if taxonomy is not None:
            _SKWS, _CKWS = get_regular_expressions(taxonomy,
                rebuild=rebuild_cache, no_cache=no_cache)
        else:
            write_message("ERROR: Please specify an ontology in order to "
                "extract keywords.", stream=sys.stderr, verbose=1)

    text_lines = cut_references(text_lines)
    fulltext = normalize_fulltext("\n".join(text_lines))

    author_keywords = None
    if with_author_keywords:
        author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext)

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    single_keywords = get_single_keywords(_SKWS, fulltext)

    composite_keywords = get_composite_keywords(_CKWS, fulltext,
        single_keywords)

    return _get_keywords_output(single_keywords, composite_keywords, taxonomy,
        author_keywords, output_mode, output_limit, spires, only_core_tags)

コード例 #3

0

ファイルを表示

ファイル: bibclassify_engine.py プロジェクト: metandrey/invenio-metandrey

def get_keywords_from_text(text_lines, taxonomy=None, output_mode="text",
    output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False,
    match_mode="full", no_cache=False, with_author_keywords=False,
    rebuild_cache=False, only_core_tags=False):
    """Returns a formatted string containing the keywords for a single
    document."""
    global _SKWS
    global _CKWS
    if not _SKWS:
        if taxonomy is not None:
            _SKWS, _CKWS = get_regular_expressions(taxonomy,
                rebuild=rebuild_cache, no_cache=no_cache)
        else:
            write_message("ERROR: Please specify an ontology in order to "
                "extract keywords.", stream=sys.stderr, verbose=1)

    text_lines = cut_references(text_lines)
    fulltext = normalize_fulltext("\n".join(text_lines))

    author_keywords = None
    if with_author_keywords:
        author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext)

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    single_keywords = get_single_keywords(_SKWS, fulltext)

    composite_keywords = get_composite_keywords(_CKWS, fulltext,
        single_keywords)

    return _get_keywords_output(single_keywords, composite_keywords, taxonomy,
        author_keywords, output_mode, output_limit, spires, only_core_tags)

コード例 #4

0

ファイルを表示

ファイル: bibclassify_engine.py プロジェクト: metandrey/invenio-metandrey

def get_keywords_from_local_file(local_file, taxonomy, rebuild_cache=False,
    match_mode="full", no_cache=False, with_author_keywords=False):

    text_lines = text_lines_from_local_file(local_file)

    global _SKWS
    global _CKWS
    if not _SKWS:
        if taxonomy is not None:
            _SKWS, _CKWS = get_regular_expressions(taxonomy,
                rebuild=rebuild_cache, no_cache=no_cache)
        else:
            write_message("ERROR: Please specify an ontology in order to "
                "extract keywords.", stream=sys.stderr, verbose=1)

    text_lines = cut_references(text_lines)
    fulltext = normalize_fulltext("\n".join(text_lines))

    author_keywords = None
    if with_author_keywords:
        author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext)

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    single_keywords = get_single_keywords(_SKWS, fulltext)

    composite_keywords = get_composite_keywords(_CKWS, fulltext,
        single_keywords)

    return (single_keywords, composite_keywords)

コード例 #5

0

ファイルを表示

ファイル: bibclassify_unit_tests.py プロジェクト: Kennethhole/Invenio-1

    def test_rebuild_cache(self):
        """bibclassify - test rebuilding cache (takes long time)"""

        info = bibclassify_ontology_reader._get_ontology(self.taxonomy_name)

        if info[0]:
            cache = bibclassify_ontology_reader._get_cache_path(info[0])

            if os.path.exists(cache):
                ctime = os.stat(cache)[stat.ST_CTIME]
            else:
                ctime = -1

            rex = bibclassify_ontology_reader.get_regular_expressions(self.taxonomy_name, rebuild=True)

            self.assertTrue(os.path.exists(cache))
            ntime = os.stat(cache)[stat.ST_CTIME]

            self.assertTrue((ntime > ctime))
        else:
            raise Exception("Taxonomy wasn't found")

コード例 #6

0

ファイルを表示

    def test_rebuild_cache(self):
        """bibclassify - test rebuilding cache (takes long time)"""

        info = bibclassify_ontology_reader._get_ontology(self.taxonomy_name)

        if info[0]:
            cache = bibclassify_ontology_reader._get_cache_path(info[0])

            if os.path.exists(cache):
                ctime = os.stat(cache)[stat.ST_CTIME]
            else:
                ctime = -1

            rex = bibclassify_ontology_reader.get_regular_expressions(self.taxonomy_name, rebuild=True)

            self.assertTrue(os.path.exists(cache))
            ntime = os.stat(cache)[stat.ST_CTIME]

            self.assertTrue(ntime > ctime)
        else:
            raise Exception("Taxonomy wasn't found")

コード例 #7

0

ファイルを表示

ファイル: bibclassify_engine.py プロジェクト: pombredanne/invenio

def output_keywords_for_sources(input_sources, taxonomy, rebuild_cache=False,
    output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
    match_mode="full", no_cache=False, with_author_keywords=False,
    spires=False, verbose=None, only_core_tags=False, extract_acronyms=False):
    """Outputs the keywords for each source in sources."""
    if verbose is not None:
        set_verbose_level(verbose)

    # Initialize cache
    global _SKWS
    global _CKWS
    _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache,
        no_cache=no_cache)

    # Get the fulltext for each source.
    for entry in input_sources:
        write_message("INFO: Trying input file %s." % entry, stream=sys.stderr,
            verbose=3)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                if os.path.isfile(entry + filename):
                    text_lines = text_lines_from_local_file(entry + filename)
                    if text_lines:
                        source = filename
        elif os.path.isfile(entry):
            text_lines = text_lines_from_local_file(entry)
            if text_lines:
                source = os.path.basename(entry)
        else:
            # Treat as a URL.
            text_lines = text_lines_from_url(entry,
                user_agent=CFG_BIBCLASSIFY_USER_AGENT)
            if text_lines:
                source = entry.split("/")[-1]

        if source:
            if output_mode == "text":
                print "Input file: %s" % source

            keywords = get_keywords_from_text(text_lines,
                output_mode=output_mode,
                output_limit=output_limit,
                spires=spires,
                match_mode=match_mode,
                with_author_keywords=with_author_keywords,
                only_core_tags=only_core_tags)

            if extract_acronyms:
                acronyms = get_acronyms("\n".join(text_lines))
                if acronyms:
                    acronyms_str = ["\nAcronyms:"]
                    for acronym, expansions in acronyms.iteritems():
                        expansions_str = ", ".join(["%s (%d)" % expansion
                                                    for expansion in expansions])

                        acronyms_str.append("%s  %s" % (acronym, expansions_str))
                    acronyms_str = "\n".join(acronyms_str)
                else:
                    acronyms_str = "\nNo acronyms."

                print keywords + acronyms_str + "\n"
            else:
                print keywords

コード例 #8

0

ファイルを表示

ファイル: bibclassify_unit_tests.py プロジェクト: Kennethhole/Invenio-1

    def test_cache_accessibility(self):
        """bibclassify - test cache accessibility/writability"""

        # we will do tests with a copy of test taxonomy, in case anything goes wrong...
        orig_name, orig_taxonomy_path, orig_taxonomy_url = bibclassify_ontology_reader._get_ontology(self.taxonomy_name)

        taxonomy_path = orig_taxonomy_path.replace('.rdf', '.copy.rdf')
        taxonomy_name = self.taxonomy_name + '.copy'

        shutil.copy(orig_taxonomy_path, taxonomy_path)
        assert(os.path.exists(taxonomy_path))

        name, taxonomy_path, taxonomy_url = bibclassify_ontology_reader._get_ontology(taxonomy_name)

        cache = bibclassify_ontology_reader._get_cache_path(os.path.basename(taxonomy_path))


        if not name:
            raise Exception("Taxonomy wasn't found")

        if os.path.exists(cache):
            os.remove(cache)

        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False)
        assert(os.path.exists(cache))

        log.error('Testing corrupted states, please ignore errors...')

        # set cache unreadable
        os.chmod(cache, 000)
        try: bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False)
        except: pass
        else: raise Exception('cache chmod to 000 but no exception raised')

        # set cache unreadable and test writing
        os.chmod(cache, 000)
        try: bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False)
        except: pass
        else: raise Exception('cache chmod to 000 but no exception raised')

        # set cache unreadable but don't care for it
        os.chmod(cache, 000)
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=True)
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=True)

        # set cache readable and test writing
        os.chmod(cache, 600)
        try: bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False)
        except: pass
        else: raise Exception('cache chmod to 600 but no exception raised')

        # set cache writable only
        os.chmod(cache, 200)
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False)
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False)


        # set cache readable/writable but corrupted (must rebuild itself)
        os.chmod(cache, 600)
        os.remove(cache)
        open(cache, 'w').close()
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False)


        # set cache readable/writable but corrupted (must rebuild itself)
        open(cache, 'w').close()
        try:
            try:
                os.rename(taxonomy_path, taxonomy_path + 'x')
                open(taxonomy_path, 'w').close()
                bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False)
            except:
                pass
        finally:
            os.rename(taxonomy_path+'x', taxonomy_path)

        # make cache ok, but corrupt source
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False)

        try:
            try:
                os.rename(taxonomy_path, taxonomy_path + 'x')
                open(taxonomy_path, 'w').close()
                time.sleep(.1)
                os.utime(cache, (time.time() + 100, time.time() + 100))  #touch the taxonomy to be older
                bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False)
            except:
                os.rename(taxonomy_path+'x', taxonomy_path)
                raise Exception('Cache exists and is ok, but was ignored')
        finally:
            os.rename(taxonomy_path+'x', taxonomy_path)

        # make cache ok (but old), and corrupt source
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False)
        try:
            try:
                os.rename(taxonomy_path, taxonomy_path + 'x')
                open(taxonomy_path, 'w').close()
                bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False)
            except:
                pass
        finally:
            os.rename(taxonomy_path+'x', taxonomy_path)

        log.error('...testing of corrupted states finished.')

        name, taxonomy_path, taxonomy_url = bibclassify_ontology_reader._get_ontology(taxonomy_name)
        cache = bibclassify_ontology_reader._get_cache_path(name)
        os.remove(taxonomy_path)
        os.remove(cache)

コード例 #9

0

ファイルを表示

ファイル: bibclassify_engine.py プロジェクト: metandrey/invenio-metandrey

def output_keywords_for_sources(input_sources, taxonomy, rebuild_cache=False,
    output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
    match_mode="full", no_cache=False, with_author_keywords=False,
    spires=False, verbose=None, only_core_tags=False, extract_acronyms=False):
    """Outputs the keywords for each source in sources."""
    if verbose is not None:
        set_verbose_level(verbose)

    # Initialize cache
    global _SKWS
    global _CKWS
    _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache,
        no_cache=no_cache)

    # Get the fulltext for each source.
    for entry in input_sources:
        write_message("INFO: Trying input file %s." % entry, stream=sys.stderr,
            verbose=3)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                if os.path.isfile(entry + filename):
                    text_lines = text_lines_from_local_file(entry + filename)
                    if text_lines:
                        source = filename
        elif os.path.isfile(entry):
            text_lines = text_lines_from_local_file(entry)
            if text_lines:
                source = os.path.basename(entry)
        else:
            # Treat as a URL.
            text_lines = text_lines_from_url(entry,
                user_agent=CFG_BIBCLASSIFY_USER_AGENT)
            if text_lines:
                source = entry.split("/")[-1]

        if source:
            if output_mode == "text":
                print "Input file: %s" % source

            keywords = get_keywords_from_text(text_lines,
                output_mode=output_mode,
                output_limit=output_limit,
                spires=spires,
                match_mode=match_mode,
                with_author_keywords=with_author_keywords,
                only_core_tags=only_core_tags)

            if extract_acronyms:
                acronyms = get_acronyms("\n".join(text_lines))
                if acronyms:
                    acronyms_str = ["\nAcronyms:"]
                    for acronym, expansions in acronyms.iteritems():
                        expansions_str = ", ".join(["%s (%d)" % expansion
                                                    for expansion in expansions])

                        acronyms_str.append("%s  %s" % (acronym, expansions_str))
                    acronyms_str = "\n".join(acronyms_str)
                else:
                    acronyms_str = "\nNo acronyms."

                print keywords + acronyms_str + "\n"
            else:
                print keywords

コード例 #10

0

ファイルを表示

    def test_cache_accessibility(self):
        """Test cache accessibility/writability"""

        # we will do tests with a copy of test taxonomy, in case anything goes wrong...
        name, taxonomy_path, taxonomy_url = bibclassify_ontology_reader._get_ontology('test')
        shutil.copy(taxonomy_path, taxonomy_path +'.copy')
        assert(os.path.exists(taxonomy_path + '.copy'))
        self.taxonomy_name = 'test.rdf.copy'

        taxonomy_name = self.taxonomy_name

        name, taxonomy_path, taxonomy_url = bibclassify_ontology_reader._get_ontology(self.taxonomy_name)
        cache = bibclassify_ontology_reader._get_cache_path(name)

        if not name:
            raise Exception("Taxonomy wasn't found")

        if os.path.exists(cache):
            os.remove(cache)

        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False)
        assert(os.path.exists(cache))

        # set cache unreadable
        os.chmod(cache, 000)
        try: bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False)
        except: pass
        else: raise Exception('cache chmod to 000 but no exception raised')

        # set cache unreadable and test writing
        os.chmod(cache, 000)
        try: bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False)
        except: pass
        else: raise Exception('cache chmod to 000 but no exception raised')

        # set cache unreadable but don't care for it
        os.chmod(cache, 000)
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=True)
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=True)

        # set cache readable and test writing
        os.chmod(cache, 600)
        try: bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False)
        except: pass
        else: raise Exception('cache chmod to 600 but no exception raised')

        # set cache writable only
        os.chmod(cache, 200)
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False)
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False)

        # set cache readable/writable but corrupted (must rebuild itself)
        os.chmod(cache, 600)
        os.remove(cache)
        open(cache, 'w').close()
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False)

        # set cache readable/writable but corrupted (must rebuild itself)
        open(cache, 'w').close()
        try:
            try:
                os.rename(taxonomy_path, taxonomy_path + 'x')
                open(taxonomy_path, 'w').close()
                bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False)
            except:
                pass
        finally:
            os.rename(taxonomy_path+'x', taxonomy_path)

        # make cache ok, but corrupt source
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False)
        try:
            try:
                os.rename(taxonomy_path, taxonomy_path + 'x')
                open(taxonomy_path, 'w').close()
                time.sleep(.1)
                os.utime(cache, (time.time() + 100, time.time() + 100))  #touch the taxonomy to be older
                bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False)
            except:
                os.rename(taxonomy_path+'x', taxonomy_path)
                raise Exception('Cache exists and is ok, but was ignored')
        finally:
            os.rename(taxonomy_path+'x', taxonomy_path)

        # make cache ok (but old), and corrupt source
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False)
        try:
            try:
                os.rename(taxonomy_path, taxonomy_path + 'x')
                open(taxonomy_path, 'w').close()
                bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False)
            except:
                pass
        finally:
            os.rename(taxonomy_path+'x', taxonomy_path)

        name, taxonomy_path, taxonomy_url = bibclassify_ontology_reader._get_ontology(self.taxonomy_name)
        cache = bibclassify_ontology_reader._get_cache_path(name)
        os.remove(taxonomy_path)
        os.remove(cache)

コード例 #11

0

ファイルを表示

def get_keywords_from_text(
        text_lines,
        taxonomy_name,
        output_mode="text",
        output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
        spires=False,
        match_mode="full",
        no_cache=False,
        with_author_keywords=False,
        rebuild_cache=False,
        only_core_tags=False,
        extract_acronyms=False,
        **kwargs):
    """Extracts keywords from the list of strings

    @var text_lines: list of strings (will be normalized before being
        joined into one string)
    @keyword taxonomy_name: string, name of the taxonomy_name
    @keyword output_mode: string - text|html|marcxml|raw
    @keyword output_limit: int
    @keyword spires: boolean, if True marcxml output reflect spires
        codes
    @keyword match_mode: str - partial|full; in partial mode only
        beginning of the fulltext is searched
    @keyword no_cache: boolean, means loaded definitions will not be saved
    @keyword with_author_keywords: boolean, extract keywords from the
        pdfs
    @keyword rebuild_cache: boolean
    @keyword only_core_tags: boolean
    @return: if output_mode=raw, it will return
            (single_keywords, composite_keywords, author_keywords, acronyms)
            for other output modes it returns formatted string
    """

    start_time = time.time()
    cache = reader.get_cache(taxonomy_name)
    if not cache:
        reader.set_cache(
            taxonomy_name,
            reader.get_regular_expressions(taxonomy_name,
                                           rebuild=rebuild_cache,
                                           no_cache=no_cache))
        cache = reader.get_cache(taxonomy_name)

    _skw = cache[0]
    _ckw = cache[1]

    text_lines = normalizer.cut_references(text_lines)
    fulltext = normalizer.normalize_fulltext("\n".join(text_lines))

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    author_keywords = None
    if with_author_keywords:
        author_keywords = extract_author_keywords(_skw, _ckw, fulltext)

    acronyms = {}
    if extract_acronyms:
        acronyms = extract_abbreviations(fulltext)

    single_keywords = extract_single_keywords(_skw, fulltext)
    composite_keywords = extract_composite_keywords(_ckw, fulltext,
                                                    single_keywords)

    if only_core_tags:
        single_keywords = clean_before_output(
            _filter_core_keywors(single_keywords))
        composite_keywords = _filter_core_keywors(composite_keywords)
    else:
        # Filter out the "nonstandalone" keywords
        single_keywords = clean_before_output(single_keywords)

    log.info('Keywords generated in: %.1f sec' % (time.time() - start_time))

    if output_mode == "raw":
        if output_limit:
            return (
                _kw(_sort_kw_matches(single_keywords, output_limit)),
                _kw(_sort_kw_matches(composite_keywords, output_limit)),
                author_keywords,  # this we don't limit (?)
                _kw(_sort_kw_matches(acronyms, output_limit)))
        else:
            return (single_keywords, composite_keywords, author_keywords,
                    acronyms)
    else:
        return get_keywords_output(single_keywords, composite_keywords,
                                   taxonomy_name, author_keywords, acronyms,
                                   output_mode, output_limit, spires,
                                   only_core_tags)

コード例 #12

0

ファイルを表示

ファイル: bibclassify_engine.py プロジェクト: pombredanne/invenio-old

def get_keywords_from_text(text_lines, taxonomy_name, output_mode="text",
    output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False,
    match_mode="full", no_cache=False, with_author_keywords=False,
    rebuild_cache=False, only_core_tags=False, extract_acronyms=False,
    **kwargs):
    """Extracts keywords from the list of strings

    @var text_lines: list of strings (will be normalized before being
        joined into one string)
    @keyword taxonomy_name: string, name of the taxonomy_name
    @keyword output_mode: string - text|html|marcxml|raw
    @keyword output_limit: int
    @keyword spires: boolean, if True marcxml output reflect spires
        codes
    @keyword match_mode: str - partial|full; in partial mode only
        beginning of the fulltext is searched
    @keyword no_cache: boolean, means loaded definitions will not be saved
    @keyword with_author_keywords: boolean, extract keywords from the
        pdfs
    @keyword rebuild_cache: boolean
    @keyword only_core_tags: boolean
    @return: if output_mode=raw, it will return
            (single_keywords, composite_keywords, author_keywords, acronyms)
            for other output modes it returns formatted string
    """

    cache = reader.get_cache(taxonomy_name)
    if not cache:
        reader.set_cache(taxonomy_name, reader.get_regular_expressions(taxonomy_name,
                rebuild=rebuild_cache, no_cache=no_cache))
        cache = reader.get_cache(taxonomy_name)


    _skw = cache[0]
    _ckw = cache[1]

    text_lines = normalizer.cut_references(text_lines)
    fulltext = normalizer.normalize_fulltext("\n".join(text_lines))

    author_keywords = None
    if with_author_keywords:
        author_keywords = extract_author_keywords(_skw, _ckw, fulltext)

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    single_keywords = extract_single_keywords(_skw, fulltext)

    composite_keywords = extract_composite_keywords(_ckw, fulltext, single_keywords)

    acronyms = {}
    if extract_acronyms:
        acronyms = extract_abbreviations(fulltext)

    if output_mode == "raw":
        if output_limit:
            return (_kw(_sort_kw_matches(single_keywords, output_limit)),
                    _kw(_sort_kw_matches(composite_keywords, output_limit)),
                    author_keywords, # this we don't limit (?)
                    _kw(_sort_kw_matches(acronyms, output_limit)))
        else:
            return (single_keywords, composite_keywords, author_keywords, acronyms)
    else:
        return _get_keywords_output(single_keywords, composite_keywords, taxonomy_name,
                                    author_keywords, acronyms, output_mode, output_limit,
                                    spires, only_core_tags)