Esempio n. 1
0
def get_keywords_from_text(text_lines, taxonomy=None, output_mode="text",
    output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False,
    match_mode="full", no_cache=False, with_author_keywords=False,
    rebuild_cache=False, only_core_tags=False):
    """Returns a formatted string containing the keywords for a single
    document."""
    global _SKWS
    global _CKWS
    if not _SKWS:
        if taxonomy is not None:
            _SKWS, _CKWS = get_regular_expressions(taxonomy,
                rebuild=rebuild_cache, no_cache=no_cache)
        else:
            write_message("ERROR: Please specify an ontology in order to "
                "extract keywords.", stream=sys.stderr, verbose=1)

    text_lines = cut_references(text_lines)
    fulltext = normalize_fulltext("\n".join(text_lines))

    author_keywords = None
    if with_author_keywords:
        author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext)

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    single_keywords = get_single_keywords(_SKWS, fulltext)

    composite_keywords = get_composite_keywords(_CKWS, fulltext,
        single_keywords)

    return _get_keywords_output(single_keywords, composite_keywords, taxonomy,
        author_keywords, output_mode, output_limit, spires, only_core_tags)
Esempio n. 2
0
def get_keywords_from_local_file(local_file, taxonomy, rebuild_cache=False,
    match_mode="full", no_cache=False, with_author_keywords=False):

    text_lines = text_lines_from_local_file(local_file)

    global _SKWS
    global _CKWS
    if not _SKWS:
        if taxonomy is not None:
            _SKWS, _CKWS = get_regular_expressions(taxonomy,
                rebuild=rebuild_cache, no_cache=no_cache)
        else:
            write_message("ERROR: Please specify an ontology in order to "
                "extract keywords.", stream=sys.stderr, verbose=1)

    text_lines = cut_references(text_lines)
    fulltext = normalize_fulltext("\n".join(text_lines))

    author_keywords = None
    if with_author_keywords:
        author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext)

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    single_keywords = get_single_keywords(_SKWS, fulltext)

    composite_keywords = get_composite_keywords(_CKWS, fulltext,
        single_keywords)

    return (single_keywords, composite_keywords)
def get_keywords_from_text(text_lines, taxonomy=None, output_mode="text",
    output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False,
    match_mode="full", no_cache=False, with_author_keywords=False,
    rebuild_cache=False, only_core_tags=False):
    """Returns a formatted string containing the keywords for a single
    document."""
    global _SKWS
    global _CKWS
    if not _SKWS:
        if taxonomy is not None:
            _SKWS, _CKWS = get_regular_expressions(taxonomy,
                rebuild=rebuild_cache, no_cache=no_cache)
        else:
            write_message("ERROR: Please specify an ontology in order to "
                "extract keywords.", stream=sys.stderr, verbose=1)

    text_lines = cut_references(text_lines)
    fulltext = normalize_fulltext("\n".join(text_lines))

    author_keywords = None
    if with_author_keywords:
        author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext)

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    single_keywords = get_single_keywords(_SKWS, fulltext)

    composite_keywords = get_composite_keywords(_CKWS, fulltext,
        single_keywords)

    return _get_keywords_output(single_keywords, composite_keywords, taxonomy,
        author_keywords, output_mode, output_limit, spires, only_core_tags)
def get_keywords_from_local_file(local_file, taxonomy, rebuild_cache=False,
    match_mode="full", no_cache=False, with_author_keywords=False):

    text_lines = text_lines_from_local_file(local_file)

    global _SKWS
    global _CKWS
    if not _SKWS:
        if taxonomy is not None:
            _SKWS, _CKWS = get_regular_expressions(taxonomy,
                rebuild=rebuild_cache, no_cache=no_cache)
        else:
            write_message("ERROR: Please specify an ontology in order to "
                "extract keywords.", stream=sys.stderr, verbose=1)

    text_lines = cut_references(text_lines)
    fulltext = normalize_fulltext("\n".join(text_lines))

    author_keywords = None
    if with_author_keywords:
        author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext)

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    single_keywords = get_single_keywords(_SKWS, fulltext)

    composite_keywords = get_composite_keywords(_CKWS, fulltext,
        single_keywords)

    return (single_keywords, composite_keywords)
Esempio n. 5
0
def get_keywords_from_text(
        text_lines,
        taxonomy_name,
        output_mode="text",
        output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
        spires=False,
        match_mode="full",
        no_cache=False,
        with_author_keywords=False,
        rebuild_cache=False,
        only_core_tags=False,
        extract_acronyms=False,
        **kwargs):
    """Extracts keywords from the list of strings

    @var text_lines: list of strings (will be normalized before being
        joined into one string)
    @keyword taxonomy_name: string, name of the taxonomy_name
    @keyword output_mode: string - text|html|marcxml|raw
    @keyword output_limit: int
    @keyword spires: boolean, if True marcxml output reflect spires
        codes
    @keyword match_mode: str - partial|full; in partial mode only
        beginning of the fulltext is searched
    @keyword no_cache: boolean, means loaded definitions will not be saved
    @keyword with_author_keywords: boolean, extract keywords from the
        pdfs
    @keyword rebuild_cache: boolean
    @keyword only_core_tags: boolean
    @return: if output_mode=raw, it will return
            (single_keywords, composite_keywords, author_keywords, acronyms)
            for other output modes it returns formatted string
    """

    start_time = time.time()
    cache = reader.get_cache(taxonomy_name)
    if not cache:
        reader.set_cache(
            taxonomy_name,
            reader.get_regular_expressions(taxonomy_name,
                                           rebuild=rebuild_cache,
                                           no_cache=no_cache))
        cache = reader.get_cache(taxonomy_name)

    _skw = cache[0]
    _ckw = cache[1]

    text_lines = normalizer.cut_references(text_lines)
    fulltext = normalizer.normalize_fulltext("\n".join(text_lines))

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    author_keywords = None
    if with_author_keywords:
        author_keywords = extract_author_keywords(_skw, _ckw, fulltext)

    acronyms = {}
    if extract_acronyms:
        acronyms = extract_abbreviations(fulltext)

    single_keywords = extract_single_keywords(_skw, fulltext)
    composite_keywords = extract_composite_keywords(_ckw, fulltext,
                                                    single_keywords)

    if only_core_tags:
        single_keywords = clean_before_output(
            _filter_core_keywors(single_keywords))
        composite_keywords = _filter_core_keywors(composite_keywords)
    else:
        # Filter out the "nonstandalone" keywords
        single_keywords = clean_before_output(single_keywords)

    log.info('Keywords generated in: %.1f sec' % (time.time() - start_time))

    if output_mode == "raw":
        if output_limit:
            return (
                _kw(_sort_kw_matches(single_keywords, output_limit)),
                _kw(_sort_kw_matches(composite_keywords, output_limit)),
                author_keywords,  # this we don't limit (?)
                _kw(_sort_kw_matches(acronyms, output_limit)))
        else:
            return (single_keywords, composite_keywords, author_keywords,
                    acronyms)
    else:
        return get_keywords_output(single_keywords, composite_keywords,
                                   taxonomy_name, author_keywords, acronyms,
                                   output_mode, output_limit, spires,
                                   only_core_tags)
def get_keywords_from_text(text_lines, taxonomy_name, output_mode="text",
    output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False,
    match_mode="full", no_cache=False, with_author_keywords=False,
    rebuild_cache=False, only_core_tags=False, extract_acronyms=False,
    **kwargs):
    """Extracts keywords from the list of strings

    @var text_lines: list of strings (will be normalized before being
        joined into one string)
    @keyword taxonomy_name: string, name of the taxonomy_name
    @keyword output_mode: string - text|html|marcxml|raw
    @keyword output_limit: int
    @keyword spires: boolean, if True marcxml output reflect spires
        codes
    @keyword match_mode: str - partial|full; in partial mode only
        beginning of the fulltext is searched
    @keyword no_cache: boolean, means loaded definitions will not be saved
    @keyword with_author_keywords: boolean, extract keywords from the
        pdfs
    @keyword rebuild_cache: boolean
    @keyword only_core_tags: boolean
    @return: if output_mode=raw, it will return
            (single_keywords, composite_keywords, author_keywords, acronyms)
            for other output modes it returns formatted string
    """

    cache = reader.get_cache(taxonomy_name)
    if not cache:
        reader.set_cache(taxonomy_name, reader.get_regular_expressions(taxonomy_name,
                rebuild=rebuild_cache, no_cache=no_cache))
        cache = reader.get_cache(taxonomy_name)


    _skw = cache[0]
    _ckw = cache[1]

    text_lines = normalizer.cut_references(text_lines)
    fulltext = normalizer.normalize_fulltext("\n".join(text_lines))

    author_keywords = None
    if with_author_keywords:
        author_keywords = extract_author_keywords(_skw, _ckw, fulltext)

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    single_keywords = extract_single_keywords(_skw, fulltext)

    composite_keywords = extract_composite_keywords(_ckw, fulltext, single_keywords)

    acronyms = {}
    if extract_acronyms:
        acronyms = extract_abbreviations(fulltext)

    if output_mode == "raw":
        if output_limit:
            return (_kw(_sort_kw_matches(single_keywords, output_limit)),
                    _kw(_sort_kw_matches(composite_keywords, output_limit)),
                    author_keywords, # this we don't limit (?)
                    _kw(_sort_kw_matches(acronyms, output_limit)))
        else:
            return (single_keywords, composite_keywords, author_keywords, acronyms)
    else:
        return _get_keywords_output(single_keywords, composite_keywords, taxonomy_name,
                                    author_keywords, acronyms, output_mode, output_limit,
                                    spires, only_core_tags)