def output_keywords_for_local_file(
        local_file,
        taxonomy,
        rebuild_cache=False,
        output_mode="text",
        output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
        match_mode="full",
        no_cache=False,
        with_author_keywords=False,
        spires=False,
        verbose=None):
    """Outputs the keywords for a local file."""
    if verbose is not None:
        set_verbose_level(verbose)

    write_message("INFO: Analyzing keywords for local file %s." % local_file,
                  stream=sys.stderr,
                  verbose=3)
    text_lines = text_lines_from_local_file(local_file)

    return get_keywords_from_text(text_lines,
                                  output_mode=output_mode,
                                  output_limit=output_limit,
                                  taxonomy=taxonomy,
                                  spires=spires,
                                  match_mode=match_mode,
                                  with_author_keywords=with_author_keywords,
                                  rebuild_cache=rebuild_cache,
                                  no_cache=no_cache)
Esempio n. 2
0
def output_keywords_for_local_file(local_file, taxonomy, rebuild_cache=False,
    output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
    match_mode="full", no_cache=False, with_author_keywords=False,
    spires=False, verbose=None):
    """Outputs the keywords for a local file."""
    if verbose is not None:
        set_verbose_level(verbose)

    write_message("INFO: Analyzing keywords for local file %s." % local_file,
        stream=sys.stderr, verbose=3)
    text_lines = text_lines_from_local_file(local_file)

    return get_keywords_from_text(text_lines,
        output_mode=output_mode,
        output_limit=output_limit,
        taxonomy=taxonomy,
        spires=spires,
        match_mode=match_mode,
        with_author_keywords=with_author_keywords,
        rebuild_cache=rebuild_cache,
        no_cache=no_cache)
Esempio n. 3
0
def output_keywords_for_sources(input_sources, taxonomy, rebuild_cache=False,
    output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
    match_mode="full", no_cache=False, with_author_keywords=False,
    spires=False, verbose=None, only_core_tags=False, extract_acronyms=False):
    """Outputs the keywords for each source in sources."""
    if verbose is not None:
        set_verbose_level(verbose)

    # Initialize cache
    global _SKWS
    global _CKWS
    _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache,
        no_cache=no_cache)

    # Get the fulltext for each source.
    for entry in input_sources:
        write_message("INFO: Trying input file %s." % entry, stream=sys.stderr,
            verbose=3)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                if os.path.isfile(entry + filename):
                    text_lines = text_lines_from_local_file(entry + filename)
                    if text_lines:
                        source = filename
        elif os.path.isfile(entry):
            text_lines = text_lines_from_local_file(entry)
            if text_lines:
                source = os.path.basename(entry)
        else:
            # Treat as a URL.
            text_lines = text_lines_from_url(entry,
                user_agent=CFG_BIBCLASSIFY_USER_AGENT)
            if text_lines:
                source = entry.split("/")[-1]

        if source:
            if output_mode == "text":
                print "Input file: %s" % source

            keywords = get_keywords_from_text(text_lines,
                output_mode=output_mode,
                output_limit=output_limit,
                spires=spires,
                match_mode=match_mode,
                with_author_keywords=with_author_keywords,
                only_core_tags=only_core_tags)

            if extract_acronyms:
                acronyms = get_acronyms("\n".join(text_lines))
                if acronyms:
                    acronyms_str = ["\nAcronyms:"]
                    for acronym, expansions in acronyms.iteritems():
                        expansions_str = ", ".join(["%s (%d)" % expansion
                                                    for expansion in expansions])

                        acronyms_str.append("%s  %s" % (acronym, expansions_str))
                    acronyms_str = "\n".join(acronyms_str)
                else:
                    acronyms_str = "\nNo acronyms."

                print keywords + acronyms_str + "\n"
            else:
                print keywords
def output_keywords_for_sources(input_sources, taxonomy, rebuild_cache=False,
    output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
    match_mode="full", no_cache=False, with_author_keywords=False,
    spires=False, verbose=None, only_core_tags=False, extract_acronyms=False):
    """Outputs the keywords for each source in sources."""
    if verbose is not None:
        set_verbose_level(verbose)

    # Initialize cache
    global _SKWS
    global _CKWS
    _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache,
        no_cache=no_cache)

    # Get the fulltext for each source.
    for entry in input_sources:
        write_message("INFO: Trying input file %s." % entry, stream=sys.stderr,
            verbose=3)
        text_lines = None
        source = ""
        if os.path.isdir(entry):
            for filename in os.listdir(entry):
                if os.path.isfile(entry + filename):
                    text_lines = text_lines_from_local_file(entry + filename)
                    if text_lines:
                        source = filename
        elif os.path.isfile(entry):
            text_lines = text_lines_from_local_file(entry)
            if text_lines:
                source = os.path.basename(entry)
        else:
            # Treat as a URL.
            text_lines = text_lines_from_url(entry,
                user_agent=CFG_BIBCLASSIFY_USER_AGENT)
            if text_lines:
                source = entry.split("/")[-1]

        if source:
            if output_mode == "text":
                print "Input file: %s" % source

            keywords = get_keywords_from_text(text_lines,
                output_mode=output_mode,
                output_limit=output_limit,
                spires=spires,
                match_mode=match_mode,
                with_author_keywords=with_author_keywords,
                only_core_tags=only_core_tags)

            if extract_acronyms:
                acronyms = get_acronyms("\n".join(text_lines))
                if acronyms:
                    acronyms_str = ["\nAcronyms:"]
                    for acronym, expansions in acronyms.iteritems():
                        expansions_str = ", ".join(["%s (%d)" % expansion
                                                    for expansion in expansions])

                        acronyms_str.append("%s  %s" % (acronym, expansions_str))
                    acronyms_str = "\n".join(acronyms_str)
                else:
                    acronyms_str = "\nNo acronyms."

                print keywords + acronyms_str + "\n"
            else:
                print keywords