def output_keywords_for_sources(input_sources, taxonomy_name, output_mode="text", output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False, match_mode="full", no_cache=False, with_author_keywords=False, rebuild_cache=False, only_core_tags=False, extract_acronyms=False, **kwargs): """Outputs the keywords for each source in sources.""" # Inner function which does the job and it would be too much work to # refactor the call (and it must be outside the loop, before it did # not process multiple files) def process_lines(): if output_mode == "text": print "Input file: %s" % source output = get_keywords_from_text(text_lines, taxonomy_name, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms ) print output # Get the fulltext for each source. for entry in input_sources: log.info("Trying to read input file %s." % entry) text_lines = None source = "" if os.path.isdir(entry): for filename in os.listdir(entry): filename = os.path.join(entry, filename) if os.path.isfile(filename): text_lines = extractor.text_lines_from_local_file(filename) if text_lines: source = filename process_lines() elif os.path.isfile(entry): text_lines = extractor.text_lines_from_local_file(entry) if text_lines: source = os.path.basename(entry) process_lines() else: # Treat as a URL. text_lines = extractor.text_lines_from_url(entry, user_agent=bconfig.CFG_BIBCLASSIFY_USER_AGENT) if text_lines: source = entry.split("/")[-1] process_lines()
def output_keywords_for_sources(input_sources, taxonomy, rebuild_cache=False, output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, match_mode="full", no_cache=False, with_author_keywords=False, spires=False, verbose=None, only_core_tags=False, extract_acronyms=False): """Outputs the keywords for each source in sources.""" if verbose is not None: set_verbose_level(verbose) # Initialize cache global _SKWS global _CKWS _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache, no_cache=no_cache) # Get the fulltext for each source. for entry in input_sources: write_message("INFO: Trying input file %s." % entry, stream=sys.stderr, verbose=3) text_lines = None source = "" if os.path.isdir(entry): for filename in os.listdir(entry): if os.path.isfile(entry + filename): text_lines = text_lines_from_local_file(entry + filename) if text_lines: source = filename elif os.path.isfile(entry): text_lines = text_lines_from_local_file(entry) if text_lines: source = os.path.basename(entry) else: # Treat as a URL. text_lines = text_lines_from_url(entry, user_agent=CFG_BIBCLASSIFY_USER_AGENT) if text_lines: source = entry.split("/")[-1] if source: if output_mode == "text": print "Input file: %s" % source keywords = get_keywords_from_text(text_lines, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, with_author_keywords=with_author_keywords, only_core_tags=only_core_tags) if extract_acronyms: acronyms = get_acronyms("\n".join(text_lines)) if acronyms: acronyms_str = ["\nAcronyms:"] for acronym, expansions in acronyms.iteritems(): expansions_str = ", ".join(["%s (%d)" % expansion for expansion in expansions]) acronyms_str.append("%s %s" % (acronym, expansions_str)) acronyms_str = "\n".join(acronyms_str) else: acronyms_str = "\nNo acronyms." print keywords + acronyms_str + "\n" else: print keywords
def output_keywords_for_sources(input_sources, taxonomy, rebuild_cache=False, output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, match_mode="full", no_cache=False, with_author_keywords=False, spires=False, verbose=None, only_core_tags=False, extract_acronyms=False): """Outputs the keywords for each source in sources.""" if verbose is not None: set_verbose_level(verbose) # Initialize cache global _SKWS global _CKWS _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache, no_cache=no_cache) # Get the fulltext for each source. for entry in input_sources: write_message("INFO: Trying input file %s." % entry, stream=sys.stderr, verbose=3) text_lines = None source = "" if os.path.isdir(entry): for filename in os.listdir(entry): if os.path.isfile(entry + filename): text_lines = text_lines_from_local_file(entry + filename) if text_lines: source = filename elif os.path.isfile(entry): text_lines = text_lines_from_local_file(entry) if text_lines: source = os.path.basename(entry) else: # Treat as a URL. text_lines = text_lines_from_url(entry, user_agent=CFG_BIBCLASSIFY_USER_AGENT) if text_lines: source = entry.split("/")[-1] if source: if output_mode == "text": print "Input file: %s" % source keywords = get_keywords_from_text(text_lines, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, with_author_keywords=with_author_keywords, only_core_tags=only_core_tags) if extract_acronyms: acronyms = get_acronyms("\n".join(text_lines)) if acronyms: acronyms_str = ["\nAcronyms:"] for acronym, expansions in acronyms.iteritems(): expansions_str = ", ".join(["%s (%d)" % expansion for expansion in expansions]) acronyms_str.append("%s %s" % (acronym, expansions_str)) acronyms_str = "\n".join(acronyms_str) else: acronyms_str = "\nNo acronyms." print keywords + acronyms_str + "\n" else: print keywords
def output_keywords_for_sources( input_sources, taxonomy_name, output_mode="text", output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False, match_mode="full", no_cache=False, with_author_keywords=False, rebuild_cache=False, only_core_tags=False, extract_acronyms=False, **kwargs): """Outputs the keywords for each source in sources.""" # Inner function which does the job and it would be too much work to # refactor the call (and it must be outside the loop, before it did # not process multiple files) def process_lines(): if output_mode == "text": print "Input file: %s" % source output = get_keywords_from_text( text_lines, taxonomy_name, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms) print output # Get the fulltext for each source. for entry in input_sources: log.info("Trying to read input file %s." % entry) text_lines = None source = "" if os.path.isdir(entry): for filename in os.listdir(entry): filename = os.path.join(entry, filename) if os.path.isfile(filename): text_lines = extractor.text_lines_from_local_file(filename) if text_lines: source = filename process_lines() elif os.path.isfile(entry): text_lines = extractor.text_lines_from_local_file(entry) if text_lines: source = os.path.basename(entry) process_lines() else: # Treat as a URL. text_lines = extractor.text_lines_from_url( entry, user_agent=make_user_agent_string("BibClassify")) if text_lines: source = entry.split("/")[-1] process_lines()