def extract_abbreviations(fulltext): """Extract acronyms from the fulltext @var fulltext: utf-8 string @return: dictionary of matches in a formt { <keyword object>, [matched skw or ckw object, ....] } or empty {} """ acronyms = {} K = reader.KeywordToken for k, v in acronymer.get_acronyms(fulltext).items(): acronyms[K(k, type='acronym')] = v return acronyms
def output_keywords_for_sources(input_sources, taxonomy, rebuild_cache=False, output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, match_mode="full", no_cache=False, with_author_keywords=False, spires=False, verbose=None, only_core_tags=False, extract_acronyms=False): """Outputs the keywords for each source in sources.""" if verbose is not None: set_verbose_level(verbose) # Initialize cache global _SKWS global _CKWS _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache, no_cache=no_cache) # Get the fulltext for each source. for entry in input_sources: write_message("INFO: Trying input file %s." % entry, stream=sys.stderr, verbose=3) text_lines = None source = "" if os.path.isdir(entry): for filename in os.listdir(entry): if os.path.isfile(entry + filename): text_lines = text_lines_from_local_file(entry + filename) if text_lines: source = filename elif os.path.isfile(entry): text_lines = text_lines_from_local_file(entry) if text_lines: source = os.path.basename(entry) else: # Treat as a URL. text_lines = text_lines_from_url(entry, user_agent=CFG_BIBCLASSIFY_USER_AGENT) if text_lines: source = entry.split("/")[-1] if source: if output_mode == "text": print "Input file: %s" % source keywords = get_keywords_from_text(text_lines, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, with_author_keywords=with_author_keywords, only_core_tags=only_core_tags) if extract_acronyms: acronyms = get_acronyms("\n".join(text_lines)) if acronyms: acronyms_str = ["\nAcronyms:"] for acronym, expansions in acronyms.iteritems(): expansions_str = ", ".join(["%s (%d)" % expansion for expansion in expansions]) acronyms_str.append("%s %s" % (acronym, expansions_str)) acronyms_str = "\n".join(acronyms_str) else: acronyms_str = "\nNo acronyms." print keywords + acronyms_str + "\n" else: print keywords