def get_keywords_from_local_file(local_file, taxonomy, rebuild_cache=False, match_mode="full", no_cache=False, with_author_keywords=False): text_lines = text_lines_from_local_file(local_file) global _SKWS global _CKWS if not _SKWS: if taxonomy is not None: _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache, no_cache=no_cache) else: write_message("ERROR: Please specify an ontology in order to " "extract keywords.", stream=sys.stderr, verbose=1) text_lines = cut_references(text_lines) fulltext = normalize_fulltext("\n".join(text_lines)) author_keywords = None if with_author_keywords: author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext) if match_mode == "partial": fulltext = _get_partial_text(fulltext) single_keywords = get_single_keywords(_SKWS, fulltext) composite_keywords = get_composite_keywords(_CKWS, fulltext, single_keywords) return (single_keywords, composite_keywords)
def get_keywords_from_text(text_lines, taxonomy=None, output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False, match_mode="full", no_cache=False, with_author_keywords=False, rebuild_cache=False, only_core_tags=False): """Returns a formatted string containing the keywords for a single document.""" global _SKWS global _CKWS if not _SKWS: if taxonomy is not None: _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache, no_cache=no_cache) else: write_message("ERROR: Please specify an ontology in order to " "extract keywords.", stream=sys.stderr, verbose=1) text_lines = cut_references(text_lines) fulltext = normalize_fulltext("\n".join(text_lines)) author_keywords = None if with_author_keywords: author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext) if match_mode == "partial": fulltext = _get_partial_text(fulltext) single_keywords = get_single_keywords(_SKWS, fulltext) composite_keywords = get_composite_keywords(_CKWS, fulltext, single_keywords) return _get_keywords_output(single_keywords, composite_keywords, taxonomy, author_keywords, output_mode, output_limit, spires, only_core_tags)
def get_keywords_from_text(text_lines, taxonomy=None, output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False, match_mode="full", no_cache=False, with_author_keywords=False, rebuild_cache=False, only_core_tags=False): """Returns a formatted string containing the keywords for a single document.""" global _SKWS global _CKWS if not _SKWS: if taxonomy is not None: _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache, no_cache=no_cache) else: write_message("ERROR: Please specify an ontology in order to " "extract keywords.", stream=sys.stderr, verbose=1) text_lines = cut_references(text_lines) fulltext = normalize_fulltext("\n".join(text_lines)) author_keywords = None if with_author_keywords: author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext) if match_mode == "partial": fulltext = _get_partial_text(fulltext) single_keywords = get_single_keywords(_SKWS, fulltext) composite_keywords = get_composite_keywords(_CKWS, fulltext, single_keywords) return _get_keywords_output(single_keywords, composite_keywords, taxonomy, author_keywords, output_mode, output_limit, spires, only_core_tags)
def get_keywords_from_local_file(local_file, taxonomy, rebuild_cache=False, match_mode="full", no_cache=False, with_author_keywords=False): text_lines = text_lines_from_local_file(local_file) global _SKWS global _CKWS if not _SKWS: if taxonomy is not None: _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache, no_cache=no_cache) else: write_message("ERROR: Please specify an ontology in order to " "extract keywords.", stream=sys.stderr, verbose=1) text_lines = cut_references(text_lines) fulltext = normalize_fulltext("\n".join(text_lines)) author_keywords = None if with_author_keywords: author_keywords = get_author_keywords(_SKWS, _CKWS, fulltext) if match_mode == "partial": fulltext = _get_partial_text(fulltext) single_keywords = get_single_keywords(_SKWS, fulltext) composite_keywords = get_composite_keywords(_CKWS, fulltext, single_keywords) return (single_keywords, composite_keywords)
def test_rebuild_cache(self): """bibclassify - test rebuilding cache (takes long time)""" info = bibclassify_ontology_reader._get_ontology(self.taxonomy_name) if info[0]: cache = bibclassify_ontology_reader._get_cache_path(info[0]) if os.path.exists(cache): ctime = os.stat(cache)[stat.ST_CTIME] else: ctime = -1 rex = bibclassify_ontology_reader.get_regular_expressions(self.taxonomy_name, rebuild=True) self.assertTrue(os.path.exists(cache)) ntime = os.stat(cache)[stat.ST_CTIME] self.assertTrue((ntime > ctime)) else: raise Exception("Taxonomy wasn't found")
def test_rebuild_cache(self): """bibclassify - test rebuilding cache (takes long time)""" info = bibclassify_ontology_reader._get_ontology(self.taxonomy_name) if info[0]: cache = bibclassify_ontology_reader._get_cache_path(info[0]) if os.path.exists(cache): ctime = os.stat(cache)[stat.ST_CTIME] else: ctime = -1 rex = bibclassify_ontology_reader.get_regular_expressions(self.taxonomy_name, rebuild=True) self.assertTrue(os.path.exists(cache)) ntime = os.stat(cache)[stat.ST_CTIME] self.assertTrue(ntime > ctime) else: raise Exception("Taxonomy wasn't found")
def output_keywords_for_sources(input_sources, taxonomy, rebuild_cache=False, output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, match_mode="full", no_cache=False, with_author_keywords=False, spires=False, verbose=None, only_core_tags=False, extract_acronyms=False): """Outputs the keywords for each source in sources.""" if verbose is not None: set_verbose_level(verbose) # Initialize cache global _SKWS global _CKWS _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache, no_cache=no_cache) # Get the fulltext for each source. for entry in input_sources: write_message("INFO: Trying input file %s." % entry, stream=sys.stderr, verbose=3) text_lines = None source = "" if os.path.isdir(entry): for filename in os.listdir(entry): if os.path.isfile(entry + filename): text_lines = text_lines_from_local_file(entry + filename) if text_lines: source = filename elif os.path.isfile(entry): text_lines = text_lines_from_local_file(entry) if text_lines: source = os.path.basename(entry) else: # Treat as a URL. text_lines = text_lines_from_url(entry, user_agent=CFG_BIBCLASSIFY_USER_AGENT) if text_lines: source = entry.split("/")[-1] if source: if output_mode == "text": print "Input file: %s" % source keywords = get_keywords_from_text(text_lines, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, with_author_keywords=with_author_keywords, only_core_tags=only_core_tags) if extract_acronyms: acronyms = get_acronyms("\n".join(text_lines)) if acronyms: acronyms_str = ["\nAcronyms:"] for acronym, expansions in acronyms.iteritems(): expansions_str = ", ".join(["%s (%d)" % expansion for expansion in expansions]) acronyms_str.append("%s %s" % (acronym, expansions_str)) acronyms_str = "\n".join(acronyms_str) else: acronyms_str = "\nNo acronyms." print keywords + acronyms_str + "\n" else: print keywords
def test_cache_accessibility(self): """bibclassify - test cache accessibility/writability""" # we will do tests with a copy of test taxonomy, in case anything goes wrong... orig_name, orig_taxonomy_path, orig_taxonomy_url = bibclassify_ontology_reader._get_ontology(self.taxonomy_name) taxonomy_path = orig_taxonomy_path.replace('.rdf', '.copy.rdf') taxonomy_name = self.taxonomy_name + '.copy' shutil.copy(orig_taxonomy_path, taxonomy_path) assert(os.path.exists(taxonomy_path)) name, taxonomy_path, taxonomy_url = bibclassify_ontology_reader._get_ontology(taxonomy_name) cache = bibclassify_ontology_reader._get_cache_path(os.path.basename(taxonomy_path)) if not name: raise Exception("Taxonomy wasn't found") if os.path.exists(cache): os.remove(cache) bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False) assert(os.path.exists(cache)) log.error('Testing corrupted states, please ignore errors...') # set cache unreadable os.chmod(cache, 000) try: bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False) except: pass else: raise Exception('cache chmod to 000 but no exception raised') # set cache unreadable and test writing os.chmod(cache, 000) try: bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False) except: pass else: raise Exception('cache chmod to 000 but no exception raised') # set cache unreadable but don't care for it os.chmod(cache, 000) bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=True) bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=True) # set cache readable and test writing os.chmod(cache, 600) try: bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False) except: pass else: raise Exception('cache chmod to 600 but no exception raised') # set cache writable only os.chmod(cache, 200) bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False) bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False) # set cache readable/writable but corrupted (must rebuild itself) os.chmod(cache, 600) os.remove(cache) open(cache, 'w').close() bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False) # set cache readable/writable but corrupted (must rebuild itself) open(cache, 'w').close() try: try: os.rename(taxonomy_path, taxonomy_path + 'x') open(taxonomy_path, 'w').close() bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False) except: pass finally: os.rename(taxonomy_path+'x', taxonomy_path) # make cache ok, but corrupt source bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False) try: try: os.rename(taxonomy_path, taxonomy_path + 'x') open(taxonomy_path, 'w').close() time.sleep(.1) os.utime(cache, (time.time() + 100, time.time() + 100)) #touch the taxonomy to be older bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False) except: os.rename(taxonomy_path+'x', taxonomy_path) raise Exception('Cache exists and is ok, but was ignored') finally: os.rename(taxonomy_path+'x', taxonomy_path) # make cache ok (but old), and corrupt source bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False) try: try: os.rename(taxonomy_path, taxonomy_path + 'x') open(taxonomy_path, 'w').close() bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False) except: pass finally: os.rename(taxonomy_path+'x', taxonomy_path) log.error('...testing of corrupted states finished.') name, taxonomy_path, taxonomy_url = bibclassify_ontology_reader._get_ontology(taxonomy_name) cache = bibclassify_ontology_reader._get_cache_path(name) os.remove(taxonomy_path) os.remove(cache)
def output_keywords_for_sources(input_sources, taxonomy, rebuild_cache=False, output_mode="text", output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, match_mode="full", no_cache=False, with_author_keywords=False, spires=False, verbose=None, only_core_tags=False, extract_acronyms=False): """Outputs the keywords for each source in sources.""" if verbose is not None: set_verbose_level(verbose) # Initialize cache global _SKWS global _CKWS _SKWS, _CKWS = get_regular_expressions(taxonomy, rebuild=rebuild_cache, no_cache=no_cache) # Get the fulltext for each source. for entry in input_sources: write_message("INFO: Trying input file %s." % entry, stream=sys.stderr, verbose=3) text_lines = None source = "" if os.path.isdir(entry): for filename in os.listdir(entry): if os.path.isfile(entry + filename): text_lines = text_lines_from_local_file(entry + filename) if text_lines: source = filename elif os.path.isfile(entry): text_lines = text_lines_from_local_file(entry) if text_lines: source = os.path.basename(entry) else: # Treat as a URL. text_lines = text_lines_from_url(entry, user_agent=CFG_BIBCLASSIFY_USER_AGENT) if text_lines: source = entry.split("/")[-1] if source: if output_mode == "text": print "Input file: %s" % source keywords = get_keywords_from_text(text_lines, output_mode=output_mode, output_limit=output_limit, spires=spires, match_mode=match_mode, with_author_keywords=with_author_keywords, only_core_tags=only_core_tags) if extract_acronyms: acronyms = get_acronyms("\n".join(text_lines)) if acronyms: acronyms_str = ["\nAcronyms:"] for acronym, expansions in acronyms.iteritems(): expansions_str = ", ".join(["%s (%d)" % expansion for expansion in expansions]) acronyms_str.append("%s %s" % (acronym, expansions_str)) acronyms_str = "\n".join(acronyms_str) else: acronyms_str = "\nNo acronyms." print keywords + acronyms_str + "\n" else: print keywords
def test_cache_accessibility(self): """Test cache accessibility/writability""" # we will do tests with a copy of test taxonomy, in case anything goes wrong... name, taxonomy_path, taxonomy_url = bibclassify_ontology_reader._get_ontology('test') shutil.copy(taxonomy_path, taxonomy_path +'.copy') assert(os.path.exists(taxonomy_path + '.copy')) self.taxonomy_name = 'test.rdf.copy' taxonomy_name = self.taxonomy_name name, taxonomy_path, taxonomy_url = bibclassify_ontology_reader._get_ontology(self.taxonomy_name) cache = bibclassify_ontology_reader._get_cache_path(name) if not name: raise Exception("Taxonomy wasn't found") if os.path.exists(cache): os.remove(cache) bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False) assert(os.path.exists(cache)) # set cache unreadable os.chmod(cache, 000) try: bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False) except: pass else: raise Exception('cache chmod to 000 but no exception raised') # set cache unreadable and test writing os.chmod(cache, 000) try: bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False) except: pass else: raise Exception('cache chmod to 000 but no exception raised') # set cache unreadable but don't care for it os.chmod(cache, 000) bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=True) bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=True) # set cache readable and test writing os.chmod(cache, 600) try: bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False) except: pass else: raise Exception('cache chmod to 600 but no exception raised') # set cache writable only os.chmod(cache, 200) bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False) bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False) # set cache readable/writable but corrupted (must rebuild itself) os.chmod(cache, 600) os.remove(cache) open(cache, 'w').close() bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False) # set cache readable/writable but corrupted (must rebuild itself) open(cache, 'w').close() try: try: os.rename(taxonomy_path, taxonomy_path + 'x') open(taxonomy_path, 'w').close() bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False) except: pass finally: os.rename(taxonomy_path+'x', taxonomy_path) # make cache ok, but corrupt source bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False) try: try: os.rename(taxonomy_path, taxonomy_path + 'x') open(taxonomy_path, 'w').close() time.sleep(.1) os.utime(cache, (time.time() + 100, time.time() + 100)) #touch the taxonomy to be older bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False) except: os.rename(taxonomy_path+'x', taxonomy_path) raise Exception('Cache exists and is ok, but was ignored') finally: os.rename(taxonomy_path+'x', taxonomy_path) # make cache ok (but old), and corrupt source bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False) try: try: os.rename(taxonomy_path, taxonomy_path + 'x') open(taxonomy_path, 'w').close() bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False) except: pass finally: os.rename(taxonomy_path+'x', taxonomy_path) name, taxonomy_path, taxonomy_url = bibclassify_ontology_reader._get_ontology(self.taxonomy_name) cache = bibclassify_ontology_reader._get_cache_path(name) os.remove(taxonomy_path) os.remove(cache)
def get_keywords_from_text( text_lines, taxonomy_name, output_mode="text", output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False, match_mode="full", no_cache=False, with_author_keywords=False, rebuild_cache=False, only_core_tags=False, extract_acronyms=False, **kwargs): """Extracts keywords from the list of strings @var text_lines: list of strings (will be normalized before being joined into one string) @keyword taxonomy_name: string, name of the taxonomy_name @keyword output_mode: string - text|html|marcxml|raw @keyword output_limit: int @keyword spires: boolean, if True marcxml output reflect spires codes @keyword match_mode: str - partial|full; in partial mode only beginning of the fulltext is searched @keyword no_cache: boolean, means loaded definitions will not be saved @keyword with_author_keywords: boolean, extract keywords from the pdfs @keyword rebuild_cache: boolean @keyword only_core_tags: boolean @return: if output_mode=raw, it will return (single_keywords, composite_keywords, author_keywords, acronyms) for other output modes it returns formatted string """ start_time = time.time() cache = reader.get_cache(taxonomy_name) if not cache: reader.set_cache( taxonomy_name, reader.get_regular_expressions(taxonomy_name, rebuild=rebuild_cache, no_cache=no_cache)) cache = reader.get_cache(taxonomy_name) _skw = cache[0] _ckw = cache[1] text_lines = normalizer.cut_references(text_lines) fulltext = normalizer.normalize_fulltext("\n".join(text_lines)) if match_mode == "partial": fulltext = _get_partial_text(fulltext) author_keywords = None if with_author_keywords: author_keywords = extract_author_keywords(_skw, _ckw, fulltext) acronyms = {} if extract_acronyms: acronyms = extract_abbreviations(fulltext) single_keywords = extract_single_keywords(_skw, fulltext) composite_keywords = extract_composite_keywords(_ckw, fulltext, single_keywords) if only_core_tags: single_keywords = clean_before_output( _filter_core_keywors(single_keywords)) composite_keywords = _filter_core_keywors(composite_keywords) else: # Filter out the "nonstandalone" keywords single_keywords = clean_before_output(single_keywords) log.info('Keywords generated in: %.1f sec' % (time.time() - start_time)) if output_mode == "raw": if output_limit: return ( _kw(_sort_kw_matches(single_keywords, output_limit)), _kw(_sort_kw_matches(composite_keywords, output_limit)), author_keywords, # this we don't limit (?) _kw(_sort_kw_matches(acronyms, output_limit))) else: return (single_keywords, composite_keywords, author_keywords, acronyms) else: return get_keywords_output(single_keywords, composite_keywords, taxonomy_name, author_keywords, acronyms, output_mode, output_limit, spires, only_core_tags)
def get_keywords_from_text(text_lines, taxonomy_name, output_mode="text", output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False, match_mode="full", no_cache=False, with_author_keywords=False, rebuild_cache=False, only_core_tags=False, extract_acronyms=False, **kwargs): """Extracts keywords from the list of strings @var text_lines: list of strings (will be normalized before being joined into one string) @keyword taxonomy_name: string, name of the taxonomy_name @keyword output_mode: string - text|html|marcxml|raw @keyword output_limit: int @keyword spires: boolean, if True marcxml output reflect spires codes @keyword match_mode: str - partial|full; in partial mode only beginning of the fulltext is searched @keyword no_cache: boolean, means loaded definitions will not be saved @keyword with_author_keywords: boolean, extract keywords from the pdfs @keyword rebuild_cache: boolean @keyword only_core_tags: boolean @return: if output_mode=raw, it will return (single_keywords, composite_keywords, author_keywords, acronyms) for other output modes it returns formatted string """ cache = reader.get_cache(taxonomy_name) if not cache: reader.set_cache(taxonomy_name, reader.get_regular_expressions(taxonomy_name, rebuild=rebuild_cache, no_cache=no_cache)) cache = reader.get_cache(taxonomy_name) _skw = cache[0] _ckw = cache[1] text_lines = normalizer.cut_references(text_lines) fulltext = normalizer.normalize_fulltext("\n".join(text_lines)) author_keywords = None if with_author_keywords: author_keywords = extract_author_keywords(_skw, _ckw, fulltext) if match_mode == "partial": fulltext = _get_partial_text(fulltext) single_keywords = extract_single_keywords(_skw, fulltext) composite_keywords = extract_composite_keywords(_ckw, fulltext, single_keywords) acronyms = {} if extract_acronyms: acronyms = extract_abbreviations(fulltext) if output_mode == "raw": if output_limit: return (_kw(_sort_kw_matches(single_keywords, output_limit)), _kw(_sort_kw_matches(composite_keywords, output_limit)), author_keywords, # this we don't limit (?) _kw(_sort_kw_matches(acronyms, output_limit))) else: return (single_keywords, composite_keywords, author_keywords, acronyms) else: return _get_keywords_output(single_keywords, composite_keywords, taxonomy_name, author_keywords, acronyms, output_mode, output_limit, spires, only_core_tags)