def test_rebuild_cache(self):
        """bibclassify - test rebuilding cache (takes long time)"""

        info = bibclassify_ontology_reader._get_ontology(self.taxonomy_name)

        if info[0]:
            cache = bibclassify_ontology_reader._get_cache_path(info[0])

            if os.path.exists(cache):
                ctime = os.stat(cache)[stat.ST_CTIME]
            else:
                ctime = -1

            rex = bibclassify_ontology_reader.get_regular_expressions(self.taxonomy_name, rebuild=True)

            self.assertTrue(os.path.exists(cache))
            ntime = os.stat(cache)[stat.ST_CTIME]

            self.assertTrue((ntime > ctime))
        else:
            raise Exception("Taxonomy wasn't found")
Beispiel #2
0
    def test_rebuild_cache(self):
        """bibclassify - test rebuilding cache (takes long time)"""

        info = bibclassify_ontology_reader._get_ontology(self.taxonomy_name)

        if info[0]:
            cache = bibclassify_ontology_reader._get_cache_path(info[0])

            if os.path.exists(cache):
                ctime = os.stat(cache)[stat.ST_CTIME]
            else:
                ctime = -1

            rex = bibclassify_ontology_reader.get_regular_expressions(
                self.taxonomy_name, rebuild=True)

            self.assertTrue(os.path.exists(cache))
            ntime = os.stat(cache)[stat.ST_CTIME]

            self.assertTrue((ntime > ctime))
        else:
            raise Exception("Taxonomy wasn't found")
Beispiel #3
0
    def test_cache_accessibility(self):
        """bibclassify - test cache accessibility/writability"""

        # we will do tests with a copy of test taxonomy, in case anything goes wrong...
        orig_name, orig_taxonomy_path, orig_taxonomy_url = bibclassify_ontology_reader._get_ontology(
            self.taxonomy_name)

        taxonomy_path = orig_taxonomy_path.replace('.rdf', '.copy.rdf')
        taxonomy_name = self.taxonomy_name + '.copy'

        shutil.copy(orig_taxonomy_path, taxonomy_path)
        assert (os.path.exists(taxonomy_path))

        name, taxonomy_path, taxonomy_url = bibclassify_ontology_reader._get_ontology(
            taxonomy_name)

        cache = bibclassify_ontology_reader._get_cache_path(
            os.path.basename(taxonomy_path))

        if not name:
            raise Exception("Taxonomy wasn't found")

        if os.path.exists(cache):
            os.remove(cache)

        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name,
                                                            rebuild=True,
                                                            no_cache=False)
        assert (os.path.exists(cache))

        log.error('Testing corrupted states, please ignore errors...')

        # set cache unreadable
        os.chmod(cache, 000)
        try:
            bibclassify_ontology_reader.get_regular_expressions(taxonomy_name,
                                                                rebuild=False,
                                                                no_cache=False)
        except:
            pass
        else:
            raise Exception('cache chmod to 000 but no exception raised')

        # set cache unreadable and test writing
        os.chmod(cache, 000)
        try:
            bibclassify_ontology_reader.get_regular_expressions(taxonomy_name,
                                                                rebuild=True,
                                                                no_cache=False)
        except:
            pass
        else:
            raise Exception('cache chmod to 000 but no exception raised')

        # set cache unreadable but don't care for it
        os.chmod(cache, 000)
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name,
                                                            rebuild=False,
                                                            no_cache=True)
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name,
                                                            rebuild=True,
                                                            no_cache=True)

        # set cache readable and test writing
        os.chmod(cache, 600)
        try:
            bibclassify_ontology_reader.get_regular_expressions(taxonomy_name,
                                                                rebuild=True,
                                                                no_cache=False)
        except:
            pass
        else:
            raise Exception('cache chmod to 600 but no exception raised')

        # set cache writable only
        os.chmod(cache, 200)
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name,
                                                            rebuild=True,
                                                            no_cache=False)
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name,
                                                            rebuild=False,
                                                            no_cache=False)

        # set cache readable/writable but corrupted (must rebuild itself)
        os.chmod(cache, 600)
        os.remove(cache)
        open(cache, 'w').close()
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name,
                                                            rebuild=False,
                                                            no_cache=False)

        # set cache readable/writable but corrupted (must rebuild itself)
        open(cache, 'w').close()
        try:
            try:
                os.rename(taxonomy_path, taxonomy_path + 'x')
                open(taxonomy_path, 'w').close()
                bibclassify_ontology_reader.get_regular_expressions(
                    taxonomy_name, rebuild=False, no_cache=False)
            except:
                pass
        finally:
            os.rename(taxonomy_path + 'x', taxonomy_path)

        # make cache ok, but corrupt source
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name,
                                                            rebuild=True,
                                                            no_cache=False)

        try:
            try:
                os.rename(taxonomy_path, taxonomy_path + 'x')
                open(taxonomy_path, 'w').close()
                time.sleep(.1)
                os.utime(cache,
                         (time.time() + 100,
                          time.time() + 100))  #touch the taxonomy to be older
                bibclassify_ontology_reader.get_regular_expressions(
                    taxonomy_name, rebuild=False, no_cache=False)
            except:
                os.rename(taxonomy_path + 'x', taxonomy_path)
                raise Exception('Cache exists and is ok, but was ignored')
        finally:
            os.rename(taxonomy_path + 'x', taxonomy_path)

        # make cache ok (but old), and corrupt source
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name,
                                                            rebuild=True,
                                                            no_cache=False)
        try:
            try:
                os.rename(taxonomy_path, taxonomy_path + 'x')
                open(taxonomy_path, 'w').close()
                bibclassify_ontology_reader.get_regular_expressions(
                    taxonomy_name, rebuild=False, no_cache=False)
            except:
                pass
        finally:
            os.rename(taxonomy_path + 'x', taxonomy_path)

        log.error('...testing of corrupted states finished.')

        name, taxonomy_path, taxonomy_url = bibclassify_ontology_reader._get_ontology(
            taxonomy_name)
        cache = bibclassify_ontology_reader._get_cache_path(name)
        os.remove(taxonomy_path)
        os.remove(cache)
    def test_cache_accessibility(self):
        """bibclassify - test cache accessibility/writability"""

        # we will do tests with a copy of test taxonomy, in case anything goes wrong...
        orig_name, orig_taxonomy_path, orig_taxonomy_url = bibclassify_ontology_reader._get_ontology(self.taxonomy_name)

        taxonomy_path = orig_taxonomy_path.replace('.rdf', '.copy.rdf')
        taxonomy_name = self.taxonomy_name + '.copy'

        shutil.copy(orig_taxonomy_path, taxonomy_path)
        assert(os.path.exists(taxonomy_path))

        name, taxonomy_path, taxonomy_url = bibclassify_ontology_reader._get_ontology(taxonomy_name)

        cache = bibclassify_ontology_reader._get_cache_path(os.path.basename(taxonomy_path))


        if not name:
            raise Exception("Taxonomy wasn't found")

        if os.path.exists(cache):
            os.remove(cache)

        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False)
        assert(os.path.exists(cache))

        log.error('Testing corrupted states, please ignore errors...')

        # set cache unreadable
        os.chmod(cache, 000)
        try: bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False)
        except: pass
        else: raise Exception('cache chmod to 000 but no exception raised')

        # set cache unreadable and test writing
        os.chmod(cache, 000)
        try: bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False)
        except: pass
        else: raise Exception('cache chmod to 000 but no exception raised')

        # set cache unreadable but don't care for it
        os.chmod(cache, 000)
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=True)
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=True)

        # set cache readable and test writing
        os.chmod(cache, 600)
        try: bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False)
        except: pass
        else: raise Exception('cache chmod to 600 but no exception raised')

        # set cache writable only
        os.chmod(cache, 200)
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False)
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False)


        # set cache readable/writable but corrupted (must rebuild itself)
        os.chmod(cache, 600)
        os.remove(cache)
        open(cache, 'w').close()
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False)


        # set cache readable/writable but corrupted (must rebuild itself)
        open(cache, 'w').close()
        try:
            try:
                os.rename(taxonomy_path, taxonomy_path + 'x')
                open(taxonomy_path, 'w').close()
                bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False)
            except:
                pass
        finally:
            os.rename(taxonomy_path+'x', taxonomy_path)

        # make cache ok, but corrupt source
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False)

        try:
            try:
                os.rename(taxonomy_path, taxonomy_path + 'x')
                open(taxonomy_path, 'w').close()
                time.sleep(.1)
                os.utime(cache, (time.time() + 100, time.time() + 100))  #touch the taxonomy to be older
                bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False)
            except:
                os.rename(taxonomy_path+'x', taxonomy_path)
                raise Exception('Cache exists and is ok, but was ignored')
        finally:
            os.rename(taxonomy_path+'x', taxonomy_path)

        # make cache ok (but old), and corrupt source
        bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=True, no_cache=False)
        try:
            try:
                os.rename(taxonomy_path, taxonomy_path + 'x')
                open(taxonomy_path, 'w').close()
                bibclassify_ontology_reader.get_regular_expressions(taxonomy_name, rebuild=False, no_cache=False)
            except:
                pass
        finally:
            os.rename(taxonomy_path+'x', taxonomy_path)

        log.error('...testing of corrupted states finished.')

        name, taxonomy_path, taxonomy_url = bibclassify_ontology_reader._get_ontology(taxonomy_name)
        cache = bibclassify_ontology_reader._get_cache_path(name)
        os.remove(taxonomy_path)
        os.remove(cache)
def get_keywords_from_text(text_lines, taxonomy_name, output_mode="text",
    output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False,
    match_mode="full", no_cache=False, with_author_keywords=False,
    rebuild_cache=False, only_core_tags=False, extract_acronyms=False,
    **kwargs):
    """Extracts keywords from the list of strings

    @var text_lines: list of strings (will be normalized before being
        joined into one string)
    @keyword taxonomy_name: string, name of the taxonomy_name
    @keyword output_mode: string - text|html|marcxml|raw
    @keyword output_limit: int
    @keyword spires: boolean, if True marcxml output reflect spires
        codes
    @keyword match_mode: str - partial|full; in partial mode only
        beginning of the fulltext is searched
    @keyword no_cache: boolean, means loaded definitions will not be saved
    @keyword with_author_keywords: boolean, extract keywords from the
        pdfs
    @keyword rebuild_cache: boolean
    @keyword only_core_tags: boolean
    @return: if output_mode=raw, it will return
            (single_keywords, composite_keywords, author_keywords, acronyms)
            for other output modes it returns formatted string
    """

    start_time = time.time()
    cache = reader.get_cache(taxonomy_name)
    if not cache:
        reader.set_cache(taxonomy_name, reader.get_regular_expressions(taxonomy_name,
                rebuild=rebuild_cache, no_cache=no_cache))
        cache = reader.get_cache(taxonomy_name)


    _skw = cache[0]
    _ckw = cache[1]

    text_lines = normalizer.cut_references(text_lines)
    fulltext = normalizer.normalize_fulltext("\n".join(text_lines))

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    author_keywords = None
    if with_author_keywords:
        author_keywords = extract_author_keywords(_skw, _ckw, fulltext)

    acronyms = {}
    if extract_acronyms:
        acronyms = extract_abbreviations(fulltext)


    single_keywords = extract_single_keywords(_skw, fulltext)
    composite_keywords = extract_composite_keywords(_ckw, fulltext, single_keywords)


    if only_core_tags:
        single_keywords = clean_before_output(_filter_core_keywors(single_keywords))
        composite_keywords = _filter_core_keywors(composite_keywords)
    else:
        # Filter out the "nonstandalone" keywords
        single_keywords = clean_before_output(single_keywords)

    log.info('Keywords generated in: %.1f sec' % (time.time() - start_time))

    if output_mode == "raw":
        if output_limit:
            return (_kw(_sort_kw_matches(single_keywords, output_limit)),
                    _kw(_sort_kw_matches(composite_keywords, output_limit)),
                    author_keywords, # this we don't limit (?)
                    _kw(_sort_kw_matches(acronyms, output_limit)))
        else:
            return (single_keywords, composite_keywords, author_keywords, acronyms)
    else:
        return get_keywords_output(single_keywords, composite_keywords, taxonomy_name,
                                    author_keywords, acronyms, output_mode, output_limit,
                                    spires, only_core_tags)
Beispiel #6
0
def get_keywords_from_text(text_lines, taxonomy_name, output_mode="text",
    output_limit=bconfig.CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, spires=False,
    match_mode="full", no_cache=False, with_author_keywords=False,
    rebuild_cache=False, only_core_tags=False, extract_acronyms=False,
    **kwargs):
    """Extracts keywords from the list of strings

    @var text_lines: list of strings (will be normalized before being
        joined into one string)
    @keyword taxonomy_name: string, name of the taxonomy_name
    @keyword output_mode: string - text|html|marcxml|raw
    @keyword output_limit: int
    @keyword spires: boolean, if True marcxml output reflect spires
        codes
    @keyword match_mode: str - partial|full; in partial mode only
        beginning of the fulltext is searched
    @keyword no_cache: boolean, means loaded definitions will not be saved
    @keyword with_author_keywords: boolean, extract keywords from the
        pdfs
    @keyword rebuild_cache: boolean
    @keyword only_core_tags: boolean
    @return: if output_mode=raw, it will return
            (single_keywords, composite_keywords, author_keywords, acronyms)
            for other output modes it returns formatted string
    """

    start_time = time.time()
    cache = reader.get_cache(taxonomy_name)
    if not cache:
        reader.set_cache(taxonomy_name, reader.get_regular_expressions(taxonomy_name,
                rebuild=rebuild_cache, no_cache=no_cache))
        cache = reader.get_cache(taxonomy_name)


    _skw = cache[0]
    _ckw = cache[1]

    text_lines = normalizer.cut_references(text_lines)
    fulltext = normalizer.normalize_fulltext("\n".join(text_lines))

    if match_mode == "partial":
        fulltext = _get_partial_text(fulltext)

    author_keywords = None
    if with_author_keywords:
        author_keywords = extract_author_keywords(_skw, _ckw, fulltext)

    acronyms = {}
    if extract_acronyms:
        acronyms = extract_abbreviations(fulltext)


    single_keywords = extract_single_keywords(_skw, fulltext)
    composite_keywords = extract_composite_keywords(_ckw, fulltext, single_keywords)


    if only_core_tags:
        single_keywords = clean_before_output(_filter_core_keywors(single_keywords))
        composite_keywords = _filter_core_keywors(composite_keywords)
    else:
        # Filter out the "nonstandalone" keywords
        single_keywords = clean_before_output(single_keywords)

    log.info('Keywords generated in: %.1f sec' % (time.time() - start_time))

    if output_mode == "raw":
        if output_limit:
            return (_kw(_sort_kw_matches(single_keywords, output_limit)),
                    _kw(_sort_kw_matches(composite_keywords, output_limit)),
                    author_keywords, # this we don't limit (?)
                    _kw(_sort_kw_matches(acronyms, output_limit)))
        else:
            return (single_keywords, composite_keywords, author_keywords, acronyms)
    else:
        return get_keywords_output(single_keywords, composite_keywords, taxonomy_name,
                                    author_keywords, acronyms, output_mode, output_limit,
                                    spires, only_core_tags)