Exemple #1
0
    def setUp(self):
        """Initialize stuff"""
        ## NOTE next time please make sure that you change global variables
        ## back to initial values in tearDown. Thank you!!!
        self.__CFG_TMPDIR = config.CFG_TMPDIR
        config.CFG_TMPDIR = tempfile.gettempdir()

        self.oldstdout = sys.stdout
        self.oldstderr = sys.stderr
        self.stdout = None
        self.stderr = None

        self.taxonomy_name = "test"
        from invenio import bibclassify_config as bconfig
        self.log = bconfig.get_logger("bibclassify.tests")
        self.log_level = bconfig.logging_level
        bconfig.set_global_level(bconfig.logging.CRITICAL)
Exemple #2
0
import time
import urllib2
import traceback
import xml.sax
import thread
import time

try:
    import rdflib
    rdflib_exceptions_Error = rdflib.exceptions.Error
except ImportError:
    rdflib = None
    rdflib_exceptions_Error = None

from invenio import bibclassify_config as bconfig
log = bconfig.get_logger("bibclassify.ontology_reader")
from invenio import config

# only if not running in a stanalone mode
if bconfig.STANDALONE:
    dbquery = None
    from urllib2 import urlopen
else:
    from invenio import dbquery
    from invenio.urlutils import make_invenio_opener
    urlopen = make_invenio_opener('BibClassify').open

_contains_digit = re.compile("\d")
_starts_with_non = re.compile("(?i)^non[a-z]")
_starts_with_anti = re.compile("(?i)^anti[a-z]")
_split_by_punctuation = re.compile("(\W+)")
#
# You should have received a copy of the GNU General Public License
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

"""Module for running microtests on how well the extraction works -
this module is STANDALONE safe"""

import ConfigParser
import glob
import traceback
import codecs

from invenio import bibclassify_config as bconfig
from invenio import bibclassify_engine as engine
log = bconfig.get_logger("bibclassify.microtest")


def run(glob_patterns,
        verbose=20,
        plevel = 1
        ):
    """Execute microtests"""

    if verbose is not None:
        log.setLevel(int(verbose))

    results = {}
    for pattern in glob_patterns:
        log.info("Looking for microtests: %s" % pattern)
        for cfgfile in glob.glob(pattern):
"""
BibClassify keyword analyser.

This module contains methods to extract keywords from texts. It provides 3
different methods for 3 different types of keywords: single keywords, composite
keywords and author keywords.

This module is STANDALONE safe
"""

import re
import sys
import time

from invenio import bibclassify_config as bconfig
log = bconfig.get_logger("bibclassify.keyword_analyzer")

_MAXIMUM_SEPARATOR_LENGTH = max([len(_separator)
    for _separator in bconfig.CFG_BIBCLASSIFY_VALID_SEPARATORS])


# XXX - rebuild this whole thing
def get_single_keywords(skw_db, fulltext):
    """Find single keywords in the fulltext
    @var skw_db: list of KeywordToken objects
    @var fulltext: string, which will be searched
    @return : dictionary of matches in a format {
            <keyword object>, [[position, position...], ],
            ..
            }
    """
This module provides methods to clean the text lines. Currently, the methods
are tuned to work with the output of pdftotext and documents in the HEP field.
Methods can be tuned to your needs through the configuration file.

This modules uses the refextract module of BibEdit in order to find the
references section and to replace unicode characters.
"""

import sys
import re
from invenio import bibclassify_config as bconfig

from invenio.docextract_pdf import replace_undesirable_characters
from invenio.refextract_find import find_reference_section, find_end_of_reference_section

log = bconfig.get_logger("bibclassify.text_normalizer")

_washing_regex = []
def get_washing_regex():
    global _washing_regex
    if len(_washing_regex):
        return _washing_regex

    washing_regex = [
        # Replace non and anti with non- and anti-. This allows a better
        # detection of keywords such as nonabelian.
        (re.compile(r"(\snon)[- ](\w+)"), r"\1\2"),
        (re.compile(r"(\santi)[- ](\w+)"), r"\1\2"),
        # Remove all leading numbers (e.g. 2-pion -> pion).
        (re.compile(r"\s\d-"), " "),
        # Remove multiple spaces.
Exemple #6
0
from warnings import warn

from invenio.config import CFG_SITE_URL
from invenio.testutils import make_test_suite, run_test_suite, \
    test_web_page_content

from invenio.bibclassify_unit_tests import BibClassifyTestCase, suite

from invenio import bibclassify_config as bconfig
from invenio.testutils import make_test_suite, run_test_suite
from invenio import config
from invenio import bibclassify_engine
from invenio import bibclassify_cli
from invenio import bibclassify_ontology_reader

log = bconfig.get_logger("bibclassify.regression_tests")

# do this only if not in STANDALONE mode
bibclassify_daemon = dbquery = None
if not bconfig.STANDALONE:
    from invenio import dbquery
    from invenio import bibclassify_daemon
    from invenio import bibdocfile


class BibClassifyRegressionTest(BibClassifyTestCase):
    """Check BibClassify web pages whether they are up or not."""
    def test_availability_bibclassify_admin_guide(self):
        """bibclassify - availability of BibClassify Admin Guide page"""
        self.assertEqual(
            [],
import sys

from invenio.testutils import InvenioTestCase
import tempfile
import cStringIO
import os
import time
import stat
import shutil

from invenio import bibclassify_config as bconfig
from invenio.testutils import make_test_suite, run_test_suite, nottest
from invenio import config
from invenio import bibclassify_ontology_reader

log = bconfig.get_logger("bibclassify.tests")

# do this only if not in STANDALONE mode
bibclassify_daemon = dbquery = None
if not bconfig.STANDALONE:
    from invenio import bibdocfile


class BibClassifyTestCase(InvenioTestCase):
    """ Abusive test suite - the one that takes sooooo long """

    def setUp(self):
        """Initialize stuff"""
        #self.tmpdir = invenio.config.CFG_TMPDIR
        config.CFG_TMPDIR = tempfile.gettempdir()
But unfortunately there is a confusion between running in a standalone mode
and producing output suitable for printing, and running in a web-based
mode where the webtemplate is used. For the moment the pieces of the representation
code are left in this module.

This module is STANDALONE safe
"""

import os
import random
import sys
import time
import cgi

from invenio import bibclassify_config as bconfig
log = bconfig.get_logger("bibclassify.engine")

from invenio import bibclassify_ontology_reader as reader
from invenio import bibclassify_text_extractor as extractor
from invenio import bibclassify_text_normalizer as normalizer
from invenio import bibclassify_keyword_analyzer as keyworder
from invenio import bibclassify_acronym_analyzer as acronymer

from invenio.urlutils import make_user_agent_string
from invenio.textutils import encode_for_xml

# ---------------------------------------------------------------------
#                          API
# ---------------------------------------------------------------------

import time
import urllib2
import traceback
import xml.sax
import thread
import time

try:
    import rdflib
    rdflib_exceptions_Error = rdflib.exceptions.Error
except ImportError:
    rdflib = None
    rdflib_exceptions_Error = None

from invenio import bibclassify_config as bconfig
log = bconfig.get_logger("bibclassify.ontology_reader")
from invenio import config

# only if not running in a stanalone mode
if bconfig.STANDALONE:
    dbquery = None
    from urllib2 import urlopen
else:
    from invenio import dbquery
    from invenio.urlutils import make_invenio_opener
    urlopen = make_invenio_opener('BibClassify').open

_contains_digit = re.compile("\d")
_starts_with_non = re.compile("(?i)^non[a-z]")
_starts_with_anti = re.compile("(?i)^anti[a-z]")
_split_by_punctuation = re.compile("(\W+)")
Exemple #10
0
This module provides methods to clean the text lines. Currently, the methods
are tuned to work with the output of pdftotext and documents in the HEP field.
Methods can be tuned to your needs through the configuration file.

This modules uses the refextract module of BibEdit in order to find the
references section and to replace unicode characters.
"""

import sys
import re
from invenio import bibclassify_config as bconfig

from invenio.docextract_pdf import replace_undesirable_characters
from invenio.refextract_find import find_reference_section, find_end_of_reference_section

log = bconfig.get_logger("bibclassify.text_normalizer")

_washing_regex = []


def get_washing_regex():
    global _washing_regex
    if len(_washing_regex):
        return _washing_regex

    washing_regex = [
        # Replace non and anti with non- and anti-. This allows a better
        # detection of keywords such as nonabelian.
        (re.compile(r"(\snon)[- ](\w+)"), r"\1\2"),
        (re.compile(r"(\santi)[- ](\w+)"), r"\1\2"),
        # Remove all leading numbers (e.g. 2-pion -> pion).
import os
from warnings import warn

from invenio.config import CFG_SITE_URL
from invenio.testutils import make_test_suite, run_test_suite, test_web_page_content

from invenio.bibclassify_unit_tests import BibClassifyTestCase, suite

from invenio import bibclassify_config as bconfig
from invenio.testutils import make_test_suite, run_test_suite
from invenio import config
from invenio import bibclassify_engine
from invenio import bibclassify_cli
from invenio import bibclassify_ontology_reader

log = bconfig.get_logger("bibclassify.regression_tests")

# do this only if not in STANDALONE mode
bibclassify_daemon = dbquery = None
if not bconfig.STANDALONE:
    from invenio import dbquery
    from invenio import bibclassify_daemon
    from invenio import bibdocfile


class BibClassifyRegressionTest(BibClassifyTestCase):
    """Check BibClassify web pages whether they are up or not."""

    def test_availability_bibclassify_admin_guide(self):
        """bibclassify - availability of BibClassify Admin Guide page"""
        self.assertEqual(
"""Template for the bibclassify -
this modules is NOT standalone safe - it is not expected to be
used in a stanalone mode ever.

Some template variables are coming directly from the config
module, those starting with CFG_BIBCLASSIFY_WEB....
"""

import cgi
from invenio import config
from invenio.messages import gettext_set_language
from urllib import quote
from invenio.htmlutils import escape_html
from invenio import bibclassify_config as bconfig
from invenio import bibclassify_ontology_reader as reader
log = bconfig.get_logger("bibclassify.template")


class Template:
    def tmpl_page(self,
                  keywords=None,
                  top='',
                  middle='',
                  bottom='',
                  navbar=None,
                  req=None,
                  ln=None,
                  generate=None,
                  sorting=None,
                  type=None,
                  numbering=None,
"""Template for the bibclassify -
this modules is NOT standalone safe - it is not expected to be
used in a stanalone mode ever.

Some template variables are coming directly from the config
module, those starting with CFG_BIBCLASSIFY_WEB....
"""

import cgi
from invenio import config
from invenio.messages import gettext_set_language
from urllib import quote
from invenio.htmlutils import escape_html
from invenio import bibclassify_config as bconfig
from invenio import bibclassify_ontology_reader as reader
log = bconfig.get_logger("bibclassify.template")


class Template:

    def tmpl_page(self,
                  keywords=None,
                  top='',
                  middle='',
                  bottom='',
                  navbar=None,
                  req=None,
                  ln=None,
                  generate=None,
                  sorting=None,
                  type=None,
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
BibClassify command-line interface.

This modules provides a CLI for BibClassify. It reads the options and calls
the method output_keywords_for_sources from bibclassify_engine.

This module is STANDALONE safe
"""

import getopt
import sys

from invenio import bibclassify_config as bconfig
log = bconfig.get_logger("bibclassify.cli")

from invenio import bibclassify_engine as engine
from invenio import bibclassify_ontology_reader as reader

daemon = None


def get_recids_list(recids_string):
    """Returns a list of recIDs."""
    recids = {}
    elements = recids_string.split(",")
    for element in elements:
        bounds = element.split("-")
        bounds_nb = len(bounds)
        if bounds_nb == 1:
    perform_request_search, get_record, print_record
from invenio.websearchadminlib import get_detailed_page_tabs
from invenio.template import load
from invenio.webinterface_handler import wash_urlargd
from invenio.webuser import collect_user_info
from invenio import access_control_engine as acce
from invenio import dbquery
from invenio import bibtask
from invenio import bibrecord

from invenio import bibclassify_config as bconfig
from invenio import bibclassify_text_extractor
from invenio import bibclassify_engine
from invenio import bibclassify_ontology_reader as bor

log = bconfig.get_logger("bibclassify.webinterface")

template = load('bibclassify')



def main_page(req, recid, tabs, ln, template):
    """Generates the main page for the keyword tab - http://url/record/[recid]/keywords
    @var req: request object
    @var recid: int docid
    @var tabs: list of tab links
    @var ln: language id
    @var template: template object
    @return: nothing, writes using req object
    """
Exemple #16
0
"""

import os
import re
import sys
import tempfile
import urllib2
from invenio import bibclassify_config as bconfig

if bconfig.STANDALONE:
    from urllib2 import urlopen
else:
    from invenio.urlutils import make_invenio_opener
    urlopen = make_invenio_opener('BibClassify').open

log = bconfig.get_logger("bibclassify.text_extractor")

_ONE_WORD = re.compile("[A-Za-z]{2,}")


def text_lines_from_local_file(document, remote=False):
    """Returns the fulltext of the local file.
    @var document: fullpath to the file that should be read
    @var remote: boolean, if True does not count lines (gosh!)
    @return: list of lines if st was read or an empty list"""

    # FIXME - this does not care if we open anything, including binary files

    try:
        if is_pdf(document):
            if not executable_exists("pdftotext"):
Exemple #17
0
    perform_request_search, get_record, print_record
from invenio.websearchadminlib import get_detailed_page_tabs
from invenio.template import load
from invenio.webinterface_handler import wash_urlargd
from invenio.webuser import collect_user_info
from invenio import access_control_engine as acce
from invenio import dbquery
from invenio import bibtask
from invenio import bibrecord

from invenio import bibclassify_config as bconfig
from invenio import bibclassify_text_extractor
from invenio import bibclassify_engine
from invenio import bibclassify_ontology_reader as bor

log = bconfig.get_logger("bibclassify.webinterface")

template = load('bibclassify')


def main_page(req, recid, tabs, ln, template):
    """Generates the main page for the keyword tab - http://url/record/[recid]/keywords
    @var req: request object
    @var recid: int docid
    @var tabs: list of tab links
    @var ln: language id
    @var template: template object
    @return: nothing, writes using req object
    """

    form = req.form
Exemple #18
0
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Module for running microtests on how well the extraction works -
this module is STANDALONE safe"""

import ConfigParser
import glob
import traceback
import codecs

from invenio import bibclassify_config as bconfig
from invenio import bibclassify_engine as engine

log = bconfig.get_logger("bibclassify.microtest")


def run(glob_patterns, verbose=20, plevel=1):
    """Execute microtests"""

    if verbose is not None:
        log.setLevel(int(verbose))

    results = {}
    for pattern in glob_patterns:
        log.info("Looking for microtests: %s" % pattern)
        for cfgfile in glob.glob(pattern):
            log.debug("processing: %s" % (cfgfile))

            try:
"""
BibClassify command-line interface.

This modules provides a CLI for BibClassify. It reads the options and calls
the method output_keywords_for_sources from bibclassify_engine.

This module is STANDALONE safe
"""

import getopt
import sys


from invenio import bibclassify_config as bconfig
log = bconfig.get_logger("bibclassify.cli")


from invenio import bibclassify_engine as engine
from invenio import bibclassify_ontology_reader as reader

daemon = None

def get_recids_list(recids_string):
    """Returns a list of recIDs."""
    recids = {}
    elements = recids_string.split(",")
    for element in elements:
        bounds = element.split("-")
        bounds_nb = len(bounds)
        if bounds_nb == 1:
Exemple #20
0
import sys

from invenio.testutils import InvenioTestCase
import tempfile
import cStringIO
import os
import time
import stat
import shutil

from invenio import bibclassify_config as bconfig
from invenio.testutils import make_test_suite, run_test_suite, nottest
from invenio import config
from invenio import bibclassify_ontology_reader

log = bconfig.get_logger("bibclassify.tests")

# do this only if not in STANDALONE mode
bibclassify_daemon = dbquery = None
if not bconfig.STANDALONE:
    from invenio import bibdocfile


class BibClassifyTestCase(InvenioTestCase):
    """ Abusive test suite - the one that takes sooooo long """
    def setUp(self):
        """Initialize stuff"""
        #self.tmpdir = invenio.config.CFG_TMPDIR
        self.original_tmpdir = config.CFG_TMPDIR
        config.CFG_TMPDIR = tempfile.gettempdir()
Exemple #21
0
"""
BibClassify keyword analyser.

This module contains methods to extract keywords from texts. It provides 3
different methods for 3 different types of keywords: single keywords, composite
keywords and author keywords.

This module is STANDALONE safe
"""

import re
import sys
import time

from invenio import bibclassify_config as bconfig
log = bconfig.get_logger("bibclassify.keyword_analyzer")

_MAXIMUM_SEPARATOR_LENGTH = max([
    len(_separator) for _separator in bconfig.CFG_BIBCLASSIFY_VALID_SEPARATORS
])


# XXX - rebuild this whole thing
def get_single_keywords(skw_db, fulltext):
    """Find single keywords in the fulltext
    @var skw_db: list of KeywordToken objects
    @var fulltext: string, which will be searched
    @return : dictionary of matches in a format {
            <keyword object>, [[position, position...], ],
            ..
            }
"""

import os
import re
import sys
import tempfile
import urllib2
from invenio import bibclassify_config as bconfig

if bconfig.STANDALONE:
    from urllib2 import urlopen
else:
    from invenio.urlutils import make_invenio_opener
    urlopen = make_invenio_opener('BibClassify').open

log = bconfig.get_logger("bibclassify.text_extractor")


_ONE_WORD = re.compile("[A-Za-z]{2,}")

def text_lines_from_local_file(document, remote=False):
    """Returns the fulltext of the local file.
    @var document: fullpath to the file that should be read
    @var remote: boolean, if True does not count lines (gosh!)
    @return: list of lines if st was read or an empty list"""

    # FIXME - this does not care if we open anything, including binary files

    try:
        if is_pdf(document):
            if not executable_exists("pdftotext"):
Exemple #23
0
But unfortunately there is a confusion between running in a standalone mode
and producing output suitable for printing, and running in a web-based
mode where the webtemplate is used. For the moment the pieces of the representation
code are left in this module.

This module is STANDALONE safe
"""

import os
import random
import sys
import time
import cgi

from invenio import bibclassify_config as bconfig
log = bconfig.get_logger("bibclassify.engine")

from invenio import bibclassify_ontology_reader as reader
from invenio import bibclassify_text_extractor as extractor
from invenio import bibclassify_text_normalizer as normalizer
from invenio import bibclassify_keyword_analyzer as keyworder
from invenio import bibclassify_acronym_analyzer as acronymer

try:
    from invenio.urlutils import make_user_agent_string
except ImportError:
    ## Not in Invenio, we simply use default agent
    def make_user_agent_string(component=None):
        return bconfig.CFG_BIBCLASSIFY_USER_AGENT

try: