def setUp(self):
        """Initialize stuff."""
        from invenio import config
        self.original_tmpdir = config.CFG_TMPDIR
        config.CFG_TMPDIR = tempfile.gettempdir()

        self.oldstdout = sys.stdout
        self.oldstderr = sys.stderr
        self.stdout = None
        self.stderr = None

        self.taxonomy_name = "test"
        from invenio.legacy.bibclassify import config as bconfig
        self.log = bconfig.get_logger("bibclassify.tests")
        self.log_level = bconfig.logging_level
        bconfig.set_global_level(bconfig.logging.CRITICAL)
        self.app.extensions['registry']['classifierext.taxonomies'] = \
            taxonomies_registry()
Exemple #2
0
    def setUp(self):
        """Initialize stuff"""
        from invenio import config
        self.original_tmpdir = config.CFG_TMPDIR
        config.CFG_TMPDIR = tempfile.gettempdir()

        self.oldstdout = sys.stdout
        self.oldstderr = sys.stderr
        self.stdout = None
        self.stderr = None

        self.taxonomy_name = "test"
        from invenio.legacy.bibclassify import config as bconfig
        self.log = bconfig.get_logger("bibclassify.tests")
        self.log_level = bconfig.logging_level
        bconfig.set_global_level(bconfig.logging.CRITICAL)
        self.app.extensions['registry']['classifierext.taxonomies'] = \
            taxonomies_registry()
Exemple #3
0
    def setUp(self):
        """Initialize stuff"""
        ## NOTE next time please make sure that you change global variables
        ## back to initial values in tearDown. Thank you!!!
        from invenio import config
        self.__CFG_TMPDIR = config.CFG_TMPDIR
        config.CFG_TMPDIR = tempfile.gettempdir()

        self.oldstdout = sys.stdout
        self.oldstderr = sys.stderr
        self.stdout = None
        self.stderr = None

        self.taxonomy_name = "test"
        from invenio.legacy.bibclassify import config as bconfig
        self.log = bconfig.get_logger("bibclassify.tests")
        self.log_level = bconfig.logging_level
        bconfig.set_global_level(bconfig.logging.CRITICAL)
        self.app.extensions['registry']['classifierext.taxonomies'] = \
            taxonomies_registry()
Exemple #4
0
This module is STANDALONE safe
"""

import os
import re

from invenio.legacy.bibclassify import config as bconfig

if bconfig.STANDALONE:
    from urllib2 import urlopen
else:
    from invenio.utils.url import make_invenio_opener

    urlopen = make_invenio_opener('BibClassify').open

log = bconfig.get_logger("bibclassify.text_extractor")

_ONE_WORD = re.compile("[A-Za-z]{2,}")


def is_pdf(document):
    """Check if a document is a PDF file and return True if is is."""
    if not executable_exists('pdftotext'):
        log.warning("GNU file was not found on the system. "
                    "Switching to a weak file extension test.")
        if document.lower().endswith(".pdf"):
            return True
        return False
        # Tested with file version >= 4.10. First test is secure and works
    # with file version 4.25. Second condition is tested for file
    # version 4.10.
Exemple #5
0
import six
from cgi import escape
from invenio.base.i18n import gettext_set_language
from invenio.legacy.bibdocfile.api import BibRecDocs
from invenio.legacy.search_engine import get_record
from invenio.legacy.template import load
from invenio.ext.legacy.handler import wash_urlargd
import invenio.modules.access.engine as acce
from invenio.legacy.bibsched import bibtask
from invenio.legacy.bibupload.engine import open_marc_file, xml_marc_to_records
from invenio.legacy import bibrecord, dbquery
from invenio.legacy.bibclassify.engine import get_tmp_file, build_marc, _parse_marc_code
from invenio.legacy.bibclassify import (config as bconfig, ontology_reader as
                                        bor)

log = bconfig.get_logger("bibclassify.webinterface")

template = load('bibclassify')


def main_page(req, recid, tabs, ln, template):
    """Generate the main page for the keyword tab

    Url style : http://url/record/[recid]/keywords
    :param req: request object
    :param recid: int docid
    :param tabs: list of tab links
    :param ln: language id
    :param template: template object
    :return: nothing, writes using req object
    """
import os
import re
import sys
import tempfile
import time
import urllib2
import traceback
import xml.sax
import thread
import rdflib

from invenio.legacy.bibclassify import config as bconfig
from invenio.modules.classifier.errors import TaxonomyError

log = bconfig.get_logger("bibclassify.ontology_reader")
from invenio import config

from invenio.modules.classifier.registry import taxonomies

# only if not running in a stanalone mode
if bconfig.STANDALONE:
    dbquery = None
    from urllib2 import urlopen
else:
    from invenio.legacy import dbquery
    from invenio.utils.url import make_invenio_opener

    urlopen = make_invenio_opener('BibClassify').open

_contains_digit = re.compile("\d")
Exemple #7
0
BibClassify command-line interface.

This modules provides a CLI for BibClassify. It reads the options and calls
the method output_keywords_for_sources from bibclassify_engine.

This module is STANDALONE safe.
"""
from __future__ import print_function


import getopt
import sys

from invenio.legacy.bibclassify import config as bconfig

log = bconfig.get_logger("bibclassify.cli")

from invenio.legacy.bibclassify import engine
from invenio.legacy.bibclassify import ontology_reader as reader

daemon = None


def get_recids_list(recids_string):
    """Return a list of recIDs."""
    recids = {}
    elements = recids_string.split(",")
    for element in elements:
        bounds = element.split("-")
        bounds_nb = len(bounds)
        if bounds_nb == 1:
Exemple #8
0
import os
import re
import sys
import tempfile
import time
import urllib2
import traceback
import xml.sax
import thread
import rdflib

from invenio.legacy.bibclassify import config as bconfig
from invenio.modules.classifier.errors import TaxonomyError

log = bconfig.get_logger("bibclassify.ontology_reader")
from invenio import config

from invenio.modules.classifier.registry import taxonomies

# only if not running in a stanalone mode
if bconfig.STANDALONE:
    dbquery = None
    from urllib2 import urlopen
else:
    from invenio.legacy import dbquery
    from invenio.utils.url import make_invenio_opener

    urlopen = make_invenio_opener('BibClassify').open

_contains_digit = re.compile("\d")
Exemple #9
0
import os
import six
from cgi import escape
from invenio.base.i18n import gettext_set_language
from invenio.legacy.bibdocfile.api import BibRecDocs
from invenio.legacy.search_engine import get_record
from invenio.legacy.template import load
from invenio.ext.legacy.handler import wash_urlargd
import invenio.modules.access.engine as acce
from invenio.legacy.bibsched import bibtask
from invenio.legacy.bibupload.engine import open_marc_file, xml_marc_to_records
from invenio.legacy import bibrecord, dbquery
from invenio.legacy.bibclassify.engine import get_tmp_file, build_marc, _parse_marc_code
from invenio.legacy.bibclassify import config as bconfig, ontology_reader as bor

log = bconfig.get_logger("bibclassify.webinterface")

template = load("bibclassify")


def main_page(req, recid, tabs, ln, template):
    """Generate the main page for the keyword tab

    Url style : http://url/record/[recid]/keywords
    :param req: request object
    :param recid: int docid
    :param tabs: list of tab links
    :param ln: language id
    :param template: template object
    :return: nothing, writes using req object
    """
Exemple #10
0
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

from __future__ import print_function

"""Module for running microtests on how well the extraction works -
this module is STANDALONE safe"""

import ConfigParser
import glob
import traceback
import codecs

from invenio.legacy.bibclassify import config as bconfig
from invenio.legacy.bibclassify import engine as engine

log = bconfig.get_logger("bibclassify.microtest")


def run(glob_patterns,
        verbose=20,
        plevel=1
):
    """Execute microtests"""

    if verbose is not None:
        log.setLevel(int(verbose))

    results = {}
    for pattern in glob_patterns:
        log.info("Looking for microtests: %s" % pattern)
        for cfgfile in glob.glob(pattern):
Exemple #11
0
"""

import os
import re
import tempfile
import urllib2
from invenio.legacy.bibclassify import config as bconfig

if bconfig.STANDALONE:
    from urllib2 import urlopen
else:
    from invenio.utils.url import make_invenio_opener

    urlopen = make_invenio_opener('BibClassify').open

log = bconfig.get_logger("bibclassify.text_extractor")

_ONE_WORD = re.compile("[A-Za-z]{2,}")


def is_pdf(document):
    """Checks if a document is a PDF file. Returns True if is is."""
    if not executable_exists('pdftotext'):
        log.warning("GNU file was not found on the system. "
                    "Switching to a weak file extension test.")
        if document.lower().endswith(".pdf"):
            return True
        return False
        # Tested with file version >= 4.10. First test is secure and works
    # with file version 4.25. Second condition is tested for file
    # version 4.10.