def setUp(self): """Initialize stuff""" ## NOTE next time please make sure that you change global variables ## back to initial values in tearDown. Thank you!!! self.__CFG_TMPDIR = config.CFG_TMPDIR config.CFG_TMPDIR = tempfile.gettempdir() self.oldstdout = sys.stdout self.oldstderr = sys.stderr self.stdout = None self.stderr = None self.taxonomy_name = "test" from invenio import bibclassify_config as bconfig self.log = bconfig.get_logger("bibclassify.tests") self.log_level = bconfig.logging_level bconfig.set_global_level(bconfig.logging.CRITICAL)
import time import urllib2 import traceback import xml.sax import thread import time try: import rdflib rdflib_exceptions_Error = rdflib.exceptions.Error except ImportError: rdflib = None rdflib_exceptions_Error = None from invenio import bibclassify_config as bconfig log = bconfig.get_logger("bibclassify.ontology_reader") from invenio import config # only if not running in a stanalone mode if bconfig.STANDALONE: dbquery = None from urllib2 import urlopen else: from invenio import dbquery from invenio.urlutils import make_invenio_opener urlopen = make_invenio_opener('BibClassify').open _contains_digit = re.compile("\d") _starts_with_non = re.compile("(?i)^non[a-z]") _starts_with_anti = re.compile("(?i)^anti[a-z]") _split_by_punctuation = re.compile("(\W+)")
# # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Module for running microtests on how well the extraction works - this module is STANDALONE safe""" import ConfigParser import glob import traceback import codecs from invenio import bibclassify_config as bconfig from invenio import bibclassify_engine as engine log = bconfig.get_logger("bibclassify.microtest") def run(glob_patterns, verbose=20, plevel = 1 ): """Execute microtests""" if verbose is not None: log.setLevel(int(verbose)) results = {} for pattern in glob_patterns: log.info("Looking for microtests: %s" % pattern) for cfgfile in glob.glob(pattern):
""" BibClassify keyword analyser. This module contains methods to extract keywords from texts. It provides 3 different methods for 3 different types of keywords: single keywords, composite keywords and author keywords. This module is STANDALONE safe """ import re import sys import time from invenio import bibclassify_config as bconfig log = bconfig.get_logger("bibclassify.keyword_analyzer") _MAXIMUM_SEPARATOR_LENGTH = max([len(_separator) for _separator in bconfig.CFG_BIBCLASSIFY_VALID_SEPARATORS]) # XXX - rebuild this whole thing def get_single_keywords(skw_db, fulltext): """Find single keywords in the fulltext @var skw_db: list of KeywordToken objects @var fulltext: string, which will be searched @return : dictionary of matches in a format { <keyword object>, [[position, position...], ], .. } """
This module provides methods to clean the text lines. Currently, the methods are tuned to work with the output of pdftotext and documents in the HEP field. Methods can be tuned to your needs through the configuration file. This modules uses the refextract module of BibEdit in order to find the references section and to replace unicode characters. """ import sys import re from invenio import bibclassify_config as bconfig from invenio.docextract_pdf import replace_undesirable_characters from invenio.refextract_find import find_reference_section, find_end_of_reference_section log = bconfig.get_logger("bibclassify.text_normalizer") _washing_regex = [] def get_washing_regex(): global _washing_regex if len(_washing_regex): return _washing_regex washing_regex = [ # Replace non and anti with non- and anti-. This allows a better # detection of keywords such as nonabelian. (re.compile(r"(\snon)[- ](\w+)"), r"\1\2"), (re.compile(r"(\santi)[- ](\w+)"), r"\1\2"), # Remove all leading numbers (e.g. 2-pion -> pion). (re.compile(r"\s\d-"), " "), # Remove multiple spaces.
from warnings import warn from invenio.config import CFG_SITE_URL from invenio.testutils import make_test_suite, run_test_suite, \ test_web_page_content from invenio.bibclassify_unit_tests import BibClassifyTestCase, suite from invenio import bibclassify_config as bconfig from invenio.testutils import make_test_suite, run_test_suite from invenio import config from invenio import bibclassify_engine from invenio import bibclassify_cli from invenio import bibclassify_ontology_reader log = bconfig.get_logger("bibclassify.regression_tests") # do this only if not in STANDALONE mode bibclassify_daemon = dbquery = None if not bconfig.STANDALONE: from invenio import dbquery from invenio import bibclassify_daemon from invenio import bibdocfile class BibClassifyRegressionTest(BibClassifyTestCase): """Check BibClassify web pages whether they are up or not.""" def test_availability_bibclassify_admin_guide(self): """bibclassify - availability of BibClassify Admin Guide page""" self.assertEqual( [],
import sys from invenio.testutils import InvenioTestCase import tempfile import cStringIO import os import time import stat import shutil from invenio import bibclassify_config as bconfig from invenio.testutils import make_test_suite, run_test_suite, nottest from invenio import config from invenio import bibclassify_ontology_reader log = bconfig.get_logger("bibclassify.tests") # do this only if not in STANDALONE mode bibclassify_daemon = dbquery = None if not bconfig.STANDALONE: from invenio import bibdocfile class BibClassifyTestCase(InvenioTestCase): """ Abusive test suite - the one that takes sooooo long """ def setUp(self): """Initialize stuff""" #self.tmpdir = invenio.config.CFG_TMPDIR config.CFG_TMPDIR = tempfile.gettempdir()
But unfortunately there is a confusion between running in a standalone mode and producing output suitable for printing, and running in a web-based mode where the webtemplate is used. For the moment the pieces of the representation code are left in this module. This module is STANDALONE safe """ import os import random import sys import time import cgi from invenio import bibclassify_config as bconfig log = bconfig.get_logger("bibclassify.engine") from invenio import bibclassify_ontology_reader as reader from invenio import bibclassify_text_extractor as extractor from invenio import bibclassify_text_normalizer as normalizer from invenio import bibclassify_keyword_analyzer as keyworder from invenio import bibclassify_acronym_analyzer as acronymer from invenio.urlutils import make_user_agent_string from invenio.textutils import encode_for_xml # --------------------------------------------------------------------- # API # ---------------------------------------------------------------------
import time import urllib2 import traceback import xml.sax import thread import time try: import rdflib rdflib_exceptions_Error = rdflib.exceptions.Error except ImportError: rdflib = None rdflib_exceptions_Error = None from invenio import bibclassify_config as bconfig log = bconfig.get_logger("bibclassify.ontology_reader") from invenio import config # only if not running in a stanalone mode if bconfig.STANDALONE: dbquery = None from urllib2 import urlopen else: from invenio import dbquery from invenio.urlutils import make_invenio_opener urlopen = make_invenio_opener('BibClassify').open _contains_digit = re.compile("\d") _starts_with_non = re.compile("(?i)^non[a-z]") _starts_with_anti = re.compile("(?i)^anti[a-z]") _split_by_punctuation = re.compile("(\W+)")
This module provides methods to clean the text lines. Currently, the methods are tuned to work with the output of pdftotext and documents in the HEP field. Methods can be tuned to your needs through the configuration file. This modules uses the refextract module of BibEdit in order to find the references section and to replace unicode characters. """ import sys import re from invenio import bibclassify_config as bconfig from invenio.docextract_pdf import replace_undesirable_characters from invenio.refextract_find import find_reference_section, find_end_of_reference_section log = bconfig.get_logger("bibclassify.text_normalizer") _washing_regex = [] def get_washing_regex(): global _washing_regex if len(_washing_regex): return _washing_regex washing_regex = [ # Replace non and anti with non- and anti-. This allows a better # detection of keywords such as nonabelian. (re.compile(r"(\snon)[- ](\w+)"), r"\1\2"), (re.compile(r"(\santi)[- ](\w+)"), r"\1\2"), # Remove all leading numbers (e.g. 2-pion -> pion).
import os from warnings import warn from invenio.config import CFG_SITE_URL from invenio.testutils import make_test_suite, run_test_suite, test_web_page_content from invenio.bibclassify_unit_tests import BibClassifyTestCase, suite from invenio import bibclassify_config as bconfig from invenio.testutils import make_test_suite, run_test_suite from invenio import config from invenio import bibclassify_engine from invenio import bibclassify_cli from invenio import bibclassify_ontology_reader log = bconfig.get_logger("bibclassify.regression_tests") # do this only if not in STANDALONE mode bibclassify_daemon = dbquery = None if not bconfig.STANDALONE: from invenio import dbquery from invenio import bibclassify_daemon from invenio import bibdocfile class BibClassifyRegressionTest(BibClassifyTestCase): """Check BibClassify web pages whether they are up or not.""" def test_availability_bibclassify_admin_guide(self): """bibclassify - availability of BibClassify Admin Guide page""" self.assertEqual(
"""Template for the bibclassify - this modules is NOT standalone safe - it is not expected to be used in a stanalone mode ever. Some template variables are coming directly from the config module, those starting with CFG_BIBCLASSIFY_WEB.... """ import cgi from invenio import config from invenio.messages import gettext_set_language from urllib import quote from invenio.htmlutils import escape_html from invenio import bibclassify_config as bconfig from invenio import bibclassify_ontology_reader as reader log = bconfig.get_logger("bibclassify.template") class Template: def tmpl_page(self, keywords=None, top='', middle='', bottom='', navbar=None, req=None, ln=None, generate=None, sorting=None, type=None, numbering=None,
"""Template for the bibclassify - this modules is NOT standalone safe - it is not expected to be used in a stanalone mode ever. Some template variables are coming directly from the config module, those starting with CFG_BIBCLASSIFY_WEB.... """ import cgi from invenio import config from invenio.messages import gettext_set_language from urllib import quote from invenio.htmlutils import escape_html from invenio import bibclassify_config as bconfig from invenio import bibclassify_ontology_reader as reader log = bconfig.get_logger("bibclassify.template") class Template: def tmpl_page(self, keywords=None, top='', middle='', bottom='', navbar=None, req=None, ln=None, generate=None, sorting=None, type=None,
## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ BibClassify command-line interface. This modules provides a CLI for BibClassify. It reads the options and calls the method output_keywords_for_sources from bibclassify_engine. This module is STANDALONE safe """ import getopt import sys from invenio import bibclassify_config as bconfig log = bconfig.get_logger("bibclassify.cli") from invenio import bibclassify_engine as engine from invenio import bibclassify_ontology_reader as reader daemon = None def get_recids_list(recids_string): """Returns a list of recIDs.""" recids = {} elements = recids_string.split(",") for element in elements: bounds = element.split("-") bounds_nb = len(bounds) if bounds_nb == 1:
perform_request_search, get_record, print_record from invenio.websearchadminlib import get_detailed_page_tabs from invenio.template import load from invenio.webinterface_handler import wash_urlargd from invenio.webuser import collect_user_info from invenio import access_control_engine as acce from invenio import dbquery from invenio import bibtask from invenio import bibrecord from invenio import bibclassify_config as bconfig from invenio import bibclassify_text_extractor from invenio import bibclassify_engine from invenio import bibclassify_ontology_reader as bor log = bconfig.get_logger("bibclassify.webinterface") template = load('bibclassify') def main_page(req, recid, tabs, ln, template): """Generates the main page for the keyword tab - http://url/record/[recid]/keywords @var req: request object @var recid: int docid @var tabs: list of tab links @var ln: language id @var template: template object @return: nothing, writes using req object """
""" import os import re import sys import tempfile import urllib2 from invenio import bibclassify_config as bconfig if bconfig.STANDALONE: from urllib2 import urlopen else: from invenio.urlutils import make_invenio_opener urlopen = make_invenio_opener('BibClassify').open log = bconfig.get_logger("bibclassify.text_extractor") _ONE_WORD = re.compile("[A-Za-z]{2,}") def text_lines_from_local_file(document, remote=False): """Returns the fulltext of the local file. @var document: fullpath to the file that should be read @var remote: boolean, if True does not count lines (gosh!) @return: list of lines if st was read or an empty list""" # FIXME - this does not care if we open anything, including binary files try: if is_pdf(document): if not executable_exists("pdftotext"):
perform_request_search, get_record, print_record from invenio.websearchadminlib import get_detailed_page_tabs from invenio.template import load from invenio.webinterface_handler import wash_urlargd from invenio.webuser import collect_user_info from invenio import access_control_engine as acce from invenio import dbquery from invenio import bibtask from invenio import bibrecord from invenio import bibclassify_config as bconfig from invenio import bibclassify_text_extractor from invenio import bibclassify_engine from invenio import bibclassify_ontology_reader as bor log = bconfig.get_logger("bibclassify.webinterface") template = load('bibclassify') def main_page(req, recid, tabs, ln, template): """Generates the main page for the keyword tab - http://url/record/[recid]/keywords @var req: request object @var recid: int docid @var tabs: list of tab links @var ln: language id @var template: template object @return: nothing, writes using req object """ form = req.form
## ## You should have received a copy of the GNU General Public License ## along with Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Module for running microtests on how well the extraction works - this module is STANDALONE safe""" import ConfigParser import glob import traceback import codecs from invenio import bibclassify_config as bconfig from invenio import bibclassify_engine as engine log = bconfig.get_logger("bibclassify.microtest") def run(glob_patterns, verbose=20, plevel=1): """Execute microtests""" if verbose is not None: log.setLevel(int(verbose)) results = {} for pattern in glob_patterns: log.info("Looking for microtests: %s" % pattern) for cfgfile in glob.glob(pattern): log.debug("processing: %s" % (cfgfile)) try:
""" BibClassify command-line interface. This modules provides a CLI for BibClassify. It reads the options and calls the method output_keywords_for_sources from bibclassify_engine. This module is STANDALONE safe """ import getopt import sys from invenio import bibclassify_config as bconfig log = bconfig.get_logger("bibclassify.cli") from invenio import bibclassify_engine as engine from invenio import bibclassify_ontology_reader as reader daemon = None def get_recids_list(recids_string): """Returns a list of recIDs.""" recids = {} elements = recids_string.split(",") for element in elements: bounds = element.split("-") bounds_nb = len(bounds) if bounds_nb == 1:
import sys from invenio.testutils import InvenioTestCase import tempfile import cStringIO import os import time import stat import shutil from invenio import bibclassify_config as bconfig from invenio.testutils import make_test_suite, run_test_suite, nottest from invenio import config from invenio import bibclassify_ontology_reader log = bconfig.get_logger("bibclassify.tests") # do this only if not in STANDALONE mode bibclassify_daemon = dbquery = None if not bconfig.STANDALONE: from invenio import bibdocfile class BibClassifyTestCase(InvenioTestCase): """ Abusive test suite - the one that takes sooooo long """ def setUp(self): """Initialize stuff""" #self.tmpdir = invenio.config.CFG_TMPDIR self.original_tmpdir = config.CFG_TMPDIR config.CFG_TMPDIR = tempfile.gettempdir()
""" BibClassify keyword analyser. This module contains methods to extract keywords from texts. It provides 3 different methods for 3 different types of keywords: single keywords, composite keywords and author keywords. This module is STANDALONE safe """ import re import sys import time from invenio import bibclassify_config as bconfig log = bconfig.get_logger("bibclassify.keyword_analyzer") _MAXIMUM_SEPARATOR_LENGTH = max([ len(_separator) for _separator in bconfig.CFG_BIBCLASSIFY_VALID_SEPARATORS ]) # XXX - rebuild this whole thing def get_single_keywords(skw_db, fulltext): """Find single keywords in the fulltext @var skw_db: list of KeywordToken objects @var fulltext: string, which will be searched @return : dictionary of matches in a format { <keyword object>, [[position, position...], ], .. }
""" import os import re import sys import tempfile import urllib2 from invenio import bibclassify_config as bconfig if bconfig.STANDALONE: from urllib2 import urlopen else: from invenio.urlutils import make_invenio_opener urlopen = make_invenio_opener('BibClassify').open log = bconfig.get_logger("bibclassify.text_extractor") _ONE_WORD = re.compile("[A-Za-z]{2,}") def text_lines_from_local_file(document, remote=False): """Returns the fulltext of the local file. @var document: fullpath to the file that should be read @var remote: boolean, if True does not count lines (gosh!) @return: list of lines if st was read or an empty list""" # FIXME - this does not care if we open anything, including binary files try: if is_pdf(document): if not executable_exists("pdftotext"):
But unfortunately there is a confusion between running in a standalone mode and producing output suitable for printing, and running in a web-based mode where the webtemplate is used. For the moment the pieces of the representation code are left in this module. This module is STANDALONE safe """ import os import random import sys import time import cgi from invenio import bibclassify_config as bconfig log = bconfig.get_logger("bibclassify.engine") from invenio import bibclassify_ontology_reader as reader from invenio import bibclassify_text_extractor as extractor from invenio import bibclassify_text_normalizer as normalizer from invenio import bibclassify_keyword_analyzer as keyworder from invenio import bibclassify_acronym_analyzer as acronymer try: from invenio.urlutils import make_user_agent_string except ImportError: ## Not in Invenio, we simply use default agent def make_user_agent_string(component=None): return bconfig.CFG_BIBCLASSIFY_USER_AGENT try: