Exemple #1
0
 def test_paper_id_is_not_set(self, mock_request):
     """``paperid`` is not included in the log data."""
     mock_request.environ = {'REQUEST_ID': 'foo-id-1234'}
     stream = StringIO()
     logger = logging.getLogger('foologger', stream)
     logger.error('what')
     captured_value = stream.getvalue()
     stream.close()
     self.assertIn('arxiv:null', captured_value,
                   "Paper ID should be null in log messages")
Exemple #2
0
 def test_get_logger_with_request(self, mock_request):
     """The request context is available."""
     mock_request.environ = {'REQUEST_ID': 'foo-id-1234'}
     stream = StringIO()
     logger = logging.getLogger('foologger', stream)
     self.assertIsInstance(logger, pyLogging.Logger,
                           "Should return a logging.Logger instance")
     logger.error('foo')
     captured_value = stream.getvalue()
     stream.close()
     self.assertIn('foo-id-1234', captured_value,
                   "Should include request ID in log messages")
Exemple #3
0
 def test_config_sets_loglevel(self, mock_get_config, mock_request):
     """LOGLEVEL param in config controls log level."""
     mock_get_config.return_value = {'LOGLEVEL': 10}
     mock_request.environ = {'REQUEST_ID': 'foo-id-1234'}
     stream = StringIO()
     logger = logging.getLogger('foologger', stream)
     logger.debug('foo')
     captured_value = stream.getvalue()
     stream.close()
     self.assertIn(
         'DEBUG', captured_value,
         "Changing LOGLEVEL in the app config should change the"
         " logger log level")
Exemple #4
0
    def test_get_logger_no_app_nor_request(self):
        """There is no application nor request context."""
        stream = StringIO()

        logger = logging.getLogger('foologger', stream)
        self.assertIsInstance(logger, pyLogging.Logger,
                              "Should return a logging.Logger instance")

        logger.error('foo')
        captured_value = stream.getvalue()
        stream.close()
        self.assertIn('ERROR: "foo"', captured_value,
                      "Should log normally even if request is not present")
Exemple #5
0
from string import punctuation

from elasticsearch_dsl import Search, Q, SF

from arxiv.base import logging

from search.domain import SimpleQuery, Query, AdvancedQuery, Classification, \
    ClassificationList
from .util import strip_tex, Q_, is_tex_query, is_literal_query, escape, \
    wildcard_escape, remove_single_characters, has_wildcard, is_old_papernum, \
    parse_date, parse_date_partial

from .highlighting import HIGHLIGHT_TAG_OPEN, HIGHLIGHT_TAG_CLOSE
from .authors import author_query, author_id_query, orcid_query

logger = logging.getLogger(__name__)

START_YEAR = 1991
END_YEAR = datetime.now().year


def _query_title(term: str, default_operator: str = 'AND') -> Q:
    if is_tex_query(term):
        return Q("match", **{f'title.tex': {'query': term}})
    fields = ['title.english']
    if is_literal_query(term):
        fields += ['title']
    return Q("query_string",
             fields=fields,
             default_operator=default_operator,
             allow_leading_wildcard=False,
Exemple #6
0
    DataRequired

from http import HTTPStatus as status
from arxiv.taxonomy import CATEGORIES_ACTIVE as CATEGORIES
from arxiv.taxonomy import ARCHIVES_ACTIVE as ARCHIVES
from arxiv.base import logging, alerts
from arxiv.forms import csrf
from arxiv.users.domain import Session
from arxiv.submission import save, RequestCrossList, Submission
from arxiv.submission.exceptions import SaveError

from ..util import load_submission
from .util import user_and_client_from_session, OptGroupSelectField, \
    validate_command

logger = logging.getLogger(__name__)  # pylint: disable=C0103

Response = Tuple[Dict[str, Any], int, Dict[str, Any]]  # pylint: disable=C0103


CONTACT_SUPPORT = Markup(
    'If you continue to experience problems, please contact'
    ' <a href="mailto:[email protected]"> arXiv support</a>.'
)


class HiddenListField(HiddenField):
    def process_formdata(self, valuelist):
        self.data = list(str(x) for x in valuelist if x)

    def process_data(self, value):
Exemple #7
0
    QueryError,
    IndexConnectionError,
    DocumentNotFound,
    IndexingError,
    OutsideAllowedRange,
    MappingError,
)
from search.services.index.util import MAX_RESULTS
from search.services.index.advanced import advanced_search
from search.services.index.simple import simple_search
from search.services.index.api import api_search
from search.services.index.classic_api import classic_search
from search.services.index import highlighting
from search.services.index import results

logger = logging.getLogger(__name__)

# Disable the Elasticsearch logger. When enabled, the Elasticsearch logger
# dumps entire Tracebacks prior to propagating exceptions. Thus we end up with
# tracebacks in the logs even for handled exceptions.
logging.getLogger("elasticsearch").disabled = True

ALL_SEARCH_FIELDS = [
    "author",
    "title",
    "abstract",
    "comments",
    "journal_ref",
    "acm_class",
    "msc_class",
    "report_num",
import sys
sys.path.append('.')
from unittest import mock
from references.process import extract
from references.process.merge import align, arbitrate, beliefs, normalize, priors
import os
from pprint import pprint
import csv
from arxiv.base import logging
logging.getLogger('references.process.extract').setLevel(40)

basepath = os.path.abspath('evaluation/pdfs')

if __name__ == '__main__':
    with open('evaluation/referenceCounts.csv') as f:
        raw = [row for row in csv.reader(f)]

    referenceCounts = [{k: row[i] for i, k in enumerate(raw[0])}
                       for row in raw if len(row) == len(raw[0])]

    for row in referenceCounts:

        full_path = os.path.join(basepath, row['pdf'])
        if not os.path.exists(full_path):
            continue
        document_id = row['pdf'][:-4]
        print('Extracting %s' % document_id)

        extractions = extract.extract(full_path, document_id)
        for extractor, refs in extractions.items():
            print(extractor, len(refs), row['N'])