예제 #1
0
 def __init__(self, app_name, *args, **kwargs):
     Celery.__init__(self, *args, **kwargs)
     self._config = adsputils.load_config()
     self._session = None
     self._engine = None
     self._app_name = app_name
     self.logger = adsputils.setup_logging(app_name)  #default logger
예제 #2
0
    def __init__(self, fields, ignore_fields, new_fields):
        self.fields = fields
        self.ignore_fields = ignore_fields
        self.new_fields = new_fields

        self.logger = setup_logging('validate', 'INFO')
        self.config = {}
        self.config.update(load_config())
예제 #3
0
    def __init__(self, file_):
        self._file = file_
        self.read_count = 0   # needed for logging
        self.logger = setup_logging('AdsDataSqlSync', 'DEBUG')
        self.logger.info('nonbib file ingest, file {}'.format(self._file))
        self.config = {}
        self.config.update(load_config())

        self._iostream = open(file_, 'r')
예제 #4
0
    def test_load_config(self):
        with patch('adsputils.load_module') as load_module:
            c = adsputils.load_config()
            f = os.path.abspath(
                os.path.join(os.path.dirname(inspect.getsourcefile(adsputils)),
                             '..'))
            self.assertEquals((f + '/config.py', ),
                              load_module.call_args_list[0][0])
            self.assertEquals((f + '/local_config.py', ),
                              load_module.call_args_list[1][0])
            self.assertEqual(c['PROJ_HOME'], f)

        with patch('adsputils.load_module') as load_module:
            adsputils.load_config('/tmp')
            self.assertEquals(('/tmp/config.py', ),
                              load_module.call_args_list[0][0])
            self.assertEquals(('/tmp/local_config.py', ),
                              load_module.call_args_list[1][0])
예제 #5
0
def query_Kibana(
        query='"+@log_group:\\"backoffice-orcid_pipeline-daemon\\" +@message:\\"Claim refused\\""',
        n_days=7,
        rows=5):
    """
    Function to query Kibana for a given input query and return the response.

    :param query: string query, same as would be entered in the Kibana search input (be sure to escape quotes and wrap
        query in double quotes - see default query for formatting)
    :param n_days: number of days backwards to query, starting now (=0 for all time)
    :param rows: number of results to return. If you just need the total number of hits and not the results
        themselves, can be small.
    :return: JSON results
    """

    config = {}
    config.update(load_config())

    # get start and end timestamps (in milliseconds since 1970 epoch)
    now = datetime.datetime.now(tzutc())
    epoch = datetime.datetime.utcfromtimestamp(0).replace(tzinfo=pytz.UTC)
    end_time = (now - epoch).total_seconds() * 1000.
    if n_days != 0:
        start_time = (now - datetime.timedelta(days=n_days) -
                      epoch).total_seconds() * 1000.
    else:
        start_time = 0.

    data = (
        '{"index":["cwl-*"]}\n{"size":%.0f,"sort":[{"@timestamp":{"order":"desc","unmapped_type":"boolean"}}],'
        % (rows) +
        '"query":{"bool":{"must":[{"query_string":{"analyze_wildcard":true, "query":'
        + query + '}}, ' +
        '{"range": {"@timestamp": {"gte": %.0f, "lte": %.0f,"format": "epoch_millis"}}}], "must_not":[]}}, '
        % (start_time, end_time) + '"docvalue_fields":["@timestamp"]}\n\n')

    header = {
        'origin': 'https://pipeline-kibana.kube.adslabs.org',
        'authorization': 'Basic ' + config['KIBANA_TOKEN'],
        'content-type': 'application/x-ndjson',
        'kbn-version': '5.5.2'
    }

    url = 'https://pipeline-kibana.kube.adslabs.org/_plugin/kibana/elasticsearch/_msearch'

    # set to bypass SSL cert problem w/ Kibana
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    resp = app.client.post(url, data=data, headers=header, verify=False)

    if resp.status_code == 200:
        results = resp.json()
        return results
    logger.warn('For query {}, there was a network problem: {0}\n'.format(
        query, resp))
    return None
예제 #6
0
 def init(cls):
     if cls._initted is False:
         config = load_config()
         root_dir = config.get('INPUT_DATA_ROOT',
                               './adsdata/tests/data1/config/')
         cls._reference_network = _Network(root_dir +
                                           data_files['reference']['path'])
         cls._citation_network = _Network(root_dir +
                                          data_files['citation']['path'])
         cls._refereed_list = _Refereed(root_dir +
                                        data_files['refereed']['path'])
         cls._initted = True
예제 #7
0
    def __init__(self, schema_='metrics'):
        self.logger = setup_logging('AdsDataSqlSync', 'INFO')

        self.schema = schema_
        self.table = models.MetricsTable()
        self.table.schema = self.schema

        # used to buffer writes
        self.upserts = []
        self.tmp_update_buffer = []
        self.tmp_count = 0
        self.config = {}
        self.config.update(load_config())
예제 #8
0
def create_app(app_name='adstb', local_config=None):
    """Builds and initializes the Celery application."""

    conf = adsputils.load_config()
    if local_config:
        conf.update(local_config)

    app = ADSTurboBeeCelery(app_name,
                            broker=conf.get('CELERY_BROKER', 'pyamqp://'),
                            include=conf.get('CELERY_INCLUDE',
                                             ['adstb.tasks']))

    return app
예제 #9
0
    def __init__(self, schema_='metrics'):
        self.logger = setup_logging('AdsDataSqlSync', 'INFO')

        self.schema =  schema_
        self.table = models.MetricsTable()
        self.table.schema = self.schema

        # used to buffer writes                                                                                         
        self.upserts = []
        self.tmp_update_buffer = []
        self.tmp_count = 0
        self.config = {}
        self.config.update(load_config())
    def __init__(self,
                 sqlachemy_url,
                 group_changes_in_chunks_of=1,
                 sqlalchemy_echo=False,
                 schema_prefix="citation_capture_",
                 force=False):
        """
        Initializes the class and prepares DB connection.

        :param sqlachemy_url: URL to connect to the DB.
        :param group_changes_in_chunks_of: Number of citation changes to be
            grouped when iterating.
        :param sqlalchemy_echo: Print every SQL statement.
        :param schema_prefix: Data is stored in schemas that correspond to a
            prefix + file last access date.
        :param force: If tables already exists in DB, drop them and re-ingest.
        """
        self.engine = create_engine(sqlachemy_url, echo=sqlalchemy_echo)
        self.connection = self.engine.connect()
        self.session = sessionmaker(bind=self.engine)()
        #
        # - Use app logger:
        #import logging
        #self.logger = logging.getLogger('ads-citation-capture')
        # - Or individual logger for this file:
        from adsputils import setup_logging, load_config
        proj_home = os.path.realpath(
            os.path.join(os.path.dirname(__file__), '../'))
        config = load_config(proj_home=proj_home)
        self.logger = setup_logging(__name__,
                                    proj_home=proj_home,
                                    level=config.get('LOGGING_LEVEL', 'INFO'),
                                    attach_stdout=config.get(
                                        'LOG_STDOUT', False))
        #
        self.table_name = RawCitation.__tablename__
        self.expanded_table_name = "expanded_" + self.table_name
        self.recreated_previous_expanded_table_name = "recreated_previous_expanded_" + self.table_name
        self.missing_previous_expanded_table_name = "not_processed_" + self.table_name
        self.joint_table_name = CitationChanges.__tablename__
        self.schema_prefix = schema_prefix
        self.schema_name = None
        self.previous_schema_name = None
        self.input_refids_filename = None
        self.group_changes_in_chunks_of = group_changes_in_chunks_of
        self.offset = 0
        self.n_changes = 0
        self.force = force
        self.last_modification_date = None
예제 #11
0
    def __init__(self, fields, ignore_fields, new_fields):
        self.fields = fields
        self.ignore_fields = ignore_fields
        self.new_fields = new_fields

        # - Use app logger:
        # import logging
        # self.logger = logging.getLogger('master-pipeline')
        # - Or individual logger for this file:
        from adsputils import setup_logging, load_config
        proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), '../'))
        self.config = load_config(proj_home=proj_home)
        self.logger = setup_logging(__name__, proj_home=proj_home,
                                    level=self.config.get('LOGGING_LEVEL', 'INFO'),
                                    attach_stdout=self.config.get('LOG_STDOUT', False))
예제 #12
0
def main():
    global config
    config.update(load_config())
    global logger
    logger = setup_logging('ADSData', config.get('LOG_LEVEL', 'INFO'))

    parser = argparse.ArgumentParser(description='generate nonbib data')
    ars = parser.parse_args()

    load(config)

 
    # compute metrics for a bibcode
    compute_metrics('2012ApJS..199...26H')
    # lots_of_metrics(config)
    logger.info('end of program')
예제 #13
0
 def __init__(self, schema_='nonbib'):
     self.schema = schema_
     self.meta = MetaData()
     self.table = models.NonBibTable()
     self.table.schema = self.schema
     # - Use app logger:
     #import logging
     #logger = logging.getLogger('ads-data')
     # - Or individual logger for this file:
     proj_home = os.path.realpath(
         os.path.join(os.path.dirname(__file__), '../'))
     config = load_config(proj_home=proj_home)
     self.logger = setup_logging(__name__,
                                 proj_home=proj_home,
                                 level=config.get('LOGGING_LEVEL', 'INFO'),
                                 attach_stdout=config.get(
                                     'LOG_STDOUT', False))
예제 #14
0
    def __init__(self, file_):
        self._file = file_
        self.read_count = 0  # needed for logging
        # - Use app logger:
        #import logging
        #logger = logging.getLogger('ads-data')
        # - Or individual logger for this file:
        proj_home = os.path.realpath(
            os.path.join(os.path.dirname(__file__), '../'))
        self.config = load_config(proj_home=proj_home)
        self.logger = setup_logging(
            __name__,
            proj_home=proj_home,
            level=self.config.get('LOGGING_LEVEL', 'INFO'),
            attach_stdout=self.config.get('LOG_STDOUT', False))

        self.logger.info('nonbib file ingest, file {}'.format(self._file))
        self._iostream = open(file_, 'r')
예제 #15
0
    def setUp(self):
        unittest.TestCase.setUp(self)
        config = load_config()
        proj_home = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '../..'))
        self.app = app.ADSMasterPipelineCelery('test', local_config=\
            {
            'SQLALCHEMY_URL': 'sqlite:///',
            'METRICS_SQLALCHEMY_URL': 'postgresql://[email protected]:15678/test',
            'SQLALCHEMY_ECHO': True,
            'PROJ_HOME' : proj_home,
            'TEST_DIR' : os.path.join(proj_home, 'adsmp/tests'),
            })
        Base.metadata.bind = self.app._session.get_bind()
        Base.metadata.create_all()

        MetricsBase.metadata.bind = self.app._metrics_engine
        MetricsBase.metadata.create_all()
예제 #16
0
    def setUp(self):
        unittest.TestCase.setUp(self)
        config = load_config()
        proj_home = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '../..'))
        self.app = app.ADSMasterPipelineCelery('test', local_config=\
            {
            'SQLALCHEMY_URL': config.get('METRICS_SQLALCHEMY_URL'),
            'METRICS_SQLALCHEMY_URL': config.get('METRICS_SQLALCHEMY_URL'),
            'SQLALCHEMY_ECHO': False,
            'PROJ_HOME' : proj_home,
            'TEST_DIR' : os.path.join(proj_home, 'adsmp/tests'),
            })
        Base.metadata.bind = self.app._session.get_bind()
        Base.metadata.create_all()

        MetricsBase.metadata.bind = self.app._session.get_bind()
        MetricsBase.metadata.create_all()
예제 #17
0
    def __init__(self, schema_='metrics'):
        # - Use app logger:
        #import logging
        #logger = logging.getLogger('ads-data')
        # - Or individual logger for this file:
        proj_home = os.path.realpath(
            os.path.join(os.path.dirname(__file__), '../'))
        self.config = load_config(proj_home=proj_home)
        self.logger = setup_logging(
            __name__,
            proj_home=proj_home,
            level=self.config.get('LOGGING_LEVEL', 'INFO'),
            attach_stdout=self.config.get('LOG_STDOUT', False))

        self.schema = schema_
        self.table = models.MetricsTable()
        self.table.schema = self.schema

        # used to buffer writes
        self.upserts = []
        self.tmp_update_buffer = []
        self.tmp_count = 0
예제 #18
0
파일: app.py 프로젝트: adsabs/oboi
    def __init__(self, app_name, *args, **kwargs):
        """
        :param: app_name - string, name of the application (can be anything)
        :keyword: local_config - dict, configuration that should be applied
            over the default config (that is loaded from config.py and local_config.py)
        """
        proj_home = None
        if 'proj_home' in kwargs:
            proj_home = kwargs.pop('proj_home')
        self.config = load_config(extra_frames=1,
                                  proj_home=proj_home,
                                  app_name=app_name)

        local_config = None
        if 'local_config' in kwargs and kwargs['local_config']:
            local_config = kwargs.pop('local_config')
            self.config.update(local_config)  #our config
        if not proj_home:
            proj_home = self.config.get('PROJ_HOME', None)
        self.logger = setup_logging(
            app_name,
            proj_home=proj_home,
            level=self.config.get('LOGGING_LEVEL', 'INFO'),
            attach_stdout=self.config.get('LOG_STDOUT', False))
예제 #19
0
 def setUp(self):
     unittest.TestCase.setUp(self)
     self.proj_home = tasks.app.conf['PROJ_HOME']
     self._app = tasks.app
     # Use a different database for unit tests since they will modify it
     self.sqlalchemy_url = "{}_test".format(load_config().get(
         'SQLALCHEMY_URL',
         'postgres://postgres@localhost:5432/citation_capture_pipeline'))
     config = {
         "TESTING_MODE": False,
         "CELERY_ALWAYS_EAGER": False,
         "CELERY_EAGER_PROPAGATES_EXCEPTIONS": False,
         "SQLALCHEMY_URL": self.sqlalchemy_url,
     }
     self.app = app.ADSCitationCaptureCelery('test',
                                             proj_home=self.proj_home,
                                             local_config=config)
     tasks.app = self.app  # monkey-patch the app object
     self._init_mock_data()
     try:
         Base.metadata.create_all(bind=self.app._engine, checkfirst=True)
     except:
         # Database not empty!
         raise
예제 #20
0
class NonbibFileReader(object):
    """reads nonbib column files

    file reading/parsing is controlled by the file's properties dict in file_defs
    every line must start with a bibcode
    file must be sorted by bibcode
    """

    bibcode_length = 19
    config = load_config()

    def __init__(self, filetype, file_info):
        """passed file type (e.g., canonical) and relevant part of file_defs"""
        self.filetype = filetype
        self.file_info = file_info
        self.filename = self.config.get('INPUT_DATA_ROOT',
                                        './') + file_info['path']
        self.logger = tasks.app.logger
        self.read_count = 0  # used in logging
        self.buffer = None  # holds at most one line of text
        self._iostream = open(self.filename, 'r', encoding='utf-8')

    def __enter__(self, *args, **kwargs):
        return self

    def __exit__(self, *args, **kwargs):
        self.close()

    def __iter__(self):
        return self

    def next(self):
        return self._iostream.next()

    def close(self):
        self._iostream.close()
        del self._iostream

    def _pushline(self, s):
        """the buffer is used when we read a line that is behond the desired bibcode
           and we need to unread it"""
        if self.buffer:
            self.logger.error(
                'error in file {}, {}, _pushline called when buffer was not empty.  File line number: read line: {}, buffer: {}'
                .format(self.filetype, self.filename, self.read_count, s,
                        self.buffer))
        self.buffer = s

    def _readline(self):
        """return the next valid line or empty string at eof 
           used to read all files"""
        self.read_count += 1
        if self.buffer:
            line = self.buffer
            self.buffer = None
            return line
        if self._iostream.closed:
            return ''
        line = self._iostream.readline()
        while len(line) > 0 and len(line) < self.bibcode_length:
            self.logger.error(
                'error, invalid short line in readline {} filename: {} at line {}, line length less then length of bibcode, line: {}'
                .format(self.filetype, self.filename, self.read_count, line))
            self.read_count += 1
            line = self._iostream.readline()
        return line

    def read_value_for(self, bibcode):
        """return the value from the file for the passed bibcode
        returns default value if bibcoce is not in file

        return value is a dict with the key of self.filetype

        some files repeat a bibcode on consecutive lines to provide multiple values
        other files do not repeat a bibcode and provide multiple values on a single line
        other files (e.g., relevance/docmetrics.tab) have multiple values
        some files have associated effects on values like property field
        this reader handles all cases based on the file property dict
        """
        # first, are we at eof?
        current_line = self._readline()
        if len(current_line) == 0:
            # here if we are already at eof, bibcode isn't in file
            return self._convert_value(self.file_info['default_value'])

        # next, skip over lines in file until we:
        #   either find the passed bibcode or determine it isn't in the file
        skip_count = 0
        while len(current_line) != 0 and self._get_bibcode(
                current_line) < bibcode:
            current_line = self._readline()
            skip_count = skip_count + 1

        # at this point, we have either read to the desired bibcode
        # or it doesn't exist and we read past it
        if len(current_line
               ) == 0 or bibcode != self._get_bibcode(current_line):
            # bibcode not in file
            self._pushline(current_line)
            return self._convert_value(self.file_info['default_value'])

        if isinstance(self.file_info['default_value'], bool):
            return self._convert_value(
                True)  # boolean files only hold bibcodes, all values are True

        # at this point, we have the first line with the bibcode in it
        # roll up possible other values on adjacent lines in file
        value = []
        value.append(self._get_rest(current_line))
        current_line = self._readline()
        while self.file_info.get(
                'multiline', False) and (current_line is not None) and (
                    bibcode == self._get_bibcode(current_line)):
            value.append(self._get_rest(current_line))
            current_line = self._readline()

        # at this point we have read beyond the desired bibcode, must back up
        self._pushline(current_line)
        # finally, convert raw input into something useful
        return self._convert_value(value)

    def _convert_value(self, value):
        """convert file string line to something more useful
        
        return a dict with filetype as key and value converted
        """

        if isinstance(value, str) and '\x00' in value:
            # there should not be nulls in strings
            self.logger.error(
                'error string contained a null in file {} {}, line number: {}, value: {}'
                .format(self.filetype, self.filename, self.read_count, value))
            value = value.replace('\x00', '')

        return_value = value
        if isinstance(value, bool):
            d = {self.filetype: return_value}
            if 'extra_values' in self.file_info and value != self.file_info[
                    'default_value']:
                d.update(self.file_info['extra_values'])
            return {self.filetype: d}
        elif (len(value) > 0 and '\t' in value[0]
              and not self.file_info.get('tabs_to_spaces', False)):
            # tab separator in string means we need to convert elements to array
            z = []
            for r in value:
                x = r.split('\t')
                if self.file_info.get('string_to_number', True):
                    # convert valid ints and floats to numeric representation
                    t = []
                    for y in x:
                        t.append(self._convert_scalar(y))
                    z.append(t)
            return_value = z
            if len(return_value) == 1:
                return_value = return_value[0]
        elif 'interleave' in self.file_info and value != self.file_info[
                'default_value']:
            # here on multi-line dict (e.g., associations)
            # interleave data on successive lines e.g., merge first element in each array, second element, etc.
            #   since they also have subparts, these arrays will then put in dict with the cooresponding key
            x = {}
            for k in self.file_info['subparts']:
                x[k] = []
            for r in value:
                # For instance, in associations 'r' should contain:
                #   URL title
                # where title may contain spaces too
                parts = r.split(' ', 1)  # parts will contain [URL, title]
                if len(parts) < len(self.file_info['subparts']):
                    self.logger.error(
                        'error in reader with interleave for {} file {}, incomplete value in line.  value = {}, parts = {} at line'
                        .format(self.filetype, self.filename, value, parts,
                                self.read_count))
                else:
                    for i, k in enumerate(self.file_info['subparts']):
                        v = parts[i].strip()
                        x[k].append(v)
            return_value = x
        elif (self.file_info.get('tabs_to_spaces', False)):
            # files like simbad_objects have tabs that we simply convert to spaces
            x = []
            for a in value:
                x.append(a.replace('\t', ' '))
            return_value = x
        elif (len(value) > 1):
            x = []
            for r in value:
                x.append(r.replace('\t', ' ').strip())
            return_value = x
        # convert array to dict if needed
        if 'subparts' in self.file_info and return_value != self.file_info[
                'default_value'] and 'interleave' not in self.file_info:
            if type(return_value[0]) is list:
                x = []
                for r in return_value:
                    x.append(self._convert_subparts(r))
            else:
                x = self._convert_subparts(return_value)
            return_value = x

        # are there extra_values to add to dict
        if 'extra_values' in self.file_info:
            self._add_extra_values(return_value)
        return {self.filetype: return_value}

    def _add_extra_values(self, current):
        if current != self.file_info['default_value'] and type(
                current) is dict:
            current.update(self.file_info['extra_values'])
        elif current != self.file_info['default_value'] and type(
                current) is list:
            # here with array of dicts, put extra_values in each dict
            for x in current:
                v = self.file_info['extra_values']
                if type(v) is dict and type(x) is dict:
                    x.update(v)
                else:
                    self.logger.error(
                        'serious error in reader._add_extra_values, non dict value, extra_values = {}, processing element = {},  passed current = {}'
                        .format(x, v, current))

    def _convert_subparts(self, current):
        d = {}
        for i, k in enumerate(self.file_info['subparts']):
            v = ''
            if i < len(current):
                v = current[i]
            if type(k) is list:
                # here if key is in a list by itself which means values should be in a list
                k = k[0]
                v = [v]
            d[k] = v
        return d

    def _get_bibcode(self, s):
        """return the  bibcode from the from of the line"""
        if s is None:
            return None
        if len(s) < self.bibcode_length:
            self.logger.error(
                'error, invalid short line in file {} {} at line {}, line length less then length of bibcode, line = {}'
                .format(self.filetype, self.filename, self.read_count, s))
            return s
        return s[:self.bibcode_length].strip()

    def _get_rest(self, s):
        """return the text after the bibcode and first tab separator"""
        if len(s) < self.bibcode_length + 1:
            self.logger.error(
                'error, in _get_rest with invalid short line in file {} {} at line {}, line length less then length of bibcode plus 1, line = {}'
                .format(self.filetype, self.filename, self.read_count, s))
            return ''
        return s[self.bibcode_length + 1:].strip()

    def _convert_scalar(self, s):
        if s.isdigit():
            return int(s)
        try:
            x = float(s)
            return x
        except ValueError:
            return s.strip()
예제 #21
0
import os
import requests
import json
from adsputils import date2solrstamp
import sys
import time
from collections import OrderedDict

# ============================= INITIALIZATION ==================================== #
# - Use app logger:
#import logging
#logger = logging.getLogger('master-pipeline')
# - Or individual logger for this file:
from adsputils import setup_logging, load_config
proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), '../'))
config = load_config(proj_home=proj_home)
logger = setup_logging(__name__,
                       proj_home=proj_home,
                       level=config.get('LOGGING_LEVEL', 'INFO'),
                       attach_stdout=config.get('LOG_STDOUT', False))

# =============================== FUNCTIONS ======================================= #


def extract_metrics_pipeline(data, solrdoc):

    citation = data.get('citations', [])

    return dict(citation=citation)

예제 #22
0
 def setUp(self):
     self.config = {}
     self.config.update(load_config())
     self.assertEqual(-1, self.config['MAX_ROWS'],
                      'tests fail when local_config.py sets MAX_ROWS')
예제 #23
0
import collections
import copy
import datetime
import itertools
import os
import sys
import types

from aip.libs import enforce_schema, author_match
import adsputils as utils

_config = utils.load_config()

def mergeRecords(records):
    completeRecords = []
    e = enforce_schema.Enforcer() # TODO: no need to create new instances?
    for r in copy.deepcopy(records):
        r['text'] = Merger().mergeText(r['text'])
        blocks = e.ensureList(r['metadata'])
        #Multiply defined blocks need merging.
        metadatablockCounter = collections.Counter([i['tempdata']['type'] for i in blocks])
        needsMerging = dict([(k,[]) for k,v in metadatablockCounter.iteritems() if v>1])
    
        completeMetadata = {}
        #First pass: Add the singly defined blocks to the complete record
        for b in blocks:
            _type = b['tempdata']['type']
            if _type not in needsMerging:
                completeMetadata[_type] = b
            else:
                needsMerging[_type].append(b)
예제 #24
0
 def setUp(self):
     self.config = {}
     self.config.update(load_config())
     self.assertEqual(-1, self.config['MAX_ROWS'], 'tests fail when local_config.py sets MAX_ROWS')
예제 #25
0
def claimed_records(debug=False, test=False):
    """
    Reporting function; checks SOLR for the following:
        - number of records that have been claimed by at least one ORCID ID, in orcid_pub, orcid_user, orcid_other
            (each reported separately)
        - total number of accepted claims of each of orcid_pub, orcid_user, orcid_other (i.e. if a single record
            has two separate authors who have successfully created a claim, the number reported here is 2)
        - total number of bibcodes that have been claimed, of any type

    The report is designed to be run regularly, and the results compared to previous report runs (via logs)

    :return: None (output to logs)
    """
    if test:
        logger = setup_logging('test_claimed')
    else:
        logger = setup_logging('reporting')

    config = {}
    config.update(load_config())

    # the first 7 digits of ORCID IDs are zero padding
    orcid_wild = '000000*'
    resp_pub = query_solr(config['SOLR_URL'],
                          'orcid_pub:"' + orcid_wild + '"',
                          rows=10,
                          sort="bibcode desc",
                          fl='bibcode')
    resp_user = query_solr(config['SOLR_URL'],
                           'orcid_user:"******"',
                           rows=10,
                           sort="bibcode desc",
                           fl='bibcode')
    resp_other = query_solr(config['SOLR_URL'],
                            'orcid_other:"' + orcid_wild + '"',
                            rows=10,
                            sort="bibcode desc",
                            fl='bibcode')

    logger.info('Number of records with an orcid_pub: {}'.format(
        resp_pub['response']['numFound']))
    logger.info('Number of records with an orcid_user: {}'.format(
        resp_user['response']['numFound']))
    logger.info('Number of records with an orcid_other: {}'.format(
        resp_other['response']['numFound']))

    start = 0
    rows = 1000

    results = resp_pub['response']['docs']
    num_orcid_pub = 0
    num_orcid_user = 0
    num_orcid_other = 0

    bibcode_pub = set()
    bibcode_user = set()
    bibcode_other = set()
    while results:
        results = query_records(start=start, rows=rows)
        for i in range(len(results)):
            try:
                results[i]['orcid_pub']
            except KeyError:
                pass
            else:
                num_p = len(
                    fnmatch.filter(results[i].get('orcid_pub'), '0000*'))
                num_orcid_pub += num_p
                bibcode_pub.add(results[i].get('bibcode'))
            try:
                results[i]['orcid_user']
            except KeyError:
                pass
            else:
                num_u = len(
                    fnmatch.filter(results[i].get('orcid_user'), '0000*'))
                num_orcid_user += num_u
                bibcode_user.add(results[i].get('bibcode'))
            try:
                results[i]['orcid_other']
            except KeyError:
                pass
            else:
                num_o = len(
                    fnmatch.filter(results[i].get('orcid_other'), '0000*'))
                num_orcid_other += num_o
                bibcode_other.add(results[i].get('bibcode'))

        if debug:
            if (start + rows) % 10000 == 0:
                logger.info(
                    'Number of results processed so far: {}'.format(start +
                                                                    rows))

        if test:
            break
        else:
            start += rows

    logger.info('Total number of orcid_pub claims: {}'.format(num_orcid_pub))
    logger.info('Total number of orcid_user claims: {}'.format(num_orcid_user))
    logger.info(
        'Total number of orcid_other claims: {}'.format(num_orcid_other))

    orcid_bibcodes = bibcode_pub.union(bibcode_user).union(bibcode_other)
    logger.info('Total number of records with any ORCID claims: {}'.format(
        len(orcid_bibcodes)))
예제 #26
0
import os
import fnmatch
import datetime
import cachetools
import time
import pytz
import urllib3

# ============================= INITIALIZATION ==================================== #
# - Use app logger:
#import logging
#logger = logging.getLogger('orcid-pipeline')
# - Or individual logger for this file:
from adsputils import setup_logging, load_config
proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), '../'))
config = load_config(proj_home=proj_home)
logger = setup_logging(__name__,
                       proj_home=proj_home,
                       level=config.get('LOGGING_LEVEL', 'INFO'),
                       attach_stdout=config.get('LOG_STDOUT', False))

app = tasks.app

records_cache = cachetools.TTLCache(maxsize=1024,
                                    ttl=3600,
                                    timer=time.time,
                                    missing=None,
                                    getsizeof=None)

# =============================== FUNCTIONS ======================================= #
예제 #27
0
def num_missing_profile(n_days=7,test=False):
    """
    Queries logs via Kibana to get the number of profiles reported missing over a given time period.

    :param n_days: Number of days backwards to look, starting from now
    :return: None (outputs to logs)
    """

    if test:
        logger = setup_logging('test_kibana')

    query = '"+@log_group:\\"backoffice-orcid_pipeline-daemon\\" +@message:\\"Missing profile for\\""'

    resp = query_Kibana(query=query, n_days=n_days, rows=5)

    total = resp['responses'][0]['hits']['total']

    logger.info('Number of missing profile errors in the last {} days: {}'.format(n_days, total))

if __name__ == '__main__':
    # Runs all reporting scripts, outputs results to logs

    # Before running, tunnel into SOLR and postgres and specify localhost URLs for
    # SOLR_URL and SQLALCHEMY_URL, respectively, in local_config.py

    config = {}
    config.update(load_config())
    claimed_records()
    num_claims(n_days=7)
    num_refused_claims(n_days=7)
    num_missing_profile(n_days=7)
예제 #28
0
def main():
    parser = argparse.ArgumentParser(description='process column files into Postgres')
    parser.add_argument('-t', '--rowViewBaselineSchemaName', default='nonbibstaging', 
                        help='name of old postgres schema, used to compute delta')
    parser.add_argument('-d', '--diagnose', default=False, action='store_true', help='run simple test')
    parser.add_argument('-f', '--filename', default='bibcodes.txt', help='name of file containing the list of bibcode for metrics comparison')
    parser.add_argument('-m', '--metricsSchemaName', default='metrics', help='name of the postgres metrics schema')
    parser.add_argument('-n', '--metricsSchemaName2', default='', help='name of the postgres metrics schema for comparison')
    parser.add_argument('-r', '--rowViewSchemaName', default='nonbib', help='name of the postgres row view schema')
    parser.add_argument('-s', '--batchSize', default=100,  help='used when queuing data')
    parser.add_argument('-b', '--bibcodes', default='',  help='comma separate list of bibcodes send to master pipeline')
    parser.add_argument('command', default='help', nargs='?',
                        help='ingest | verify | createIngestTables | dropIngestTables | renameSchema ' \
                        + ' | createJoinedRows | createMetricsTable | dropMetricsTable ' \
                        + ' | populateMetricsTable | createDeltaRows | populateMetricsTableDelta ' \
                        + ' | runRowViewPipeline | runMetricsPipeline | createNewBibcodes ' \
                        + ' | runRowViewPipelineDelta | runMetricsPipelineDelta '\
                        + ' | runPipelines | runPipelinesDelta | nonbibToMasterPipeline | nonbibDeltaToMasterPipeline'
                        + ' | metricsToMasterPipeline | metricsDeltaToMasterPipeline | metricsCompare')

    args = parser.parse_args()

    config.update(load_config())

    global logger
    logger = setup_logging('AdsDataSqlSync', config.get('LOG_LEVEL', 'INFO'))
    logger.info('starting AdsDataSqlSync.app with {}'.format(args.command))
    nonbib_connection_string = config.get('INGEST_DATABASE',
                                   'postgresql://postgres@localhost:5432/postgres')
    nonbib_db_engine = create_engine(nonbib_connection_string)
    nonbib_db_conn = nonbib_db_engine.connect()

    metrics_connection_string = config.get('METRICS_DATABASE',
                                   'postgresql://postgres@localhost:5432/postgres')
    metrics_db_engine = create_engine(metrics_connection_string)
    metrics_db_conn = metrics_db_engine.connect()
    sql_sync = nonbib.NonBib(args.rowViewSchemaName)
    if args.command == 'help' and args.diagnose:
        diagnose_nonbib()
        diagnose_metrics()

    elif args.command == 'createIngestTables':
        sql_sync.create_column_tables(nonbib_db_engine)

    elif args.command == 'dropIngestTables':
        sql_sync.drop_column_tables(nonbib_db_engine)

    elif args.command == 'createJoinedRows':
        sql_sync.create_joined_rows(nonbib_db_conn)

    elif args.command == 'createMetricsTable' and args.metricsSchemaName:
        m = metrics.Metrics(args.metricsSchemaName)
        m.create_metrics_table(metrics_db_engine)

    elif args.command == 'dropMetricsTable' and args.metricsSchemaName:
        m = metrics.Metrics(args.metricsSchemaName)
        m.drop_metrics_table(metrics_db_engine)

    elif args.command == 'populateMetricsTable' and args.rowViewSchemaName and args.metricsSchemaName:
        m = metrics.Metrics()
        m.update_metrics_all(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName)

    elif args.command == 'populateMetricsTableDelta' and args.rowViewSchemaName and args.metricsSchemaName:
        m = metrics.Metrics(args.metricsSchemaName)
        m.update_metrics_changed(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName)

    elif args.command == 'renameSchema' and args.rowViewSchemaName and args.rowViewBaselineSchemaName:
        sql_sync.rename_schema(nonbib_db_conn, args.rowViewBaselineSchemaName)

    elif args.command == 'createDeltaRows' and args.rowViewSchemaName and args.rowViewBaselineSchemaName:
        sql_sync.create_delta_rows(nonbib_db_conn, args.rowViewBaselineSchemaName)

    elif args.command == 'createNewBibcodes' and args.rowViewSchemaName and args.rowViewBaselineSchemaName:
        sql_sync.build_new_bibcodes(nonbib_db_conn, args.rowViewBaselineSchemaName)

    elif args.command == 'logDeltaReasons' and args.rowViewSchemaName and args.rowViewBaselineSchemaName:
        sql_sync.log_delta_reasons(nonbib_db_conn, args.rowViewBaselineSchemaName)

    elif args.command == 'runRowViewPipeline' and args.rowViewSchemaName:
        # drop tables, create tables, load data, create joined view
        sql_sync.drop_column_tables(nonbib_db_engine)
        sql_sync.create_column_tables(nonbib_db_engine)
        load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync)

    elif args.command == 'runMetricsPipeline' and args.rowViewSchemaName and args.metricsSchemaName:
        m = metrics.Metrics(args.metricsSchemaName)
        m.drop_metrics_table(metrics_db_engine)
        m.create_metrics_table(metrics_db_engine)
        m.update_metrics_all(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName)

    elif args.command == 'runRowViewPipelineDelta' and args.rowViewSchemaName and args.rowViewBaselineSchemaName:
        # we delete the old data
        baseline_sql_sync = nonbib.NonBib(args.rowViewBaselineSchemaName)
        baseline_engine = create_engine(nonbib_connection_string)
        baseline_sql_sync.drop_column_tables(baseline_engine)
        # rename the current to be the old (for later comparison)
        sql_sync.rename_schema(nonbib_db_conn, args.rowViewBaselineSchemaName)
        # create the new and populate
        baseline_sql_sync = None
        sql_sync.create_column_tables(nonbib_db_engine)
        load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync)
        # compute delta between old and new
        sql_sync.create_delta_rows(nonbib_db_conn, args.rowViewBaselineSchemaName)
        sql_sync.log_delta_reasons(nonbib_db_conn, args.rowViewBaselineSchemaName)

    elif args.command == 'runMetricsPipelineDelta' and args.rowViewSchemaName and args.metricsSchemaName:
        m = metrics.Metrics(args.metricsSchemaName)
        m.update_metrics_changed(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName)

    elif args.command == 'runPipelines' and args.rowViewSchemaName and args.metricsSchemaName:
        # drop tables, create tables, load data, compute metrics
        sql_sync.drop_column_tables(nonbib_db_engine)
        sql_sync.create_column_tables(nonbib_db_engine)
        load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync)

        m = metrics.Metrics(args.metricsSchemaName)
        m.drop_metrics_table(metrics_db_engine)
        m.create_metrics_table(metrics_db_engine)
        m.update_metrics_all(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName)

    elif args.command == 'runPipelinesDelta' and args.rowViewSchemaName and args.metricsSchemaName and args.rowViewBaselineSchemaName:
        # drop tables, rename schema, create tables, load data, compute delta, compute metrics
        baseline_sql_sync = nonbib.NonBib(args.rowViewBaselineSchemaName)
        baseline_engine = create_engine(nonbib_connection_string)
        baseline_sql_sync.drop_column_tables(baseline_engine)
        sql_sync.rename_schema(nonbib_db_conn, args.rowViewBaselineSchemaName)

        baseline_sql_sync = None
        sql_sync.create_column_tables(nonbib_db_engine)
        load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync)

        sql_sync.create_delta_rows(nonbib_db_conn, args.rowViewBaselineSchemaName)
        sql_sync.log_delta_reasons(nonbib_db_conn, args.rowViewBaselineSchemaName)

        m = metrics.Metrics(args.metricsSchemaName)
        m.update_metrics_changed(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName)

    elif args.command == 'nonbibToMasterPipeline' and args.diagnose:
        diagnose_nonbib()
    elif args.command == 'nonbibToMasterPipeline' and args.bibcodes:
        bibcodes = args.bibcodes.split(',')
        nonbib_bibs_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, bibcodes)
    elif args.command == 'nonbibToMasterPipeline' and args.filename:
        bibcodes = []
        with open(args.filename, 'r') as f:
            for line in f:
                bibcodes.append(line.strip())
                if len(bibcodes) > 100:
                    nonbib_bibs_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, bibcodes)
                    bibcodes = []
        if len(bibcodes) > 0:
            nonbib_bibs_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, bibcodes)
    elif args.command == 'nonbibToMasterPipeline':
        nonbib_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, int(args.batchSize))
    elif args.command == 'nonbibDeltaToMasterPipeline':
        nonbib_delta_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, int(args.batchSize))
    elif args.command == 'metricsToMasterPipeline' and args.diagnose:
        diagnose_metrics()
    elif args.command == 'metricsToMasterPipeline' and args.bibcodes:
        bibcodes = args.bibcodes.split(',')
        metrics_bibs_to_master_pipeline(metrics_db_engine, args.metricsSchemaName, bibcodes)
    elif args.command == 'metricsToMasterPipeline':
        metrics_to_master_pipeline(metrics_db_engine, args.metricsSchemaName, int(args.batchSize))
    elif args.command == 'metricsDeltaToMasterPipeline':
        metrics_delta_to_master_pipeline(metrics_db_engine, args.metricsSchemaName, nonbib_db_engine, args.rowViewSchemaName, int(args.batchSize))

    elif args.command == 'metricsCompare':
        # compare the values in two metrics postgres tables
        # useful to compare results from new pipeline to produciton pipeline
        # read metrics records from both databases and compare
        metrics_logger = setup_logging('metricsCompare', 'INFO')
        metrics1 = metrics.Metrics(args.metricsSchemaName)
        Session = sessionmaker(bind=metrics_db_engine)
        session = Session()
        if args.metricsSchemaName:
            session.execute('set search_path to {}'.format(args.metricsSchemaName))

        metrics2 = metrics.Metrics(args.metricsSchemaName2)
        metrics_connection_string2 = config.get('METRICS_DATABASE2',
                                               'postgresql://postgres@localhost:5432/postgres')
        metrics_db_engine2 = create_engine(metrics_connection_string2)
        Session2 = sessionmaker(bind=metrics_db_engine2)
        session2 = Session2()
        if args.metricsSchemaName2:
            session2.execute('set search_path to {}'.format(args.metricsSchemaName2))

        print 'm2', metrics_connection_string2
        print 'm2 schema', args.metricsSchemaName2
        with open(args.filename) as f:
            for line in f:
                bibcode = line.strip()
                m1 = metrics1.get_by_bibcode(session, bibcode)
                m2 = metrics2.get_by_bibcode(session2, bibcode)
                mismatch = metrics.Metrics.metrics_mismatch(line.strip(), m1, m2, metrics_logger)
                if mismatch:
                    metrics_logger.error('{} MISMATCHED FIELDS: {}'.format(bibcode, mismatch))
                    print '{} MISMATCHED FIELDS: {}'.format(bibcode, mismatch)
        session.close()
        session2.close()

    else:
        print 'app.py: illegal command or missing argument, command = ', args.command
        print '  row view schema name = ', args.rowViewSchemaName
        print '  row view baseline schema name = ', args.rowViewBaselineSchemaName
        print '  metrics schema name = ', args.metricsSchemaName

    if nonbib_db_conn:
        nonbib_db_conn.close()
    if metrics_db_conn:
        metrics_db_conn.close()
    logger.info('completed {}'.format(args.command))
예제 #29
0
# access to the values within the .ini file in use.
config = context.config

# Interpret the config file for Python logging.
# This line sets up loggers basically.
fileConfig(config.config_file_name)

# add your model's MetaData object here
# for 'autogenerate' support
# from myapp import mymodel
# target_metadata = mymodel.Base.metadata
#target_metadata = None
from adsputils import load_config

opath = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
app_conf = load_config(proj_home=opath)

if opath not in sys.path:
    sys.path.insert(0, opath)
from ADSCitationCapture import models

target_metadata = models.Base.metadata

# other values from the config, defined by the needs of env.py,
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.


def run_migrations_offline():
    """Run migrations in 'offline' mode.
예제 #30
0
class test_resolver(unittest.TestCase):
    """tests for generation of resolver"""

    # Reference to testing.postgresql database instance
    db = None

    # Connection to the database used to set the database state before running each
    # test
    db_con = None

    # Map of database connection parameters passed to the functions we're testing
    db_conf = None

    config = {}
    config.update(load_config())

    def setUp(self):
        """ Module level set-up called once before any tests in this file are
        executed.  Creates a temporary database and sets it up """
        global db, db_con, db_conf
        db = testing.postgresql.Postgresql()
        # Get a map of connection parameters for the database which can be passed
        # to the functions being tested so that they connect to the correct
        # database
        db_conf = db.dsn()
        # Create a connection which can be used by our test functions to set and
        # query the state of the database
        db_con = psycopg2.connect(**db_conf)
        # Commit changes immediately to the database
        db_con.set_isolation_level(
            psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
        with db_con.cursor() as cur:
            # Create the initial database structure (roles, schemas, tables etc.)
            # basically anything that doesn't change
            cur.execute(self.slurp('tests/data/datalinks.sql'))
        self.maxDiff = None

    def tearDown(self):
        """ Called after all of the tests in this file have been executed to close
        the database connecton and destroy the temporary database """
        db_con.close()
        db.stop()

    def slurp(self, path):
        """ Reads and returns the entire contents of a file """
        with open(path, 'r') as f:
            return f.read()

    def test_data_query(self):
        with db_con.cursor() as cur:
            cur.execute(self.config['DATA_QUERY'].format(
                db='public', bibcode='1903BD....C......0A'))
            self.assertEqual(fetch_data_link_elements_counts(cur.fetchone()),
                             [['CDS:1', 'Vizier:1'], 2])

    def test_esource_query1(self):
        with db_con.cursor() as cur:
            cur.execute(self.config['ESOURCE_QUERY'].format(
                db='public', bibcode='2016Atoms...4...18I'))
            self.assertEqual(fetch_data_link_elements(cur.fetchone()),
                             ['EPRINT_HTML', 'EPRINT_PDF'])

    def test_esource_query2(self):
        with db_con.cursor() as cur:
            cur.execute(self.config['ESOURCE_QUERY'].format(
                db='public', bibcode='2014MNRAS.444.1496E'))
            self.assertEqual(fetch_data_link_elements(cur.fetchone()),
                             ['PUB_PDF'])

    def test_esource_query3(self):
        with db_con.cursor() as cur:
            cur.execute(self.config['ESOURCE_QUERY'].format(
                db='public', bibcode='2014MNRAS.444.1497S'))
            self.assertEqual(fetch_data_link_elements(cur.fetchone()),
                             ['EPRINT_HTML', 'EPRINT_PDF', 'PUB_PDF'])

    def test_property_query1(self):
        with db_con.cursor() as cur:
            cur.execute(self.config['PROPERTY_QUERY'].format(
                db='public', bibcode='2004MNRAS.354L..31M'))
            self.assertEqual(fetch_data_link_elements(cur.fetchone()),
                             ['ASSOCIATED', 'ESOURCE', 'INSPIRE'])

    def test_property_query2(self):
        with db_con.cursor() as cur:
            cur.execute(self.config['PROPERTY_QUERY'].format(
                db='public', bibcode='1891opvl.book.....N'))
            self.assertEqual(fetch_data_link_elements(cur.fetchone()),
                             ['LIBRARYCATALOG'])

    def test_property_query3(self):
        with db_con.cursor() as cur:
            cur.execute(self.config['PROPERTY_QUERY'].format(
                db='public', bibcode='2018LPI....49.2177B'))
            self.assertEqual(fetch_data_link_elements(cur.fetchone()),
                             ['ESOURCE', 'TOC'])

    def test_extra_property_values(self):
        current_row = {}
        extra_properties = [{
            'pub_openaccess': True,
            'private': False,
            'ocrabstract': False,
            'nonarticle': True,
            'refereed': True
        }, {
            'pub_openaccess': False,
            'private': True,
            'ocrabstract': True,
            'nonarticle': False,
            'refereed': False
        }]
        results = [[
            'NONARTICLE', 'REFEREED', 'PUB_OPENACCESS', 'ADS_OPENACCESS',
            'AUTHOR_OPENACCESS', 'EPRINT_OPENACCESS', 'OPENACCESS'
        ], ['ARTICLE', 'NOT REFEREED', 'PRIVATE', 'OCRABSTRACT']]
        esources = [['ADS_PDF', 'AUTHOR_PDF', 'EPRINT_HTML'], []]
        for extra_property, result, esource in zip(extra_properties, results,
                                                   esources):
            current_row['property'] = []
            for key, value in extra_property.iteritems():
                current_row[key] = value
            current_row['esource'] = esource
            current_row = add_data_link_extra_properties(current_row)
            self.assertEqual(current_row['property'], result)

    def test_datalinks_query(self):
        with db_con.cursor() as cur:
            cur.execute(self.config['DATALINKS_QUERY'].format(
                db='public', bibcode='2004MNRAS.354L..31M'))
            rec = fetch_data_link_record(cur.fetchall())
            expected = [{
                'url':
                ['http://articles.adsabs.harvard.edu/pdf/1825AN......4..241B'],
                'title': [],
                'item_count':
                0,
                'link_type':
                'ESOURCE',
                'link_sub_type':
                'ADS_PDF'
            }, {
                'url': ['1825AN......4..241B', '2010AN....331..852K'],
                'title': ['Main Paper', 'Translation'],
                'item_count': 0,
                'link_type': 'ASSOCIATED',
                'link_sub_type': 'NA'
            }, {
                'url': [],
                'title': [],
                'item_count': 0,
                'link_type': 'INSPIRE',
                'link_sub_type': 'NA'
            }]
            self.assertEqual(rec, expected)

    def test_datalinks_query_for_associated(self):
        with db_con.cursor() as cur:
            cur.execute(self.config['DATALINKS_QUERY'].format(
                db='public', bibcode='2004MNRAS.354L..31M'))
            self.assertEqual(fetch_data_link_record(cur.fetchall()), [{
                'url':
                ['http://articles.adsabs.harvard.edu/pdf/1825AN......4..241B'],
                'title': [],
                'item_count':
                0,
                'link_type':
                'ESOURCE',
                'link_sub_type':
                'ADS_PDF'
            }, {
                'url': ['1825AN......4..241B', '2010AN....331..852K'],
                'title': ['Main Paper', 'Translation'],
                'item_count':
                0,
                'link_type':
                'ASSOCIATED',
                'link_sub_type':
                'NA'
            }, {
                'url': [],
                'title': [],
                'item_count':
                0,
                'link_type':
                'INSPIRE',
                'link_sub_type':
                'NA'
            }])
예제 #31
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import sys
from aip.libs import read_records
from adsputils import setup_logging, load_config
from aip.models import Records
from aip import app, tasks

import time
import mmap
import argparse
from collections import OrderedDict
from sqlalchemy.orm import load_only

config = load_config()
logger = setup_logging('run.py')


def readBibcodesFromFile(
        files):  #rca: all here is old code, i don't see why mmap was used
    """Reads contents of the BIBFILES into memory; basically bibcode:json_fingerprint
    pairs.
    
    @param files: list of files to read from
    @return: OrderedDict instance
    """
    start = time.time()
    records = OrderedDict()

    for f in files:
예제 #32
0
 def setUp(self):
     super(TestXMLExtractorBase, self).setUp()
     self.preferred_parser_names = load_config().get(
         'PREFERRED_XML_PARSER_NAMES'
     )  # Iterate through all the parsers defined in config.py
예제 #33
0
from datetime import datetime, timedelta
from os import remove
from shutil import move
import subprocess
import os

from adsputils import setup_logging, load_config

logger = setup_logging('AutomatedIngestReport')
conf = load_config(proj_home='./')


# enums used to to generate file names
class FileType:
    CANONICAL = 'CANONICAL'
    SOLR = 'SOLR'
    FULLTEXT = 'FULLTEXT'


class FileAdjective:
    MISSING = 'MISSING'
    DELETED = 'DELETED'
    EXTRA = 'EXTRA'
    NEW = 'NEW'


class Date:
    TODAY = 1
    YESTERDAY = 2