def __init__(self, app_name, *args, **kwargs): Celery.__init__(self, *args, **kwargs) self._config = adsputils.load_config() self._session = None self._engine = None self._app_name = app_name self.logger = adsputils.setup_logging(app_name) #default logger
def __init__(self, fields, ignore_fields, new_fields): self.fields = fields self.ignore_fields = ignore_fields self.new_fields = new_fields self.logger = setup_logging('validate', 'INFO') self.config = {} self.config.update(load_config())
def __init__(self, file_): self._file = file_ self.read_count = 0 # needed for logging self.logger = setup_logging('AdsDataSqlSync', 'DEBUG') self.logger.info('nonbib file ingest, file {}'.format(self._file)) self.config = {} self.config.update(load_config()) self._iostream = open(file_, 'r')
def test_load_config(self): with patch('adsputils.load_module') as load_module: c = adsputils.load_config() f = os.path.abspath( os.path.join(os.path.dirname(inspect.getsourcefile(adsputils)), '..')) self.assertEquals((f + '/config.py', ), load_module.call_args_list[0][0]) self.assertEquals((f + '/local_config.py', ), load_module.call_args_list[1][0]) self.assertEqual(c['PROJ_HOME'], f) with patch('adsputils.load_module') as load_module: adsputils.load_config('/tmp') self.assertEquals(('/tmp/config.py', ), load_module.call_args_list[0][0]) self.assertEquals(('/tmp/local_config.py', ), load_module.call_args_list[1][0])
def query_Kibana( query='"+@log_group:\\"backoffice-orcid_pipeline-daemon\\" +@message:\\"Claim refused\\""', n_days=7, rows=5): """ Function to query Kibana for a given input query and return the response. :param query: string query, same as would be entered in the Kibana search input (be sure to escape quotes and wrap query in double quotes - see default query for formatting) :param n_days: number of days backwards to query, starting now (=0 for all time) :param rows: number of results to return. If you just need the total number of hits and not the results themselves, can be small. :return: JSON results """ config = {} config.update(load_config()) # get start and end timestamps (in milliseconds since 1970 epoch) now = datetime.datetime.now(tzutc()) epoch = datetime.datetime.utcfromtimestamp(0).replace(tzinfo=pytz.UTC) end_time = (now - epoch).total_seconds() * 1000. if n_days != 0: start_time = (now - datetime.timedelta(days=n_days) - epoch).total_seconds() * 1000. else: start_time = 0. data = ( '{"index":["cwl-*"]}\n{"size":%.0f,"sort":[{"@timestamp":{"order":"desc","unmapped_type":"boolean"}}],' % (rows) + '"query":{"bool":{"must":[{"query_string":{"analyze_wildcard":true, "query":' + query + '}}, ' + '{"range": {"@timestamp": {"gte": %.0f, "lte": %.0f,"format": "epoch_millis"}}}], "must_not":[]}}, ' % (start_time, end_time) + '"docvalue_fields":["@timestamp"]}\n\n') header = { 'origin': 'https://pipeline-kibana.kube.adslabs.org', 'authorization': 'Basic ' + config['KIBANA_TOKEN'], 'content-type': 'application/x-ndjson', 'kbn-version': '5.5.2' } url = 'https://pipeline-kibana.kube.adslabs.org/_plugin/kibana/elasticsearch/_msearch' # set to bypass SSL cert problem w/ Kibana urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) resp = app.client.post(url, data=data, headers=header, verify=False) if resp.status_code == 200: results = resp.json() return results logger.warn('For query {}, there was a network problem: {0}\n'.format( query, resp)) return None
def init(cls): if cls._initted is False: config = load_config() root_dir = config.get('INPUT_DATA_ROOT', './adsdata/tests/data1/config/') cls._reference_network = _Network(root_dir + data_files['reference']['path']) cls._citation_network = _Network(root_dir + data_files['citation']['path']) cls._refereed_list = _Refereed(root_dir + data_files['refereed']['path']) cls._initted = True
def __init__(self, schema_='metrics'): self.logger = setup_logging('AdsDataSqlSync', 'INFO') self.schema = schema_ self.table = models.MetricsTable() self.table.schema = self.schema # used to buffer writes self.upserts = [] self.tmp_update_buffer = [] self.tmp_count = 0 self.config = {} self.config.update(load_config())
def create_app(app_name='adstb', local_config=None): """Builds and initializes the Celery application.""" conf = adsputils.load_config() if local_config: conf.update(local_config) app = ADSTurboBeeCelery(app_name, broker=conf.get('CELERY_BROKER', 'pyamqp://'), include=conf.get('CELERY_INCLUDE', ['adstb.tasks'])) return app
def __init__(self, sqlachemy_url, group_changes_in_chunks_of=1, sqlalchemy_echo=False, schema_prefix="citation_capture_", force=False): """ Initializes the class and prepares DB connection. :param sqlachemy_url: URL to connect to the DB. :param group_changes_in_chunks_of: Number of citation changes to be grouped when iterating. :param sqlalchemy_echo: Print every SQL statement. :param schema_prefix: Data is stored in schemas that correspond to a prefix + file last access date. :param force: If tables already exists in DB, drop them and re-ingest. """ self.engine = create_engine(sqlachemy_url, echo=sqlalchemy_echo) self.connection = self.engine.connect() self.session = sessionmaker(bind=self.engine)() # # - Use app logger: #import logging #self.logger = logging.getLogger('ads-citation-capture') # - Or individual logger for this file: from adsputils import setup_logging, load_config proj_home = os.path.realpath( os.path.join(os.path.dirname(__file__), '../')) config = load_config(proj_home=proj_home) self.logger = setup_logging(__name__, proj_home=proj_home, level=config.get('LOGGING_LEVEL', 'INFO'), attach_stdout=config.get( 'LOG_STDOUT', False)) # self.table_name = RawCitation.__tablename__ self.expanded_table_name = "expanded_" + self.table_name self.recreated_previous_expanded_table_name = "recreated_previous_expanded_" + self.table_name self.missing_previous_expanded_table_name = "not_processed_" + self.table_name self.joint_table_name = CitationChanges.__tablename__ self.schema_prefix = schema_prefix self.schema_name = None self.previous_schema_name = None self.input_refids_filename = None self.group_changes_in_chunks_of = group_changes_in_chunks_of self.offset = 0 self.n_changes = 0 self.force = force self.last_modification_date = None
def __init__(self, fields, ignore_fields, new_fields): self.fields = fields self.ignore_fields = ignore_fields self.new_fields = new_fields # - Use app logger: # import logging # self.logger = logging.getLogger('master-pipeline') # - Or individual logger for this file: from adsputils import setup_logging, load_config proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), '../')) self.config = load_config(proj_home=proj_home) self.logger = setup_logging(__name__, proj_home=proj_home, level=self.config.get('LOGGING_LEVEL', 'INFO'), attach_stdout=self.config.get('LOG_STDOUT', False))
def main(): global config config.update(load_config()) global logger logger = setup_logging('ADSData', config.get('LOG_LEVEL', 'INFO')) parser = argparse.ArgumentParser(description='generate nonbib data') ars = parser.parse_args() load(config) # compute metrics for a bibcode compute_metrics('2012ApJS..199...26H') # lots_of_metrics(config) logger.info('end of program')
def __init__(self, schema_='nonbib'): self.schema = schema_ self.meta = MetaData() self.table = models.NonBibTable() self.table.schema = self.schema # - Use app logger: #import logging #logger = logging.getLogger('ads-data') # - Or individual logger for this file: proj_home = os.path.realpath( os.path.join(os.path.dirname(__file__), '../')) config = load_config(proj_home=proj_home) self.logger = setup_logging(__name__, proj_home=proj_home, level=config.get('LOGGING_LEVEL', 'INFO'), attach_stdout=config.get( 'LOG_STDOUT', False))
def __init__(self, file_): self._file = file_ self.read_count = 0 # needed for logging # - Use app logger: #import logging #logger = logging.getLogger('ads-data') # - Or individual logger for this file: proj_home = os.path.realpath( os.path.join(os.path.dirname(__file__), '../')) self.config = load_config(proj_home=proj_home) self.logger = setup_logging( __name__, proj_home=proj_home, level=self.config.get('LOGGING_LEVEL', 'INFO'), attach_stdout=self.config.get('LOG_STDOUT', False)) self.logger.info('nonbib file ingest, file {}'.format(self._file)) self._iostream = open(file_, 'r')
def setUp(self): unittest.TestCase.setUp(self) config = load_config() proj_home = os.path.abspath( os.path.join(os.path.dirname(__file__), '../..')) self.app = app.ADSMasterPipelineCelery('test', local_config=\ { 'SQLALCHEMY_URL': 'sqlite:///', 'METRICS_SQLALCHEMY_URL': 'postgresql://[email protected]:15678/test', 'SQLALCHEMY_ECHO': True, 'PROJ_HOME' : proj_home, 'TEST_DIR' : os.path.join(proj_home, 'adsmp/tests'), }) Base.metadata.bind = self.app._session.get_bind() Base.metadata.create_all() MetricsBase.metadata.bind = self.app._metrics_engine MetricsBase.metadata.create_all()
def setUp(self): unittest.TestCase.setUp(self) config = load_config() proj_home = os.path.abspath( os.path.join(os.path.dirname(__file__), '../..')) self.app = app.ADSMasterPipelineCelery('test', local_config=\ { 'SQLALCHEMY_URL': config.get('METRICS_SQLALCHEMY_URL'), 'METRICS_SQLALCHEMY_URL': config.get('METRICS_SQLALCHEMY_URL'), 'SQLALCHEMY_ECHO': False, 'PROJ_HOME' : proj_home, 'TEST_DIR' : os.path.join(proj_home, 'adsmp/tests'), }) Base.metadata.bind = self.app._session.get_bind() Base.metadata.create_all() MetricsBase.metadata.bind = self.app._session.get_bind() MetricsBase.metadata.create_all()
def __init__(self, schema_='metrics'): # - Use app logger: #import logging #logger = logging.getLogger('ads-data') # - Or individual logger for this file: proj_home = os.path.realpath( os.path.join(os.path.dirname(__file__), '../')) self.config = load_config(proj_home=proj_home) self.logger = setup_logging( __name__, proj_home=proj_home, level=self.config.get('LOGGING_LEVEL', 'INFO'), attach_stdout=self.config.get('LOG_STDOUT', False)) self.schema = schema_ self.table = models.MetricsTable() self.table.schema = self.schema # used to buffer writes self.upserts = [] self.tmp_update_buffer = [] self.tmp_count = 0
def __init__(self, app_name, *args, **kwargs): """ :param: app_name - string, name of the application (can be anything) :keyword: local_config - dict, configuration that should be applied over the default config (that is loaded from config.py and local_config.py) """ proj_home = None if 'proj_home' in kwargs: proj_home = kwargs.pop('proj_home') self.config = load_config(extra_frames=1, proj_home=proj_home, app_name=app_name) local_config = None if 'local_config' in kwargs and kwargs['local_config']: local_config = kwargs.pop('local_config') self.config.update(local_config) #our config if not proj_home: proj_home = self.config.get('PROJ_HOME', None) self.logger = setup_logging( app_name, proj_home=proj_home, level=self.config.get('LOGGING_LEVEL', 'INFO'), attach_stdout=self.config.get('LOG_STDOUT', False))
def setUp(self): unittest.TestCase.setUp(self) self.proj_home = tasks.app.conf['PROJ_HOME'] self._app = tasks.app # Use a different database for unit tests since they will modify it self.sqlalchemy_url = "{}_test".format(load_config().get( 'SQLALCHEMY_URL', 'postgres://postgres@localhost:5432/citation_capture_pipeline')) config = { "TESTING_MODE": False, "CELERY_ALWAYS_EAGER": False, "CELERY_EAGER_PROPAGATES_EXCEPTIONS": False, "SQLALCHEMY_URL": self.sqlalchemy_url, } self.app = app.ADSCitationCaptureCelery('test', proj_home=self.proj_home, local_config=config) tasks.app = self.app # monkey-patch the app object self._init_mock_data() try: Base.metadata.create_all(bind=self.app._engine, checkfirst=True) except: # Database not empty! raise
class NonbibFileReader(object): """reads nonbib column files file reading/parsing is controlled by the file's properties dict in file_defs every line must start with a bibcode file must be sorted by bibcode """ bibcode_length = 19 config = load_config() def __init__(self, filetype, file_info): """passed file type (e.g., canonical) and relevant part of file_defs""" self.filetype = filetype self.file_info = file_info self.filename = self.config.get('INPUT_DATA_ROOT', './') + file_info['path'] self.logger = tasks.app.logger self.read_count = 0 # used in logging self.buffer = None # holds at most one line of text self._iostream = open(self.filename, 'r', encoding='utf-8') def __enter__(self, *args, **kwargs): return self def __exit__(self, *args, **kwargs): self.close() def __iter__(self): return self def next(self): return self._iostream.next() def close(self): self._iostream.close() del self._iostream def _pushline(self, s): """the buffer is used when we read a line that is behond the desired bibcode and we need to unread it""" if self.buffer: self.logger.error( 'error in file {}, {}, _pushline called when buffer was not empty. File line number: read line: {}, buffer: {}' .format(self.filetype, self.filename, self.read_count, s, self.buffer)) self.buffer = s def _readline(self): """return the next valid line or empty string at eof used to read all files""" self.read_count += 1 if self.buffer: line = self.buffer self.buffer = None return line if self._iostream.closed: return '' line = self._iostream.readline() while len(line) > 0 and len(line) < self.bibcode_length: self.logger.error( 'error, invalid short line in readline {} filename: {} at line {}, line length less then length of bibcode, line: {}' .format(self.filetype, self.filename, self.read_count, line)) self.read_count += 1 line = self._iostream.readline() return line def read_value_for(self, bibcode): """return the value from the file for the passed bibcode returns default value if bibcoce is not in file return value is a dict with the key of self.filetype some files repeat a bibcode on consecutive lines to provide multiple values other files do not repeat a bibcode and provide multiple values on a single line other files (e.g., relevance/docmetrics.tab) have multiple values some files have associated effects on values like property field this reader handles all cases based on the file property dict """ # first, are we at eof? current_line = self._readline() if len(current_line) == 0: # here if we are already at eof, bibcode isn't in file return self._convert_value(self.file_info['default_value']) # next, skip over lines in file until we: # either find the passed bibcode or determine it isn't in the file skip_count = 0 while len(current_line) != 0 and self._get_bibcode( current_line) < bibcode: current_line = self._readline() skip_count = skip_count + 1 # at this point, we have either read to the desired bibcode # or it doesn't exist and we read past it if len(current_line ) == 0 or bibcode != self._get_bibcode(current_line): # bibcode not in file self._pushline(current_line) return self._convert_value(self.file_info['default_value']) if isinstance(self.file_info['default_value'], bool): return self._convert_value( True) # boolean files only hold bibcodes, all values are True # at this point, we have the first line with the bibcode in it # roll up possible other values on adjacent lines in file value = [] value.append(self._get_rest(current_line)) current_line = self._readline() while self.file_info.get( 'multiline', False) and (current_line is not None) and ( bibcode == self._get_bibcode(current_line)): value.append(self._get_rest(current_line)) current_line = self._readline() # at this point we have read beyond the desired bibcode, must back up self._pushline(current_line) # finally, convert raw input into something useful return self._convert_value(value) def _convert_value(self, value): """convert file string line to something more useful return a dict with filetype as key and value converted """ if isinstance(value, str) and '\x00' in value: # there should not be nulls in strings self.logger.error( 'error string contained a null in file {} {}, line number: {}, value: {}' .format(self.filetype, self.filename, self.read_count, value)) value = value.replace('\x00', '') return_value = value if isinstance(value, bool): d = {self.filetype: return_value} if 'extra_values' in self.file_info and value != self.file_info[ 'default_value']: d.update(self.file_info['extra_values']) return {self.filetype: d} elif (len(value) > 0 and '\t' in value[0] and not self.file_info.get('tabs_to_spaces', False)): # tab separator in string means we need to convert elements to array z = [] for r in value: x = r.split('\t') if self.file_info.get('string_to_number', True): # convert valid ints and floats to numeric representation t = [] for y in x: t.append(self._convert_scalar(y)) z.append(t) return_value = z if len(return_value) == 1: return_value = return_value[0] elif 'interleave' in self.file_info and value != self.file_info[ 'default_value']: # here on multi-line dict (e.g., associations) # interleave data on successive lines e.g., merge first element in each array, second element, etc. # since they also have subparts, these arrays will then put in dict with the cooresponding key x = {} for k in self.file_info['subparts']: x[k] = [] for r in value: # For instance, in associations 'r' should contain: # URL title # where title may contain spaces too parts = r.split(' ', 1) # parts will contain [URL, title] if len(parts) < len(self.file_info['subparts']): self.logger.error( 'error in reader with interleave for {} file {}, incomplete value in line. value = {}, parts = {} at line' .format(self.filetype, self.filename, value, parts, self.read_count)) else: for i, k in enumerate(self.file_info['subparts']): v = parts[i].strip() x[k].append(v) return_value = x elif (self.file_info.get('tabs_to_spaces', False)): # files like simbad_objects have tabs that we simply convert to spaces x = [] for a in value: x.append(a.replace('\t', ' ')) return_value = x elif (len(value) > 1): x = [] for r in value: x.append(r.replace('\t', ' ').strip()) return_value = x # convert array to dict if needed if 'subparts' in self.file_info and return_value != self.file_info[ 'default_value'] and 'interleave' not in self.file_info: if type(return_value[0]) is list: x = [] for r in return_value: x.append(self._convert_subparts(r)) else: x = self._convert_subparts(return_value) return_value = x # are there extra_values to add to dict if 'extra_values' in self.file_info: self._add_extra_values(return_value) return {self.filetype: return_value} def _add_extra_values(self, current): if current != self.file_info['default_value'] and type( current) is dict: current.update(self.file_info['extra_values']) elif current != self.file_info['default_value'] and type( current) is list: # here with array of dicts, put extra_values in each dict for x in current: v = self.file_info['extra_values'] if type(v) is dict and type(x) is dict: x.update(v) else: self.logger.error( 'serious error in reader._add_extra_values, non dict value, extra_values = {}, processing element = {}, passed current = {}' .format(x, v, current)) def _convert_subparts(self, current): d = {} for i, k in enumerate(self.file_info['subparts']): v = '' if i < len(current): v = current[i] if type(k) is list: # here if key is in a list by itself which means values should be in a list k = k[0] v = [v] d[k] = v return d def _get_bibcode(self, s): """return the bibcode from the from of the line""" if s is None: return None if len(s) < self.bibcode_length: self.logger.error( 'error, invalid short line in file {} {} at line {}, line length less then length of bibcode, line = {}' .format(self.filetype, self.filename, self.read_count, s)) return s return s[:self.bibcode_length].strip() def _get_rest(self, s): """return the text after the bibcode and first tab separator""" if len(s) < self.bibcode_length + 1: self.logger.error( 'error, in _get_rest with invalid short line in file {} {} at line {}, line length less then length of bibcode plus 1, line = {}' .format(self.filetype, self.filename, self.read_count, s)) return '' return s[self.bibcode_length + 1:].strip() def _convert_scalar(self, s): if s.isdigit(): return int(s) try: x = float(s) return x except ValueError: return s.strip()
import os import requests import json from adsputils import date2solrstamp import sys import time from collections import OrderedDict # ============================= INITIALIZATION ==================================== # # - Use app logger: #import logging #logger = logging.getLogger('master-pipeline') # - Or individual logger for this file: from adsputils import setup_logging, load_config proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), '../')) config = load_config(proj_home=proj_home) logger = setup_logging(__name__, proj_home=proj_home, level=config.get('LOGGING_LEVEL', 'INFO'), attach_stdout=config.get('LOG_STDOUT', False)) # =============================== FUNCTIONS ======================================= # def extract_metrics_pipeline(data, solrdoc): citation = data.get('citations', []) return dict(citation=citation)
def setUp(self): self.config = {} self.config.update(load_config()) self.assertEqual(-1, self.config['MAX_ROWS'], 'tests fail when local_config.py sets MAX_ROWS')
import collections import copy import datetime import itertools import os import sys import types from aip.libs import enforce_schema, author_match import adsputils as utils _config = utils.load_config() def mergeRecords(records): completeRecords = [] e = enforce_schema.Enforcer() # TODO: no need to create new instances? for r in copy.deepcopy(records): r['text'] = Merger().mergeText(r['text']) blocks = e.ensureList(r['metadata']) #Multiply defined blocks need merging. metadatablockCounter = collections.Counter([i['tempdata']['type'] for i in blocks]) needsMerging = dict([(k,[]) for k,v in metadatablockCounter.iteritems() if v>1]) completeMetadata = {} #First pass: Add the singly defined blocks to the complete record for b in blocks: _type = b['tempdata']['type'] if _type not in needsMerging: completeMetadata[_type] = b else: needsMerging[_type].append(b)
def claimed_records(debug=False, test=False): """ Reporting function; checks SOLR for the following: - number of records that have been claimed by at least one ORCID ID, in orcid_pub, orcid_user, orcid_other (each reported separately) - total number of accepted claims of each of orcid_pub, orcid_user, orcid_other (i.e. if a single record has two separate authors who have successfully created a claim, the number reported here is 2) - total number of bibcodes that have been claimed, of any type The report is designed to be run regularly, and the results compared to previous report runs (via logs) :return: None (output to logs) """ if test: logger = setup_logging('test_claimed') else: logger = setup_logging('reporting') config = {} config.update(load_config()) # the first 7 digits of ORCID IDs are zero padding orcid_wild = '000000*' resp_pub = query_solr(config['SOLR_URL'], 'orcid_pub:"' + orcid_wild + '"', rows=10, sort="bibcode desc", fl='bibcode') resp_user = query_solr(config['SOLR_URL'], 'orcid_user:"******"', rows=10, sort="bibcode desc", fl='bibcode') resp_other = query_solr(config['SOLR_URL'], 'orcid_other:"' + orcid_wild + '"', rows=10, sort="bibcode desc", fl='bibcode') logger.info('Number of records with an orcid_pub: {}'.format( resp_pub['response']['numFound'])) logger.info('Number of records with an orcid_user: {}'.format( resp_user['response']['numFound'])) logger.info('Number of records with an orcid_other: {}'.format( resp_other['response']['numFound'])) start = 0 rows = 1000 results = resp_pub['response']['docs'] num_orcid_pub = 0 num_orcid_user = 0 num_orcid_other = 0 bibcode_pub = set() bibcode_user = set() bibcode_other = set() while results: results = query_records(start=start, rows=rows) for i in range(len(results)): try: results[i]['orcid_pub'] except KeyError: pass else: num_p = len( fnmatch.filter(results[i].get('orcid_pub'), '0000*')) num_orcid_pub += num_p bibcode_pub.add(results[i].get('bibcode')) try: results[i]['orcid_user'] except KeyError: pass else: num_u = len( fnmatch.filter(results[i].get('orcid_user'), '0000*')) num_orcid_user += num_u bibcode_user.add(results[i].get('bibcode')) try: results[i]['orcid_other'] except KeyError: pass else: num_o = len( fnmatch.filter(results[i].get('orcid_other'), '0000*')) num_orcid_other += num_o bibcode_other.add(results[i].get('bibcode')) if debug: if (start + rows) % 10000 == 0: logger.info( 'Number of results processed so far: {}'.format(start + rows)) if test: break else: start += rows logger.info('Total number of orcid_pub claims: {}'.format(num_orcid_pub)) logger.info('Total number of orcid_user claims: {}'.format(num_orcid_user)) logger.info( 'Total number of orcid_other claims: {}'.format(num_orcid_other)) orcid_bibcodes = bibcode_pub.union(bibcode_user).union(bibcode_other) logger.info('Total number of records with any ORCID claims: {}'.format( len(orcid_bibcodes)))
import os import fnmatch import datetime import cachetools import time import pytz import urllib3 # ============================= INITIALIZATION ==================================== # # - Use app logger: #import logging #logger = logging.getLogger('orcid-pipeline') # - Or individual logger for this file: from adsputils import setup_logging, load_config proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), '../')) config = load_config(proj_home=proj_home) logger = setup_logging(__name__, proj_home=proj_home, level=config.get('LOGGING_LEVEL', 'INFO'), attach_stdout=config.get('LOG_STDOUT', False)) app = tasks.app records_cache = cachetools.TTLCache(maxsize=1024, ttl=3600, timer=time.time, missing=None, getsizeof=None) # =============================== FUNCTIONS ======================================= #
def num_missing_profile(n_days=7,test=False): """ Queries logs via Kibana to get the number of profiles reported missing over a given time period. :param n_days: Number of days backwards to look, starting from now :return: None (outputs to logs) """ if test: logger = setup_logging('test_kibana') query = '"+@log_group:\\"backoffice-orcid_pipeline-daemon\\" +@message:\\"Missing profile for\\""' resp = query_Kibana(query=query, n_days=n_days, rows=5) total = resp['responses'][0]['hits']['total'] logger.info('Number of missing profile errors in the last {} days: {}'.format(n_days, total)) if __name__ == '__main__': # Runs all reporting scripts, outputs results to logs # Before running, tunnel into SOLR and postgres and specify localhost URLs for # SOLR_URL and SQLALCHEMY_URL, respectively, in local_config.py config = {} config.update(load_config()) claimed_records() num_claims(n_days=7) num_refused_claims(n_days=7) num_missing_profile(n_days=7)
def main(): parser = argparse.ArgumentParser(description='process column files into Postgres') parser.add_argument('-t', '--rowViewBaselineSchemaName', default='nonbibstaging', help='name of old postgres schema, used to compute delta') parser.add_argument('-d', '--diagnose', default=False, action='store_true', help='run simple test') parser.add_argument('-f', '--filename', default='bibcodes.txt', help='name of file containing the list of bibcode for metrics comparison') parser.add_argument('-m', '--metricsSchemaName', default='metrics', help='name of the postgres metrics schema') parser.add_argument('-n', '--metricsSchemaName2', default='', help='name of the postgres metrics schema for comparison') parser.add_argument('-r', '--rowViewSchemaName', default='nonbib', help='name of the postgres row view schema') parser.add_argument('-s', '--batchSize', default=100, help='used when queuing data') parser.add_argument('-b', '--bibcodes', default='', help='comma separate list of bibcodes send to master pipeline') parser.add_argument('command', default='help', nargs='?', help='ingest | verify | createIngestTables | dropIngestTables | renameSchema ' \ + ' | createJoinedRows | createMetricsTable | dropMetricsTable ' \ + ' | populateMetricsTable | createDeltaRows | populateMetricsTableDelta ' \ + ' | runRowViewPipeline | runMetricsPipeline | createNewBibcodes ' \ + ' | runRowViewPipelineDelta | runMetricsPipelineDelta '\ + ' | runPipelines | runPipelinesDelta | nonbibToMasterPipeline | nonbibDeltaToMasterPipeline' + ' | metricsToMasterPipeline | metricsDeltaToMasterPipeline | metricsCompare') args = parser.parse_args() config.update(load_config()) global logger logger = setup_logging('AdsDataSqlSync', config.get('LOG_LEVEL', 'INFO')) logger.info('starting AdsDataSqlSync.app with {}'.format(args.command)) nonbib_connection_string = config.get('INGEST_DATABASE', 'postgresql://postgres@localhost:5432/postgres') nonbib_db_engine = create_engine(nonbib_connection_string) nonbib_db_conn = nonbib_db_engine.connect() metrics_connection_string = config.get('METRICS_DATABASE', 'postgresql://postgres@localhost:5432/postgres') metrics_db_engine = create_engine(metrics_connection_string) metrics_db_conn = metrics_db_engine.connect() sql_sync = nonbib.NonBib(args.rowViewSchemaName) if args.command == 'help' and args.diagnose: diagnose_nonbib() diagnose_metrics() elif args.command == 'createIngestTables': sql_sync.create_column_tables(nonbib_db_engine) elif args.command == 'dropIngestTables': sql_sync.drop_column_tables(nonbib_db_engine) elif args.command == 'createJoinedRows': sql_sync.create_joined_rows(nonbib_db_conn) elif args.command == 'createMetricsTable' and args.metricsSchemaName: m = metrics.Metrics(args.metricsSchemaName) m.create_metrics_table(metrics_db_engine) elif args.command == 'dropMetricsTable' and args.metricsSchemaName: m = metrics.Metrics(args.metricsSchemaName) m.drop_metrics_table(metrics_db_engine) elif args.command == 'populateMetricsTable' and args.rowViewSchemaName and args.metricsSchemaName: m = metrics.Metrics() m.update_metrics_all(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName) elif args.command == 'populateMetricsTableDelta' and args.rowViewSchemaName and args.metricsSchemaName: m = metrics.Metrics(args.metricsSchemaName) m.update_metrics_changed(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName) elif args.command == 'renameSchema' and args.rowViewSchemaName and args.rowViewBaselineSchemaName: sql_sync.rename_schema(nonbib_db_conn, args.rowViewBaselineSchemaName) elif args.command == 'createDeltaRows' and args.rowViewSchemaName and args.rowViewBaselineSchemaName: sql_sync.create_delta_rows(nonbib_db_conn, args.rowViewBaselineSchemaName) elif args.command == 'createNewBibcodes' and args.rowViewSchemaName and args.rowViewBaselineSchemaName: sql_sync.build_new_bibcodes(nonbib_db_conn, args.rowViewBaselineSchemaName) elif args.command == 'logDeltaReasons' and args.rowViewSchemaName and args.rowViewBaselineSchemaName: sql_sync.log_delta_reasons(nonbib_db_conn, args.rowViewBaselineSchemaName) elif args.command == 'runRowViewPipeline' and args.rowViewSchemaName: # drop tables, create tables, load data, create joined view sql_sync.drop_column_tables(nonbib_db_engine) sql_sync.create_column_tables(nonbib_db_engine) load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync) elif args.command == 'runMetricsPipeline' and args.rowViewSchemaName and args.metricsSchemaName: m = metrics.Metrics(args.metricsSchemaName) m.drop_metrics_table(metrics_db_engine) m.create_metrics_table(metrics_db_engine) m.update_metrics_all(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName) elif args.command == 'runRowViewPipelineDelta' and args.rowViewSchemaName and args.rowViewBaselineSchemaName: # we delete the old data baseline_sql_sync = nonbib.NonBib(args.rowViewBaselineSchemaName) baseline_engine = create_engine(nonbib_connection_string) baseline_sql_sync.drop_column_tables(baseline_engine) # rename the current to be the old (for later comparison) sql_sync.rename_schema(nonbib_db_conn, args.rowViewBaselineSchemaName) # create the new and populate baseline_sql_sync = None sql_sync.create_column_tables(nonbib_db_engine) load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync) # compute delta between old and new sql_sync.create_delta_rows(nonbib_db_conn, args.rowViewBaselineSchemaName) sql_sync.log_delta_reasons(nonbib_db_conn, args.rowViewBaselineSchemaName) elif args.command == 'runMetricsPipelineDelta' and args.rowViewSchemaName and args.metricsSchemaName: m = metrics.Metrics(args.metricsSchemaName) m.update_metrics_changed(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName) elif args.command == 'runPipelines' and args.rowViewSchemaName and args.metricsSchemaName: # drop tables, create tables, load data, compute metrics sql_sync.drop_column_tables(nonbib_db_engine) sql_sync.create_column_tables(nonbib_db_engine) load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync) m = metrics.Metrics(args.metricsSchemaName) m.drop_metrics_table(metrics_db_engine) m.create_metrics_table(metrics_db_engine) m.update_metrics_all(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName) elif args.command == 'runPipelinesDelta' and args.rowViewSchemaName and args.metricsSchemaName and args.rowViewBaselineSchemaName: # drop tables, rename schema, create tables, load data, compute delta, compute metrics baseline_sql_sync = nonbib.NonBib(args.rowViewBaselineSchemaName) baseline_engine = create_engine(nonbib_connection_string) baseline_sql_sync.drop_column_tables(baseline_engine) sql_sync.rename_schema(nonbib_db_conn, args.rowViewBaselineSchemaName) baseline_sql_sync = None sql_sync.create_column_tables(nonbib_db_engine) load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync) sql_sync.create_delta_rows(nonbib_db_conn, args.rowViewBaselineSchemaName) sql_sync.log_delta_reasons(nonbib_db_conn, args.rowViewBaselineSchemaName) m = metrics.Metrics(args.metricsSchemaName) m.update_metrics_changed(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName) elif args.command == 'nonbibToMasterPipeline' and args.diagnose: diagnose_nonbib() elif args.command == 'nonbibToMasterPipeline' and args.bibcodes: bibcodes = args.bibcodes.split(',') nonbib_bibs_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, bibcodes) elif args.command == 'nonbibToMasterPipeline' and args.filename: bibcodes = [] with open(args.filename, 'r') as f: for line in f: bibcodes.append(line.strip()) if len(bibcodes) > 100: nonbib_bibs_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, bibcodes) bibcodes = [] if len(bibcodes) > 0: nonbib_bibs_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, bibcodes) elif args.command == 'nonbibToMasterPipeline': nonbib_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, int(args.batchSize)) elif args.command == 'nonbibDeltaToMasterPipeline': nonbib_delta_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, int(args.batchSize)) elif args.command == 'metricsToMasterPipeline' and args.diagnose: diagnose_metrics() elif args.command == 'metricsToMasterPipeline' and args.bibcodes: bibcodes = args.bibcodes.split(',') metrics_bibs_to_master_pipeline(metrics_db_engine, args.metricsSchemaName, bibcodes) elif args.command == 'metricsToMasterPipeline': metrics_to_master_pipeline(metrics_db_engine, args.metricsSchemaName, int(args.batchSize)) elif args.command == 'metricsDeltaToMasterPipeline': metrics_delta_to_master_pipeline(metrics_db_engine, args.metricsSchemaName, nonbib_db_engine, args.rowViewSchemaName, int(args.batchSize)) elif args.command == 'metricsCompare': # compare the values in two metrics postgres tables # useful to compare results from new pipeline to produciton pipeline # read metrics records from both databases and compare metrics_logger = setup_logging('metricsCompare', 'INFO') metrics1 = metrics.Metrics(args.metricsSchemaName) Session = sessionmaker(bind=metrics_db_engine) session = Session() if args.metricsSchemaName: session.execute('set search_path to {}'.format(args.metricsSchemaName)) metrics2 = metrics.Metrics(args.metricsSchemaName2) metrics_connection_string2 = config.get('METRICS_DATABASE2', 'postgresql://postgres@localhost:5432/postgres') metrics_db_engine2 = create_engine(metrics_connection_string2) Session2 = sessionmaker(bind=metrics_db_engine2) session2 = Session2() if args.metricsSchemaName2: session2.execute('set search_path to {}'.format(args.metricsSchemaName2)) print 'm2', metrics_connection_string2 print 'm2 schema', args.metricsSchemaName2 with open(args.filename) as f: for line in f: bibcode = line.strip() m1 = metrics1.get_by_bibcode(session, bibcode) m2 = metrics2.get_by_bibcode(session2, bibcode) mismatch = metrics.Metrics.metrics_mismatch(line.strip(), m1, m2, metrics_logger) if mismatch: metrics_logger.error('{} MISMATCHED FIELDS: {}'.format(bibcode, mismatch)) print '{} MISMATCHED FIELDS: {}'.format(bibcode, mismatch) session.close() session2.close() else: print 'app.py: illegal command or missing argument, command = ', args.command print ' row view schema name = ', args.rowViewSchemaName print ' row view baseline schema name = ', args.rowViewBaselineSchemaName print ' metrics schema name = ', args.metricsSchemaName if nonbib_db_conn: nonbib_db_conn.close() if metrics_db_conn: metrics_db_conn.close() logger.info('completed {}'.format(args.command))
# access to the values within the .ini file in use. config = context.config # Interpret the config file for Python logging. # This line sets up loggers basically. fileConfig(config.config_file_name) # add your model's MetaData object here # for 'autogenerate' support # from myapp import mymodel # target_metadata = mymodel.Base.metadata #target_metadata = None from adsputils import load_config opath = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) app_conf = load_config(proj_home=opath) if opath not in sys.path: sys.path.insert(0, opath) from ADSCitationCapture import models target_metadata = models.Base.metadata # other values from the config, defined by the needs of env.py, # can be acquired: # my_important_option = config.get_main_option("my_important_option") # ... etc. def run_migrations_offline(): """Run migrations in 'offline' mode.
class test_resolver(unittest.TestCase): """tests for generation of resolver""" # Reference to testing.postgresql database instance db = None # Connection to the database used to set the database state before running each # test db_con = None # Map of database connection parameters passed to the functions we're testing db_conf = None config = {} config.update(load_config()) def setUp(self): """ Module level set-up called once before any tests in this file are executed. Creates a temporary database and sets it up """ global db, db_con, db_conf db = testing.postgresql.Postgresql() # Get a map of connection parameters for the database which can be passed # to the functions being tested so that they connect to the correct # database db_conf = db.dsn() # Create a connection which can be used by our test functions to set and # query the state of the database db_con = psycopg2.connect(**db_conf) # Commit changes immediately to the database db_con.set_isolation_level( psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) with db_con.cursor() as cur: # Create the initial database structure (roles, schemas, tables etc.) # basically anything that doesn't change cur.execute(self.slurp('tests/data/datalinks.sql')) self.maxDiff = None def tearDown(self): """ Called after all of the tests in this file have been executed to close the database connecton and destroy the temporary database """ db_con.close() db.stop() def slurp(self, path): """ Reads and returns the entire contents of a file """ with open(path, 'r') as f: return f.read() def test_data_query(self): with db_con.cursor() as cur: cur.execute(self.config['DATA_QUERY'].format( db='public', bibcode='1903BD....C......0A')) self.assertEqual(fetch_data_link_elements_counts(cur.fetchone()), [['CDS:1', 'Vizier:1'], 2]) def test_esource_query1(self): with db_con.cursor() as cur: cur.execute(self.config['ESOURCE_QUERY'].format( db='public', bibcode='2016Atoms...4...18I')) self.assertEqual(fetch_data_link_elements(cur.fetchone()), ['EPRINT_HTML', 'EPRINT_PDF']) def test_esource_query2(self): with db_con.cursor() as cur: cur.execute(self.config['ESOURCE_QUERY'].format( db='public', bibcode='2014MNRAS.444.1496E')) self.assertEqual(fetch_data_link_elements(cur.fetchone()), ['PUB_PDF']) def test_esource_query3(self): with db_con.cursor() as cur: cur.execute(self.config['ESOURCE_QUERY'].format( db='public', bibcode='2014MNRAS.444.1497S')) self.assertEqual(fetch_data_link_elements(cur.fetchone()), ['EPRINT_HTML', 'EPRINT_PDF', 'PUB_PDF']) def test_property_query1(self): with db_con.cursor() as cur: cur.execute(self.config['PROPERTY_QUERY'].format( db='public', bibcode='2004MNRAS.354L..31M')) self.assertEqual(fetch_data_link_elements(cur.fetchone()), ['ASSOCIATED', 'ESOURCE', 'INSPIRE']) def test_property_query2(self): with db_con.cursor() as cur: cur.execute(self.config['PROPERTY_QUERY'].format( db='public', bibcode='1891opvl.book.....N')) self.assertEqual(fetch_data_link_elements(cur.fetchone()), ['LIBRARYCATALOG']) def test_property_query3(self): with db_con.cursor() as cur: cur.execute(self.config['PROPERTY_QUERY'].format( db='public', bibcode='2018LPI....49.2177B')) self.assertEqual(fetch_data_link_elements(cur.fetchone()), ['ESOURCE', 'TOC']) def test_extra_property_values(self): current_row = {} extra_properties = [{ 'pub_openaccess': True, 'private': False, 'ocrabstract': False, 'nonarticle': True, 'refereed': True }, { 'pub_openaccess': False, 'private': True, 'ocrabstract': True, 'nonarticle': False, 'refereed': False }] results = [[ 'NONARTICLE', 'REFEREED', 'PUB_OPENACCESS', 'ADS_OPENACCESS', 'AUTHOR_OPENACCESS', 'EPRINT_OPENACCESS', 'OPENACCESS' ], ['ARTICLE', 'NOT REFEREED', 'PRIVATE', 'OCRABSTRACT']] esources = [['ADS_PDF', 'AUTHOR_PDF', 'EPRINT_HTML'], []] for extra_property, result, esource in zip(extra_properties, results, esources): current_row['property'] = [] for key, value in extra_property.iteritems(): current_row[key] = value current_row['esource'] = esource current_row = add_data_link_extra_properties(current_row) self.assertEqual(current_row['property'], result) def test_datalinks_query(self): with db_con.cursor() as cur: cur.execute(self.config['DATALINKS_QUERY'].format( db='public', bibcode='2004MNRAS.354L..31M')) rec = fetch_data_link_record(cur.fetchall()) expected = [{ 'url': ['http://articles.adsabs.harvard.edu/pdf/1825AN......4..241B'], 'title': [], 'item_count': 0, 'link_type': 'ESOURCE', 'link_sub_type': 'ADS_PDF' }, { 'url': ['1825AN......4..241B', '2010AN....331..852K'], 'title': ['Main Paper', 'Translation'], 'item_count': 0, 'link_type': 'ASSOCIATED', 'link_sub_type': 'NA' }, { 'url': [], 'title': [], 'item_count': 0, 'link_type': 'INSPIRE', 'link_sub_type': 'NA' }] self.assertEqual(rec, expected) def test_datalinks_query_for_associated(self): with db_con.cursor() as cur: cur.execute(self.config['DATALINKS_QUERY'].format( db='public', bibcode='2004MNRAS.354L..31M')) self.assertEqual(fetch_data_link_record(cur.fetchall()), [{ 'url': ['http://articles.adsabs.harvard.edu/pdf/1825AN......4..241B'], 'title': [], 'item_count': 0, 'link_type': 'ESOURCE', 'link_sub_type': 'ADS_PDF' }, { 'url': ['1825AN......4..241B', '2010AN....331..852K'], 'title': ['Main Paper', 'Translation'], 'item_count': 0, 'link_type': 'ASSOCIATED', 'link_sub_type': 'NA' }, { 'url': [], 'title': [], 'item_count': 0, 'link_type': 'INSPIRE', 'link_sub_type': 'NA' }])
#!/usr/bin/python # -*- coding: utf-8 -*- import os import sys from aip.libs import read_records from adsputils import setup_logging, load_config from aip.models import Records from aip import app, tasks import time import mmap import argparse from collections import OrderedDict from sqlalchemy.orm import load_only config = load_config() logger = setup_logging('run.py') def readBibcodesFromFile( files): #rca: all here is old code, i don't see why mmap was used """Reads contents of the BIBFILES into memory; basically bibcode:json_fingerprint pairs. @param files: list of files to read from @return: OrderedDict instance """ start = time.time() records = OrderedDict() for f in files:
def setUp(self): super(TestXMLExtractorBase, self).setUp() self.preferred_parser_names = load_config().get( 'PREFERRED_XML_PARSER_NAMES' ) # Iterate through all the parsers defined in config.py
from datetime import datetime, timedelta from os import remove from shutil import move import subprocess import os from adsputils import setup_logging, load_config logger = setup_logging('AutomatedIngestReport') conf = load_config(proj_home='./') # enums used to to generate file names class FileType: CANONICAL = 'CANONICAL' SOLR = 'SOLR' FULLTEXT = 'FULLTEXT' class FileAdjective: MISSING = 'MISSING' DELETED = 'DELETED' EXTRA = 'EXTRA' NEW = 'NEW' class Date: TODAY = 1 YESTERDAY = 2