Beispiel #1
0
def num_claims(app=app, n_days=7, test=False):
    """
    Reporting function; checks the postgres database for:
        - number of unique ORCID IDs who have created claims in the given range of time
            - if a single user creates a number of claims in the time period, that user is reported only once here
            - counts claims of all types
        - number of claims on a single bibcode by a single user in the given range of time
            - if a user claims 5 separate records in the given time period, the number of claims reported is 5
            - if a user claims a record multiple times in the given time period, the number of claims reported is 1
            - counts claims of type claimed, updated, and removed
        - total number of claims in the given time period
            - does not remove duplicates
            - meant to be compared to Kibana reports on number of rejected claims
    :param n_days: number of days backwards to look, starting from now
    :return: None (outputs to logs)
    """

    if test:
        logger = setup_logging('test_num_claimed')
    else:
        logger = setup_logging('reporting')

    now = datetime.datetime.now(tzutc())
    beginning = now - datetime.timedelta(days=n_days)

    with app.session_scope() as session:
        status_count = session.query(func.count(distinct(
            ClaimsLog.orcidid)), ClaimsLog.status).filter(
                and_(ClaimsLog.created >= beginning,
                     ClaimsLog.created <= now)).group_by(
                         ClaimsLog.status).all()

        for i in range(len(status_count)):
            logger.info(
                'Number of unique ORCID IDs generating claims of type {} in last {} days: {}'
                .format(status_count[i][1], n_days, status_count[i][0]))

        statuses = ['claimed', 'removed', 'updated']

        for s in statuses:
            #claims = session.query(func.count(distinct(ClaimsLog.bibcode)).
            #                       filter(and_(ClaimsLog.created >= beginning, ClaimsLog.created <= now,
            #                                   ClaimsLog.status == s))).all()
            claims = session.query(ClaimsLog).distinct(
                ClaimsLog.bibcode, ClaimsLog.orcidid).filter(
                    and_(ClaimsLog.created >= beginning,
                         ClaimsLog.created <= now,
                         ClaimsLog.status == s)).all()

            logger.info(
                'Number of unique claims by a unique bibcode+ORCID ID pair that have been {} in the last {} days: {}'
                .format(s, n_days, len(claims)))

        total_claims = session.query(ClaimsLog).filter(
            and_(ClaimsLog.created >= beginning, ClaimsLog.created <= now,
                 ClaimsLog.status.in_(statuses))).all()

        logger.info(
            'Total number of non-unique claims with status {} in the last {} days, to compare with logging on rejected claims: {}'
            .format(statuses, n_days, len(total_claims)))
Beispiel #2
0
 def test_setup_logging(self):
     with patch('adsputils.ConcurrentRotatingFileHandler') as cloghandler:
         adsputils.setup_logging('app')
         f = os.path.abspath(
             os.path.join(os.path.abspath(__file__), '../../..'))
         self.assertEqual(
             "call(backupCount=5, encoding=u'UTF-8', filename=u'{filename}/logs/app.log', maxBytes=2097152, mode=u'a')"
             .format(filename=f), str(cloghandler.call_args))
 def test_setup_logging(self):
     with patch('adsputils.ConcurrentRotatingFileHandler') as cloghandler:
         adsputils.setup_logging('app')
         f = os.path.abspath(
             os.path.join(os.path.abspath(__file__), '../../..'))
         if sys.version_info > (3, ):
             test_data = "call(backupCount=10, encoding='UTF-8', filename='{filename}/logs/app.log', maxBytes=10485760, mode='a')".format(
                 filename=f)
         else:
             test_data = "call(backupCount=10, encoding=u'UTF-8', filename=u'{filename}/logs/app.log', maxBytes=10485760, mode=u'a')".format(
                 filename=f)
         self.assertEqual(test_data, str(cloghandler.call_args))
Beispiel #4
0
 def __init__(self, app_name, *args, **kwargs):
     Celery.__init__(self, *args, **kwargs)
     self._config = adsputils.load_config()
     self._session = None
     self._engine = None
     self._app_name = app_name
     self.logger = adsputils.setup_logging(app_name)  #default logger
Beispiel #5
0
    def __init__(self, sqlachemy_url, group_changes_in_chunks_of=1, sqlalchemy_echo=False, schema_prefix="citation_capture_", force=False):
        """
        Initializes the class and prepares DB connection.

        :param sqlachemy_url: URL to connect to the DB.
        :param group_changes_in_chunks_of: Number of citation changes to be
            grouped when iterating.
        :param sqlalchemy_echo: Print every SQL statement.
        :param schema_prefix: Data is stored in schemas that correspond to a
            prefix + file last access date.
        :param force: If tables already exists in DB, drop them and re-ingest.
        """
        self.engine = create_engine(sqlachemy_url, echo=sqlalchemy_echo)
        self.connection = self.engine.connect()
        self.session = sessionmaker(bind=self.engine)()
        #
        self.logger = setup_logging(__name__)
        self.logger.propagate = False
        #
        self.table_name = RawCitation.__tablename__
        self.expanded_table_name = "expanded_" + self.table_name
        self.recreated_previous_expanded_table_name = "recreated_previous_expanded_" + self.table_name
        self.missing_previous_expanded_table_name = "not_processed_" + self.table_name
        self.joint_table_name = CitationChanges.__tablename__
        self.schema_prefix = schema_prefix
        self.schema_name = None
        self.previous_schema_name = None
        self.input_refids_filename = None
        self.group_changes_in_chunks_of=group_changes_in_chunks_of
        self.offset = 0
        self.n_changes = 0
        self.force = force
        self.last_modification_date = None
Beispiel #6
0
    def __init__(self, fields, ignore_fields, new_fields):
        self.fields = fields
        self.ignore_fields = ignore_fields
        self.new_fields = new_fields

        self.logger = setup_logging('validate', 'INFO')
        self.config = {}
        self.config.update(load_config())
Beispiel #7
0
    def __init__(self, file_):
        self._file = file_
        self.read_count = 0   # needed for logging
        self.logger = setup_logging('AdsDataSqlSync', 'DEBUG')
        self.logger.info('nonbib file ingest, file {}'.format(self._file))
        self.config = {}
        self.config.update(load_config())

        self._iostream = open(file_, 'r')
Beispiel #8
0
    def __init__(self, schema_='metrics'):
        self.logger = setup_logging('AdsDataSqlSync', 'INFO')

        self.schema =  schema_
        self.table = models.MetricsTable()
        self.table.schema = self.schema

        # used to buffer writes                                                                                         
        self.upserts = []
        self.tmp_update_buffer = []
        self.tmp_count = 0
        self.config = {}
        self.config.update(load_config())
Beispiel #9
0
def num_refused_claims(n_days=7, test=False):
    """
    Queries logs via Kibana to get the number of refused claims over a given time period.

    :param n_days: Number of days backwards to look, starting from now
    :return: None (outputs to logs)
    """

    if test:
        logger = setup_logging('test_kibana')
    else:
        logger = setup_logging('reporting')

    query = '"+@log_group:\\"backoffice-logs\\" "+@log_group:\\"fluent-bit-backoffice_prod_orcid_pipeline_1\\" +@message:\\"Claim refused\\""'

    # don't need the full set of results as the total is passed separately
    resp = query_Kibana(query=query, n_days=n_days, rows=5)

    total = resp['responses'][0]['hits']['total']

    logger.info('Number of claims rejected in the last {} days: {}'.format(
        n_days, total))
Beispiel #10
0
def num_missing_profile(n_days=7, test=False):
    """
    Queries logs via Kibana to get the number of profiles reported missing over a given time period.

    :param n_days: Number of days backwards to look, starting from now
    :return: None (outputs to logs)
    """

    if test:
        logger = setup_logging('test_kibana')
    else:
        logger = setup_logging('reporting')

    query = '"+@log_group:\\"backoffice-logs\\" "+@log_group:\\"fluent-bit-backoffice_prod_orcid_pipeline_1\\" +@message:\\"Missing profile for\\""'

    resp = query_Kibana(query=query, n_days=n_days, rows=5)

    total = resp['responses'][0]['hits']['total']

    logger.info(
        'Number of missing profile errors in the last {} days: {}'.format(
            n_days, total))
Beispiel #11
0
    def __init__(self, schema_='metrics'):
        self.logger = setup_logging('AdsDataSqlSync', 'INFO')

        self.schema = schema_
        self.table = models.MetricsTable()
        self.table.schema = self.schema

        # used to buffer writes
        self.upserts = []
        self.tmp_update_buffer = []
        self.tmp_count = 0
        self.config = {}
        self.config.update(load_config())
    def __init__(self,
                 sqlachemy_url,
                 group_changes_in_chunks_of=1,
                 sqlalchemy_echo=False,
                 schema_prefix="citation_capture_",
                 force=False):
        """
        Initializes the class and prepares DB connection.

        :param sqlachemy_url: URL to connect to the DB.
        :param group_changes_in_chunks_of: Number of citation changes to be
            grouped when iterating.
        :param sqlalchemy_echo: Print every SQL statement.
        :param schema_prefix: Data is stored in schemas that correspond to a
            prefix + file last access date.
        :param force: If tables already exists in DB, drop them and re-ingest.
        """
        self.engine = create_engine(sqlachemy_url, echo=sqlalchemy_echo)
        self.connection = self.engine.connect()
        self.session = sessionmaker(bind=self.engine)()
        #
        # - Use app logger:
        #import logging
        #self.logger = logging.getLogger('ads-citation-capture')
        # - Or individual logger for this file:
        from adsputils import setup_logging, load_config
        proj_home = os.path.realpath(
            os.path.join(os.path.dirname(__file__), '../'))
        config = load_config(proj_home=proj_home)
        self.logger = setup_logging(__name__,
                                    proj_home=proj_home,
                                    level=config.get('LOGGING_LEVEL', 'INFO'),
                                    attach_stdout=config.get(
                                        'LOG_STDOUT', False))
        #
        self.table_name = RawCitation.__tablename__
        self.expanded_table_name = "expanded_" + self.table_name
        self.recreated_previous_expanded_table_name = "recreated_previous_expanded_" + self.table_name
        self.missing_previous_expanded_table_name = "not_processed_" + self.table_name
        self.joint_table_name = CitationChanges.__tablename__
        self.schema_prefix = schema_prefix
        self.schema_name = None
        self.previous_schema_name = None
        self.input_refids_filename = None
        self.group_changes_in_chunks_of = group_changes_in_chunks_of
        self.offset = 0
        self.n_changes = 0
        self.force = force
        self.last_modification_date = None
    def __init__(self, fields, ignore_fields, new_fields):
        self.fields = fields
        self.ignore_fields = ignore_fields
        self.new_fields = new_fields

        # - Use app logger:
        # import logging
        # self.logger = logging.getLogger('master-pipeline')
        # - Or individual logger for this file:
        from adsputils import setup_logging, load_config
        proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), '../'))
        self.config = load_config(proj_home=proj_home)
        self.logger = setup_logging(__name__, proj_home=proj_home,
                                    level=self.config.get('LOGGING_LEVEL', 'INFO'),
                                    attach_stdout=self.config.get('LOG_STDOUT', False))
Beispiel #14
0
def main():
    global config
    config.update(load_config())
    global logger
    logger = setup_logging('ADSData', config.get('LOG_LEVEL', 'INFO'))

    parser = argparse.ArgumentParser(description='generate nonbib data')
    ars = parser.parse_args()

    load(config)

 
    # compute metrics for a bibcode
    compute_metrics('2012ApJS..199...26H')
    # lots_of_metrics(config)
    logger.info('end of program')
Beispiel #15
0
 def __init__(self, schema_='nonbib'):
     self.schema = schema_
     self.meta = MetaData()
     self.table = models.NonBibTable()
     self.table.schema = self.schema
     # - Use app logger:
     #import logging
     #logger = logging.getLogger('ads-data')
     # - Or individual logger for this file:
     proj_home = os.path.realpath(
         os.path.join(os.path.dirname(__file__), '../'))
     config = load_config(proj_home=proj_home)
     self.logger = setup_logging(__name__,
                                 proj_home=proj_home,
                                 level=config.get('LOGGING_LEVEL', 'INFO'),
                                 attach_stdout=config.get(
                                     'LOG_STDOUT', False))
Beispiel #16
0
    def __init__(self, file_):
        self._file = file_
        self.read_count = 0  # needed for logging
        # - Use app logger:
        #import logging
        #logger = logging.getLogger('ads-data')
        # - Or individual logger for this file:
        proj_home = os.path.realpath(
            os.path.join(os.path.dirname(__file__), '../'))
        self.config = load_config(proj_home=proj_home)
        self.logger = setup_logging(
            __name__,
            proj_home=proj_home,
            level=self.config.get('LOGGING_LEVEL', 'INFO'),
            attach_stdout=self.config.get('LOG_STDOUT', False))

        self.logger.info('nonbib file ingest, file {}'.format(self._file))
        self._iostream = open(file_, 'r')
Beispiel #17
0
    def test_logging(self):
        logdir = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '../../logs'))
        foo_log = logdir + '/foo.bar.log'
        if os.path.exists(foo_log):
            os.remove(foo_log)
        logger = adsputils.setup_logging('foo.bar')
        logger.warn('first')
        frameinfo = getframeinfo(currentframe())

        logger.handlers[0].stream.flush()
        #print foo_log
        self.assertTrue(os.path.exists(foo_log))
        c = _read_file(foo_log)
        j = json.loads(c)

        self.assertEqual(j['message'], 'first')
        self.assertTrue('hostname' in j)

        # verify warning has filename and linenumber
        self.assertEqual(os.path.basename(frameinfo.filename), j['filename'])
        self.assertEqual(j['lineno'], frameinfo.lineno - 1)

        time.sleep(0.01)
        # now multiline message
        logger.warn(u'second\nthird')
        logger.warn('last')
        c = _read_file(foo_log)

        found = False
        msecs = False
        for x in c.strip().split('\n'):
            j = json.loads(x)
            self.assertTrue(j)
            if j['message'] == u'second\nthird':
                found = True
            t = adsputils.get_date(j['asctime'])
            if t.microsecond > 0:
                msecs = True

        self.assertTrue(found)
        self.assertTrue(msecs)
Beispiel #18
0
 def __init__(self, 
              blocks=None, 
              logger=None, 
              merger_rules= _config['MERGER_RULES'],
              priorities = _config['PRIORITIES'],
              references_always_append = _config['REFERENCES_ALWAYS_APPEND']
              ):
   self.blocks = blocks
   self.logger=logger
   self.block = {}
   self.altpublications = []
   self.eL = enforce_schema.Enforcer().ensureList
   self.merger_rules = merger_rules
   self.priorities = priorities
   self.references_always_append = references_always_append
   
   if blocks:
     #Assert that there is only block type being merged
     assert len(set([i['tempdata']['type'] for i in blocks]))==1
     self.blocktype = blocks[0]['tempdata']['type']
   if not self.logger:
     self.logger = utils.setup_logging('merger')
Beispiel #19
0
    def __init__(self, schema_='metrics'):
        # - Use app logger:
        #import logging
        #logger = logging.getLogger('ads-data')
        # - Or individual logger for this file:
        proj_home = os.path.realpath(
            os.path.join(os.path.dirname(__file__), '../'))
        self.config = load_config(proj_home=proj_home)
        self.logger = setup_logging(
            __name__,
            proj_home=proj_home,
            level=self.config.get('LOGGING_LEVEL', 'INFO'),
            attach_stdout=self.config.get('LOG_STDOUT', False))

        self.schema = schema_
        self.table = models.MetricsTable()
        self.table.schema = self.schema

        # used to buffer writes
        self.upserts = []
        self.tmp_update_buffer = []
        self.tmp_count = 0
Beispiel #20
0
    def __init__(self, app_name, *args, **kwargs):
        """
        :param: app_name - string, name of the application (can be anything)
        :keyword: local_config - dict, configuration that should be applied
            over the default config (that is loaded from config.py and local_config.py)
        """
        proj_home = None
        if 'proj_home' in kwargs:
            proj_home = kwargs.pop('proj_home')
        self.config = load_config(extra_frames=1,
                                  proj_home=proj_home,
                                  app_name=app_name)

        local_config = None
        if 'local_config' in kwargs and kwargs['local_config']:
            local_config = kwargs.pop('local_config')
            self.config.update(local_config)  #our config
        if not proj_home:
            proj_home = self.config.get('PROJ_HOME', None)
        self.logger = setup_logging(
            app_name,
            proj_home=proj_home,
            level=self.config.get('LOGGING_LEVEL', 'INFO'),
            attach_stdout=self.config.get('LOG_STDOUT', False))
Beispiel #21
0
    def init_app(self, config=None):
        """This function must be called before you start working with the application
        (or worker, script etc)
        
        :return None
        """

        if self._session is not None:  # the app was already instantiated
            return

        if config:
            self._config.update(config)  #our config
            self.conf.update(
                config
            )  #celery's config (devs should be careful to avoid clashes)

        self.logger = adsputils.setup_logging(
            self._app_name, self._config.get('LOGGING_LEVEL', 'INFO'))
        self._engine = create_engine(config.get('SQLALCHEMY_URL',
                                                'sqlite:///'),
                                     echo=config.get('SQLALCHEMY_ECHO', False))
        self._session_factory = sessionmaker()
        self._session = scoped_session(self._session_factory)
        self._session.configure(bind=self._engine)
Beispiel #22
0

import sys
import os
import utils
import json
import ptree
import traceback

from stat import ST_MTIME
from datetime import datetime
from dateutil.parser import parse
from adsputils import setup_logging
from adsft.utils import get_filenames

logger = setup_logging(__name__)


def file_last_modified_time(file_input):
    """
    Stats the given file to find the last modified time

    :param file_input: path to file
    :return: date time object of the last modified time
    """

    mtime = os.stat(file_input)[ST_MTIME]
    return datetime.fromtimestamp(mtime)


def create_meta_path(dict_input, extract_path):
Beispiel #23
0
import sys
import time
import argparse
import logging
import traceback
import requests
import warnings
from requests.packages.urllib3 import exceptions
warnings.simplefilter('ignore', exceptions.InsecurePlatformWarning)

from adsputils import setup_logging, get_date
from ADSOrcid import updater, tasks
from ADSOrcid.models import ClaimsLog, KeyValue, Records, AuthorInfo

app = tasks.app
logger = setup_logging('run.py')


def reindex_claims(since=None, orcid_ids=None, **kwargs):
    """
    Re-runs all claims, both from the pipeline and
    from the orcid-service storage.
    
    :param: since - RFC889 formatted string
    :type: str
    
    :return: no return
    """
    if orcid_ids:
        for oid in orcid_ids:
            tasks.task_index_orcid_profile.delay({
Beispiel #24
0
from adsputils import setup_logging
import spacy

logger = setup_logging(__name__)


def get_facilities(model, text):
    """
    purpose: to identify facilities within the text
    input: model loaded from disk, text to process
    return: list of facilities identified with custom spacy ner model
    """

    doc = model(text)

    facilities = []

    for ent in doc.ents:
        facilities.append(ent.text)

    return facilities


def load_model(dir):

    return spacy.load(dir)
Beispiel #25
0
def main():
    parser = argparse.ArgumentParser(description='process column files into Postgres')
    parser.add_argument('-t', '--rowViewBaselineSchemaName', default='nonbibstaging', 
                        help='name of old postgres schema, used to compute delta')
    parser.add_argument('-d', '--diagnose', default=False, action='store_true', help='run simple test')
    parser.add_argument('-f', '--filename', default='bibcodes.txt', help='name of file containing the list of bibcode for metrics comparison')
    parser.add_argument('-m', '--metricsSchemaName', default='metrics', help='name of the postgres metrics schema')
    parser.add_argument('-n', '--metricsSchemaName2', default='', help='name of the postgres metrics schema for comparison')
    parser.add_argument('-r', '--rowViewSchemaName', default='nonbib', help='name of the postgres row view schema')
    parser.add_argument('-s', '--batchSize', default=100,  help='used when queuing data')
    parser.add_argument('-b', '--bibcodes', default='',  help='comma separate list of bibcodes send to master pipeline')
    parser.add_argument('command', default='help', nargs='?',
                        help='ingest | verify | createIngestTables | dropIngestTables | renameSchema ' \
                        + ' | createJoinedRows | createMetricsTable | dropMetricsTable ' \
                        + ' | populateMetricsTable | createDeltaRows | populateMetricsTableDelta ' \
                        + ' | runRowViewPipeline | runMetricsPipeline | createNewBibcodes ' \
                        + ' | runRowViewPipelineDelta | runMetricsPipelineDelta '\
                        + ' | runPipelines | runPipelinesDelta | nonbibToMasterPipeline | nonbibDeltaToMasterPipeline'
                        + ' | metricsToMasterPipeline | metricsDeltaToMasterPipeline | metricsCompare')

    args = parser.parse_args()

    config.update(load_config())

    global logger
    logger = setup_logging('AdsDataSqlSync', config.get('LOG_LEVEL', 'INFO'))
    logger.info('starting AdsDataSqlSync.app with {}'.format(args.command))
    nonbib_connection_string = config.get('INGEST_DATABASE',
                                   'postgresql://postgres@localhost:5432/postgres')
    nonbib_db_engine = create_engine(nonbib_connection_string)
    nonbib_db_conn = nonbib_db_engine.connect()

    metrics_connection_string = config.get('METRICS_DATABASE',
                                   'postgresql://postgres@localhost:5432/postgres')
    metrics_db_engine = create_engine(metrics_connection_string)
    metrics_db_conn = metrics_db_engine.connect()
    sql_sync = nonbib.NonBib(args.rowViewSchemaName)
    if args.command == 'help' and args.diagnose:
        diagnose_nonbib()
        diagnose_metrics()

    elif args.command == 'createIngestTables':
        sql_sync.create_column_tables(nonbib_db_engine)

    elif args.command == 'dropIngestTables':
        sql_sync.drop_column_tables(nonbib_db_engine)

    elif args.command == 'createJoinedRows':
        sql_sync.create_joined_rows(nonbib_db_conn)

    elif args.command == 'createMetricsTable' and args.metricsSchemaName:
        m = metrics.Metrics(args.metricsSchemaName)
        m.create_metrics_table(metrics_db_engine)

    elif args.command == 'dropMetricsTable' and args.metricsSchemaName:
        m = metrics.Metrics(args.metricsSchemaName)
        m.drop_metrics_table(metrics_db_engine)

    elif args.command == 'populateMetricsTable' and args.rowViewSchemaName and args.metricsSchemaName:
        m = metrics.Metrics()
        m.update_metrics_all(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName)

    elif args.command == 'populateMetricsTableDelta' and args.rowViewSchemaName and args.metricsSchemaName:
        m = metrics.Metrics(args.metricsSchemaName)
        m.update_metrics_changed(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName)

    elif args.command == 'renameSchema' and args.rowViewSchemaName and args.rowViewBaselineSchemaName:
        sql_sync.rename_schema(nonbib_db_conn, args.rowViewBaselineSchemaName)

    elif args.command == 'createDeltaRows' and args.rowViewSchemaName and args.rowViewBaselineSchemaName:
        sql_sync.create_delta_rows(nonbib_db_conn, args.rowViewBaselineSchemaName)

    elif args.command == 'createNewBibcodes' and args.rowViewSchemaName and args.rowViewBaselineSchemaName:
        sql_sync.build_new_bibcodes(nonbib_db_conn, args.rowViewBaselineSchemaName)

    elif args.command == 'logDeltaReasons' and args.rowViewSchemaName and args.rowViewBaselineSchemaName:
        sql_sync.log_delta_reasons(nonbib_db_conn, args.rowViewBaselineSchemaName)

    elif args.command == 'runRowViewPipeline' and args.rowViewSchemaName:
        # drop tables, create tables, load data, create joined view
        sql_sync.drop_column_tables(nonbib_db_engine)
        sql_sync.create_column_tables(nonbib_db_engine)
        load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync)

    elif args.command == 'runMetricsPipeline' and args.rowViewSchemaName and args.metricsSchemaName:
        m = metrics.Metrics(args.metricsSchemaName)
        m.drop_metrics_table(metrics_db_engine)
        m.create_metrics_table(metrics_db_engine)
        m.update_metrics_all(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName)

    elif args.command == 'runRowViewPipelineDelta' and args.rowViewSchemaName and args.rowViewBaselineSchemaName:
        # we delete the old data
        baseline_sql_sync = nonbib.NonBib(args.rowViewBaselineSchemaName)
        baseline_engine = create_engine(nonbib_connection_string)
        baseline_sql_sync.drop_column_tables(baseline_engine)
        # rename the current to be the old (for later comparison)
        sql_sync.rename_schema(nonbib_db_conn, args.rowViewBaselineSchemaName)
        # create the new and populate
        baseline_sql_sync = None
        sql_sync.create_column_tables(nonbib_db_engine)
        load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync)
        # compute delta between old and new
        sql_sync.create_delta_rows(nonbib_db_conn, args.rowViewBaselineSchemaName)
        sql_sync.log_delta_reasons(nonbib_db_conn, args.rowViewBaselineSchemaName)

    elif args.command == 'runMetricsPipelineDelta' and args.rowViewSchemaName and args.metricsSchemaName:
        m = metrics.Metrics(args.metricsSchemaName)
        m.update_metrics_changed(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName)

    elif args.command == 'runPipelines' and args.rowViewSchemaName and args.metricsSchemaName:
        # drop tables, create tables, load data, compute metrics
        sql_sync.drop_column_tables(nonbib_db_engine)
        sql_sync.create_column_tables(nonbib_db_engine)
        load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync)

        m = metrics.Metrics(args.metricsSchemaName)
        m.drop_metrics_table(metrics_db_engine)
        m.create_metrics_table(metrics_db_engine)
        m.update_metrics_all(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName)

    elif args.command == 'runPipelinesDelta' and args.rowViewSchemaName and args.metricsSchemaName and args.rowViewBaselineSchemaName:
        # drop tables, rename schema, create tables, load data, compute delta, compute metrics
        baseline_sql_sync = nonbib.NonBib(args.rowViewBaselineSchemaName)
        baseline_engine = create_engine(nonbib_connection_string)
        baseline_sql_sync.drop_column_tables(baseline_engine)
        sql_sync.rename_schema(nonbib_db_conn, args.rowViewBaselineSchemaName)

        baseline_sql_sync = None
        sql_sync.create_column_tables(nonbib_db_engine)
        load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync)

        sql_sync.create_delta_rows(nonbib_db_conn, args.rowViewBaselineSchemaName)
        sql_sync.log_delta_reasons(nonbib_db_conn, args.rowViewBaselineSchemaName)

        m = metrics.Metrics(args.metricsSchemaName)
        m.update_metrics_changed(metrics_db_conn, nonbib_db_conn, args.rowViewSchemaName)

    elif args.command == 'nonbibToMasterPipeline' and args.diagnose:
        diagnose_nonbib()
    elif args.command == 'nonbibToMasterPipeline' and args.bibcodes:
        bibcodes = args.bibcodes.split(',')
        nonbib_bibs_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, bibcodes)
    elif args.command == 'nonbibToMasterPipeline' and args.filename:
        bibcodes = []
        with open(args.filename, 'r') as f:
            for line in f:
                bibcodes.append(line.strip())
                if len(bibcodes) > 100:
                    nonbib_bibs_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, bibcodes)
                    bibcodes = []
        if len(bibcodes) > 0:
            nonbib_bibs_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, bibcodes)
    elif args.command == 'nonbibToMasterPipeline':
        nonbib_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, int(args.batchSize))
    elif args.command == 'nonbibDeltaToMasterPipeline':
        nonbib_delta_to_master_pipeline(nonbib_db_engine, args.rowViewSchemaName, int(args.batchSize))
    elif args.command == 'metricsToMasterPipeline' and args.diagnose:
        diagnose_metrics()
    elif args.command == 'metricsToMasterPipeline' and args.bibcodes:
        bibcodes = args.bibcodes.split(',')
        metrics_bibs_to_master_pipeline(metrics_db_engine, args.metricsSchemaName, bibcodes)
    elif args.command == 'metricsToMasterPipeline':
        metrics_to_master_pipeline(metrics_db_engine, args.metricsSchemaName, int(args.batchSize))
    elif args.command == 'metricsDeltaToMasterPipeline':
        metrics_delta_to_master_pipeline(metrics_db_engine, args.metricsSchemaName, nonbib_db_engine, args.rowViewSchemaName, int(args.batchSize))

    elif args.command == 'metricsCompare':
        # compare the values in two metrics postgres tables
        # useful to compare results from new pipeline to produciton pipeline
        # read metrics records from both databases and compare
        metrics_logger = setup_logging('metricsCompare', 'INFO')
        metrics1 = metrics.Metrics(args.metricsSchemaName)
        Session = sessionmaker(bind=metrics_db_engine)
        session = Session()
        if args.metricsSchemaName:
            session.execute('set search_path to {}'.format(args.metricsSchemaName))

        metrics2 = metrics.Metrics(args.metricsSchemaName2)
        metrics_connection_string2 = config.get('METRICS_DATABASE2',
                                               'postgresql://postgres@localhost:5432/postgres')
        metrics_db_engine2 = create_engine(metrics_connection_string2)
        Session2 = sessionmaker(bind=metrics_db_engine2)
        session2 = Session2()
        if args.metricsSchemaName2:
            session2.execute('set search_path to {}'.format(args.metricsSchemaName2))

        print 'm2', metrics_connection_string2
        print 'm2 schema', args.metricsSchemaName2
        with open(args.filename) as f:
            for line in f:
                bibcode = line.strip()
                m1 = metrics1.get_by_bibcode(session, bibcode)
                m2 = metrics2.get_by_bibcode(session2, bibcode)
                mismatch = metrics.Metrics.metrics_mismatch(line.strip(), m1, m2, metrics_logger)
                if mismatch:
                    metrics_logger.error('{} MISMATCHED FIELDS: {}'.format(bibcode, mismatch))
                    print '{} MISMATCHED FIELDS: {}'.format(bibcode, mismatch)
        session.close()
        session2.close()

    else:
        print 'app.py: illegal command or missing argument, command = ', args.command
        print '  row view schema name = ', args.rowViewSchemaName
        print '  row view baseline schema name = ', args.rowViewBaselineSchemaName
        print '  metrics schema name = ', args.metricsSchemaName

    if nonbib_db_conn:
        nonbib_db_conn.close()
    if metrics_db_conn:
        metrics_db_conn.close()
    logger.info('completed {}'.format(args.command))
Beispiel #26
0
def claimed_records(debug=False, test=False):
    """
    Reporting function; checks SOLR for the following:
        - number of records that have been claimed by at least one ORCID ID, in orcid_pub, orcid_user, orcid_other
            (each reported separately)
        - total number of accepted claims of each of orcid_pub, orcid_user, orcid_other (i.e. if a single record
            has two separate authors who have successfully created a claim, the number reported here is 2)
        - total number of bibcodes that have been claimed, of any type

    The report is designed to be run regularly, and the results compared to previous report runs (via logs)

    :return: None (output to logs)
    """
    if test:
        logger = setup_logging('test_claimed')
    else:
        logger = setup_logging('reporting')

    config = {}
    config.update(load_config())

    # the first 7 digits of ORCID IDs are zero padding
    orcid_wild = '000000*'
    resp_pub = query_solr(config['SOLR_URL'],
                          'orcid_pub:"' + orcid_wild + '"',
                          rows=10,
                          sort="bibcode desc",
                          fl='bibcode')
    resp_user = query_solr(config['SOLR_URL'],
                           'orcid_user:"******"',
                           rows=10,
                           sort="bibcode desc",
                           fl='bibcode')
    resp_other = query_solr(config['SOLR_URL'],
                            'orcid_other:"' + orcid_wild + '"',
                            rows=10,
                            sort="bibcode desc",
                            fl='bibcode')

    logger.info('Number of records with an orcid_pub: {}'.format(
        resp_pub['response']['numFound']))
    logger.info('Number of records with an orcid_user: {}'.format(
        resp_user['response']['numFound']))
    logger.info('Number of records with an orcid_other: {}'.format(
        resp_other['response']['numFound']))

    start = 0
    rows = 1000

    results = resp_pub['response']['docs']
    num_orcid_pub = 0
    num_orcid_user = 0
    num_orcid_other = 0

    bibcode_pub = set()
    bibcode_user = set()
    bibcode_other = set()
    while results:
        results = query_records(start=start, rows=rows)
        for i in range(len(results)):
            try:
                results[i]['orcid_pub']
            except KeyError:
                pass
            else:
                num_p = len(
                    fnmatch.filter(results[i].get('orcid_pub'), '0000*'))
                num_orcid_pub += num_p
                bibcode_pub.add(results[i].get('bibcode'))
            try:
                results[i]['orcid_user']
            except KeyError:
                pass
            else:
                num_u = len(
                    fnmatch.filter(results[i].get('orcid_user'), '0000*'))
                num_orcid_user += num_u
                bibcode_user.add(results[i].get('bibcode'))
            try:
                results[i]['orcid_other']
            except KeyError:
                pass
            else:
                num_o = len(
                    fnmatch.filter(results[i].get('orcid_other'), '0000*'))
                num_orcid_other += num_o
                bibcode_other.add(results[i].get('bibcode'))

        if debug:
            if (start + rows) % 10000 == 0:
                logger.info(
                    'Number of results processed so far: {}'.format(start +
                                                                    rows))

        if test:
            break
        else:
            start += rows

    logger.info('Total number of orcid_pub claims: {}'.format(num_orcid_pub))
    logger.info('Total number of orcid_user claims: {}'.format(num_orcid_user))
    logger.info(
        'Total number of orcid_other claims: {}'.format(num_orcid_other))

    orcid_bibcodes = bibcode_pub.union(bibcode_user).union(bibcode_other)
    logger.info('Total number of records with any ORCID claims: {}'.format(
        len(orcid_bibcodes)))
import os
import requests
import json
from adsputils import setup_logging

logger = setup_logging('docmatch_log')


def get_matches(metadata, doctype, mustmatch=False, match_doctype=None):
    """

    :param metadata:
    :param doctype:
    :param mustmatch:
    :param match_doctype: list of doctypes, if specified only this type of doctype is matched
    :return:
    """
    try:
        payload = {
            'abstract': metadata['abstract'].replace('\n', ' '),
            'title': metadata['title'].replace('\n', ' '),
            'author': metadata['authors'],
            'year': metadata['pubdate'][:4],
            'doctype': doctype,
            'bibcode': metadata['bibcode'],
            'doi': metadata.get('doi', None),
            'mustmatch': mustmatch,
            'match_doctype': match_doctype
        }
    except KeyError as e:
        return (metadata['bibcode'], None, e)
Beispiel #28
0
import gzip
import time
import argparse
from collections import OrderedDict
from sqlalchemy.orm import load_only

from aip.classic import read_records
from adsputils import setup_logging
from aip.models import Records, ChangeLog
from aip import tasks

import pyingest.parsers.aps as aps
import pyingest.parsers.arxiv as arxiv

app = tasks.app
logger = setup_logging('run.py')



def read_bibcodes(files):
    """Reads contents of the BIBFILES into memory; basically bibcode:json_fingerprint
    pairs.

    @param files: list of files to read from
    @return: OrderedDict instance
    """
    start = time.time()
    records = OrderedDict()

    for f in files:
        logger.debug('...loading %s' % f)
    from ads.ADSCachedExports import ADSRecords, init_lookers_cache
    from ads.ADSCachedExports import LOGGER as export_logger
    from aip.classic import conversions
except ImportError:
    sys.path.append('/proj/ads/soft/python/lib/site-packages') #TODO: make it configurable
    try:
        from ads.ADSCachedExports import ADSRecords, init_lookers_cache
        from ads.ADSCachedExports import LOGGER as export_logger
        from aip.classic import conversions
        INIT_LOOKERS_CACHE = init_lookers_cache
    except ImportError:
        print "Unable to import ads.ADSExports.ADSRecords!"
        print "We will be unable to query ADS-classic for records!"


logger = utils.setup_logging('read_records')


def canonicalize_records(records, targets=None, ignore_fingerprints=False):
    '''
    Takes a dict of {bibcode:fingerprint} and resolves each bibcode to its canonical.
    
    Finds all alternates associated with that bibcode and constructs the full JSON_fingerprint
    from all of these associated records
    
    Note: Pops from the input dict with no attempt to copy/deepcopy it.
    '''
    
    #TODO(rca): getAlternates is called multiple times unnecessarily
    start = time.time()
    results = []
Beispiel #30
0
from adsputils import load_config, setup_logging
from ADSOrcid import tasks
from ADSOrcid.models import ClaimsLog
from levenshtein_default import query_solr
from sqlalchemy import func, and_, distinct
from dateutil.tz import tzutc
import fnmatch
import datetime
import cachetools
import time
import pytz
import urllib3
import requests

app = tasks.app
logger = setup_logging('reporting')

records_cache = cachetools.TTLCache(maxsize=1024, ttl=3600, timer=time.time, missing=None, getsizeof=None)

@cachetools.cached(records_cache)
def query_records(start=0,rows=1000):
    """
    Function to query SOLR for a set of records and return the response.
    Kept as a separate function in order to use a cache.

    :param
        start: Row number to start with; default=0
        rows:  Number of rows to retrieve; default=1000
    :return
        response: Response from the query
    """
Beispiel #31
0
from __future__ import print_function
from future import standard_library
standard_library.install_aliases()
from builtins import str
import json
import sys
import os
import requests
import argparse
import json
import pickle

# python compare_solrs.py --solr-endpoints http://adsqb.cfa.harvard.edu:9983/solr/BumblebeeETL/select http://adsqb.cfa.harvard.edu:9983/solr/collection1/select --bibcode stdin fields < testBibcodes.txt

from adsputils import setup_logging
logger = setup_logging('compare-solr', level='DEBUG')

SOLR1_PATH = 'http://localhost:9000/solr/select/'
SOLR2_PATH = 'http://localhost:8900/solr/select/'


def query_solr(
    endpoint,
    query,
    start=0,
    rows=200,
    sort='date desc',
    fl=None,
):
    d = {
        'q': query,
Beispiel #32
0
 def __init__(self, schema_='nonbib'):
     self.schema = schema_
     self.meta = MetaData()
     self.table = models.NonBibTable()
     self.table.schema = self.schema
     self.logger = setup_logging('AdsDataSqlSync', 'INFO')
Beispiel #33
0
from kombu import Exchange, Queue, BrokerConnection
import datetime


# ============================= INITIALIZATION ==================================== #

app = app_module.create_app()
exch = Exchange(app.conf.get('CELERY_DEFAULT_EXCHANGE', 'ADSWorker'), 
                type=app.conf.get('CELERY_DEFAULT_EXCHANGE_TYPE', 'topic'))
app.conf.CELERY_QUEUES = (
    Queue('errors', exch, routing_key='errors', durable=False, message_ttl=24*3600*5),
    Queue('some-queue', exch, routing_key='check-orcidid')
)


logger = adsputils.setup_logging('ADSWorker', app.conf.get('LOGGING_LEVEL', 'INFO'))


# connection to the other virtual host (for sending data out)
forwarding_connection = BrokerConnection(app.conf.get('OUTPUT_CELERY_BROKER',
                              '%s/%s' % (app.conf.get('CELRY_BROKER', 'pyamqp://'),
                                         app.conf.get('OUTPUT_EXCHANGE', 'other-pipeline'))))
class MyTask(Task):
    def on_failure(self, exc, task_id, args, kwargs, einfo):
        logger.error('{0!r} failed: {1!r}'.format(task_id, exc))



# ============================= TASKS ============================================= #

@app.task(base=MyTask, queue='some-queue')
Beispiel #34
0
import json
from adsputils import date2solrstamp
import sys
import time
from collections import OrderedDict

# ============================= INITIALIZATION ==================================== #
# - Use app logger:
#import logging
#logger = logging.getLogger('master-pipeline')
# - Or individual logger for this file:
from adsputils import setup_logging, load_config
proj_home = os.path.realpath(os.path.join(os.path.dirname(__file__), '../'))
config = load_config(proj_home=proj_home)
logger = setup_logging(__name__,
                       proj_home=proj_home,
                       level=config.get('LOGGING_LEVEL', 'INFO'),
                       attach_stdout=config.get('LOG_STDOUT', False))

# =============================== FUNCTIONS ======================================= #


def extract_metrics_pipeline(data, solrdoc):

    citation = data.get('citations', [])

    return dict(citation=citation)


def extract_data_pipeline(data, solrdoc):

    reader = data.get('readers', [])
Beispiel #35
0
def main():
    parser = argparse.ArgumentParser(
        description='process column files into Postgres')
    parser.add_argument(
        '-t',
        '--rowViewBaselineSchemaName',
        default='nonbibstaging',
        help='name of old postgres schema, used to compute delta')
    parser.add_argument('-d',
                        '--diagnose',
                        default=False,
                        action='store_true',
                        help='run simple test')
    parser.add_argument(
        '-f',
        '--filename',
        default=None,
        help=
        'name of file containing the list of bibcode for metrics comparison')
    parser.add_argument('-m',
                        '--metricsSchemaName',
                        default='metrics',
                        help='name of the postgres metrics schema')
    parser.add_argument(
        '-n',
        '--metricsSchemaName2',
        default='',
        help='name of the postgres metrics schema for comparison')
    parser.add_argument('-r',
                        '--rowViewSchemaName',
                        default='nonbib',
                        help='name of the postgres row view schema')
    parser.add_argument('-s',
                        '--batchSize',
                        default=100,
                        help='used when queuing data')
    parser.add_argument(
        '-b',
        '--bibcodes',
        default='',
        help='comma separate list of bibcodes send to master pipeline')
    parser.add_argument('command', default='help', nargs='?',
                        help='ingest | verify | createIngestTables | dropIngestTables | renameSchema ' \
                        + ' | createJoinedRows | createMetricsTable | dropMetricsTable ' \
                        + ' | populateMetricsTable | createDeltaRows | populateMetricsTableDelta ' \
                        + ' | runRowViewPipeline | runMetricsPipeline | createNewBibcodes ' \
                        + ' | runRowViewPipelineDelta | runMetricsPipelineDelta '\
                        + ' | runPipelines | runPipelinesDelta | nonbibToMasterPipeline | nonbibDeltaToMasterPipeline'
                        + ' | metricsToMasterPipeline | metricsDeltaToMasterPipeline | metricsCompare'
                        + ' | resetNonbib')

    args = parser.parse_args()

    logger.info('starting AdsDataSqlSync.app with {}'.format(args.command))
    nonbib_connection_string = config.get(
        'INGEST_DATABASE', 'postgresql://postgres@localhost:5432/postgres')
    nonbib_db_engine = create_engine(nonbib_connection_string)
    nonbib_db_conn = nonbib_db_engine.connect()

    metrics_connection_string = config.get(
        'METRICS_DATABASE', 'postgresql://postgres@localhost:5432/postgres')
    metrics_db_engine = create_engine(metrics_connection_string, pool_size=30)
    metrics_db_conn = metrics_db_engine.connect()
    sql_sync = nonbib.NonBib(args.rowViewSchemaName)
    if args.command == 'help' and args.diagnose:
        diagnose_nonbib()
        diagnose_metrics()

    elif args.command == 'resetNonbib':
        # detect if pipeline didn't complete and reset postgres tables
        if not nonbib_db_engine.has_table('rowviewm', schema='nonbib'):
            print 'merged table not found, resetting database'
            nonbib_db_engine.execute('drop schema if exists nonbib cascade')
            nonbib_db_engine.execute(
                'alter schema nonbibstaging rename to nonbib')
            print 'reset complete'
        else:
            print 'merged output table found, reset not needed'
    elif args.command == 'createIngestTables':
        sql_sync.create_column_tables(nonbib_db_engine)

    elif args.command == 'dropIngestTables':
        sql_sync.drop_column_tables(nonbib_db_engine)

    elif args.command == 'createJoinedRows':
        sql_sync.create_joined_rows(nonbib_db_conn)

    elif args.command == 'createMetricsTable' and args.metricsSchemaName:
        m = metrics.Metrics(args.metricsSchemaName)
        m.create_metrics_table(metrics_db_engine)

    elif args.command == 'dropMetricsTable' and args.metricsSchemaName:
        m = metrics.Metrics(args.metricsSchemaName)
        m.drop_metrics_table(metrics_db_engine)

    elif args.command == 'populateMetricsTable' and args.rowViewSchemaName and args.metricsSchemaName and args.filename:
        m = metrics.Metrics(args.metricsSchemaName)
        with open(args.filename, 'r') as f:
            for line in f:
                bibcode = line.strip()
                if bibcode:
                    m.update_metrics_bibcode(bibcode, metrics_db_conn,
                                             nonbib_db_conn)

    elif args.command == 'populateMetricsTable' and args.rowViewSchemaName and args.metricsSchemaName:
        m = metrics.Metrics()
        m.update_metrics_all(metrics_db_conn, nonbib_db_conn,
                             args.rowViewSchemaName)

    elif args.command == 'populateMetricsTableDelta' and args.rowViewSchemaName and args.metricsSchemaName:
        m = metrics.Metrics(args.metricsSchemaName)
        m.update_metrics_changed(metrics_db_conn, nonbib_db_conn,
                                 args.rowViewSchemaName)

    elif args.command == 'renameSchema' and args.rowViewSchemaName and args.rowViewBaselineSchemaName:
        sql_sync.rename_schema(nonbib_db_conn, args.rowViewBaselineSchemaName)

    elif args.command == 'createDeltaRows' and args.rowViewSchemaName and args.rowViewBaselineSchemaName:
        sql_sync.create_delta_rows(nonbib_db_conn,
                                   args.rowViewBaselineSchemaName)

    elif args.command == 'createNewBibcodes' and args.rowViewSchemaName and args.rowViewBaselineSchemaName:
        sql_sync.build_new_bibcodes(nonbib_db_conn,
                                    args.rowViewBaselineSchemaName)

    elif args.command == 'logDeltaReasons' and args.rowViewSchemaName and args.rowViewBaselineSchemaName:
        sql_sync.log_delta_reasons(nonbib_db_conn,
                                   args.rowViewBaselineSchemaName)

    elif args.command == 'runRowViewPipeline' and args.rowViewSchemaName:
        # drop tables, create tables, load data, create joined view
        sql_sync.drop_column_tables(nonbib_db_engine)
        sql_sync.create_column_tables(nonbib_db_engine)
        load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync)

    elif args.command == 'runMetricsPipeline' and args.rowViewSchemaName and args.metricsSchemaName:
        m = metrics.Metrics(args.metricsSchemaName)
        m.drop_metrics_table(metrics_db_engine)
        m.create_metrics_table(metrics_db_engine)
        m.update_metrics_all(metrics_db_conn, nonbib_db_conn,
                             args.rowViewSchemaName)

    elif args.command == 'runRowViewPipelineDelta' and args.rowViewSchemaName and args.rowViewBaselineSchemaName:
        # we delete the old data
        baseline_sql_sync = nonbib.NonBib(args.rowViewBaselineSchemaName)
        baseline_engine = create_engine(nonbib_connection_string)
        baseline_sql_sync.drop_column_tables(baseline_engine)
        # rename the current to be the old (for later comparison)
        sql_sync.rename_schema(nonbib_db_conn, args.rowViewBaselineSchemaName)
        # create the new and populate
        baseline_sql_sync = None
        sql_sync.create_column_tables(nonbib_db_engine)
        load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync)
        # compute delta between old and new
        sql_sync.create_delta_rows(nonbib_db_conn,
                                   args.rowViewBaselineSchemaName)
        sql_sync.log_delta_reasons(nonbib_db_conn,
                                   args.rowViewBaselineSchemaName)

    elif args.command == 'runMetricsPipelineDelta' and args.rowViewSchemaName and args.metricsSchemaName:
        m = metrics.Metrics(args.metricsSchemaName)
        m.update_metrics_changed(metrics_db_conn, nonbib_db_conn,
                                 args.rowViewSchemaName)

    elif args.command == 'runPipelines' and args.rowViewSchemaName and args.metricsSchemaName:
        # drop tables, create tables, load data, compute metrics
        sql_sync.drop_column_tables(nonbib_db_engine)
        sql_sync.create_column_tables(nonbib_db_engine)
        load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync)

        m = metrics.Metrics(args.metricsSchemaName)
        m.drop_metrics_table(metrics_db_engine)
        m.create_metrics_table(metrics_db_engine)
        m.update_metrics_all(metrics_db_conn, nonbib_db_conn,
                             args.rowViewSchemaName)

    elif args.command == 'runPipelinesDelta' and args.rowViewSchemaName and args.metricsSchemaName and args.rowViewBaselineSchemaName:
        # drop tables, rename schema, create tables, load data, compute delta, compute metrics
        baseline_sql_sync = nonbib.NonBib(args.rowViewBaselineSchemaName)
        baseline_engine = create_engine(nonbib_connection_string)
        baseline_sql_sync.drop_column_tables(baseline_engine)
        sql_sync.rename_schema(nonbib_db_conn, args.rowViewBaselineSchemaName)

        baseline_sql_sync = None
        sql_sync.create_column_tables(nonbib_db_engine)
        load_column_files(config, nonbib_db_engine, nonbib_db_conn, sql_sync)

        sql_sync.create_delta_rows(nonbib_db_conn,
                                   args.rowViewBaselineSchemaName)
        sql_sync.log_delta_reasons(nonbib_db_conn,
                                   args.rowViewBaselineSchemaName)

        m = metrics.Metrics(args.metricsSchemaName)
        m.update_metrics_changed(metrics_db_conn, nonbib_db_conn,
                                 args.rowViewSchemaName)

    elif args.command == 'nonbibToMasterPipeline' and args.diagnose:
        diagnose_nonbib()
    elif args.command == 'nonbibToMasterPipeline' and args.bibcodes:
        bibcodes = args.bibcodes.split(',')
        nonbib_to_master_pipeline(nonbib_db_engine,
                                  args.rowViewSchemaName,
                                  int(args.batchSize),
                                  source=bibcodes)
    elif args.command == 'nonbibToMasterPipeline' and args.filename:
        bibcodes = []
        with open(args.filename, 'r') as f:
            for line in f:
                bibcodes.append(line.strip())
        nonbib_to_master_pipeline(nonbib_db_engine,
                                  args.rowViewSchemaName,
                                  int(args.batchSize),
                                  source=bibcodes)
    elif args.command == 'nonbibToMasterPipeline':
        nonbib_to_master_pipeline(nonbib_db_engine,
                                  args.rowViewSchemaName,
                                  int(args.batchSize),
                                  source="models.NonBibTable")
    elif args.command == 'nonbibDeltaToMasterPipeline':
        nonbib_to_master_pipeline(nonbib_db_engine,
                                  args.rowViewSchemaName,
                                  int(args.batchSize),
                                  source="models.NonBibDeltaTable")
    elif args.command == 'metricsToMasterPipeline' and args.diagnose:
        diagnose_metrics()
    elif args.command == 'metricsToMasterPipeline' and args.filename:
        bibcodes = []
        with open(args.filename, 'r') as f:
            for line in f:
                bibcodes.append(line.strip())
                if len(bibcodes) > 100:
                    metrics_bibs_to_master_pipeline(metrics_db_engine,
                                                    args.metricsSchemaName,
                                                    bibcodes)
                    bibcodes = []
        if len(bibcodes) > 0:
            metrics_bibs_to_master_pipeline(metrics_db_engine,
                                            args.metricsSchemaName, bibcodes)
    elif args.command == 'metricsToMasterPipeline' and args.bibcodes:
        bibcodes = args.bibcodes.split(',')
        metrics_bibs_to_master_pipeline(metrics_db_engine,
                                        args.metricsSchemaName, bibcodes)
    elif args.command == 'metricsToMasterPipeline':
        metrics_to_master_pipeline(metrics_db_engine, args.metricsSchemaName,
                                   int(args.batchSize))
    elif args.command == 'metricsDeltaToMasterPipeline':
        metrics_delta_to_master_pipeline(metrics_db_engine,
                                         args.metricsSchemaName,
                                         nonbib_db_engine,
                                         args.rowViewSchemaName,
                                         int(args.batchSize))

    elif args.command == 'metricsCompare':
        # compare the values in two metrics postgres tables
        # useful to compare results from new pipeline to produciton pipeline
        # read metrics records from both databases and compare
        metrics_logger = setup_logging('metricsCompare', 'INFO')
        metrics1 = metrics.Metrics(args.metricsSchemaName)
        Session = sessionmaker(bind=metrics_db_engine)
        session = Session()
        if args.metricsSchemaName:
            session.execute('set search_path to {}'.format(
                args.metricsSchemaName))

        metrics2 = metrics.Metrics(args.metricsSchemaName2)
        metrics_connection_string2 = config.get(
            'METRICS_DATABASE2',
            'postgresql://postgres@localhost:5432/postgres')
        metrics_db_engine2 = create_engine(metrics_connection_string2)
        Session2 = sessionmaker(bind=metrics_db_engine2)
        session2 = Session2()
        if args.metricsSchemaName2:
            session2.execute('set search_path to {}'.format(
                args.metricsSchemaName2))

        print 'm2', metrics_connection_string2
        print 'm2 schema', args.metricsSchemaName2
        with open(args.filename) as f:
            for line in f:
                bibcode = line.strip()
                m1 = metrics1.get_by_bibcode(session, bibcode)
                m2 = metrics2.get_by_bibcode(session2, bibcode)
                mismatch = metrics.Metrics.metrics_mismatch(
                    line.strip(), m1, m2, metrics_logger)
                if mismatch:
                    metrics_logger.error('{} MISMATCHED FIELDS: {}'.format(
                        bibcode, mismatch))
                    print '{} MISMATCHED FIELDS: {}'.format(bibcode, mismatch)
        session.close()
        session2.close()

    else:
        print 'app.py: illegal command or missing argument, command = ', args.command
        print '  row view schema name = ', args.rowViewSchemaName
        print '  row view baseline schema name = ', args.rowViewBaselineSchemaName
        print '  metrics schema name = ', args.metricsSchemaName

    if nonbib_db_conn:
        nonbib_db_conn.close()
    if metrics_db_conn:
        metrics_db_conn.close()
    logger.info('completed {}'.format(args.command))
import requests
import json
import os
import sys
import re
import traceback


from adsputils import setup_logging, get_date, date2solrstamp
from aip.classic import enforce_schema

logger = setup_logging('solr_adapter')

ARTICLE_TYPES = set(['eprint', 'article', 'inproceedings', 'inbook'])
AUTHOR_TYPES = set(['regular', 'collaboration'])

def get_date_by_datetype(ADS_record):
    """computes the standard pubdate by selecting the appropriate value
    from the ADS_record and formatting it as YYYY-MM-DD"""

    dates = ADS_record['metadata']['general']['publication']['dates']
    for datetype in [ 'date-published', 'date-thesis', 'date-preprint' ]:
        try:
            return next(i['content'] for i in dates if i['type'].lower() == datetype)
        except StopIteration:
            pass
    return None

def _normalize_author_name(strname):
    if not strname:
        return None
from datetime import datetime, timedelta
from os import remove
from shutil import move
import subprocess
import os

from adsputils import setup_logging, load_config

logger = setup_logging('AutomatedIngestReport')
conf = load_config(proj_home='./')


# enums used to to generate file names
class FileType:
    CANONICAL = 'CANONICAL'
    SOLR = 'SOLR'
    FULLTEXT = 'FULLTEXT'


class FileAdjective:
    MISSING = 'MISSING'
    DELETED = 'DELETED'
    EXTRA = 'EXTRA'
    NEW = 'NEW'


class Date:
    TODAY = 1
    YESTERDAY = 2