Ejemplo n.º 1
0
 def test_task_update_record_metrics_list(self):
     with patch('adsmp.tasks.task_index_records.delay') as next_task:
         recs = MetricsRecordList()
         metrics_data = {'bibcode': '2015ApJ...815..133S'}
         metrics_data2 = {'bibcode': '3015ApJ...815..133Z'}
         rec = MetricsRecord(**metrics_data)
         rec2 = MetricsRecord(**metrics_data2)
         recs.metrics_records.extend([rec._data, rec2._data])
         tasks.task_update_record(recs)
         self.assertFalse(next_task.called)
Ejemplo n.º 2
0
 def test_task_update_record_metrics(self):
     with patch('adsmp.tasks.task_index_records.delay') as next_task:
         self.assertFalse(next_task.called)
         tasks.task_update_record(
             MetricsRecord(bibcode='2015ApJ...815..133S'))
         self.assertTrue(next_task.called)
         self.assertTrue(next_task.call_args[0], ('2015ApJ...815..133S'))
Ejemplo n.º 3
0
def metrics_to_master_pipeline(metrics_engine, schema, batch_size=1):
    """send all metrics data to queue for delivery to master pipeline"""
    global config
    Session = sessionmaker(bind=metrics_engine)
    session = Session()
    session.execute('set search_path to {}'.format(schema))
    tmp = []
    i = 0
    max_rows = config['MAX_ROWS']
    for current_row in session.query(models.MetricsTable).yield_per(100):
        current_row = row2dict(current_row)
        current_row.pop('id')
        rec = MetricsRecord(**current_row)
        tmp.append(rec._data)
        i += 1
        if max_rows > 0 and i > max_rows:
            break
        if len(tmp) >= batch_size:
            recs = MetricsRecordList()
            recs.metrics_records.extend(tmp)
            logger.info("Calling metrics 'app.forward_message' count = '%s'",
                        i)
            tasks.task_output_metrics.delay(recs)
            tmp = []

    if len(tmp) > 0:
        recs = MetricsRecordList()
        recs.metrics_records.extend(tmp)
        logger.debug("Calling metrics 'app.forward_message' with count = '%s'",
                     i)
        tasks.task_output_metrics.delay(recs)
Ejemplo n.º 4
0
    def process_bibcodes(self, bibcodes):
        """send nonbib and metrics records to master for the passed bibcodes

        for each bibcode
            read nonbib data from files, generate nonbib protobuf
            compute metrics, generate protobuf"""
        # batch up messages to master for improved performance
        nonbib_protos = NonBibRecordList()
        metrics_protos = MetricsRecordList()

        for bibcode in bibcodes:
            try:
                nonbib = self._read_next_bibcode(bibcode)
                converted = self._convert(nonbib)
                nonbib_proto = NonBibRecord(**converted)
                nonbib_protos.nonbib_records.extend([nonbib_proto._data])
                if self.compute_metrics:
                    met = self._compute_metrics(nonbib)
                    metrics_proto = MetricsRecord(**met)
                    metrics_protos.metrics_records.extend(
                        [metrics_proto._data])
            except Exception as e:
                self.logger.error(
                    'serious error in process.process_bibcodes for bibcode {}, error {}'
                    .format(bibcode, e))
                self.logger.exception('general stacktrace')
        tasks.task_output_nonbib.delay(nonbib_protos)
        tasks.task_output_metrics.delay(metrics_protos)
Ejemplo n.º 5
0
def metrics_delta_to_master_pipeline(metrics_engine,
                                     metrics_schema,
                                     nonbib_engine,
                                     nonbib_schema,
                                     batch_size=1):
    """send data for changed metrics to master pipeline

    the delta table was computed by comparing to sets of nonbib data
    perhaps ingested on succesive days"""
    global config
    Nonbib_Session = sessionmaker(bind=nonbib_engine)
    nonbib_session = Nonbib_Session()
    nonbib_session.execute('set search_path to {}'.format(nonbib_schema))

    Metrics_Session = sessionmaker(bind=metrics_engine)
    metrics_session = Metrics_Session()
    metrics_session.execute('set search_path to {}'.format(metrics_schema))

    m = metrics.Metrics(metrics_schema)
    n = nonbib.NonBib(nonbib_schema)
    max_rows = config['MAX_ROWS']
    tmp = []
    i = 0
    for current_delta in nonbib_session.query(
            models.NonBibDeltaTable).yield_per(100):
        row = m.get_by_bibcode(metrics_session, current_delta.bibcode)
        rec = row2dict(row)
        rec.pop('id')
        rec = MetricsRecord(**dict(rec))
        tmp.append(rec._data)
        i += 1
        if max_rows > 0 and i > max_rows:
            break
        if len(tmp) >= batch_size:
            recs = MetricsRecordList()
            recs.metrics_records.extend(tmp)
            logger.debug(
                "Calling metrics 'app.forward_message' with '%s' messages",
                len(recs.metrics_records))
            tasks.task_output_metrics.delay(recs)
            tmp = []

    if len(tmp) > 0:
        recs = MetricsRecordList()
        recs.metrics_records.extend(tmp)
        logger.debug("Calling metrics 'app.forward_message' with final '%s'",
                     str(rec))
        tasks.task_output_metrics.delay(recs)
Ejemplo n.º 6
0
def diagnose_metrics():
    """send hard coded metrics data the master pipeline

    useful for testing to verify connectivity"""

    test_data = {
        'bibcode':
        '2003ASPC..295..361M',
        'refereed':
        False,
        'rn_citations':
        0,
        'rn_citation_data': [],
        'downloads':
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 1, 0, 0, 0, 1, 2],
        'reads':
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 2, 5, 1, 0, 0, 1, 0, 0, 2, 4, 5],
        'an_citations':
        0,
        'refereed_citation_num':
        0,
        'citation_num':
        0,
        'reference_num':
        0,
        'citations': [],
        'refereed_citations': [],
        'author_num':
        2,
        'an_refereed_citations':
        0
    }
    recs = MetricsRecordList()
    rec = MetricsRecord(**test_data)
    recs.metrics_records.extend([rec._data])
    print 'sending metrics data for bibocde', test_data[
        'bibcode'], 'to master pipeline'
    print 'using CELERY_BROKER', config['CELERY_BROKER']
    print '  CELERY_DEFAULT_EXCHANGE', config['CELERY_DEFAULT_EXCHANGE']
    print '  CELERY_DEFAULT_EXCHANGE_TYPE', config[
        'CELERY_DEFAULT_EXCHANGE_TYPE']
    print '  OUTPUT_CELERY_BROKER', config['OUTPUT_CELERY_BROKER']
    print '  OUTPUT_TASKNAME', config['OUTPUT_TASKNAME']
    print 'this action did not use ingest database (configured at', config[
        'INGEST_DATABASE'], ')'
    print '  or the metrics database (at', config['METRICS_DATABASE'], ')'
    tasks.task_output_metrics.delay(recs)
Ejemplo n.º 7
0
def task_update_record(msg):
    """Receives payload to update the record.

    @param msg: protobuff that contains at minimum
        - bibcode
        - and specific payload
    """
    logger.debug('Updating record: %s', msg)
    status = app.get_msg_status(msg)

    if status == 'deleted':
        task_delete_documents(msg.bibcode)
    elif status == 'active':
        type = app.get_msg_type(msg)
        bibcodes = []

        # save into a database
        # passed msg may contain details on one bibcode or a list of bibcodes
        if type == 'nonbib_records':
            for m in msg.nonbib_records:
                m = NonBibRecord.deserializer(m.SerializeToString())
                t = app.get_msg_type(m)
                bibcodes.append(m.bibcode)
                record = app.update_storage(m.bibcode, t, m.toJSON())
                logger.debug('Saved record from list: %s', record)
        elif type == 'metrics_records':
            for m in msg.metrics_records:
                m = MetricsRecord.deserializer(m.SerializeToString())
                t = app.get_msg_type(m)
                bibcodes.append(m.bibcode)
                record = app.update_storage(m.bibcode, t, m.toJSON())
                logger.debug('Saved record from list: %s', record)
        else:
            # here when record has a single bibcode
            bibcodes.append(msg.bibcode)
            record = app.update_storage(msg.bibcode, type, msg.toJSON())
            logger.debug('Saved record: %s', record)

        # trigger futher processing
        task_index_records.delay(bibcodes)
    else:
        logger.error('Received a message with unclear status: %s', msg)
Ejemplo n.º 8
0
def metrics_bibs_to_master_pipeline(metrics_engine, metrics_schema, bibcodes):
    """send the passed list of bibcodes to master"""
    Metrics_Session = sessionmaker(bind=metrics_engine)
    metrics_session = Metrics_Session()
    metrics_session.execute('set search_path to {}'.format(metrics_schema))
    tmp = []
    m = metrics.Metrics(metrics_schema)
    for bibcode in bibcodes:
        row = m.get_by_bibcode(metrics_session, bibcode)
        if row:
            rec = row2dict(row)
            rec.pop('id')
            rec = MetricsRecord(**dict(rec))
            tmp.append(rec._data)
        else:
            print 'unknown bibcode: ', bibcode

    recs = MetricsRecordList()
    recs.metrics_records.extend(tmp)
    logger.debug("Calling metrics 'app.forward_message' for '%s' bibcodes",
                 str(recs))
    tasks.task_output_metrics.delay(recs)
Ejemplo n.º 9
0
 def test_task_update_record_metrics(self):
     with patch('adsmp.tasks.task_index_records.apply_async') as next_task:
         self.assertFalse(next_task.called)
         tasks.task_update_record(MetricsRecord(bibcode='2015ApJ...815..133S'))
         self.assertFalse(next_task.called)