def test_task_update_record_metrics_list(self): with patch('adsmp.tasks.task_index_records.delay') as next_task: recs = MetricsRecordList() metrics_data = {'bibcode': '2015ApJ...815..133S'} metrics_data2 = {'bibcode': '3015ApJ...815..133Z'} rec = MetricsRecord(**metrics_data) rec2 = MetricsRecord(**metrics_data2) recs.metrics_records.extend([rec._data, rec2._data]) tasks.task_update_record(recs) self.assertFalse(next_task.called)
def test_task_update_record_metrics(self): with patch('adsmp.tasks.task_index_records.delay') as next_task: self.assertFalse(next_task.called) tasks.task_update_record( MetricsRecord(bibcode='2015ApJ...815..133S')) self.assertTrue(next_task.called) self.assertTrue(next_task.call_args[0], ('2015ApJ...815..133S'))
def metrics_to_master_pipeline(metrics_engine, schema, batch_size=1): """send all metrics data to queue for delivery to master pipeline""" global config Session = sessionmaker(bind=metrics_engine) session = Session() session.execute('set search_path to {}'.format(schema)) tmp = [] i = 0 max_rows = config['MAX_ROWS'] for current_row in session.query(models.MetricsTable).yield_per(100): current_row = row2dict(current_row) current_row.pop('id') rec = MetricsRecord(**current_row) tmp.append(rec._data) i += 1 if max_rows > 0 and i > max_rows: break if len(tmp) >= batch_size: recs = MetricsRecordList() recs.metrics_records.extend(tmp) logger.info("Calling metrics 'app.forward_message' count = '%s'", i) tasks.task_output_metrics.delay(recs) tmp = [] if len(tmp) > 0: recs = MetricsRecordList() recs.metrics_records.extend(tmp) logger.debug("Calling metrics 'app.forward_message' with count = '%s'", i) tasks.task_output_metrics.delay(recs)
def process_bibcodes(self, bibcodes): """send nonbib and metrics records to master for the passed bibcodes for each bibcode read nonbib data from files, generate nonbib protobuf compute metrics, generate protobuf""" # batch up messages to master for improved performance nonbib_protos = NonBibRecordList() metrics_protos = MetricsRecordList() for bibcode in bibcodes: try: nonbib = self._read_next_bibcode(bibcode) converted = self._convert(nonbib) nonbib_proto = NonBibRecord(**converted) nonbib_protos.nonbib_records.extend([nonbib_proto._data]) if self.compute_metrics: met = self._compute_metrics(nonbib) metrics_proto = MetricsRecord(**met) metrics_protos.metrics_records.extend( [metrics_proto._data]) except Exception as e: self.logger.error( 'serious error in process.process_bibcodes for bibcode {}, error {}' .format(bibcode, e)) self.logger.exception('general stacktrace') tasks.task_output_nonbib.delay(nonbib_protos) tasks.task_output_metrics.delay(metrics_protos)
def metrics_delta_to_master_pipeline(metrics_engine, metrics_schema, nonbib_engine, nonbib_schema, batch_size=1): """send data for changed metrics to master pipeline the delta table was computed by comparing to sets of nonbib data perhaps ingested on succesive days""" global config Nonbib_Session = sessionmaker(bind=nonbib_engine) nonbib_session = Nonbib_Session() nonbib_session.execute('set search_path to {}'.format(nonbib_schema)) Metrics_Session = sessionmaker(bind=metrics_engine) metrics_session = Metrics_Session() metrics_session.execute('set search_path to {}'.format(metrics_schema)) m = metrics.Metrics(metrics_schema) n = nonbib.NonBib(nonbib_schema) max_rows = config['MAX_ROWS'] tmp = [] i = 0 for current_delta in nonbib_session.query( models.NonBibDeltaTable).yield_per(100): row = m.get_by_bibcode(metrics_session, current_delta.bibcode) rec = row2dict(row) rec.pop('id') rec = MetricsRecord(**dict(rec)) tmp.append(rec._data) i += 1 if max_rows > 0 and i > max_rows: break if len(tmp) >= batch_size: recs = MetricsRecordList() recs.metrics_records.extend(tmp) logger.debug( "Calling metrics 'app.forward_message' with '%s' messages", len(recs.metrics_records)) tasks.task_output_metrics.delay(recs) tmp = [] if len(tmp) > 0: recs = MetricsRecordList() recs.metrics_records.extend(tmp) logger.debug("Calling metrics 'app.forward_message' with final '%s'", str(rec)) tasks.task_output_metrics.delay(recs)
def diagnose_metrics(): """send hard coded metrics data the master pipeline useful for testing to verify connectivity""" test_data = { 'bibcode': '2003ASPC..295..361M', 'refereed': False, 'rn_citations': 0, 'rn_citation_data': [], 'downloads': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 1, 0, 0, 0, 1, 2], 'reads': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 2, 5, 1, 0, 0, 1, 0, 0, 2, 4, 5], 'an_citations': 0, 'refereed_citation_num': 0, 'citation_num': 0, 'reference_num': 0, 'citations': [], 'refereed_citations': [], 'author_num': 2, 'an_refereed_citations': 0 } recs = MetricsRecordList() rec = MetricsRecord(**test_data) recs.metrics_records.extend([rec._data]) print 'sending metrics data for bibocde', test_data[ 'bibcode'], 'to master pipeline' print 'using CELERY_BROKER', config['CELERY_BROKER'] print ' CELERY_DEFAULT_EXCHANGE', config['CELERY_DEFAULT_EXCHANGE'] print ' CELERY_DEFAULT_EXCHANGE_TYPE', config[ 'CELERY_DEFAULT_EXCHANGE_TYPE'] print ' OUTPUT_CELERY_BROKER', config['OUTPUT_CELERY_BROKER'] print ' OUTPUT_TASKNAME', config['OUTPUT_TASKNAME'] print 'this action did not use ingest database (configured at', config[ 'INGEST_DATABASE'], ')' print ' or the metrics database (at', config['METRICS_DATABASE'], ')' tasks.task_output_metrics.delay(recs)
def task_update_record(msg): """Receives payload to update the record. @param msg: protobuff that contains at minimum - bibcode - and specific payload """ logger.debug('Updating record: %s', msg) status = app.get_msg_status(msg) if status == 'deleted': task_delete_documents(msg.bibcode) elif status == 'active': type = app.get_msg_type(msg) bibcodes = [] # save into a database # passed msg may contain details on one bibcode or a list of bibcodes if type == 'nonbib_records': for m in msg.nonbib_records: m = NonBibRecord.deserializer(m.SerializeToString()) t = app.get_msg_type(m) bibcodes.append(m.bibcode) record = app.update_storage(m.bibcode, t, m.toJSON()) logger.debug('Saved record from list: %s', record) elif type == 'metrics_records': for m in msg.metrics_records: m = MetricsRecord.deserializer(m.SerializeToString()) t = app.get_msg_type(m) bibcodes.append(m.bibcode) record = app.update_storage(m.bibcode, t, m.toJSON()) logger.debug('Saved record from list: %s', record) else: # here when record has a single bibcode bibcodes.append(msg.bibcode) record = app.update_storage(msg.bibcode, type, msg.toJSON()) logger.debug('Saved record: %s', record) # trigger futher processing task_index_records.delay(bibcodes) else: logger.error('Received a message with unclear status: %s', msg)
def metrics_bibs_to_master_pipeline(metrics_engine, metrics_schema, bibcodes): """send the passed list of bibcodes to master""" Metrics_Session = sessionmaker(bind=metrics_engine) metrics_session = Metrics_Session() metrics_session.execute('set search_path to {}'.format(metrics_schema)) tmp = [] m = metrics.Metrics(metrics_schema) for bibcode in bibcodes: row = m.get_by_bibcode(metrics_session, bibcode) if row: rec = row2dict(row) rec.pop('id') rec = MetricsRecord(**dict(rec)) tmp.append(rec._data) else: print 'unknown bibcode: ', bibcode recs = MetricsRecordList() recs.metrics_records.extend(tmp) logger.debug("Calling metrics 'app.forward_message' for '%s' bibcodes", str(recs)) tasks.task_output_metrics.delay(recs)
def test_task_update_record_metrics(self): with patch('adsmp.tasks.task_index_records.apply_async') as next_task: self.assertFalse(next_task.called) tasks.task_update_record(MetricsRecord(bibcode='2015ApJ...815..133S')) self.assertFalse(next_task.called)