def test_task_update_record_augments_list(self): with patch('adsmp.tasks.task_index_records.delay') as next_task: recs = NonBibRecordList() nonbib_data = {'bibcode': '2003ASPC..295..361M', 'boost': 3.1} nonbib_data2 = {'bibcode': '3003ASPC..295..361Z', 'boost': 3.2} rec = NonBibRecord(**nonbib_data) rec2 = NonBibRecord(**nonbib_data2) recs.nonbib_records.extend([rec._data, rec2._data]) tasks.task_update_record(recs) self.assertFalse(next_task.called)
def test_task_update_record_nonbib_list(self): with patch('adsmp.tasks.task_index_records.delay') as next_task: self.assertFalse(next_task.called) recs = NonBibRecordList() nonbib_data = {'bibcode': '2003ASPC..295..361M', 'refereed': False} nonbib_data2 = {'bibcode': '3003ASPC..295..361Z', 'refereed': True} rec = NonBibRecord(**nonbib_data) rec2 = NonBibRecord(**nonbib_data2) recs.nonbib_records.extend([rec._data, rec2._data]) tasks.task_update_record(recs) self.assertTrue(next_task.called) self.assertTrue(next_task.call_args[0], ('2015ApJ...815..133S', '3003ASPC..295..361Z'))
def test_task_update_record_delete(self): for x, cls in (('fulltext', FulltextUpdate), ('orcid_claims', OrcidClaims)): self.app.update_storage('bibcode', x, {'foo': 'bar'}) self.assertEquals(self.app.get_record('bibcode')[x]['foo'], 'bar') with patch('adsmp.tasks.task_index_records.delay') as next_task: tasks.task_update_record( cls(bibcode='bibcode', status='deleted')) self.assertEquals(self.app.get_record('bibcode')[x], None) self.assertTrue(self.app.get_record('bibcode')) recs = NonBibRecordList() recs.nonbib_records.extend( [NonBibRecord(bibcode='bibcode', status='deleted').data]) with patch('adsmp.tasks.task_index_records.delay') as next_task: tasks.task_update_record(recs) self.assertEquals(self.app.get_record('bibcode')['metrics'], None) self.assertTrue(self.app.get_record('bibcode')) with patch('adsmp.tasks.task_delete_documents') as next_task: tasks.task_update_record( DenormalizedRecord(bibcode='bibcode', status='deleted')) self.assertTrue(next_task.called) self.assertTrue(next_task.call_args[0], ('bibcode', ))
def diagnose_nonbib(): """send hard coded nonbib data the master pipeline useful for testing to verify connectivity""" test_data = { 'bibcode': '2003ASPC..295..361M', 'simbad_objects': [], 'grants': ['g'], 'boost': 0.11, 'citation_count': 0, 'read_count': 2, 'readers': ['a', 'b'], 'reference': ['c', 'd'] } recs = NonBibRecordList() rec = NonBibRecord(**test_data) recs.nonbib_records.extend([rec._data]) print 'sending nonbib data for bibocde', test_data[ 'bibcode'], 'to master pipeline' print 'using CELERY_BROKER', config['CELERY_BROKER'] print ' CELERY_DEFAULT_EXCHANGE', config['CELERY_DEFAULT_EXCHANGE'] print ' CELERY_DEFAULT_EXCHANGE_TYPE', config[ 'CELERY_DEFAULT_EXCHANGE_TYPE'] print ' OUTPUT_CELERY_BROKER', config['OUTPUT_CELERY_BROKER'] print ' OUTPUT_TASKNAME', config['OUTPUT_TASKNAME'] print 'this action did not use ingest database (configured at', config[ 'INGEST_DATABASE'], ')' print ' or the metrics database (at', config['METRICS_DATABASE'], ')' tasks.task_output_results.delay(recs)
def test_task_update_record_nonbib(self): with patch('adsmp.tasks.task_index_records.delay') as next_task: self.assertFalse(next_task.called) tasks.task_update_record( NonBibRecord(bibcode='2015ApJ...815..133S')) self.assertTrue(next_task.called) self.assertTrue(next_task.call_args[0], ('2015ApJ...815..133S'))
def process_bibcodes(self, bibcodes): """send nonbib and metrics records to master for the passed bibcodes for each bibcode read nonbib data from files, generate nonbib protobuf compute metrics, generate protobuf""" # batch up messages to master for improved performance nonbib_protos = NonBibRecordList() metrics_protos = MetricsRecordList() for bibcode in bibcodes: try: nonbib = self._read_next_bibcode(bibcode) converted = self._convert(nonbib) nonbib_proto = NonBibRecord(**converted) nonbib_protos.nonbib_records.extend([nonbib_proto._data]) if self.compute_metrics: met = self._compute_metrics(nonbib) metrics_proto = MetricsRecord(**met) metrics_protos.metrics_records.extend( [metrics_proto._data]) except Exception as e: self.logger.error( 'serious error in process.process_bibcodes for bibcode {}, error {}' .format(bibcode, e)) self.logger.exception('general stacktrace') tasks.task_output_nonbib.delay(nonbib_protos) tasks.task_output_metrics.delay(metrics_protos)
def nonbib_to_master_pipeline(nonbib_engine, schema, batch_size=1): """send all nonbib data to queue for delivery to master pipeline""" global config Session = sessionmaker(bind=nonbib_engine) session = Session() session.execute('set search_path to {}'.format(schema)) tmp = [] i = 0 max_rows = config['MAX_ROWS'] q = session.query(models.NonBibTable).options( load_only(*nonbib_to_master_select_fields)) for current_row in q.yield_per(100): current_row = nonbib_to_master_dict(current_row) add_data_link(session, current_row) cleanup_for_master(current_row) rec = NonBibRecord(**current_row) tmp.append(rec._data) i += 1 if max_rows > 0 and i >= max_rows: break if len(tmp) >= batch_size: recs = NonBibRecordList() recs.nonbib_records.extend(tmp) tmp = [] logger.info("Calling 'app.forward_message' count = '%s'", i) task_output_results.delay(recs) if len(tmp) > 0: recs = NonBibRecordList() recs.nonbib_records.extend(tmp) logger.info("Calling 'app.forward_message' with count = '%s'", i) task_output_results.delay(recs) session.close()
def test_task_update_record_nonbib(self): with patch('adsmp.tasks.task_index_records.delay') as next_task: tasks.task_update_record( NonBibRecord(bibcode='2015ApJ...815..133S', read_count=9)) self.assertEquals( self.app.get_record(bibcode='2015ApJ...815..133S') ['nonbib_data']['read_count'], 9) self.assertFalse(next_task.called)
def test_protobuf(self): """make sure protobuf are created without an exception""" with Processor(compute_metrics=False) as processor, patch( 'adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): d = processor._read_next_bibcode('1057wjlf.book.....C') c = processor._convert(d) nonbib = NonBibRecord(**c) print('nonbib = {}'.format(nonbib))
def _build_nonbib_record(app, citation_change, record, db_versions, status): doi = citation_change.content nonbib_record_dict = { 'status': status, 'bibcode': record.bibcode, 'boost': 0.5, # Value between 0 and 1 'citation_count': record.citation_count, 'data': [], 'data_links_rows': [ { 'link_type': 'ESOURCE', 'link_sub_type': 'PUB_HTML', 'url': [app.conf['DOI_URL'] + doi], 'title': [''], 'item_count': 0 }, ], # `item_count` only used for DATA and not ESOURCES 'citation_count_norm': record.citation_count_norm, 'grants': [], 'ned_objects': [], 'norm_cites': 0, # log10-normalized count of citations computed on the classic site but not currently used 'read_count': record.read_count, 'readers': [], 'simbad_objects': [], 'total_link_counts': 0 # Only used for DATA and not for ESOURCES } if db_versions not in [{"": ""}, {}, None]: nonbib_record_dict['data_links_rows'].append({ 'link_type': 'ASSOCIATED', 'link_sub_type': '', 'url': db_versions.values(), 'title': db_versions.keys(), 'item_count': 0 }) nonbib_record = NonBibRecord(**nonbib_record_dict) nonbib_record.esource.extend(record.esources) nonbib_record.reference.extend(record.reference) nonbib_record.property.extend(record.property) return nonbib_record
def task_update_record(msg): """Receives payload to update the record. @param msg: protobuff that contains at minimum - bibcode - and specific payload """ logger.debug('Updating record: %s', msg) status = app.get_msg_status(msg) if status == 'deleted': task_delete_documents(msg.bibcode) elif status == 'active': type = app.get_msg_type(msg) bibcodes = [] # save into a database # passed msg may contain details on one bibcode or a list of bibcodes if type == 'nonbib_records': for m in msg.nonbib_records: m = NonBibRecord.deserializer(m.SerializeToString()) t = app.get_msg_type(m) bibcodes.append(m.bibcode) record = app.update_storage(m.bibcode, t, m.toJSON()) logger.debug('Saved record from list: %s', record) elif type == 'metrics_records': for m in msg.metrics_records: m = MetricsRecord.deserializer(m.SerializeToString()) t = app.get_msg_type(m) bibcodes.append(m.bibcode) record = app.update_storage(m.bibcode, t, m.toJSON()) logger.debug('Saved record from list: %s', record) else: # here when record has a single bibcode bibcodes.append(msg.bibcode) record = app.update_storage(msg.bibcode, type, msg.toJSON()) logger.debug('Saved record: %s', record) # trigger futher processing task_index_records.delay(bibcodes) else: logger.error('Received a message with unclear status: %s', msg)
def nonbib_delta_to_master_pipeline(nonbib_engine, schema, batch_size=1): """send data for changed bibcodes to master pipeline the delta table was computed by comparing to sets of nonbib data perhaps ingested on succesive days""" global config Session = sessionmaker(bind=nonbib_engine) session = Session() session.execute('set search_path to {}'.format(schema)) tmp = [] i = 0 n = nonbib.NonBib(schema) max_rows = config['MAX_ROWS'] for current_delta in session.query(models.NonBibDeltaTable).yield_per(100): row = n.get_by_bibcode(nonbib_engine, current_delta.bibcode, nonbib_to_master_select_fields) row = nonbib_to_master_dict(row) add_data_link(session, row) cleanup_for_master(row) rec = NonBibRecord(**row) tmp.append(rec._data) i += 1 if max_rows > 0 and i > max_rows: break if len(tmp) >= batch_size: recs = NonBibRecordList() recs.nonbib_records.extend(tmp) tmp = [] logger.debug("Calling 'app.forward_message' with '%s' items", len(recs.nonbib_records)) task_output_results.delay(recs) if len(tmp) > 0: recs = NonBibRecordList() recs.nonbib_records.extend(tmp) logger.debug("Calling 'app.forward_message' with final '%s' items", len(recs.nonbib_records)) task_output_results.delay(recs)
def nonbib_bibs_to_master_pipeline(nonbib_engine, schema, bibcodes): """send data for the passed bibcodes to master""" Session = sessionmaker(bind=nonbib_engine) session = Session() session.execute('set search_path to {}'.format(schema)) n = nonbib.NonBib(schema) tmp = [] for bibcode in bibcodes: row = n.get_by_bibcode(nonbib_engine, bibcode, nonbib_to_master_select_fields) if row: row = nonbib_to_master_dict(row) add_data_link(session, row) cleanup_for_master(row) rec = NonBibRecord(**row) tmp.append(rec._data) else: print 'unknown bibcode ', bibcode recs = NonBibRecordList() recs.nonbib_records.extend(tmp) logger.debug("Calling 'app.forward_message' for '%s' bibcodes", len(recs.nonbib_records)) task_output_results.delay(recs)