Esempio n. 1
0
def nonbib_to_master_pipeline(nonbib_engine, schema, batch_size=1):
    """send all nonbib data to queue for delivery to master pipeline"""
    global config
    Session = sessionmaker(bind=nonbib_engine)
    session = Session()
    session.execute('set search_path to {}'.format(schema))
    tmp = []
    i = 0
    max_rows = config['MAX_ROWS']
    q = session.query(models.NonBibTable).options(
        load_only(*nonbib_to_master_select_fields))
    for current_row in q.yield_per(100):
        current_row = nonbib_to_master_dict(current_row)
        add_data_link(session, current_row)
        cleanup_for_master(current_row)
        rec = NonBibRecord(**current_row)
        tmp.append(rec._data)
        i += 1
        if max_rows > 0 and i >= max_rows:
            break
        if len(tmp) >= batch_size:
            recs = NonBibRecordList()
            recs.nonbib_records.extend(tmp)
            tmp = []
            logger.info("Calling 'app.forward_message' count = '%s'", i)
            task_output_results.delay(recs)

    if len(tmp) > 0:
        recs = NonBibRecordList()
        recs.nonbib_records.extend(tmp)
        logger.info("Calling 'app.forward_message' with count = '%s'", i)
        task_output_results.delay(recs)
    session.close()
Esempio n. 2
0
    def test_task_update_record_delete(self):

        for x, cls in (('fulltext', FulltextUpdate), ('orcid_claims',
                                                      OrcidClaims)):
            self.app.update_storage('bibcode', x, {'foo': 'bar'})
            self.assertEquals(self.app.get_record('bibcode')[x]['foo'], 'bar')
            with patch('adsmp.tasks.task_index_records.delay') as next_task:
                tasks.task_update_record(
                    cls(bibcode='bibcode', status='deleted'))
                self.assertEquals(self.app.get_record('bibcode')[x], None)
                self.assertTrue(self.app.get_record('bibcode'))

        recs = NonBibRecordList()
        recs.nonbib_records.extend(
            [NonBibRecord(bibcode='bibcode', status='deleted').data])
        with patch('adsmp.tasks.task_index_records.delay') as next_task:
            tasks.task_update_record(recs)
            self.assertEquals(self.app.get_record('bibcode')['metrics'], None)
            self.assertTrue(self.app.get_record('bibcode'))

        with patch('adsmp.tasks.task_delete_documents') as next_task:
            tasks.task_update_record(
                DenormalizedRecord(bibcode='bibcode', status='deleted'))
            self.assertTrue(next_task.called)
            self.assertTrue(next_task.call_args[0], ('bibcode', ))
Esempio n. 3
0
def diagnose_nonbib():
    """send hard coded nonbib data the master pipeline

    useful for testing to verify connectivity"""

    test_data = {
        'bibcode': '2003ASPC..295..361M',
        'simbad_objects': [],
        'grants': ['g'],
        'boost': 0.11,
        'citation_count': 0,
        'read_count': 2,
        'readers': ['a', 'b'],
        'reference': ['c', 'd']
    }
    recs = NonBibRecordList()
    rec = NonBibRecord(**test_data)
    recs.nonbib_records.extend([rec._data])
    print 'sending nonbib data for bibocde', test_data[
        'bibcode'], 'to master pipeline'
    print 'using CELERY_BROKER', config['CELERY_BROKER']
    print '  CELERY_DEFAULT_EXCHANGE', config['CELERY_DEFAULT_EXCHANGE']
    print '  CELERY_DEFAULT_EXCHANGE_TYPE', config[
        'CELERY_DEFAULT_EXCHANGE_TYPE']
    print '  OUTPUT_CELERY_BROKER', config['OUTPUT_CELERY_BROKER']
    print '  OUTPUT_TASKNAME', config['OUTPUT_TASKNAME']
    print 'this action did not use ingest database (configured at', config[
        'INGEST_DATABASE'], ')'
    print '  or the metrics database (at', config['METRICS_DATABASE'], ')'
    tasks.task_output_results.delay(recs)
Esempio n. 4
0
    def process_bibcodes(self, bibcodes):
        """send nonbib and metrics records to master for the passed bibcodes

        for each bibcode
            read nonbib data from files, generate nonbib protobuf
            compute metrics, generate protobuf"""
        # batch up messages to master for improved performance
        nonbib_protos = NonBibRecordList()
        metrics_protos = MetricsRecordList()

        for bibcode in bibcodes:
            try:
                nonbib = self._read_next_bibcode(bibcode)
                converted = self._convert(nonbib)
                nonbib_proto = NonBibRecord(**converted)
                nonbib_protos.nonbib_records.extend([nonbib_proto._data])
                if self.compute_metrics:
                    met = self._compute_metrics(nonbib)
                    metrics_proto = MetricsRecord(**met)
                    metrics_protos.metrics_records.extend(
                        [metrics_proto._data])
            except Exception as e:
                self.logger.error(
                    'serious error in process.process_bibcodes for bibcode {}, error {}'
                    .format(bibcode, e))
                self.logger.exception('general stacktrace')
        tasks.task_output_nonbib.delay(nonbib_protos)
        tasks.task_output_metrics.delay(metrics_protos)
Esempio n. 5
0
 def test_task_update_record_augments_list(self):
     with patch('adsmp.tasks.task_index_records.delay') as next_task:
         recs = NonBibRecordList()
         nonbib_data = {'bibcode': '2003ASPC..295..361M', 'boost': 3.1}
         nonbib_data2 = {'bibcode': '3003ASPC..295..361Z', 'boost': 3.2}
         rec = NonBibRecord(**nonbib_data)
         rec2 = NonBibRecord(**nonbib_data2)
         recs.nonbib_records.extend([rec._data, rec2._data])
         tasks.task_update_record(recs)
         self.assertFalse(next_task.called)
Esempio n. 6
0
 def test_task_update_record_nonbib_list(self):
     with patch('adsmp.tasks.task_index_records.delay') as next_task:
         self.assertFalse(next_task.called)
         recs = NonBibRecordList()
         nonbib_data = {'bibcode': '2003ASPC..295..361M', 'refereed': False}
         nonbib_data2 = {'bibcode': '3003ASPC..295..361Z', 'refereed': True}
         rec = NonBibRecord(**nonbib_data)
         rec2 = NonBibRecord(**nonbib_data2)
         recs.nonbib_records.extend([rec._data, rec2._data])
         tasks.task_update_record(recs)
         self.assertTrue(next_task.called)
         self.assertTrue(next_task.call_args[0],
                         ('2015ApJ...815..133S', '3003ASPC..295..361Z'))
Esempio n. 7
0
def nonbib_delta_to_master_pipeline(nonbib_engine, schema, batch_size=1):
    """send data for changed bibcodes to master pipeline

    the delta table was computed by comparing to sets of nonbib data
    perhaps ingested on succesive days"""
    global config
    Session = sessionmaker(bind=nonbib_engine)
    session = Session()
    session.execute('set search_path to {}'.format(schema))
    tmp = []
    i = 0
    n = nonbib.NonBib(schema)
    max_rows = config['MAX_ROWS']
    for current_delta in session.query(models.NonBibDeltaTable).yield_per(100):
        row = n.get_by_bibcode(nonbib_engine, current_delta.bibcode,
                               nonbib_to_master_select_fields)
        row = nonbib_to_master_dict(row)
        add_data_link(session, row)
        cleanup_for_master(row)
        rec = NonBibRecord(**row)
        tmp.append(rec._data)
        i += 1
        if max_rows > 0 and i > max_rows:
            break
        if len(tmp) >= batch_size:
            recs = NonBibRecordList()
            recs.nonbib_records.extend(tmp)
            tmp = []
            logger.debug("Calling 'app.forward_message' with '%s' items",
                         len(recs.nonbib_records))
            task_output_results.delay(recs)

    if len(tmp) > 0:
        recs = NonBibRecordList()
        recs.nonbib_records.extend(tmp)
        logger.debug("Calling 'app.forward_message' with final '%s' items",
                     len(recs.nonbib_records))
        task_output_results.delay(recs)
Esempio n. 8
0
def nonbib_bibs_to_master_pipeline(nonbib_engine, schema, bibcodes):
    """send data for the passed bibcodes to master"""
    Session = sessionmaker(bind=nonbib_engine)
    session = Session()
    session.execute('set search_path to {}'.format(schema))
    n = nonbib.NonBib(schema)
    tmp = []
    for bibcode in bibcodes:
        row = n.get_by_bibcode(nonbib_engine, bibcode,
                               nonbib_to_master_select_fields)
        if row:
            row = nonbib_to_master_dict(row)
            add_data_link(session, row)
            cleanup_for_master(row)
            rec = NonBibRecord(**row)
            tmp.append(rec._data)
        else:
            print 'unknown bibcode ', bibcode
    recs = NonBibRecordList()
    recs.nonbib_records.extend(tmp)
    logger.debug("Calling 'app.forward_message' for '%s' bibcodes",
                 len(recs.nonbib_records))
    task_output_results.delay(recs)