def test_update_store(self):
        msg = TurboBeeMsg(qid='foo', value='bar')
        msg2 = TurboBeeMsg(qid='foo', value='bar')
        resp = MagicMock()
        resp.raise_for_status = lambda: 1
        with patch.object(self.app._client, 'post', return_value=resp) as post:
            r = self.app.update_store([msg, msg2])

            assert post.call_args[0][
                0] == u'https://api.adsabs.harvard.edu/v1/store/update'
            assert len(post.call_args[1]['files']) == 2
            assert post.call_args[1]['files']['0'] == '\n\x03fooR\x03bar'
def store(qid=None):

    if request.method == 'POST':
        out = {}

        # there might be many objects in there...
        msgs = []
        for _, fo in request.files.items():

            if not hasattr(fo, 'read'):
                continue  # not a file object

            # on error, we'll crash early; that's OK
            msg = TurboBeeMsg.loads('adsmsg.turbobee.TurboBeeMsg', fo.read())
            msgs.append(msg)

        # also read data posted the normal way
        for k, v in request.form.items():
            msg = TurboBeeMsg.loads('adsmsg.turbobee.TurboBeeMsg',
                                    base64.decodestring(v))
            msgs.append(msg)

        if not len(msgs):
            return jsonify({'msg':
                            'Empty stream, no messages were received'}), 501

        out = []
        if len(msgs):
            out = current_app.set_pages(msgs)

        if 'errors' in out:
            return jsonify(out), 400
        return jsonify(out), 200

    elif request.method == 'DELETE':
        with current_app.session_scope() as session:
            pages = session.query(Pages).options(
                load_only('qid')).filter_by(qid=qid).first()

            qid = None
            if not pages:
                return jsonify({'qi': qid, 'msg': 'Not found'}), 404
            else:
                qid = pages.qid
                session.delete(pages)

            try:
                session.commit()
            except exc.IntegrityError as e:
                session.rollback()

            return jsonify({'qid': qid, 'status': 'deleted'}), 200
    def test_proto_empty(self):
        msg = TurboBeeMsg()
        my_data = {
            'file_field': (StringIO(msg.dump()[1]), 'turbobee_msg.proto')
        }

        r = self.client.post(url_for('turbobee_app.store', qid='asdf'),
                             content_type='multipart/form-data',
                             data=my_data)

        self.assertEqual(r.status_code, 200)

        with self.app.session_scope() as session:
            assert len(session.query(Pages).all()) == 1
 def submit(bibcode):
     msg = TurboBeeMsg(target=tmpl % {'bibcode': bibcode})
     if queue == 'harvest-bumblebee':
         tasks.task_harvest_bumblebee.delay(msg)
     elif queue == 'static-bumblebee':
         tasks.task_static_bumblebee.delay(msg)
     else:
         raise Exception('Unknown target: %s' % queue)
def submit_url(url, queue='harvest-bumblebee'):    
    """Submits a specific URL for processing."""
    msg = TurboBeeMsg(target=url)
    if queue == 'harvest-bumblebee':
        tasks.task_harvest_bumblebee.delay(msg)
    elif queue == 'static-bumblebee':
        tasks.task_static_bumblebee.delay(msg)
    else:
        raise Exception('Unknown target: %s' % queue)
    def test_task_harvest_bumblebee(self):


        with patch.object(tasks.app, '_load_url', return_value = '<html><head>foo</head> 2019MNRAS.482.1872B </html>') as loader, \
            patch.object(tasks.task_output_results, 'delay') as next_task:

            msg = TurboBeeMsg(target='2019MNRAS.482.1872B')
            tasks.task_harvest_bumblebee(msg)
            self.assertEquals(loader.call_args[0], (
                'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.1872B/abstract',
            ))
            self.assertEquals(
                msg.target,
                '//ui.adsabs.harvard.edu/abs/2019MNRAS.482.1872B/abstract')
            self.assertTrue(next_task.called)
            self.assertTrue(msg.updated.seconds > 0)
            self.assertTrue(
                msg.expires.seconds >= msg.updated.seconds + 24 * 60 * 60)
            self.assertTrue(
                msg.eol.seconds >= msg.updated.seconds + 24 * 60 * 60 * 30)
            self.assertTrue(msg.ctype == msg.ContentType.html)

            msg = TurboBeeMsg(
                target='https://dev.adsabs.harvard.edu/#abs/foobar')
            tasks.task_harvest_bumblebee(msg)
            self.assertEquals(loader.call_args[0],
                              ('https://dev.adsabs.harvard.edu/#abs/foobar', ))
            self.assertEquals(msg.target,
                              '//dev.adsabs.harvard.edu/abs/foobar')

            msg = TurboBeeMsg(
                target='https://dev.adsabs.harvard.edu/abs/foobar')
            tasks.task_harvest_bumblebee(msg)
            self.assertEquals(loader.call_args[0],
                              ('https://dev.adsabs.harvard.edu/#abs/foobar', ))
            self.assertEquals(msg.target,
                              '//dev.adsabs.harvard.edu/abs/foobar')

            msg = TurboBeeMsg(target='//dev.adsabs.harvard.edu/abs/foobar')
            tasks.task_harvest_bumblebee(msg)
            self.assertEquals(loader.call_args[0],
                              ('https://dev.adsabs.harvard.edu/#abs/foobar', ))
            self.assertEquals(msg.target,
                              '//dev.adsabs.harvard.edu/abs/foobar')
    def test_serializer(self):
        # check we are not movng dates (by loosing nanosec precision)
        rec = TurboBeeMsg()
        now = datetime.utcnow()
        rec.created = rec.get_timestamp(now)

        for i in xrange(10050):
            rec = rec.loads(*rec.dump())
            self.assertEqual(rec.get_datetime(rec.created), now)
    def test_connection_error(self):

        msg = TurboBeeMsg(qid='foo',
                          value='bar',
                          target='http://www.google.com')
        self.app.conf[
            'PUPPETEER_ENDPOINT'] = 'http://localhost:30012222/scrape'
        try:
            self.app.harvest_webpage(msg)
        except ConnectionError:
            pass
 def test_files_form_post(self):
     """Check we can send data through both channels."""
     msg = TurboBeeMsg(value=u'\u6789'.encode('utf8'))
     msg2 = TurboBeeMsg(value=u'\u6789'.encode('utf8'))
     r = self.client.post(url_for('turbobee_app.store'),
                          content_type='multipart/form-data',
                          data={
                              'foo':
                              base64.encodestring(msg.dump()[1]),
                              'bar': (StringIO(msg2.dump()[1]),
                                      'turbobee_msg.proto'),
                          })
     assert len(r.json['created']) == 2
     msgs = list(self.app.get_pages(r.json['created']))
     for m in msgs:
         m['content'].decode('utf8') == u'\u6789'
    def test_store_post(self):
        msg = TurboBeeMsg()
        now = dt.datetime.utcnow()
        msg.created = msg.get_timestamp(now)
        msg.updated = msg.get_timestamp(now)
        msg.expires = msg.get_timestamp(now)
        msg.eol = msg.get_timestamp(now)
        msg.set_value('hello world')
        msg.ctype = msg.ContentType.html
        msg.target = 'https:///some.com'
        msg.owner = 234
        my_data = {
            'file_field': (StringIO(msg.dump()[1]), 'turbobee_msg.proto')
        }

        r = self.client.post(url_for('turbobee_app.store', qid='asdf'),
                             content_type='multipart/form-data',
                             data=my_data)

        self.assertEqual(r.status_code, 200)
        assert len(r.json['created']) == 1

        msg2 = msg.loads(*msg.dump())
        msg.qid = r.json['created'][0]
        r = self.client.post(url_for('turbobee_app.store'),
                             content_type='multipart/form-data',
                             data={
                                 'foo': (StringIO(msg.dump()[1]),
                                         'turbobee_msg.proto'),
                                 'bar': (StringIO(msg2.dump()[1]),
                                         'turbobee_msg.proto'),
                             })
        self.assertEqual(r.status_code, 200)
        assert len(r.json['created']) == 1
        assert len(r.json['updated']) == 1

        r = self.client.get(url_for('turbobee_app.store_get', qid=msg.qid))
        self.assertEqual(r.status_code, 200)
        r = self.client.head(url_for('turbobee_app.store_get', qid=msg.qid))
        self.assertEqual(r.status_code, 200)

        r = self.client.get(url_for('turbobee_app.store_get', qid='foo'))
        self.assertEqual(r.status_code, 404)
        r = self.client.head(url_for('turbobee_app.store_get', qid='foo'))
        self.assertEqual(r.status_code, 404)
    def test_is_valid(self):
        rec = TurboBeeMsg()
        rec.qid = 'foo'
        self.assertTrue(rec.is_valid())

        rec.qid = 'unlimiteeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeed' * 512
        self.assertTrue(rec.is_valid())

        rec.status = Status.new
        now = datetime.utcnow()

        rec.created = rec.get_timestamp(now)
        self.assertEqual(now, rec.get_datetime(rec.created))

        rec.value = 'foobar'
        rec.ctype = rec.ContentType.text
        rec.ctype = 0  # unknown
        self.assertTrue(rec.is_valid())

        rec.set_value(1)
        self.assertTrue(rec.is_valid())

        rec.set_value(u'\ud789')
        rec.ctype = rec.ContentType.text

        self.assertEqual(rec.get_value(), u'\ud789'.encode('utf8'))
        rec.ctype = rec.ContentType.binary
        self.assertEqual(rec.get_value(), u'\ud789'.encode('utf8'))

        rec.set_value({'foo': u'\ud789'}, rec.ContentType.json)
        self.assertEqual(rec.get_value(), {'foo': u'\ud789'})

        rec.set_value(u'\ud789', rec.ContentType.text)
        self.assertEqual(rec.get_value(), u'\ud789'.encode('utf8'))
 def test_set_get_pages(self):
     msg = TurboBeeMsg()
     now = datetime.utcnow()
     
     msg.created = msg.get_timestamp(now)
     msg.updated = msg.get_timestamp(now)
     msg.expires = msg.get_timestamp(now)
     msg.eol = msg.get_timestamp(now)
     msg.set_value('hello world')
     msg.ctype = msg.ContentType.html
     msg.target = 'https:///some.com'
     msg.owner = 234
     
     r = self.app.set_pages([msg])
     assert 'created' in r
     assert len(r['created']) ==1
     
     pages = list(self.app.get_pages(r['created']))
     expected = {
         'id': 1,
         'target': u'https:///some.com', 
         'content_type': u'application/html', 
         'content': 'hello world', 
         'created': get_date(now).isoformat(), 
         'updated': get_date(now).isoformat(), 
         'expires': get_date(now).isoformat(), 
         'lifetime': get_date(now).isoformat(),
         'owner': 234,
         'qid': pages[0]['qid']
         }
     assert pages[0] == expected
     
     msg.qid = pages[0]['qid']
     r = self.app.set_pages([msg])
     assert 'updated' in r
     assert len(r['updated']) ==1
     assert r['updated'][0] == expected['qid']
     
     msg.status = Status.deleted
     r = self.app.set_pages([msg])
     assert 'deleted' in r
     assert r['deleted'][0] == expected['qid']
     
     r = self.app.set_pages([msg])
     assert r['ignored-deleted'][0] == expected['qid']
     assert len(list(self.app.get_pages(expected['qid']))) == 0
     
     # insert it again
     msg.status = Status.active
     r = self.app.set_pages([msg])
     assert r['created'][0]
     assert r['created'][0] != expected['qid']
     
     l = list(self.app.get_pages(r['created'], fields=['foo', 'qid', 'content', 'created']))
     assert l[0]['qid'] == r['created'][0]
     assert l[0]['created'] == expected['created']
     assert l[0]['content'] == 'hello world'
     assert 'updated' not in l[0]
     assert 'foo' not in l[0]
     
     
     # set multiple objects at once
     msg.qid = r['created'][0]
     msg2 = msg.loads(*msg.dump())
     msg2.qid = ''
     r = self.app.set_pages([msg, msg2])
     assert r['created'][0] 
     assert r['updated'][0] == msg.qid
     
     # update one by one
     msg2.qid = r['created'][0]
     r = self.app.set_pages([msg, msg2], one_by_one=True)
     assert msg.qid in r['updated']
     assert msg2.qid in r['updated']
     
     
     r = self.app.set_pages([msg, msg2, msg, msg, msg], one_by_one=True)
     assert set(r['updated']) == set([msg.qid, msg2.qid])
Exemple #13
0
    def _retrieve_abstract_template(self, url):
        msg = TurboBeeMsg(target=url)
        i = 0

        parts = self._parse_bbb_url(url)

        while not self.harvest_webpage(msg) and i < 3:
            self.logger.warn('Retrying to fetch: ' + url)
            i += 1

        html = msg.get_value()
        html = html.decode('utf8')

        # some basic checks
        if 'data-widget="ShowAbstract"' not in html:
            raise Exception(
                'Cannot process fetched page, or data-widget for {}'.format(
                    url))
        # TODO; find the sections and replace them with symbolic names {tags}, {abstract}....
        x = html.find('data-highwire')
        while x > 0:
            x -= 1
            if html[x] == '<':
                break
        end = html.find('data-highwire', x)
        while html.find('data-highwire', end + 1) > 0:
            end = html.find('data-highwire', end + 1)

        while html[end] != '>':
            end += 1
        if end == -1 or x == 0:
            raise Exception("Cannot find tags section")
        html = html[0:x] + '{{tags}}' + html[end + 1:]

        x = html.find('<article')
        end = html.find('</article')

        if x == -1 or end == -1:
            raise Exception(
                'Cannot process fetched page, cannot find abstract section for {}'
                .format(url))

        while x < len(html) and x < end:
            x += 1
            if html[x] == '>':
                x += 1
                break

        if x > end:
            raise Exception(
                'Cannot process fetched page, cannot find abstract section for {}'
                .format(url))

        html = html[0:x] + '{{abstract}}' + html[end:]

        if 'bibcode' in parts and parts['bibcode']:
            html = html.replace(parts['bibcode'], u'{{bibcode}}')

        # finally, cut out noscript warning that says javascript is required
        x = html.find('id="noscriptmsg"')  # div id within noscript
        if x > -1:
            start = html.rfind('<noscript', 0, x)
            end = html.find('</noscript>', start) + len('</noscript>')
            if end > start:
                html = html[:start] + html[end:]
        # and the first noscript that includes a style, it hides the abstract div
        x = html.find('<noscript>')
        if x > -1:
            offset = x + len('<noscript>')
            if html[offset:].strip().startswith('<style>'):
                end = html.find('</noscript>') + len('</noscript>')
                if end > x:
                    html = html[:x] + html[end:]

        return html
    def test_build_abstract_page(self):
        msg = TurboBeeMsg(
            target=
            'https://ui.adsabs.harvard.edu/#abs/2019LRR....22....1I/abstract')

        html = ''
        with open(os.path.dirname(__file__) + '/abs.html', 'r') as f:
            html = f.read()

        json = {
            u'responseHeader': {
                u'status': 0,
                u'QTime': 12,
                u'params': {
                    u'x-amzn-trace-id':
                    u'Root=1-5c89826f-96274e0a2ef0646da1cdc944',
                    u'rows':
                    u'10',
                    u'q':
                    u'identifier:"2019LRR....22....1I"',
                    u'start':
                    u'0',
                    u'wt':
                    u'json',
                    u'fl':
                    u'[citations],abstract,aff,author,bibcode,citation_count,comment,data,doi,esources,first_author,id,isbn,issn,issue,keyword,links_data,page,property,pub,pub_raw,pubdate,pubnote,read_count,title,volume,year'
                }
            },
            u'response': {
                u'start':
                0,
                u'numFound':
                1,
                u'docs': [{
                    u'read_count':
                    514,
                    u'pubdate':
                    u'2019-12-00',
                    u'first_author':
                    u'Ishak, Mustapha',
                    u'abstract':
                    u'We review recent developments and results in testing general relativity (GR) at cosmological scales. The subject has witnessed rapid growth during the last two decades with the aim of addressing the question of cosmic acceleration and the dark energy associated with it. However, with the advent of precision cosmology, it has also become a well-motivated endeavor by itself to test gravitational physics at cosmic scales. We overview cosmological probes of gravity, formalisms and parameterizations for testing deviations from GR at cosmological scales, selected modified gravity (MG) theories, gravitational screening mechanisms, and computer codes developed for these tests. We then provide summaries of recent cosmological constraints on MG parameters and selected MG models. We supplement these cosmological constraints with a summary of implications from the recent binary neutron star merger event. Next, we summarize some results on MG parameter forecasts with and without astrophysical systematics that will dominate the uncertainties. The review aims at providing an overall picture of the subject and an entry point to students and researchers interested in joining the field. It can also serve as a quick reference to recent results and constraints on testing gravity at cosmological scales.',
                    u'links_data': [
                        u'{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://arxiv.org/abs/1806.10122"}',
                        u'{"access": "open", "instances": "", "title": "", "type": "electr", "url": "https://doi.org/10.1007%2Fs41114-018-0017-4"}'
                    ],
                    u'year':
                    u'2019',
                    u'pubnote': [
                        u'Invited review article for Living Reviews in Relativity. 201 pages, 17 figures. Matches published version; doi:10.1007/s41114-018-0017-4'
                    ],
                    u'id':
                    u'15558883',
                    u'bibcode':
                    u'2019LRR....22....1I',
                    u'author': [u'Ishak, Mustapha'],
                    u'aff': [
                        u'Department of Physics, The University of Texas at Dallas, Richardson, TX, USA'
                    ],
                    u'esources': [u'EPRINT_HTML', u'EPRINT_PDF', u'PUB_HTML'],
                    u'issue':
                    u'1',
                    u'pub_raw':
                    u'Living Reviews in Relativity, Volume 22, Issue 1, article id. 1, <NUMPAGES>204</NUMPAGES> pp.',
                    u'pub':
                    u'Living Reviews in Relativity',
                    u'volume':
                    u'22',
                    u'doi': [u'10.1007/s41114-018-0017-4'],
                    u'keyword': [
                        u'Tests of relativistic gravity',
                        u'Theories of gravity', u'Modified gravity',
                        u'Cosmological tests', u'Post-Friedmann limit',
                        u'Gravitational waves',
                        u'Astrophysics - Cosmology and Nongalactic Astrophysics',
                        u'Astrophysics - Astrophysics of Galaxies',
                        u'General Relativity and Quantum Cosmology'
                    ],
                    u'title': [u'Testing general relativity in cosmology'],
                    u'citation_count':
                    18,
                    u'[citations]': {
                        u'num_citations': 18,
                        u'num_references': 928
                    },
                    u'property': [
                        u'ESOURCE', u'ARTICLE', u'REFEREED', u'PUB_OPENACCESS',
                        u'EPRINT_OPENACCESS', u'EPRINT_OPENACCESS',
                        u'OPENACCESS'
                    ],
                    u'page': [u'1']
                }]
            }
        }

        with patch.object(self.app, '_load_url', return_value=html) as loader, \
            patch.object(self.app, 'search_api', return_value=json) as searcher:
            assert self.app.build_static_page(msg)
            p = msg.get_value().decode('utf8')
            assert u'og:image' in p
            assert u'We review recent' in p
            assert u'ADS requires JavaScript' not in p
            assert u'#app-container' not in p
def harvest_by_null(queue='priority-bumblebee', 
                    max_num=-1,
                    **kwargs):
    """
    Process bibcodes retrived by a {'null': True} query to ads api /v1/store/search endpoint.
    This endpoint returns ~50 records.  Passing null selects rows of pages where created==None.  
    These rows includes a target field which is a url without the protocol 
    (e.g., //dev.adsabs.harvard.edu/abs/1997A%26A...326..950F/styles/img/favicon.ico or 
    //devui.adsabs.harvard.edu/abs/doi:10.1117/12.2314185).
    
    :param: queue - where to send the claims
    
    :return: no return
    """
    
    url = app.conf.get('STORE_SEARCH_ENDPOINT', 'https://api.adsabs.harvard.edu/v1/store/search')
    params = {'null': True}
    
    i = 0
    seen = set()
    with app.session_scope() as session:
        kv = session.query(KeyValue).filter_by(key='last.null').first()
        if kv is not None:
            last_id = kv.value
        else:
            last_id = -1 
    app.logger.info('harvest_by_null running with last_id of {}'.format(last_id))

    while True:
        # reuse the http ads api client with keep-alive connections and bearer token
        r = app._client.get(url, params=params)
        r.raise_for_status()
        j = i
        for d in r.json():
            if app.iscachable(d['target']):
                msg = TurboBeeMsg(target=d['target'],
                                  qid=d['qid'])
            
                app.logger.info('harvest_by_null queuing target {}'.format(d['target']))
                tasks.task_priority_queue.delay(msg)
            params['last_id'] = d['id']
            last_id = d['id']
            if d['id'] in seen:
                break
            seen.add(d['id'])
            i += 1
            
            if max_num > 0 and i > max_num:
                break
        
        if j == i:
            break
        
    if i > 0:
        with app.session_scope() as session:
            kv = session.query(KeyValue).filter_by(key='last.null').first()
            if kv is None:
                kv = KeyValue(key='last.null', value=last_id)
                session.add(kv)
            else:
                kv.value = last_id
            session.commit()
    
    app.logger.info('Done submitting {0} pages.'.format(i))
    print i, last_id