def test_update_store(self): msg = TurboBeeMsg(qid='foo', value='bar') msg2 = TurboBeeMsg(qid='foo', value='bar') resp = MagicMock() resp.raise_for_status = lambda: 1 with patch.object(self.app._client, 'post', return_value=resp) as post: r = self.app.update_store([msg, msg2]) assert post.call_args[0][ 0] == u'https://api.adsabs.harvard.edu/v1/store/update' assert len(post.call_args[1]['files']) == 2 assert post.call_args[1]['files']['0'] == '\n\x03fooR\x03bar'
def store(qid=None): if request.method == 'POST': out = {} # there might be many objects in there... msgs = [] for _, fo in request.files.items(): if not hasattr(fo, 'read'): continue # not a file object # on error, we'll crash early; that's OK msg = TurboBeeMsg.loads('adsmsg.turbobee.TurboBeeMsg', fo.read()) msgs.append(msg) # also read data posted the normal way for k, v in request.form.items(): msg = TurboBeeMsg.loads('adsmsg.turbobee.TurboBeeMsg', base64.decodestring(v)) msgs.append(msg) if not len(msgs): return jsonify({'msg': 'Empty stream, no messages were received'}), 501 out = [] if len(msgs): out = current_app.set_pages(msgs) if 'errors' in out: return jsonify(out), 400 return jsonify(out), 200 elif request.method == 'DELETE': with current_app.session_scope() as session: pages = session.query(Pages).options( load_only('qid')).filter_by(qid=qid).first() qid = None if not pages: return jsonify({'qi': qid, 'msg': 'Not found'}), 404 else: qid = pages.qid session.delete(pages) try: session.commit() except exc.IntegrityError as e: session.rollback() return jsonify({'qid': qid, 'status': 'deleted'}), 200
def test_proto_empty(self): msg = TurboBeeMsg() my_data = { 'file_field': (StringIO(msg.dump()[1]), 'turbobee_msg.proto') } r = self.client.post(url_for('turbobee_app.store', qid='asdf'), content_type='multipart/form-data', data=my_data) self.assertEqual(r.status_code, 200) with self.app.session_scope() as session: assert len(session.query(Pages).all()) == 1
def submit(bibcode): msg = TurboBeeMsg(target=tmpl % {'bibcode': bibcode}) if queue == 'harvest-bumblebee': tasks.task_harvest_bumblebee.delay(msg) elif queue == 'static-bumblebee': tasks.task_static_bumblebee.delay(msg) else: raise Exception('Unknown target: %s' % queue)
def submit_url(url, queue='harvest-bumblebee'): """Submits a specific URL for processing.""" msg = TurboBeeMsg(target=url) if queue == 'harvest-bumblebee': tasks.task_harvest_bumblebee.delay(msg) elif queue == 'static-bumblebee': tasks.task_static_bumblebee.delay(msg) else: raise Exception('Unknown target: %s' % queue)
def test_task_harvest_bumblebee(self): with patch.object(tasks.app, '_load_url', return_value = '<html><head>foo</head> 2019MNRAS.482.1872B </html>') as loader, \ patch.object(tasks.task_output_results, 'delay') as next_task: msg = TurboBeeMsg(target='2019MNRAS.482.1872B') tasks.task_harvest_bumblebee(msg) self.assertEquals(loader.call_args[0], ( 'https://ui.adsabs.harvard.edu/#abs/2019MNRAS.482.1872B/abstract', )) self.assertEquals( msg.target, '//ui.adsabs.harvard.edu/abs/2019MNRAS.482.1872B/abstract') self.assertTrue(next_task.called) self.assertTrue(msg.updated.seconds > 0) self.assertTrue( msg.expires.seconds >= msg.updated.seconds + 24 * 60 * 60) self.assertTrue( msg.eol.seconds >= msg.updated.seconds + 24 * 60 * 60 * 30) self.assertTrue(msg.ctype == msg.ContentType.html) msg = TurboBeeMsg( target='https://dev.adsabs.harvard.edu/#abs/foobar') tasks.task_harvest_bumblebee(msg) self.assertEquals(loader.call_args[0], ('https://dev.adsabs.harvard.edu/#abs/foobar', )) self.assertEquals(msg.target, '//dev.adsabs.harvard.edu/abs/foobar') msg = TurboBeeMsg( target='https://dev.adsabs.harvard.edu/abs/foobar') tasks.task_harvest_bumblebee(msg) self.assertEquals(loader.call_args[0], ('https://dev.adsabs.harvard.edu/#abs/foobar', )) self.assertEquals(msg.target, '//dev.adsabs.harvard.edu/abs/foobar') msg = TurboBeeMsg(target='//dev.adsabs.harvard.edu/abs/foobar') tasks.task_harvest_bumblebee(msg) self.assertEquals(loader.call_args[0], ('https://dev.adsabs.harvard.edu/#abs/foobar', )) self.assertEquals(msg.target, '//dev.adsabs.harvard.edu/abs/foobar')
def test_serializer(self): # check we are not movng dates (by loosing nanosec precision) rec = TurboBeeMsg() now = datetime.utcnow() rec.created = rec.get_timestamp(now) for i in xrange(10050): rec = rec.loads(*rec.dump()) self.assertEqual(rec.get_datetime(rec.created), now)
def test_connection_error(self): msg = TurboBeeMsg(qid='foo', value='bar', target='http://www.google.com') self.app.conf[ 'PUPPETEER_ENDPOINT'] = 'http://localhost:30012222/scrape' try: self.app.harvest_webpage(msg) except ConnectionError: pass
def test_files_form_post(self): """Check we can send data through both channels.""" msg = TurboBeeMsg(value=u'\u6789'.encode('utf8')) msg2 = TurboBeeMsg(value=u'\u6789'.encode('utf8')) r = self.client.post(url_for('turbobee_app.store'), content_type='multipart/form-data', data={ 'foo': base64.encodestring(msg.dump()[1]), 'bar': (StringIO(msg2.dump()[1]), 'turbobee_msg.proto'), }) assert len(r.json['created']) == 2 msgs = list(self.app.get_pages(r.json['created'])) for m in msgs: m['content'].decode('utf8') == u'\u6789'
def test_store_post(self): msg = TurboBeeMsg() now = dt.datetime.utcnow() msg.created = msg.get_timestamp(now) msg.updated = msg.get_timestamp(now) msg.expires = msg.get_timestamp(now) msg.eol = msg.get_timestamp(now) msg.set_value('hello world') msg.ctype = msg.ContentType.html msg.target = 'https:///some.com' msg.owner = 234 my_data = { 'file_field': (StringIO(msg.dump()[1]), 'turbobee_msg.proto') } r = self.client.post(url_for('turbobee_app.store', qid='asdf'), content_type='multipart/form-data', data=my_data) self.assertEqual(r.status_code, 200) assert len(r.json['created']) == 1 msg2 = msg.loads(*msg.dump()) msg.qid = r.json['created'][0] r = self.client.post(url_for('turbobee_app.store'), content_type='multipart/form-data', data={ 'foo': (StringIO(msg.dump()[1]), 'turbobee_msg.proto'), 'bar': (StringIO(msg2.dump()[1]), 'turbobee_msg.proto'), }) self.assertEqual(r.status_code, 200) assert len(r.json['created']) == 1 assert len(r.json['updated']) == 1 r = self.client.get(url_for('turbobee_app.store_get', qid=msg.qid)) self.assertEqual(r.status_code, 200) r = self.client.head(url_for('turbobee_app.store_get', qid=msg.qid)) self.assertEqual(r.status_code, 200) r = self.client.get(url_for('turbobee_app.store_get', qid='foo')) self.assertEqual(r.status_code, 404) r = self.client.head(url_for('turbobee_app.store_get', qid='foo')) self.assertEqual(r.status_code, 404)
def test_is_valid(self): rec = TurboBeeMsg() rec.qid = 'foo' self.assertTrue(rec.is_valid()) rec.qid = 'unlimiteeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeed' * 512 self.assertTrue(rec.is_valid()) rec.status = Status.new now = datetime.utcnow() rec.created = rec.get_timestamp(now) self.assertEqual(now, rec.get_datetime(rec.created)) rec.value = 'foobar' rec.ctype = rec.ContentType.text rec.ctype = 0 # unknown self.assertTrue(rec.is_valid()) rec.set_value(1) self.assertTrue(rec.is_valid()) rec.set_value(u'\ud789') rec.ctype = rec.ContentType.text self.assertEqual(rec.get_value(), u'\ud789'.encode('utf8')) rec.ctype = rec.ContentType.binary self.assertEqual(rec.get_value(), u'\ud789'.encode('utf8')) rec.set_value({'foo': u'\ud789'}, rec.ContentType.json) self.assertEqual(rec.get_value(), {'foo': u'\ud789'}) rec.set_value(u'\ud789', rec.ContentType.text) self.assertEqual(rec.get_value(), u'\ud789'.encode('utf8'))
def test_set_get_pages(self): msg = TurboBeeMsg() now = datetime.utcnow() msg.created = msg.get_timestamp(now) msg.updated = msg.get_timestamp(now) msg.expires = msg.get_timestamp(now) msg.eol = msg.get_timestamp(now) msg.set_value('hello world') msg.ctype = msg.ContentType.html msg.target = 'https:///some.com' msg.owner = 234 r = self.app.set_pages([msg]) assert 'created' in r assert len(r['created']) ==1 pages = list(self.app.get_pages(r['created'])) expected = { 'id': 1, 'target': u'https:///some.com', 'content_type': u'application/html', 'content': 'hello world', 'created': get_date(now).isoformat(), 'updated': get_date(now).isoformat(), 'expires': get_date(now).isoformat(), 'lifetime': get_date(now).isoformat(), 'owner': 234, 'qid': pages[0]['qid'] } assert pages[0] == expected msg.qid = pages[0]['qid'] r = self.app.set_pages([msg]) assert 'updated' in r assert len(r['updated']) ==1 assert r['updated'][0] == expected['qid'] msg.status = Status.deleted r = self.app.set_pages([msg]) assert 'deleted' in r assert r['deleted'][0] == expected['qid'] r = self.app.set_pages([msg]) assert r['ignored-deleted'][0] == expected['qid'] assert len(list(self.app.get_pages(expected['qid']))) == 0 # insert it again msg.status = Status.active r = self.app.set_pages([msg]) assert r['created'][0] assert r['created'][0] != expected['qid'] l = list(self.app.get_pages(r['created'], fields=['foo', 'qid', 'content', 'created'])) assert l[0]['qid'] == r['created'][0] assert l[0]['created'] == expected['created'] assert l[0]['content'] == 'hello world' assert 'updated' not in l[0] assert 'foo' not in l[0] # set multiple objects at once msg.qid = r['created'][0] msg2 = msg.loads(*msg.dump()) msg2.qid = '' r = self.app.set_pages([msg, msg2]) assert r['created'][0] assert r['updated'][0] == msg.qid # update one by one msg2.qid = r['created'][0] r = self.app.set_pages([msg, msg2], one_by_one=True) assert msg.qid in r['updated'] assert msg2.qid in r['updated'] r = self.app.set_pages([msg, msg2, msg, msg, msg], one_by_one=True) assert set(r['updated']) == set([msg.qid, msg2.qid])
def _retrieve_abstract_template(self, url): msg = TurboBeeMsg(target=url) i = 0 parts = self._parse_bbb_url(url) while not self.harvest_webpage(msg) and i < 3: self.logger.warn('Retrying to fetch: ' + url) i += 1 html = msg.get_value() html = html.decode('utf8') # some basic checks if 'data-widget="ShowAbstract"' not in html: raise Exception( 'Cannot process fetched page, or data-widget for {}'.format( url)) # TODO; find the sections and replace them with symbolic names {tags}, {abstract}.... x = html.find('data-highwire') while x > 0: x -= 1 if html[x] == '<': break end = html.find('data-highwire', x) while html.find('data-highwire', end + 1) > 0: end = html.find('data-highwire', end + 1) while html[end] != '>': end += 1 if end == -1 or x == 0: raise Exception("Cannot find tags section") html = html[0:x] + '{{tags}}' + html[end + 1:] x = html.find('<article') end = html.find('</article') if x == -1 or end == -1: raise Exception( 'Cannot process fetched page, cannot find abstract section for {}' .format(url)) while x < len(html) and x < end: x += 1 if html[x] == '>': x += 1 break if x > end: raise Exception( 'Cannot process fetched page, cannot find abstract section for {}' .format(url)) html = html[0:x] + '{{abstract}}' + html[end:] if 'bibcode' in parts and parts['bibcode']: html = html.replace(parts['bibcode'], u'{{bibcode}}') # finally, cut out noscript warning that says javascript is required x = html.find('id="noscriptmsg"') # div id within noscript if x > -1: start = html.rfind('<noscript', 0, x) end = html.find('</noscript>', start) + len('</noscript>') if end > start: html = html[:start] + html[end:] # and the first noscript that includes a style, it hides the abstract div x = html.find('<noscript>') if x > -1: offset = x + len('<noscript>') if html[offset:].strip().startswith('<style>'): end = html.find('</noscript>') + len('</noscript>') if end > x: html = html[:x] + html[end:] return html
def test_build_abstract_page(self): msg = TurboBeeMsg( target= 'https://ui.adsabs.harvard.edu/#abs/2019LRR....22....1I/abstract') html = '' with open(os.path.dirname(__file__) + '/abs.html', 'r') as f: html = f.read() json = { u'responseHeader': { u'status': 0, u'QTime': 12, u'params': { u'x-amzn-trace-id': u'Root=1-5c89826f-96274e0a2ef0646da1cdc944', u'rows': u'10', u'q': u'identifier:"2019LRR....22....1I"', u'start': u'0', u'wt': u'json', u'fl': u'[citations],abstract,aff,author,bibcode,citation_count,comment,data,doi,esources,first_author,id,isbn,issn,issue,keyword,links_data,page,property,pub,pub_raw,pubdate,pubnote,read_count,title,volume,year' } }, u'response': { u'start': 0, u'numFound': 1, u'docs': [{ u'read_count': 514, u'pubdate': u'2019-12-00', u'first_author': u'Ishak, Mustapha', u'abstract': u'We review recent developments and results in testing general relativity (GR) at cosmological scales. The subject has witnessed rapid growth during the last two decades with the aim of addressing the question of cosmic acceleration and the dark energy associated with it. However, with the advent of precision cosmology, it has also become a well-motivated endeavor by itself to test gravitational physics at cosmic scales. We overview cosmological probes of gravity, formalisms and parameterizations for testing deviations from GR at cosmological scales, selected modified gravity (MG) theories, gravitational screening mechanisms, and computer codes developed for these tests. We then provide summaries of recent cosmological constraints on MG parameters and selected MG models. We supplement these cosmological constraints with a summary of implications from the recent binary neutron star merger event. Next, we summarize some results on MG parameter forecasts with and without astrophysical systematics that will dominate the uncertainties. The review aims at providing an overall picture of the subject and an entry point to students and researchers interested in joining the field. It can also serve as a quick reference to recent results and constraints on testing gravity at cosmological scales.', u'links_data': [ u'{"access": "open", "instances": "", "title": "", "type": "preprint", "url": "http://arxiv.org/abs/1806.10122"}', u'{"access": "open", "instances": "", "title": "", "type": "electr", "url": "https://doi.org/10.1007%2Fs41114-018-0017-4"}' ], u'year': u'2019', u'pubnote': [ u'Invited review article for Living Reviews in Relativity. 201 pages, 17 figures. Matches published version; doi:10.1007/s41114-018-0017-4' ], u'id': u'15558883', u'bibcode': u'2019LRR....22....1I', u'author': [u'Ishak, Mustapha'], u'aff': [ u'Department of Physics, The University of Texas at Dallas, Richardson, TX, USA' ], u'esources': [u'EPRINT_HTML', u'EPRINT_PDF', u'PUB_HTML'], u'issue': u'1', u'pub_raw': u'Living Reviews in Relativity, Volume 22, Issue 1, article id. 1, <NUMPAGES>204</NUMPAGES> pp.', u'pub': u'Living Reviews in Relativity', u'volume': u'22', u'doi': [u'10.1007/s41114-018-0017-4'], u'keyword': [ u'Tests of relativistic gravity', u'Theories of gravity', u'Modified gravity', u'Cosmological tests', u'Post-Friedmann limit', u'Gravitational waves', u'Astrophysics - Cosmology and Nongalactic Astrophysics', u'Astrophysics - Astrophysics of Galaxies', u'General Relativity and Quantum Cosmology' ], u'title': [u'Testing general relativity in cosmology'], u'citation_count': 18, u'[citations]': { u'num_citations': 18, u'num_references': 928 }, u'property': [ u'ESOURCE', u'ARTICLE', u'REFEREED', u'PUB_OPENACCESS', u'EPRINT_OPENACCESS', u'EPRINT_OPENACCESS', u'OPENACCESS' ], u'page': [u'1'] }] } } with patch.object(self.app, '_load_url', return_value=html) as loader, \ patch.object(self.app, 'search_api', return_value=json) as searcher: assert self.app.build_static_page(msg) p = msg.get_value().decode('utf8') assert u'og:image' in p assert u'We review recent' in p assert u'ADS requires JavaScript' not in p assert u'#app-container' not in p
def harvest_by_null(queue='priority-bumblebee', max_num=-1, **kwargs): """ Process bibcodes retrived by a {'null': True} query to ads api /v1/store/search endpoint. This endpoint returns ~50 records. Passing null selects rows of pages where created==None. These rows includes a target field which is a url without the protocol (e.g., //dev.adsabs.harvard.edu/abs/1997A%26A...326..950F/styles/img/favicon.ico or //devui.adsabs.harvard.edu/abs/doi:10.1117/12.2314185). :param: queue - where to send the claims :return: no return """ url = app.conf.get('STORE_SEARCH_ENDPOINT', 'https://api.adsabs.harvard.edu/v1/store/search') params = {'null': True} i = 0 seen = set() with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.null').first() if kv is not None: last_id = kv.value else: last_id = -1 app.logger.info('harvest_by_null running with last_id of {}'.format(last_id)) while True: # reuse the http ads api client with keep-alive connections and bearer token r = app._client.get(url, params=params) r.raise_for_status() j = i for d in r.json(): if app.iscachable(d['target']): msg = TurboBeeMsg(target=d['target'], qid=d['qid']) app.logger.info('harvest_by_null queuing target {}'.format(d['target'])) tasks.task_priority_queue.delay(msg) params['last_id'] = d['id'] last_id = d['id'] if d['id'] in seen: break seen.add(d['id']) i += 1 if max_num > 0 and i > max_num: break if j == i: break if i > 0: with app.session_scope() as session: kv = session.query(KeyValue).filter_by(key='last.null').first() if kv is None: kv = KeyValue(key='last.null', value=last_id) session.add(kv) else: kv.value = last_id session.commit() app.logger.info('Done submitting {0} pages.'.format(i)) print i, last_id