def retrieve_page(url, add_if_not_found=True): print "retrieving Page for ....%s" % (url) with app.test_request_context('/'): # this is to adjust for the fact that we are in celery content and not Flask context app.preprocess_request() if Page.get_total_num_of_pages() > app.config['MAX_CORPUS_SIZE']: ### now we need to stop crawling # celery.control.broadcast('shutdown') # Earlier I had this, but this shuts down celery, it stops Page population # But also stops Extraction population. But there has to be a one-t-one between pages and extractions # Therefore, we just stop consuming from "retrieve" queue. This is the queue to which # app.tasks.retrieve_page is configured to. Rest are on a different queue. Therefore, # other dependent tasks go through celery.control.cancel_consumer("retrieve") #celery.control.add_consumer("retrieve") # We will have to issue this before retrieve_page task is called. return page = Page.get_page_by_url(url) if page is None: if add_if_not_found: # add a page page = Page.add_page(url) else: # just return return pagenotfound() else: pass # do nothing retrieve_extraction.delay(page.id) find_links.delay(page.id) #retrieve_extraction.delay(page.id) # The reason this was commented was because boilerpipe_extract_and_populate task was getting overwhelmed # because the page population was growing so fast. # New approach: First populate 1000 pages. The stop page population and start the extraction process #Using Rest API ''''r = requests.get("http://127.0.0.1:5000/pages", params={"url":url})
def find_links(page_id): with app.test_request_context('/'): # this is to adjust for the fact that we are in celery content and not Flask context app.preprocess_request() page = Page.find_links(page_id) if page is None: return pagenotfound() for link in page.links: retrieve_page.delay(link)
def test_check_rate_request_is_required(self): url = '/?destination=NOGJM&date_from=2016-01-01&date_to=2016-01-30' with app.test_request_context(url): app.preprocess_request() try: check_rate_request() assert False except UnprocessableEntity as e: assert e.description.lower() == "origin is required"
def test_check_rate_request_success(self): url = '/?origin=CNNBO&destination=NOGJM&date_from=2016-01-01&date_to=2016-01-30' with app.test_request_context(url): app.preprocess_request() origin, destination, date_from, date_to = check_rate_request() assert origin == "CNNBO" assert destination == "NOGJM" assert date_from == "2016-01-01" assert date_to == "2016-01-30"
def test_hook(self): with app4.test_request_context("/"): app4.preprocess_request() local = LocalStorage() self.assertTrue("nowtime" in local.list) nowhour = time.strftime("%Y-%m-%d %H:", time.localtime(time.time())) self.assertIn(nowhour, local.get("nowtime")) del local["nowtime"]
def test_check_rate_request_is_date_range(self): url = '/?origin=CNNBO&destination=NOGJM&date_from=2016-01-31&date_to=2016-01-20' with app.test_request_context(url): app.preprocess_request() try: check_rate_request() assert False except UnprocessableEntity as e: assert e.description.lower( ) == 'to_date should be higher than from'
def test_d_del(self): with app.test_request_context("/c2c/del_one_accommodation/1"): user_id = self.login() app.preprocess_request() if(user_id): rv = self.app.post('/c2c/del_one_accommodation/1') data = rv.get_json() print("accommodation return ",data,"\n") assert data['success'] else: return False
def test_c_userinfo(self): with app.test_request_context("/c2c/userinfo"): self.app.post('/c2c/login',json={ 'email':"*****@*****.**", 'password':'******'}) app.preprocess_request() rv = self.app.get('/c2c/userinfo') data = rv.get_json() print("userinfo return ",data,"\n") assert "success" not in data
def test_check_rate_request_is_date_formatted(self): wrong_date = '2016-31-01' url = '/?origin=CNNBO&destination=NOGJM&date_from=%s&date_to=2016-01-30' % wrong_date with app.test_request_context(url): app.preprocess_request() try: check_rate_request() assert False except UnprocessableEntity as e: assert e.description.lower() == ( '%s should formatted in yyyy-mm-dd' % wrong_date)
def test_e_passwordchange(self): with app.test_request_context("/c2c/userinfo"): self.app.post('/c2c/login',json={ 'email':"*****@*****.**", 'password':'******'}) app.preprocess_request() rv = self.app.post('/c2c/changepassword',json={ 'password':"******"}) data = rv.get_json() print("password change return ",data,"\n") assert data['success']
def test_d_userupdate(self): with app.test_request_context("/c2c/userinfo"): self.app.post('/c2c/login',json={ 'email':"*****@*****.**", 'password':'******'}) app.preprocess_request() rv = self.app.post('/c2c/userupdate',json={ 'nickname':'testuser', 'phone':"111111", "name":"bbbn", "id_card":"3422712732376"}) data = rv.get_json() print("userupdate return ",data,"\n") assert data['success']
def setUp(self): def cleanup_db(): self.current_transaction.rollback() if self.app_context: self.app_context.pop() self.addCleanup(cleanup_db) self.current_transaction = self.connection.begin_nested() db.session = orm.scoped_session(orm.sessionmaker(bind=self.connection)) if self.CREATE_DEFAULT_APP_CONTEXT: self.app_context = app.test_request_context('/testing') self.app_context.push() app.preprocess_request() else: self.app_context = None
def retrieve_extraction(page_id,add_if_not_found=True): print "retrieving Extraction for ....%s" % (page_id) with app.test_request_context('/'): # this is to adjust for the fact that we are in celery content and not Flask context app.preprocess_request() extraction = Extraction.get_extraction_by_page_id(page_id) if extraction is None: if add_if_not_found: # add a page extraction = Extraction.add_extraction(page_id) else: return extractionnotfound else: pass # do nothing #<-->We will not do boilerpipe extraction here... We are going to simply put an extraction page # But we will run a separate process that activates boilerpipe taking page.id and extraction.id. #boilerpipe_extract_and_populate.delay(page_id,extraction.id) #Using Rest API '''rExt = requests.get("http://127.0.0.1:5000/extractions", params={"page_id":page_id})
def test_a_publish(self): with app.test_request_context("/c2c/userinfo"): user_id = self.login() app.preprocess_request() if(user_id): rv = self.app.post('/c2c/accommodation/add',json={ 'acc_address':'weihai', 'acc_capacity':300, 'acc_price':"6000", 'acc_city':1, 'acc_description':"nothing", 'acc_user_id':3, 'acc_type_id':1 }) data = rv.get_json() print("accommodation return ",data,"\n") assert data['success'] else: return False
def boilerpipe_extract_and_populate(page_id=None, ext_id=None): print "extracting using boilerpipe..." # For some reason this approach of directly calling the static method is not working '''with app.test_request_context('/'): # this is to adjust for the fact that we are in celery content and not Flask context app.preprocess_request() BoilerpipeExtraction.extract_content(page_id, ext_id)''' # Therefore, switching to calling the REST API. This seems to be working #Using Rest API #return requests.get("http://127.0.0.1:5000/extractions/bp/%s,%s"%(page_id,ext_id)) # approach 2: with app.test_request_context('/'): # this is to adjust for the fact that we are in celery content and not Flask context app.preprocess_request() for page in Page.get_all_pages(): if page is not None: extraction = Extraction.get_extraction_by_page_id(page.id) requests.get("http://127.0.0.1:5000/extractions/bp/%s,%s"%(page.id,extraction.id)) else: pass return
def exposed_fetch_unmapped_qidian_items(): ''' ''' from app import app from flask import g # with app.app_context(): with app.test_request_context(""): app.preprocess_request() print("Querying for rss feed items.") # Hard coded for my database. Because fuk u \ releases = g.session.query(db.RssFeedPost) \ .filter(db.RssFeedPost.feed_id == 2578) \ .all() print("Processing items") urls = [item.contenturl for item in releases] relmap = {} for release in releases: if "/rssbook/" in release.contenturl: continue trimmed = "/".join(release.contenturl.split("/")[:5]) + "/" relmap.setdefault(trimmed, []) relmap[trimmed].append(release) print("Fetched %s urls, %s distinct series" % (len(urls), len(relmap))) for itemlist in relmap.values(): itemlist.sort(key=lambda x: x.id) truncated_releases = [tmp[0] for tmp in relmap.values()] print("Truncated releases: %s" % len(truncated_releases)) items = proto_process_releases(truncated_releases) print("Processing resulted in %s feed items" % len(items['missed'])) feed_urls = [tmp[1]['linkUrl'] for tmp in items['missed']] trimmed = ["/".join(tmp.split("/")[:5]) + "/" for tmp in feed_urls] new_series_urls = list(set(trimmed)) print("Releases consolidated to %s distinct series" % len(new_series_urls)) bad_names = [ '12testett11223355', 'webnovel test003', 'www.webnovel.com', ] wg = WebRequest.WebGetRobust() for url in new_series_urls: meta = common.management.util.get_page_title(wg, url) if not any([tmp in meta['title'] for tmp in bad_names]): print('Missing: "%s" %s: "%s",' % (url, " " * (50 - len(url)), meta)) itemid = url.split("/") itemid = [tmp for tmp in itemid if tmp] itemid = itemid[-1] print( "'%s' : ('%s', '%s')," % (itemid, meta['title'].strip(), 'oel' if 'is-orig' in meta and meta['is-orig'] else 'translated'))
import os import sys import readline import random import datetime from pprint import pprint as p import ujson from flask import * from app import app, db from app.orders.models import * from app.taxi.models import * os.environ["PYTHONINSPECT"] = "True" # def flushall(): # db.drop_all() # db.create_all() # Add test request context ctx = app.test_request_context() ctx.push() app.preprocess_request()