def retrieve_page(url, add_if_not_found=True): print "retrieving Page for ....%s" % (url) with app.test_request_context('/'): # this is to adjust for the fact that we are in celery content and not Flask context app.preprocess_request() if Page.get_total_num_of_pages() > app.config['MAX_CORPUS_SIZE']: ### now we need to stop crawling # celery.control.broadcast('shutdown') # Earlier I had this, but this shuts down celery, it stops Page population # But also stops Extraction population. But there has to be a one-t-one between pages and extractions # Therefore, we just stop consuming from "retrieve" queue. This is the queue to which # app.tasks.retrieve_page is configured to. Rest are on a different queue. Therefore, # other dependent tasks go through celery.control.cancel_consumer("retrieve") #celery.control.add_consumer("retrieve") # We will have to issue this before retrieve_page task is called. return page = Page.get_page_by_url(url) if page is None: if add_if_not_found: # add a page page = Page.add_page(url) else: # just return return pagenotfound() else: pass # do nothing retrieve_extraction.delay(page.id) find_links.delay(page.id) #retrieve_extraction.delay(page.id) # The reason this was commented was because boilerpipe_extract_and_populate task was getting overwhelmed # because the page population was growing so fast. # New approach: First populate 1000 pages. The stop page population and start the extraction process #Using Rest API ''''r = requests.get("http://127.0.0.1:5000/pages", params={"url":url})
def find_links(page_id): with app.test_request_context('/'): # this is to adjust for the fact that we are in celery content and not Flask context app.preprocess_request() page = Page.find_links(page_id) if page is None: return pagenotfound() for link in page.links: retrieve_page.delay(link)
def update(self, bp_content="", jt_content=""): # looks at the page_id, gets the page's contents and uses boilerplate extraction on the content page = Page.load(self.page_id) assert page is not None self.bp_content = bp_content self.jt_content = jt_content self.last_checked = datetime.now() self.store() return self
def extract_content(page_id, ext_id, htmlReturn=False): # htmlReturn=False: by default returns text content if (page_id is None or "") or (ext_id is None or ""): return badrequest() page = Page.get_page(page_id) if page is None: return documentnotfound() extraction = Extraction.get_extraction(ext_id) if extraction is None: return documentnotfound() original_content = page.content if original_content is None or original_content is "": return nocontent() if not jpype.isThreadAttachedToJVM(): jpype.attachThreadToJVM() extractor = Extractor(extractor='DefaultExtractor', html=original_content) if not htmlReturn: bp_content = extractor.getText() else: bp_content = extractor.getHTML() if bp_content is None: nocontent() extraction.update(bp_content=bp_content) return success()
def boilerpipe_extract_and_populate(page_id=None, ext_id=None): print "extracting using boilerpipe..." # For some reason this approach of directly calling the static method is not working '''with app.test_request_context('/'): # this is to adjust for the fact that we are in celery content and not Flask context app.preprocess_request() BoilerpipeExtraction.extract_content(page_id, ext_id)''' # Therefore, switching to calling the REST API. This seems to be working #Using Rest API #return requests.get("http://127.0.0.1:5000/extractions/bp/%s,%s"%(page_id,ext_id)) # approach 2: with app.test_request_context('/'): # this is to adjust for the fact that we are in celery content and not Flask context app.preprocess_request() for page in Page.get_all_pages(): if page is not None: extraction = Extraction.get_extraction_by_page_id(page.id) requests.get("http://127.0.0.1:5000/extractions/bp/%s,%s"%(page.id,extraction.id)) else: pass return