Ejemplo n.º 1
0
def retrieve_page(url, add_if_not_found=True):
    print "retrieving Page for ....%s" % (url)
    with app.test_request_context('/'): # this is to adjust for the fact that we are in celery content and not Flask context 
        app.preprocess_request()
    if Page.get_total_num_of_pages() > app.config['MAX_CORPUS_SIZE']:
        ### now we need to stop crawling
        # celery.control.broadcast('shutdown') 
        # Earlier I had this, but this shuts down celery, it stops Page population
        # But also stops Extraction population. But there has to be a one-t-one between pages and extractions
        
        # Therefore, we just stop consuming from "retrieve" queue. This is the queue to which
        # app.tasks.retrieve_page is configured to. Rest are on a different queue. Therefore,
        # other dependent tasks go through
        celery.control.cancel_consumer("retrieve") 
        #celery.control.add_consumer("retrieve") # We will have to issue this before retrieve_page task is called.
        
        return
    page = Page.get_page_by_url(url)
    if page is None:
        if add_if_not_found: # add a page
            page = Page.add_page(url)
        else: # just return
            return pagenotfound()
    else:
        pass # do nothing
    retrieve_extraction.delay(page.id)
    find_links.delay(page.id)
    
    #retrieve_extraction.delay(page.id)
    # The reason this was commented was because boilerpipe_extract_and_populate task was getting overwhelmed
    # because the page population was growing so fast.
    # New approach: First populate 1000 pages. The stop page population and start the extraction process
    
    #Using Rest API
    ''''r = requests.get("http://127.0.0.1:5000/pages", params={"url":url})
Ejemplo n.º 2
0
def find_links(page_id):
    with app.test_request_context('/'): # this is to adjust for the fact that we are in celery content and not Flask context 
        app.preprocess_request()
    page = Page.find_links(page_id)
    if page is None: return pagenotfound()
    for link in page.links:
        retrieve_page.delay(link)
Ejemplo n.º 3
0
 def update(self, bp_content="", jt_content=""):
     # looks at the page_id, gets the page's contents and uses boilerplate extraction on the content
     page = Page.load(self.page_id)
     assert page is not None
     self.bp_content = bp_content
     self.jt_content = jt_content
     self.last_checked = datetime.now()
     self.store()
     return self
Ejemplo n.º 4
0
 def extract_content(page_id, ext_id, htmlReturn=False): # htmlReturn=False: by default returns text content
     if (page_id is None or "") or (ext_id is None or ""): return badrequest()
     page = Page.get_page(page_id)
     if page is None: return documentnotfound()
     extraction = Extraction.get_extraction(ext_id)
     if extraction is None: return documentnotfound()
     original_content = page.content
     if original_content is None or original_content is "": return nocontent()
     
     if not jpype.isThreadAttachedToJVM():
         jpype.attachThreadToJVM()
     extractor = Extractor(extractor='DefaultExtractor', html=original_content)
     if not htmlReturn:
         bp_content = extractor.getText()
     else:
         bp_content = extractor.getHTML()
     if bp_content is None: nocontent()
     
     extraction.update(bp_content=bp_content)
     return success()
Ejemplo n.º 5
0
def boilerpipe_extract_and_populate(page_id=None, ext_id=None):
    print "extracting using boilerpipe..."
    
    # For some reason this approach of directly calling the static method is not working
    '''with app.test_request_context('/'): # this is to adjust for the fact that we are in celery content and not Flask context 
        app.preprocess_request()
    BoilerpipeExtraction.extract_content(page_id, ext_id)'''
    
    # Therefore, switching to calling the REST API. This seems to be working 
    #Using Rest API
    #return requests.get("http://127.0.0.1:5000/extractions/bp/%s,%s"%(page_id,ext_id))
    
    # approach 2:
    with app.test_request_context('/'): # this is to adjust for the fact that we are in celery content and not Flask context 
        app.preprocess_request()
    for page in Page.get_all_pages():
        if page is not None:
            extraction = Extraction.get_extraction_by_page_id(page.id)
            requests.get("http://127.0.0.1:5000/extractions/bp/%s,%s"%(page.id,extraction.id))
        else:
            pass
    return