def extract_content(page_id, ext_id, htmlReturn=False): # htmlReturn=False: by default returns text content if (page_id is None or "") or (ext_id is None or ""): return badrequest() page = Page.get_page(page_id) if page is None: return documentnotfound() extraction = Extraction.get_extraction(ext_id) if extraction is None: return documentnotfound() original_content = page.content if original_content is None or original_content is "": return nocontent() if not jpype.isThreadAttachedToJVM(): jpype.attachThreadToJVM() extractor = Extractor(extractor='DefaultExtractor', html=original_content) if not htmlReturn: bp_content = extractor.getText() else: bp_content = extractor.getHTML() if bp_content is None: nocontent() extraction.update(bp_content=bp_content) return success()
def __init__(self, url): if not jpype.isThreadAttachedToJVM(): jpype.attachThreadToJVM() BoilerpipeWrapperWrapper.iUrl = url