class ExtractionListAPI(Resource): #curl http://localhost:5000/extractions #curl http://localhost:5000/extractions?"<page_id>" def get(self): # returns all Extractions, or returns a specific Extractions based on page_id args = parser.parse_args() page_id = args['page_id'] if page_id is None or "": # requested all Extractions jsonify = lambda x: x._to_json(x.id) if x else {} return map(jsonify, Extraction.get_all_extractions()) else: # requested Extractions filtered by one page_id retval = lambda x: x._to_json(x.id) if x else 400 return retval(Extraction.get_extraction_by_page_id(page_id)) # curl http://localhost:5000/pages -d "url=http://www.bbc.com" -X POST -v def post(self): # creates a new page args = parser.parse_args() page_id = args['page_id'] if page_id is None or "": return 400 # bad request retval = lambda x: x._to_json(x.id) if x else 400 return retval(Extraction.add_extraction(page_id)) manager.add_document(Extraction) manager.add_viewdef(Extraction.all_extractions) manager.add_viewdef(Extraction.extraction_by_page_id) manager.sync(app) api.add_resource(ExtractionAPI, '/extractions/<string:_id>') api.add_resource(ExtractionListAPI, '/extractions')
@staticmethod def get_by_domain(protocol, domain): r = RobotsTxt.robtxt_by_domian(key=[protocol, domain]) if len(r) > 0: # we have already crawled here and obtained RobotsTxt info. No need to request it again for row in r: doc = RobotsTxt.load(row.value) if doc.is_valid(): return doc else: # We are here for the first time, we need to get the RobotsTxt info and store it for current and future reference doc = RobotsTxt(protocol=protocol, domain=domain) doc.update() return doc manager.add_document(Page) manager.add_viewdef(Page.all_pages) manager.add_viewdef(Page.page_by_url) manager.add_document(RobotsTxt) manager.add_viewdef(RobotsTxt.robtxt_by_domian) manager.sync(app) api.add_resource(PageAPI, '/pages/<string:_id>') api.add_resource(PageListAPI, '/pages') ########### Helper functions ####### def unescape(text): """Removes HTML or XML character references and entities from a text string. keep &, >, < in the source code. """ def fixup(m):