def test_shouldCreateModelWhenNoModelFound(self): server = self.stub_http_server self.id_list = json.dumps({"documentIds": ["1", "2", "3", "4", "5"]}) self.clear_model(config("app.model_path")) server.response_when(method="GET", path="/documents/tokens", body=self.id_list, response=json.dumps(self.document_response), responseType="application/json") response = self.fetch("/tagger/documents", method="POST", body=self.id_list) self.assertTrue( server.request_received(method="GET", path="/documents/tokens", body=self.id_list)) self.assertTrue( server.request_received(method="POST", path="/documents/logical_topics")) self.assertTrue( os.path.exists(os.path.join(config("app.model_path"), "lda.model"))) self.assertTrue( os.path.exists( os.path.join(config("app.model_path"), "tokens.dict"))) self.assertEqual(200, response.code) self.assertEqual('success', json.loads(response.body)["status"])
def post_document_logical_topics_association(self, document_id, topics, topics_tokens_map): document_topics_mixture = DocumentTopicsMixtureRequest( document_id=document_id, topics=topics, topics_tokens_map=topics_tokens_map) post_document_logical_topics_url = config( "content_store.host") + config( "content_store.post_document_logical_topics") response = None try: logger.info("Posting Document tagged with Logical Topics to %s" % post_document_logical_topics_url) response = requests.post(post_document_logical_topics_url, data=document_topics_mixture.to_json(), headers=self.HEADERS) if response.status_code is not httplib.OK: logger.error(response.text) raise Exception(self.STATUS_FAILED) except requests.ConnectionError as e: logger.error(e) raise Exception(self.CONNECTION_ERROR) logger.info( "Successfully posted Logical Topics for Document %s. Server Response: %s" % (document_id, response.text))
def generate_a_model_for_inference(self): documents_response_map = [{ "id": "1", "tokens": [ "content", "network", "router", "wifi", "cable", "ethernet", "socket", "authentication", "content", "network", "router", "wifi", "cable", "ethernet", "socket", "authentication" ] }, { "id": "2", "tokens": ["java", "golang", "cool", "awesome"] }, { "id": "3", "tokens": ["authentication", "golang", "impossible"] }, { "id": "4", "tokens": [ "network", "tcp", "ftp", "monitor", "reliability", "cable", "ethernet", "content", "network", "router", "wifi", "cable", "ethernet", "socket", "authentication" ] }, { "id": "5", "tokens": ["python", "topic", "modelling", "module"] }] tagger = LDATagger(model_path=config("app.model_path"), num_topics=config("app.max_topics")) documents_response = DocumentsResponse(documents_response_map) docs_tokens_map = documents_response.to_docs_tokens_map() tagger.build_topics(docs_tokens_map.values())
def start(self): logger.info( "Starting Trinity with config at %s with %s sub processes." % (self.config_file, config("app.process_count"))) logger.info("Listening to requests on port %s" % config("app.port")) server = HTTPServer(self) server.bind(config("app.port")) server.start(int(config("app.process_count"))) IOLoop.instance().start()
def test_shouldSendTopicToTokensAssociationToContentStore(self): server = self.stub_http_server self.id_list = json.dumps({"documentIds": ["1", "2", "3", "4", "5"]}) self.clear_model(config("app.model_path")) server.response_when(method="GET", path="/documents/tokens", body=self.id_list, response=json.dumps(self.document_response), responseType="application/json") response = self.fetch("/tagger/documents", method="POST", body=self.id_list) self.assertTrue( server.request_received(method="GET", path="/documents/tokens", body=self.id_list)) self.assertTrue( server.request_received(method="POST", path="/documents/logical_topics")) self.assertEqual(200, response.code) self.assertEqual('success', json.loads(response.body)["status"])
def test_shouldSendfailureResponseWhenDocumentsFetchFails(self): server = self.stub_http_server self.id_list = json.dumps({"documentIds": ["1", "2", "3", "4", "5"]}) self.clear_model(config("app.model_path")) server.response_when(method="GET", path="/documents/tokens", body=self.id_list, response="[]", responseType="application/json", status_code=500) response = self.fetch("/tagger/documents", method="POST", body=self.id_list) self.assertTrue( server.request_received(method="GET", path="/documents/tokens", body=self.id_list)) self.assertFalse( server.request_received(method="POST", path="/documents/logical_topics")) self.assertEqual(500, response.code) self.assertEqual('failure', json.loads(response.body)["status"])
def post(self): document_id = json.loads(self.request.body)["documentId"] logger.info("Request to infer topics for document %s received" % document_id) try: document_response = self.content_store_service.fetch_document( document_id) except Exception as e: document_fetch_error = "unable to fetch Document for Tagging" logger.info(e) logger.error("%s for Id %s" % (document_fetch_error, document_id)) return self.error_response(document_fetch_error) model_path = config("app.model_path", LDATagger.DEFAULT_MODEL_PATH) logger.info("Model Path: %s" % (model_path)) logger.info("Inferring topics and tags for %s" % document_id) result = self.document_processor.infer(document_response, os.path.abspath(model_path)) logger.info("Topics and tags for %s successfully generated" % document_id) if not result.is_success(): self.set_status(httplib.INTERNAL_SERVER_ERROR) self.write(result.to_json()) self.set_header("Content-Type", "application/json")
def infer(self, document_response, tagger_model_path): tagger = LDATagger(tagger_model_path, num_topics=config("app.max_topics")) topics = tagger.topics_for_document(document_response.tokens()) topics_tokens_map = tagger.topics_to_tokens() tags = TagGenerator(topics_tokens_map).generate_tags( topics=topics, tokens=document_response.tokens()) try: self.content_store_service.post_document_logical_topics_association( document_response.document_id(), topics, topics_tokens_map) except Exception as e: document_topic_error_msg = "Error updating content store for document with logical topics" logger.error(e) return self.error_response(document_topic_error_msg) try: self.content_store_service.post_document_tags_association( document_response.document_id(), tags) except Exception as e: doc_tags_error_msg = "Error updating content store with Document Tags" logger.error(e) return self.error_response(doc_tags_error_msg) return Response(status="success", message="Process Complete")
def process(self, docs_tokens_map, tagger_model_path): tagger = LDATagger(tagger_model_path, num_topics=config("app.max_topics")) tagger.build_or_update_model(docs_tokens_map.values()) docs_topics_map = tagger.topics_for_documents(docs_tokens_map) topics_tokens_map = tagger.topics_to_tokens() docs_tags_map = TagGenerator( topics_tokens_map).generate_documents_tag_map( documents_tokens_map=docs_tokens_map, documents_topics_map=docs_topics_map) try: self.content_store_service.post_documents_logical_topics_associations( docs_topics_map, topics_tokens_map) except Exception as e: document_topic_error_msg = "Error updating content store for documents with logical topics" logger.error(e) return self.error_response(document_topic_error_msg) try: self.content_store_service.post_documents_tags_associations( docs_tags_map) except Exception as e: docs_tags_error_msg = "Error updating content store with Documents' Tags" logger.error(e) return self.error_response(docs_tags_error_msg) return Response(status="success", message="Process Complete")
def fetch_document(self, document_id): get_doc_url = config("content_store.host" ) + config("content_store.get_doc") % document_id response = None try: logger.info("Fetching %s Document to Tag from %s" % (document_id, get_doc_url)) response = requests.get(get_doc_url, headers=self.HEADERS) if response.status_code is not httplib.OK: logger.error(response.text) raise Exception(self.STATUS_FAILED) except requests.ConnectionError as e: logger.error(e) raise Exception(self.CONNECTION_ERROR) logger.info("Received Tokenised Document for tagging with %s tokens." % len(json.loads(response.text)["tokens"])) logger.debug( "Received Tokenised Document for tagging. Server Response %s" % response.text) return DocumentResponse(response.json())
def fetch_documents(self, documents_request): get_docs_url = config("content_store.host") + config( "content_store.get_docs") response = None try: logger.info("Fetching %s Documents to Tag from %s" % (documents_request, get_docs_url)) response = requests.get(get_docs_url, data=documents_request, headers=self.HEADERS) if response.status_code is not httplib.OK: logger.error(response.text) raise Exception(self.STATUS_FAILED) except requests.ConnectionError as e: logger.error(e) raise Exception(self.CONNECTION_ERROR) logger.debug( "Received Tokenised Documents for tagging. Server Response %s" % response.text) return DocumentsResponse(response.json())
def post_document_tags_association(self, document_id, tags): document_tags_request = DocumentTagsRequest(document_id, tags) post_document_tags_url = config("content_store.host") + config( "content_store.post_document_tags") response = None try: logger.info("Posting Document-Tags association to %s" % post_document_tags_url) response = requests.post(post_document_tags_url, data=document_tags_request.to_json(), headers=self.HEADERS) if response.status_code is not httplib.OK: logger.error(response.text) raise Exception(self.STATUS_FAILED) except requests.ConnectionError as e: logger.error(e) raise Exception(self.CONNECTION_ERROR) logger.info( "Successfully posted Document-Tags association. Server Response: %s" % response.text)
def allStatuses(): dependencies = [] dependencies.append( restStatus("ContentStore HTTP connection", config("content_store.host") + "/diagnostics/humans.txt")) return json.dumps({ "dependencies": dependencies, "status": reduce( lambda current_status, dependency: "yellow" if dependency["status"] == "red" else current_status, dependencies, "green") })
def post(self): list_of_ids = self.request.body logger.info("Request to model topics for documents %s received" % str(list_of_ids)) try: documents_response = self.content_store_service.fetch_documents(list_of_ids) except Exception as e: document_fetch_error = "unable to fetch Documents for Tagging" logger.info(e) logger.error("%s for Id %s" % (str(document_fetch_error), str(list_of_ids))) return self.error_response(document_fetch_error) model_path = config("app.model_path", LDATagger.DEFAULT_MODEL_PATH) logger.info("Model Path: %s" % (model_path)) docs_tokens_map = documents_response.to_docs_tokens_map() result = self.processor.process(docs_tokens_map, os.path.abspath(model_path)) if not result.is_success(): self.set_status(httplib.INTERNAL_SERVER_ERROR) self.write(result.to_json()) self.set_header("Content-Type", "application/json")
def tearDown(self): super(InferTaggingIntegrationTest, self).tearDown() self.clear_model(config("app.model_path"))
def setUp(self): super(InferTaggingIntegrationTest, self).setUp() self.clear_model(config("app.model_path")) self.generate_a_model_for_inference() self.stub_http_server.reset()