def test_lowest_dimension_found(self): self.doc.content = "key11 and some other stuff" self.solr_docs.update(self.doc) expected = [ SolrDocKeyword("key1", SolrDocKeywordTypes.KWM), SolrDocKeyword("key11", SolrDocKeywordTypes.KWM), ] self.solr_docs.apply_kwm(self.hierarchy.get_keywords()) self.doc = self.solr_docs.get(self.doc.id) self.assertEqual(sorted(self.doc.keywords), sorted(expected))
def test_keyword_added_and_deleted(self, client, doc_with_2_keywords): doc = self.solr_docs.get(doc_with_2_keywords) doc.keywords.add(SolrDocKeyword("keyword3", SolrDocKeywordTypes.MANUAL)) doc.keywords.remove( SolrDocKeyword("keyword2", SolrDocKeywordTypes.META)) client.patch("/changekeywords", data=json.dumps(doc.as_dict()), **POST_JSON) assert "keyword1" in self.solr_keyword_statistics assert "keyword2" not in self.solr_keyword_statistics assert "keyword3" in self.solr_keyword_statistics
def change_keywords(): """ Handles the updating of keywords for a document :return: json object containing a success/error message """ try: iDoc = request.json id = iDoc.get("id") keywords = iDoc.get("keywords") except Exception as e: return jsonify(f"Bad Request: {e}"), 400 try: solDoc = solr.docs.get(id) keywords_before = copy.deepcopy(solDoc.keywords) keywords_after = { SolrDocKeyword(kw["value"], SolrDocKeywordTypes.from_str(kw["type"])) for kw in keywords } solDoc.keywords = keywords_after solr.docs.update(solDoc) solr.keyword_statistics.update(keywords_before, keywords_after) except Exception as e: log.error(f"/changekeywords {e}") return jsonify(f"Bad Gateway to solr: {e}"), 502 print("changed keywords on file " + id + " to " + ",".join([kw.value for kw in solDoc.keywords]), file=sys.stdout) return jsonify("success"), 200
def doc2_has_kw1(client, doc_with_0_keywords_1, solr_docs): doc2 = solr_docs.get(doc_with_0_keywords_1) doc2.keywords.add(SolrDocKeyword("keyword1", SolrDocKeywordTypes.MANUAL)) client.patch("/changekeywords", data=json.dumps(doc2.as_dict()), **POST_JSON) return doc2.id
def test_apply_tagging_method_kwm(self): data = json.dumps(dict( taggingMethod={'name': 'Keyword Model', 'type': 'KWM'}, keywordModel={ 'id': 'test', 'hierarchy': json.dumps([ {'item': 'test', 'nodeType': 'KEYWORD'}, {'item': 'text', 'nodeType': 'KEYWORD'}, {'item': 'faufm', 'nodeType': 'KEYWORD'}, ]), 'keywords': ['test', 'text', 'faufm'], }, documents=[{'id': "test.txt"}, {'id':"test.pdf"}], options={"applyToAllDocuments": False}, jobId='JOB-ID' )) self.application.solr.docs.add(*self.docs) tester = self.app.test_client(self) response = tester.post("/apply", content_type="application/json", data=data) self.assertEqual(response.status_code, 200) # Wait for thread to finish. sleep(10) doc = self.application.solr.docs.get("test.txt") keywords = self.application.solr.docs.get("test.txt").keywords expected = [ SolrDocKeyword("text", SolrDocKeywordTypes.KWM), SolrDocKeyword("test", SolrDocKeywordTypes.KWM), ] self.assertEqual(sorted(keywords), sorted(expected)) keywords = self.application.solr.docs.get("test.pdf").keywords expected = [ SolrDocKeyword("faufm", SolrDocKeywordTypes.KWM), ] self.assertEqual(sorted(keywords), sorted(expected)) keywords = self.application.solr.docs.get("test.docx").keywords expected = [] self.assertEqual(sorted(keywords), sorted(expected)) keywords = self.application.solr.docs.get("test.pptx").keywords expected = [] self.assertEqual(sorted(keywords), sorted(expected))
def test_no_duplicate_keywords(self): self.doc.keywords = [SolrDocKeyword("key1", SolrDocKeywordTypes.KWM)] self.doc.content = "key1 and some other stuff" self.solr_docs.update(self.doc) self.solr_docs.apply_kwm(self.hierarchy.get_keywords()) self.doc = self.solr_docs.get(self.doc.id) self.assertEqual(len(self.doc.keywords), 1)
def doc_with_keyword_in_keywords_field(solr_docs): doc_id = "doc_with_kw_in_kws" doc = SolrDoc( doc_id, SolrDocKeyword("keyword", SolrDocKeywordTypes.MANUAL), content="content", title="title", file_type="file_type", lang="lang", size=1, ) solr_docs.update(doc) return doc_id
def run(self): self.status = 'TAGGING_JOB.CREATE_KW' auto_keywords = create_automated_keywords(self.docs, self.num_clusters, self.num_keywords, self.default, self) self.status = 'TAGGING_JOB.KW_FOUND' doc_ids = auto_keywords.keys() docs = self.solr_service.docs.get(*doc_ids) if len(doc_ids)==1: docs=[docs] print("ids", doc_ids) self.status = 'TAGGING_JOB.APPLYING' start_time = time.time() time_index = 0 iteration_time = None progress_step = 0 for idx, doc in enumerate(docs): if self.cancelled: break if idx == 0: progress_step = self.progress / len(docs) new_keywords = auto_keywords[doc.id] doc.keywords.update( SolrDocKeyword(kw, SolrDocKeywordTypes.ML) for kw in new_keywords ) if time_index == 0: end_time = time.time() iteration_time = end_time - start_time time_index = 1 remaining_iterations = len(docs) - idx idx += 1 if iteration_time != - 1: self.time_remaining = iteration_time * remaining_iterations self.progress += progress_step self.solr_service.docs.update(*docs) keywords_added = set() keywords_added.update(kw for doc in docs for kw in doc.keywords) self.solr_service.keyword_statistics.update({}, keywords_added) self.status = 'FINISHED'
def test_change_keywords(self): self.application.solr.docs.add(self.docs[0]) id = self.docs[0].id doc = self.application.solr.docs.get(id) self.assertEqual(doc.keywords, set()) tester = self.app.test_client(self) data=json.dumps({ "id":id, "keywords": [ {"value": "a", "type": "MANUAL"}, {"value": "b", "type": "MANUAL"}, {"value": "c", "type": "MANUAL"} ], }) response=tester.patch('/changekeywords', data=data, content_type='application/json') self.assertEqual(response.status_code, 200) doc = self.application.solr.docs.get(id) self.assertEqual(doc.keywords, {SolrDocKeyword(kw, SolrDocKeywordTypes.MANUAL) for kw in ["a", "b", "c"]})
def run(self): """ Applies a keyword model on every document in Solr. The idea is to search the content in Solr for the lemmatized_keyword if it is found the (normal)keyword and its parents are applied. :param keywords: dict of keywords and corresponding parents :param doc_ids: :param job_id :return: """ self.status = 'TAGGING_JOB.LEMMA_START' lemmatized_keywords = lemmatize_keywords(self.keywords) lemmatize_progress = 20 self.status = 'TAGGING_JOB.LEMMA_END' self.progress = lemmatize_progress self.status = 'TAGGING_JOB.DOC_FIND' id_query = self.solr_service.docs.build_id_query(self.doc_ids) self.status = 'TAGGING_JOB.DOC_FOUND' changed_docs = {} self.status = 'TAGGING_JOB.APPLY_KWM' start_time = time.time() time_index = 0 iteration_time = None progress_step = 0 for idx, (lemmatized_keyword, (keyword, parents)) in enumerate(zip( lemmatized_keywords, self.keywords.items() )): if self.cancelled: break if idx == 0: progress_step = (100 - lemmatize_progress) / len(lemmatized_keywords) query = self.solr_service.docs.build_kwm_query(id_query, lemmatized_keyword) res = self.solr_service.docs.search(query) res = [SolrDoc.from_hit(hit) for hit in res] for doc in res: if self.cancelled: break # check whether the doc was already updated if doc.id in changed_docs: doc = changed_docs[doc.id] # update keywords doc.keywords.add( SolrDocKeyword(keyword, SolrDocKeywordTypes.KWM)) doc.keywords.update( SolrDocKeyword(parent, SolrDocKeywordTypes.KWM) for parent in parents ) # store for bulk update changed_docs[doc.id] = doc if time_index == 0: end_time = time.time() iteration_time = end_time - start_time time_index = 1 remaining_iterations = len(lemmatized_keywords) - idx idx += 1 if iteration_time != - 1: self.time_remaining = iteration_time * remaining_iterations self.progress += progress_step changed_docs = changed_docs.values() self.status = 'TAGGING_JOB.DOC_UPDATE' if not self.cancelled: self.solr_service.docs.update(*changed_docs) keywords_added = set() keywords_added.update(kw for doc in changed_docs for kw in doc.keywords) self.solr_service.keyword_statistics.update({}, keywords_added) self.status = 'FINISHED'
def kw3(): return SolrDocKeyword("key3", SolrDocKeywordTypes.KWM)
def kw2(): return SolrDocKeyword("key2", SolrDocKeywordTypes.KWM)
def kw1(): return SolrDocKeyword("key1", SolrDocKeywordTypes.KWM)