def test_tokenize(self): test_case = { "text": "2018-06-08T00:00:00Z INFO GET /v1/bundles/7ef8966b-45ef-4e0a-a51b-44a865372050.2018-06-08T230333.785338Z?param1=1¶m2=2 {\"key\": \"value\"}" } index_name = self.es_client._format_today_index_name(self.index_prefix) index_client = IndicesClient(TestESClient.es) with self.new_index(index_name): response = index_client.analyze(index=index_name, body=test_case) tokens = [t['token'] for t in response['tokens']] self.assertEqual(set(tokens), { '7ef8966b-45ef-4e0a-a51b-44a865372050', '2018-06-08T230333.785338Z', ':', 'INFO', '1', '2', 'v1', 'bundles', 'key', 'GET', 'param2', 'param1', '2018-06-08T00:00:00Z', 'value' }) self.assertEqual(len(tokens), 14)
def get_morpheme(self, text): setting = { "analyzer": "my_analyzer", "text": text, "attributes": ["posType", "leftPOS", "rightPOS", "morphemes", "reading"], "explain": "true" } i = IndicesClient(self.es) temp = i.analyze(index="pos", body=setting) tokens = temp.get('detail').get('tokenizer').get('tokens') token_list = list({token['token']: token for token in tokens}.values()) # 중복제거 return token_list
def __init__(self): # Get all queries directory = "D:\\IR\\IR_data\\AP_DATA" + "\\" + "query_desc.51-100.short.txt" with open(directory, "r") as r: allQueries = r.readlines() # Establish connection es = Elasticsearch() q = IndicesClient(es) allDocID = DocIDProviderForJelinek() allDoc = allDocID.getDocID() # Get all queries in ascending order in dictionary queryDict = {} for each_query in allQueries: queryDict[int(each_query.split()[0].replace(".", ""))] = ' '.join( each_query.split()[1:]) # Iterate through each query. for queryNo, query in queryDict.iteritems(): print("Into query ----------------") queryDict = {} query = query.replace("Document", "") # Elastic search query for stopwords removal and stemming queryResult = q.analyze(index="ap_dataset", analyzer="my_english", text=query) # Iterate through each term for term in queryResult['tokens']: finalTerm = term['token'].encode("utf-8") termDict = termmDictFunction(allDoc, finalTerm) matchDict = matchDictFunction(finalTerm) for docID, value in matchDict.iteritems(): if termDict.has_key(docID): termDict[docID] = value else: termDict[docID] = value for docID, value in termDict.iteritems(): if queryDict.has_key(docID): prev = queryDict[docID] queryDict[docID] = prev + value else: queryDict[docID] = value # print queryDict r = RetrievalModel() r.unigramJM(queryNo, queryDict)
class ElasticAnalyzer: def __init__(self, language="english"): self.core = ElasticCore() self.indices_client = IndicesClient(self.core.es) self.splitter = TextSplitter(split_by="WORD_LIMIT") self.language = language def chunk_input(self, text): analyzed_chunks = [] # Split input if token count greater than 5K. # Elastic will complain if token count exceeds 10K. docs = self.splitter.split(text, max_limit=5000) # Extract text chunks from docs. text_chunks = [doc["text"] for doc in docs] return text_chunks def apply_analyzer(self, body): try: analysis = self.indices_client.analyze(body=body) tokens = [token["token"] for token in analysis["tokens"]] token_string = " ".join(tokens) return token_string except elasticsearch.exceptions.RequestError as e: reason = e.info["error"]["reason"] if "Invalid stemmer class" in reason: logging.getLogger(ERROR_LOGGER).warning(e) else: logging.getLogger(ERROR_LOGGER).exception(e) return "" except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) return "" def _prepare_stem_body(self, text, language, strip_html: bool, tokenizer="standard"): body = { "text": text, "tokenizer": tokenizer, "filter": [{ "type": "snowball", "language": language }] } if strip_html: body["char_filter"] = ["html_strip"] return body def stem_text(self, text: str, language: Optional[str], strip_html=True, tokenizer="standard"): analysed_chunks = [] text_chunks = self.chunk_input(text) for chunk in text_chunks: body = self._prepare_stem_body(chunk, language, strip_html, tokenizer) response = self.apply_analyzer(body) analysed_chunks.append(response) return " ".join(analysed_chunks) def _prepare_tokenizer_body(self, text, tokenizer="standard", strip_html: bool = True): body = {"text": text, "tokenizer": tokenizer} if strip_html: body["char_filter"] = ["html_strip"] return body def tokenize_text(self, text, tokenizer="standard", strip_html=True): analysed_chunks = [] text_chunks = self.chunk_input(text) for chunk in text_chunks: body = self._prepare_tokenizer_body(chunk, tokenizer, strip_html) response = self.apply_analyzer(body) analysed_chunks.append(response) return " ".join(analysed_chunks) def analyze(self, text: str, body: dict) -> str: analysed_chunks = [] text_chunks = self.chunk_input(text) for chunk in text_chunks: body = { **body, "text": chunk, } response = self.apply_analyzer(body) analysed_chunks.append(response) return " ".join(analysed_chunks) # This only exists here because otherwise it breaks backwards compatibility # with the texta-tools library that send there as the lemmatizer. def lemmatize(self, text): return self.stem_text(text=text, language=self.language, strip_html=True)
class handEl: def __init__(self, host='localhost', port=9200, index="default", props=None): if self.connect(host, port): print("Elasticsearch Connection Success [{}:{}]".format( host, port)) self.es = Elasticsearch(host=host, port=port) self.ies = IndicesClient(self.es) self.host = host self.port = port self.index = index self.result = None self.response = None self.props = props else: pass def connect(self, host, port): try: res = requests.get(url="http://{}:{}".format(host, port)) if res.status_code != 200: raise ConnectionError( "Elasticsearch Connection Failed. [{}:{}]".format( host, port)) return True except Exception as e: self.error(e, "CONNECT") return False def __str__(self): return "\nDescriptions for handEl\nhost : {}, port : {}, index : {}\ncurrent result : {}\n" \ .format(self.host, self.port, self.index, self.result) def __call__(self): print("\nCurrent response ▼") pp(self.response) def indexing(self, index): self.index = index def search(self, value, index=None, field="alias", fuzziness=0): if index: self.indexing(index) if self.match(value, field=field, fuzziness=fuzziness): return self.result else: return None def analyze(self: type, value: str, options: list, analyzer="my_analyzer"): if not value or not self.index: return [] body = { "analyzer": analyzer, "text": value, "attributes": options, "explain": True } analies = self.ies.analyze(index=self.index, body=body)['detail'] return analies['tokenfilters'][0][ 'tokens'] if '-f' in options else analies['tokenizer']['tokens'] def tokenize(self, value, tokenizer="natural_tokenizer"): if not value: return [] tokens = self.ies.analyze(index=tokenizer, body={ "field": "comments", "text": value }) if tokens['tokens']: return [token['token'] for token in tokens['tokens']] else: return [] def match(self, value, field="alias", fuzziness=0): body = { "query": { "match": { field: { "query": value, "fuzziness": fuzziness } } } } return self.parse(self.es.search(index=self.index, body=body)) def parse(self, res, score=1.0): self.response = res if res["hits"]["total"]["value"] > 0: self.result = { 'match_num': res["hits"]["total"]["value"], 'match_sco': res["hits"]["max_score"], 'match_ids': [hit["_id"] for hit in res["hits"]["hits"]], 'match_res': [hit["_source"] for hit in res["hits"]["hits"]], } return True else: self.result = None return False def doc(self, docid): if self.es.exists(index=self.index, id=docid): res = self.es.get(index=self.index, id=docid) self.result = { 'match_num': 1, 'match_sco': res["_primary_term"], 'match_ids': [res["_id"]], 'match_res': res["_source"], } return True else: self.result = None return False def num(self): return self.results['match_num'] # Create id - documents def prope(self, docid, documents, overturn=False, index=False): try: if type(documents) != type(dict()): raise Exception("The type of contents is not dictionaries!") if index: self.indexing(index) if self.doc(docid): self.update(docid, documents, overturn) else: self.create(docid, documents) except Exception as e: self.error(e, "PROPE") def unionDict(self, dict0, dict1, appending=True): if appending: for k, v in list( filter(lambda x: type(x[1]) == type([]), dict0.items())): dict1[k] = list(set(dict1[k] + v)) dict0.update(dict1) return dict0 def update(self, docid, documents, overturn=False): try: if type(documents) != type(dict()): raise Exception("The type of contents is not dictionaries!") if overturn: self.es.update(index=self.index, id=docid, body={'doc': { **documents }}) else: pp(self.unionDict(self.result['match_res'], documents)) self.es.update(index=self.index, id=self.result['match_ids'], body={ 'doc': { **self.unionDict( self.result['match_res'], documents) } }) except Exception as e: self.error(e, "UPDATE") def create(self, docid, documents): try: if type(documents) != type(dict()): raise Exception("The type of contents is not dictionaries!") self.es.create(index=self.index, id=docid, body={**documents}) except Exception as e: self.error(e, "CREATE") def error(self, e, msg=""): print("ERROR {} : {}".format(msg, e))
def analyze(self, es, indexName, text, analyzer): iclient = IndicesClient(es) print iclient.analyze(indexName, text, analyzer=analyzer)
def __init__(self): # Get all queries directory = "D:\\IR\\IR_data\\AP_DATA" + "\\" + "query_desc.51-100.short.txt" with open(directory, "r") as r: allQueries = r.readlines() # Establish connection es = Elasticsearch() q = IndicesClient(es) # Get all queries in ascending order in dictionary queryDict = {} for each_query in allQueries: queryDict[int(each_query.split()[0].replace(".", ""))] = ' '.join( each_query.split()[1:]) # Iterate through each query. for queryNo, query in queryDict.iteritems(): docDict = {} query = query.replace("Document", "") # Elastic search query for stopwords removal and stemming queryResult = q.analyze(index="ap_dataset", analyzer="my_english", text=query) # Iterate through each term print(" ------------ into query -----------------") for term in queryResult['tokens']: # Get each term finalTerm = term['token'].encode("utf-8") inlineQuery = "double tf =_index['text']" + "[\"" + finalTerm + "\"].tf(); double df =_index['text'] " + "[\"" + finalTerm + "\"].df(); double ttf = _index['text']" + "[\"" + finalTerm + "\"].ttf(); int docLen = _source.text.split().size(); return tf/(tf + 0.5 + 1.5*(docLen/441.5))" termQuery = { "size": 5000, "_source": "false", "query": { "term": { "text": finalTerm } }, "script_fields": { "okapiScore": { "script": { "lang": "groovy", "inline": inlineQuery } } } } try: res = es.search( index="ap_dataset", doc_type="HW1", body=termQuery, filter_path= "_scroll_id,hits.hits._id,hits.hits.fields.okapiScore,hits.total", scroll="1m") except TransportError as e: print(e.info) docFreq = res['hits']['total'] total = res if docFreq > 0: while (len(total['hits']['hits']) != 0): for items in total['hits']['hits']: doc_id = items['_id'] if docDict.has_key(doc_id): prevValue = docDict[doc_id] stats = float(items['fields']['okapiScore'][0]) docDict[doc_id.encode( 'utf-8')] = prevValue + stats else: stats = float(items['fields']['okapiScore'][0]) docDict[doc_id.encode('utf-8')] = stats sid = total['_scroll_id'] # print(" preparing for scroll ") total = es.scroll(scroll_id=sid, scroll='1m') r = RetrievalModel() r.okapiTfcalculate(queryNo, docDict)
def analyze_text(text, es_client, analyzer_index='analisis', analazer_name='clean'): analyze_client = IndicesClient(client=es_client) res = analyze_client.analyze(index=analyzer_index, analyzer=analazer_name, body=text) return [x['token'] for x in res['tokens']]
def __init__(self): # Get all queries directory = "D:\\IR\\IR_data\\AP_DATA" + "\\" + "query_desc.51-100.short.txt" with open(directory, "r") as r: allQueries = r.readlines() # Establish connection es = Elasticsearch() q = IndicesClient(es) allDoc = DocIDProvider() allDocFinal = allDoc.getDoc() # Get all queries in ascending order in dictionary queryDict = {} for each_query in allQueries: queryDict[int(each_query.split()[0].replace(".", ""))] = ' '.join( each_query.split()[1:]) # Iterate through each query. for queryNo, query in queryDict.iteritems(): docDict = {} query = query.replace("Document", "") query = query.replace("report", "") # Elastic search query for stopwords removal and stemming queryResult = q.analyze(index="ap_dataset", analyzer="my_english", text=query) totalTermInQuery = len(queryResult['tokens']) for docId, default in allDocFinal.iteritems(): docDict[docId] = totalTermInQuery * default # Iterate through each term print(" ------------ into query -----------------") for term in queryResult['tokens']: # Get each term finalTerm = term['token'].encode("utf-8") inlineQuery = "double tf =_index['text']" + "[\"" + finalTerm + "\"].tf(); double docLen = _source.text.split().size();double plaplace = (tf + 1.0)/(docLen + 178097.0); return Math.log10(plaplace)" termQuery = { "size": 10000, "_source": "false", "query": { "match": { "text": finalTerm } }, "script_fields": { "plaplace": { "script": { "lang": "groovy", "inline": inlineQuery } } } } try: res = es.search( index="ap_dataset", doc_type="HW1", body=termQuery, filter_path= "_scroll_id,hits.hits._id,hits.hits.fields.plaplace,hits.total", scroll="1m") except TransportError as e: print(e.info) docFreq = res['hits']['total'] total = res if docFreq > 0: while len(total['hits']['hits']) != 0: for items in total['hits']['hits']: doc_id = items['_id'] if docDict.has_key(doc_id): prevValue = docDict[doc_id] stats = float(items['fields']['plaplace'] [0]) - math.log10(1.0 / 178097.0) docDict[doc_id.encode( 'utf-8')] = prevValue + stats # else: # stats = float(items['fields']['plaplace'][0]) - math.log(1.0/178097.0) # docDict[doc_id.encode('utf-8')] = stats sid = total['_scroll_id'] total = es.scroll(scroll_id=sid, scroll='1m') r = RetrievalModel() r.laplaceSmoothing(queryNo, docDict)