Example #1
0
 def test_tokenize(self):
     test_case = {
         "text": "2018-06-08T00:00:00Z INFO GET /v1/bundles/7ef8966b-45ef-4e0a-a51b-44a865372050.2018-06-08T230333.785338Z?param1=1&param2=2 {\"key\": \"value\"}"
     }
     index_name = self.es_client._format_today_index_name(self.index_prefix)
     index_client = IndicesClient(TestESClient.es)
     with self.new_index(index_name):
         response = index_client.analyze(index=index_name, body=test_case)
         tokens = [t['token'] for t in response['tokens']]
     self.assertEqual(set(tokens), {
         '7ef8966b-45ef-4e0a-a51b-44a865372050',
         '2018-06-08T230333.785338Z',
         ':',
         'INFO',
         '1',
         '2',
         'v1',
         'bundles',
         'key',
         'GET',
         'param2',
         'param1',
         '2018-06-08T00:00:00Z',
         'value'
     })
     self.assertEqual(len(tokens), 14)
Example #2
0
 def get_morpheme(self, text):
     setting = {
         "analyzer":
         "my_analyzer",
         "text":
         text,
         "attributes":
         ["posType", "leftPOS", "rightPOS", "morphemes", "reading"],
         "explain":
         "true"
     }
     i = IndicesClient(self.es)
     temp = i.analyze(index="pos", body=setting)
     tokens = temp.get('detail').get('tokenizer').get('tokens')
     token_list = list({token['token']: token
                        for token in tokens}.values())  # 중복제거
     return token_list
    def __init__(self):
        # Get all queries
        directory = "D:\\IR\\IR_data\\AP_DATA" + "\\" + "query_desc.51-100.short.txt"
        with open(directory, "r") as r:
            allQueries = r.readlines()

        # Establish connection
        es = Elasticsearch()
        q = IndicesClient(es)
        allDocID = DocIDProviderForJelinek()
        allDoc = allDocID.getDocID()
        # Get all queries in ascending order in dictionary
        queryDict = {}
        for each_query in allQueries:
            queryDict[int(each_query.split()[0].replace(".", ""))] = ' '.join(
                each_query.split()[1:])
        # Iterate through each query.
        for queryNo, query in queryDict.iteritems():
            print("Into query ----------------")
            queryDict = {}
            query = query.replace("Document", "")
            # Elastic search query for stopwords removal and stemming
            queryResult = q.analyze(index="ap_dataset",
                                    analyzer="my_english",
                                    text=query)
            # Iterate through each term
            for term in queryResult['tokens']:
                finalTerm = term['token'].encode("utf-8")
                termDict = termmDictFunction(allDoc, finalTerm)
                matchDict = matchDictFunction(finalTerm)

                for docID, value in matchDict.iteritems():
                    if termDict.has_key(docID):
                        termDict[docID] = value
                    else:
                        termDict[docID] = value

                for docID, value in termDict.iteritems():
                    if queryDict.has_key(docID):
                        prev = queryDict[docID]
                        queryDict[docID] = prev + value
                    else:
                        queryDict[docID] = value
                # print queryDict
            r = RetrievalModel()
            r.unigramJM(queryNo, queryDict)
Example #4
0
class ElasticAnalyzer:
    def __init__(self, language="english"):
        self.core = ElasticCore()
        self.indices_client = IndicesClient(self.core.es)
        self.splitter = TextSplitter(split_by="WORD_LIMIT")
        self.language = language

    def chunk_input(self, text):
        analyzed_chunks = []
        # Split input if token count greater than 5K.
        # Elastic will complain if token count exceeds 10K.
        docs = self.splitter.split(text, max_limit=5000)
        # Extract text chunks from docs.
        text_chunks = [doc["text"] for doc in docs]
        return text_chunks

    def apply_analyzer(self, body):
        try:
            analysis = self.indices_client.analyze(body=body)
            tokens = [token["token"] for token in analysis["tokens"]]
            token_string = " ".join(tokens)
            return token_string
        except elasticsearch.exceptions.RequestError as e:
            reason = e.info["error"]["reason"]
            if "Invalid stemmer class" in reason:
                logging.getLogger(ERROR_LOGGER).warning(e)
            else:
                logging.getLogger(ERROR_LOGGER).exception(e)
            return ""
        except Exception as e:
            logging.getLogger(ERROR_LOGGER).exception(e)
            return ""

    def _prepare_stem_body(self,
                           text,
                           language,
                           strip_html: bool,
                           tokenizer="standard"):
        body = {
            "text": text,
            "tokenizer": tokenizer,
            "filter": [{
                "type": "snowball",
                "language": language
            }]
        }
        if strip_html:
            body["char_filter"] = ["html_strip"]
        return body

    def stem_text(self,
                  text: str,
                  language: Optional[str],
                  strip_html=True,
                  tokenizer="standard"):
        analysed_chunks = []
        text_chunks = self.chunk_input(text)
        for chunk in text_chunks:
            body = self._prepare_stem_body(chunk, language, strip_html,
                                           tokenizer)
            response = self.apply_analyzer(body)
            analysed_chunks.append(response)
        return " ".join(analysed_chunks)

    def _prepare_tokenizer_body(self,
                                text,
                                tokenizer="standard",
                                strip_html: bool = True):
        body = {"text": text, "tokenizer": tokenizer}
        if strip_html:
            body["char_filter"] = ["html_strip"]
        return body

    def tokenize_text(self, text, tokenizer="standard", strip_html=True):
        analysed_chunks = []
        text_chunks = self.chunk_input(text)
        for chunk in text_chunks:
            body = self._prepare_tokenizer_body(chunk, tokenizer, strip_html)
            response = self.apply_analyzer(body)
            analysed_chunks.append(response)
        return " ".join(analysed_chunks)

    def analyze(self, text: str, body: dict) -> str:
        analysed_chunks = []
        text_chunks = self.chunk_input(text)
        for chunk in text_chunks:
            body = {
                **body,
                "text": chunk,
            }
            response = self.apply_analyzer(body)
            analysed_chunks.append(response)
        return " ".join(analysed_chunks)

    # This only exists here because otherwise it breaks backwards compatibility
    # with the texta-tools library that send there as the lemmatizer.
    def lemmatize(self, text):
        return self.stem_text(text=text,
                              language=self.language,
                              strip_html=True)
Example #5
0
class handEl:
    def __init__(self,
                 host='localhost',
                 port=9200,
                 index="default",
                 props=None):
        if self.connect(host, port):
            print("Elasticsearch Connection Success [{}:{}]".format(
                host, port))
            self.es = Elasticsearch(host=host, port=port)
            self.ies = IndicesClient(self.es)
            self.host = host
            self.port = port
            self.index = index
            self.result = None
            self.response = None
            self.props = props
        else:
            pass

    def connect(self, host, port):
        try:
            res = requests.get(url="http://{}:{}".format(host, port))
            if res.status_code != 200:
                raise ConnectionError(
                    "Elasticsearch Connection Failed. [{}:{}]".format(
                        host, port))
            return True
        except Exception as e:
            self.error(e, "CONNECT")
            return False

    def __str__(self):
        return "\nDescriptions for handEl\nhost : {}, port : {}, index : {}\ncurrent result : {}\n" \
                .format(self.host, self.port, self.index, self.result)

    def __call__(self):
        print("\nCurrent response ▼")
        pp(self.response)

    def indexing(self, index):
        self.index = index

    def search(self, value, index=None, field="alias", fuzziness=0):
        if index: self.indexing(index)
        if self.match(value, field=field, fuzziness=fuzziness):
            return self.result
        else:
            return None

    def analyze(self: type, value: str, options: list, analyzer="my_analyzer"):
        if not value or not self.index: return []
        body = {
            "analyzer": analyzer,
            "text": value,
            "attributes": options,
            "explain": True
        }
        analies = self.ies.analyze(index=self.index, body=body)['detail']
        return analies['tokenfilters'][0][
            'tokens'] if '-f' in options else analies['tokenizer']['tokens']

    def tokenize(self, value, tokenizer="natural_tokenizer"):
        if not value: return []
        tokens = self.ies.analyze(index=tokenizer,
                                  body={
                                      "field": "comments",
                                      "text": value
                                  })
        if tokens['tokens']:
            return [token['token'] for token in tokens['tokens']]
        else:
            return []

    def match(self, value, field="alias", fuzziness=0):
        body = {
            "query": {
                "match": {
                    field: {
                        "query": value,
                        "fuzziness": fuzziness
                    }
                }
            }
        }
        return self.parse(self.es.search(index=self.index, body=body))

    def parse(self, res, score=1.0):
        self.response = res
        if res["hits"]["total"]["value"] > 0:
            self.result = {
                'match_num': res["hits"]["total"]["value"],
                'match_sco': res["hits"]["max_score"],
                'match_ids': [hit["_id"] for hit in res["hits"]["hits"]],
                'match_res': [hit["_source"] for hit in res["hits"]["hits"]],
            }
            return True
        else:
            self.result = None
            return False

    def doc(self, docid):
        if self.es.exists(index=self.index, id=docid):
            res = self.es.get(index=self.index, id=docid)
            self.result = {
                'match_num': 1,
                'match_sco': res["_primary_term"],
                'match_ids': [res["_id"]],
                'match_res': res["_source"],
            }
            return True
        else:
            self.result = None
            return False

    def num(self):
        return self.results['match_num']

    # Create id - documents
    def prope(self, docid, documents, overturn=False, index=False):
        try:
            if type(documents) != type(dict()):
                raise Exception("The type of contents is not dictionaries!")
            if index: self.indexing(index)
            if self.doc(docid): self.update(docid, documents, overturn)
            else: self.create(docid, documents)
        except Exception as e:
            self.error(e, "PROPE")

    def unionDict(self, dict0, dict1, appending=True):
        if appending:
            for k, v in list(
                    filter(lambda x: type(x[1]) == type([]), dict0.items())):
                dict1[k] = list(set(dict1[k] + v))
        dict0.update(dict1)
        return dict0

    def update(self, docid, documents, overturn=False):
        try:
            if type(documents) != type(dict()):
                raise Exception("The type of contents is not dictionaries!")
            if overturn:
                self.es.update(index=self.index,
                               id=docid,
                               body={'doc': {
                                   **documents
                               }})
            else:
                pp(self.unionDict(self.result['match_res'], documents))
                self.es.update(index=self.index,
                               id=self.result['match_ids'],
                               body={
                                   'doc': {
                                       **self.unionDict(
                                           self.result['match_res'], documents)
                                   }
                               })
        except Exception as e:
            self.error(e, "UPDATE")

    def create(self, docid, documents):
        try:
            if type(documents) != type(dict()):
                raise Exception("The type of contents is not dictionaries!")
            self.es.create(index=self.index, id=docid, body={**documents})
        except Exception as e:
            self.error(e, "CREATE")

    def error(self, e, msg=""):
        print("ERROR {} : {}".format(msg, e))
 def analyze(self, es, indexName, text, analyzer):
     iclient = IndicesClient(es)
     print iclient.analyze(indexName, text, analyzer=analyzer)
Example #7
0
    def __init__(self):
        # Get all queries
        directory = "D:\\IR\\IR_data\\AP_DATA" + "\\" + "query_desc.51-100.short.txt"
        with open(directory, "r") as r:
            allQueries = r.readlines()

        # Establish connection
        es = Elasticsearch()
        q = IndicesClient(es)

        # Get all queries in ascending order in dictionary
        queryDict = {}
        for each_query in allQueries:
            queryDict[int(each_query.split()[0].replace(".", ""))] = ' '.join(
                each_query.split()[1:])

        # Iterate through each query.
        for queryNo, query in queryDict.iteritems():
            docDict = {}
            query = query.replace("Document", "")
            # Elastic search query for stopwords removal and stemming
            queryResult = q.analyze(index="ap_dataset",
                                    analyzer="my_english",
                                    text=query)
            # Iterate through each term
            print(" ------------ into query -----------------")
            for term in queryResult['tokens']:
                # Get each term
                finalTerm = term['token'].encode("utf-8")
                inlineQuery = "double tf =_index['text']" + "[\"" + finalTerm + "\"].tf(); double df =_index['text'] " + "[\"" + finalTerm + "\"].df(); double ttf = _index['text']" + "[\"" + finalTerm + "\"].ttf(); int docLen = _source.text.split().size(); return tf/(tf + 0.5 + 1.5*(docLen/441.5))"
                termQuery = {
                    "size": 5000,
                    "_source": "false",
                    "query": {
                        "term": {
                            "text": finalTerm
                        }
                    },
                    "script_fields": {
                        "okapiScore": {
                            "script": {
                                "lang": "groovy",
                                "inline": inlineQuery
                            }
                        }
                    }
                }
                try:
                    res = es.search(
                        index="ap_dataset",
                        doc_type="HW1",
                        body=termQuery,
                        filter_path=
                        "_scroll_id,hits.hits._id,hits.hits.fields.okapiScore,hits.total",
                        scroll="1m")
                except TransportError as e:
                    print(e.info)
                docFreq = res['hits']['total']
                total = res
                if docFreq > 0:
                    while (len(total['hits']['hits']) != 0):
                        for items in total['hits']['hits']:
                            doc_id = items['_id']
                            if docDict.has_key(doc_id):
                                prevValue = docDict[doc_id]
                                stats = float(items['fields']['okapiScore'][0])
                                docDict[doc_id.encode(
                                    'utf-8')] = prevValue + stats
                            else:
                                stats = float(items['fields']['okapiScore'][0])
                                docDict[doc_id.encode('utf-8')] = stats
                        sid = total['_scroll_id']
                        # print(" preparing for scroll ")
                        total = es.scroll(scroll_id=sid, scroll='1m')

            r = RetrievalModel()
            r.okapiTfcalculate(queryNo, docDict)
Example #8
0
def analyze_text(text, es_client, analyzer_index='analisis', analazer_name='clean'):
    analyze_client = IndicesClient(client=es_client)
    res = analyze_client.analyze(index=analyzer_index, analyzer=analazer_name, body=text)

    return [x['token'] for x in res['tokens']]
Example #9
0
    def __init__(self):
        # Get all queries
        directory = "D:\\IR\\IR_data\\AP_DATA" + "\\" + "query_desc.51-100.short.txt"
        with open(directory, "r") as r:
            allQueries = r.readlines()
        # Establish connection
        es = Elasticsearch()
        q = IndicesClient(es)
        allDoc = DocIDProvider()
        allDocFinal = allDoc.getDoc()
        # Get all queries in ascending order in dictionary
        queryDict = {}
        for each_query in allQueries:
            queryDict[int(each_query.split()[0].replace(".", ""))] = ' '.join(
                each_query.split()[1:])

        # Iterate through each query.
        for queryNo, query in queryDict.iteritems():
            docDict = {}
            query = query.replace("Document", "")
            query = query.replace("report", "")

            # Elastic search query for stopwords removal and stemming
            queryResult = q.analyze(index="ap_dataset",
                                    analyzer="my_english",
                                    text=query)
            totalTermInQuery = len(queryResult['tokens'])

            for docId, default in allDocFinal.iteritems():
                docDict[docId] = totalTermInQuery * default
            # Iterate through each term
            print(" ------------ into query -----------------")
            for term in queryResult['tokens']:
                # Get each term
                finalTerm = term['token'].encode("utf-8")
                inlineQuery = "double tf =_index['text']" + "[\"" + finalTerm + "\"].tf(); double docLen = _source.text.split().size();double plaplace = (tf + 1.0)/(docLen + 178097.0); return Math.log10(plaplace)"
                termQuery = {
                    "size": 10000,
                    "_source": "false",
                    "query": {
                        "match": {
                            "text": finalTerm
                        }
                    },
                    "script_fields": {
                        "plaplace": {
                            "script": {
                                "lang": "groovy",
                                "inline": inlineQuery
                            }
                        }
                    }
                }
                try:
                    res = es.search(
                        index="ap_dataset",
                        doc_type="HW1",
                        body=termQuery,
                        filter_path=
                        "_scroll_id,hits.hits._id,hits.hits.fields.plaplace,hits.total",
                        scroll="1m")
                except TransportError as e:
                    print(e.info)
                docFreq = res['hits']['total']
                total = res
                if docFreq > 0:
                    while len(total['hits']['hits']) != 0:
                        for items in total['hits']['hits']:
                            doc_id = items['_id']
                            if docDict.has_key(doc_id):
                                prevValue = docDict[doc_id]
                                stats = float(items['fields']['plaplace']
                                              [0]) - math.log10(1.0 / 178097.0)
                                docDict[doc_id.encode(
                                    'utf-8')] = prevValue + stats
                            # else:
                            #     stats = float(items['fields']['plaplace'][0]) - math.log(1.0/178097.0)
                            #     docDict[doc_id.encode('utf-8')] = stats
                        sid = total['_scroll_id']
                        total = es.scroll(scroll_id=sid, scroll='1m')

            r = RetrievalModel()
            r.laplaceSmoothing(queryNo, docDict)