Ejemplo n.º 1
0
class GenerationDispatcher:
  
  def __init__(self, config, trainingDataset, holdOutDataset, processingStartIndex, processingEndIndex):
    self.config = config
    self.logger = config["logger"]
    self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
    self.trainingDataset = trainingDataset
    self.holdOutDataset = holdOutDataset
    self.config["processingStartIndex"] = processingStartIndex
    self.config["processingEndIndex"] = processingEndIndex
    self.bagOfPhrases = {}
    self.corpusIndex = config["corpus"]["index"]
    self.corpusType = config["corpus"]["type"]
    self.corpusFields = config["corpus"]["text_fields"]
    self.corpusSize = 0
    self.totalPhrasesDispatched = 0
    self.phrasesGenerated = 0
    self.phrasesNotGenerated = 0
    self.timeout = 86400000
    self.dispatcherName = "bayzee.generation.dispatcher"
    if processingEndIndex != None:
      self.dispatcherName += "." + str(processingStartIndex) + "." + str(processingEndIndex)
    self.workerName = "bayzee.generation.worker"
    self.processorIndex = config["processor"]["index"]
    self.processorType = config["processor"]["type"]
    self.processorPhraseType = config["processor"]["type"]+"__phrase"
    self.processingPageSize = config["processingPageSize"]
    config["processor_phrase_type"] = self.processorPhraseType
    
    self.featureNames = map(lambda x: x["name"], config["generator"]["features"])
    for module in config["processor"]["modules"]:
      self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"])

    # creating generation dispatcher
    self.generationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback)
    
    # creating controle channel
    self.controlChannel = RemoteChannel(self.dispatcherName, config)

  def dispatchToGenerate(self):
    processorIndex = self.config["processor"]["index"]
    phraseProcessorType = self.config["processor"]["type"] + "__phrase"
    nextPhraseIndex = 0
    if self.config["processingStartIndex"] != None: nextPhraseIndex = self.config["processingStartIndex"]
    endPhraseIndex = -1
    if self.config["processingEndIndex"] != None: endPhraseIndex = self.config["processingEndIndex"]

    if endPhraseIndex != -1 and self.processingPageSize > (endPhraseIndex - nextPhraseIndex):
      self.processingPageSize = endPhraseIndex - nextPhraseIndex + 1
    
    while True:
      phrases = self.esClient.search(index=processorIndex, doc_type=phraseProcessorType, body={"from": nextPhraseIndex,"size": self.processingPageSize, "query":{"match_all":{}},"sort":[{"phrase__not_analyzed":{"order":"asc"}}]}, fields=["_id"])
      if len(phrases["hits"]["hits"]) == 0: break
      self.totalPhrasesDispatched += len(phrases["hits"]["hits"])
      floatPrecision = "{0:." + str(self.config["generator"]["floatPrecision"]) + "f}"
      self.logger.info("Generating features from " + str(nextPhraseIndex) + " to " + str(nextPhraseIndex+len(phrases["hits"]["hits"])) + " phrases...")
      for phraseData in phrases["hits"]["hits"]:
        self.logger.info("Dispatching phrase " + phraseData["_id"])
        content = {"phraseId": phraseData["_id"], "type": "generate", "count": 1, "from": self.dispatcherName}
        self.generationDispatcher.send(content, self.workerName, self.timeout)
      nextPhraseIndex += len(phrases["hits"]["hits"])
      if endPhraseIndex != -1 and nextPhraseIndex >= endPhraseIndex: break
    
    while True:
      message = self.generationDispatcher.receive()
      if "phraseId" in message["content"] and message["content"]["phraseId"] > 0:
        self.phrasesGenerated += 1
        self.generationDispatcher.close(message)
        self.logger.info("Generated for " + message["content"]["phraseId"] + str(self.phrasesGenerated) + "/" + str(self.totalPhrasesDispatched))
      
      if (self.phrasesGenerated + self.phrasesNotGenerated) >= self.totalPhrasesDispatched:
        self.controlChannel.send("dying")
        break

    self.__terminate()
    
  def timeoutCallback(self, message):
    config.logger.info("Message timed out: " + str(message))
    if message["content"]["count"] < 5:
      message["content"]["count"] += 1
      self.generationDispatcher.send(message["content"], self.workerName, self.timeout)
    else:
      #log implementation yet to be done for expired phrases
      self.phrasesNotGenerated += 1
      if self.phrasesNotGenerated == self.totalPhrasesDispatched or (self.phrasesGenerated + self.phrasesNotGenerated) == self.totalPhrasesDispatched:
        self.__terminate()

  def __terminate(self):
    self.logger.info(str(self.totalPhrasesDispatched) + " total dispatched")
    self.logger.info(str(self.phrasesGenerated) + " generated")
    self.logger.info(str(self.phrasesNotGenerated) + " failed to generate")
    self.logger.info("Generation complete")
    self.logger.info("Terminating generation dispatcher")
Ejemplo n.º 2
0
class GenerationDispatcher:
    def __init__(self, config, trainingDataset, holdOutDataset,
                 processingStartIndex, processingEndIndex):
        self.config = config
        self.logger = config["logger"]
        self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" +
                                      str(config["elasticsearch"]["port"]))
        self.trainingDataset = trainingDataset
        self.holdOutDataset = holdOutDataset
        self.config["processingStartIndex"] = processingStartIndex
        self.config["processingEndIndex"] = processingEndIndex
        self.bagOfPhrases = {}
        self.corpusIndex = config["corpus"]["index"]
        self.corpusType = config["corpus"]["type"]
        self.corpusFields = config["corpus"]["text_fields"]
        self.corpusSize = 0
        self.totalPhrasesDispatched = 0
        self.phrasesGenerated = 0
        self.phrasesNotGenerated = 0
        self.timeout = 86400000
        self.dispatcherName = "bayzee.generation.dispatcher"
        if processingEndIndex != None:
            self.dispatcherName += "." + str(processingStartIndex) + "." + str(
                processingEndIndex)
        self.workerName = "bayzee.generation.worker"
        self.processorIndex = config["processor"]["index"]
        self.processorType = config["processor"]["type"]
        self.processorPhraseType = config["processor"]["type"] + "__phrase"
        self.processingPageSize = config["processingPageSize"]
        config["processor_phrase_type"] = self.processorPhraseType

        self.featureNames = map(lambda x: x["name"],
                                config["generator"]["features"])
        for module in config["processor"]["modules"]:
            self.featureNames = self.featureNames + map(
                lambda x: x["name"], module["features"])

        # creating generation dispatcher
        self.generationDispatcher = DurableChannel(self.dispatcherName, config,
                                                   self.timeoutCallback)

        # creating controle channel
        self.controlChannel = RemoteChannel(self.dispatcherName, config)

    def dispatchToGenerate(self):
        processorIndex = self.config["processor"]["index"]
        phraseProcessorType = self.config["processor"]["type"] + "__phrase"
        nextPhraseIndex = 0
        if self.config["processingStartIndex"] != None:
            nextPhraseIndex = self.config["processingStartIndex"]
        endPhraseIndex = -1
        if self.config["processingEndIndex"] != None:
            endPhraseIndex = self.config["processingEndIndex"]

        if endPhraseIndex != -1 and self.processingPageSize > (
                endPhraseIndex - nextPhraseIndex):
            self.processingPageSize = endPhraseIndex - nextPhraseIndex + 1

        while True:
            phrases = self.esClient.search(index=processorIndex,
                                           doc_type=phraseProcessorType,
                                           body={
                                               "from":
                                               nextPhraseIndex,
                                               "size":
                                               self.processingPageSize,
                                               "query": {
                                                   "match_all": {}
                                               },
                                               "sort": [{
                                                   "phrase__not_analyzed": {
                                                       "order": "asc"
                                                   }
                                               }]
                                           },
                                           fields=["_id"])
            if len(phrases["hits"]["hits"]) == 0: break
            self.totalPhrasesDispatched += len(phrases["hits"]["hits"])
            floatPrecision = "{0:." + str(
                self.config["generator"]["floatPrecision"]) + "f}"
            self.logger.info("Generating features from " +
                             str(nextPhraseIndex) + " to " +
                             str(nextPhraseIndex +
                                 len(phrases["hits"]["hits"])) + " phrases...")
            for phraseData in phrases["hits"]["hits"]:
                self.logger.info("Dispatching phrase " + phraseData["_id"])
                content = {
                    "phraseId": phraseData["_id"],
                    "type": "generate",
                    "count": 1,
                    "from": self.dispatcherName
                }
                self.generationDispatcher.send(content, self.workerName,
                                               self.timeout)
            nextPhraseIndex += len(phrases["hits"]["hits"])
            if endPhraseIndex != -1 and nextPhraseIndex >= endPhraseIndex:
                break

        while True:
            message = self.generationDispatcher.receive()
            if "phraseId" in message[
                    "content"] and message["content"]["phraseId"] > 0:
                self.phrasesGenerated += 1
                self.generationDispatcher.close(message)
                self.logger.info("Generated for " +
                                 message["content"]["phraseId"] +
                                 str(self.phrasesGenerated) + "/" +
                                 str(self.totalPhrasesDispatched))

            if (self.phrasesGenerated +
                    self.phrasesNotGenerated) >= self.totalPhrasesDispatched:
                self.controlChannel.send("dying")
                break

        self.__terminate()

    def timeoutCallback(self, message):
        config.logger.info("Message timed out: " + str(message))
        if message["content"]["count"] < 5:
            message["content"]["count"] += 1
            self.generationDispatcher.send(message["content"], self.workerName,
                                           self.timeout)
        else:
            #log implementation yet to be done for expired phrases
            self.phrasesNotGenerated += 1
            if self.phrasesNotGenerated == self.totalPhrasesDispatched or (
                    self.phrasesGenerated +
                    self.phrasesNotGenerated) == self.totalPhrasesDispatched:
                self.__terminate()

    def __terminate(self):
        self.logger.info(
            str(self.totalPhrasesDispatched) + " total dispatched")
        self.logger.info(str(self.phrasesGenerated) + " generated")
        self.logger.info(str(self.phrasesNotGenerated) + " failed to generate")
        self.logger.info("Generation complete")
        self.logger.info("Terminating generation dispatcher")
Ejemplo n.º 3
0
class AnnotationDispatcher:
  
  def __init__(self, config, processingStartIndex, processingEndIndex):
    self.config = config
    self.logger = config["logger"]
    self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
    self.bagOfPhrases = {}
    self.corpusIndex = config["corpus"]["index"]
    self.corpusType = config["corpus"]["type"]
    self.corpusFields = config["corpus"]["text_fields"]
    self.corpusSize = 0
    self.processorIndex = config["processor"]["index"]
    self.processorType = config["processor"]["type"]
    self.processorPhraseType = config["processor"]["type"] + "__phrase"
    self.processingPageSize = config["processingPageSize"]
    self.analyzerIndex = self.corpusIndex + "__analysis__"
    self.config["processingStartIndex"] = processingStartIndex
    self.config["processingEndIndex"] = processingEndIndex
    self.config["processingPageSize"] = self.processingPageSize
    self.totalDocumentsDispatched = 0
    self.documentsAnnotated = 0
    self.documentsNotAnnotated = 0
    self.lastDispatcher = False
    self.endProcess = False
    self.dispatcherName = "bayzee.annotation.dispatcher"
    self.workerName = "bayzee.annotation.worker"
    self.timeout = 86400000
    if processingEndIndex != None:
      self.dispatcherName += "." + str(processingStartIndex) + "." + str(processingEndIndex)

    analyzerIndexSettings = {
      "index":{
        "analysis":{
          "analyzer":{
            "analyzer_shingle":{
              "type": "custom",
              "tokenizer": "standard",
              "filter": ["standard", "lowercase", "filter_shingle"]
            }
          },
          "filter":{
            "filter_shingle":{
              "type": "shingle",
              "max_shingle_size": config["generator"]["maxShingleSize"],
              "min_shingle_size": config["generator"]["minShingleSize"],
              "output_unigrams": (config["generator"]["minShingleSize"] == 1)
            },
            "filter_stop":{
              "type": "stop"
            }
          }
        }
      }
    }
    analyzerIndexTypeMapping = {
      "properties":{
        "phrase":{"type":"string"},
        "document_id":{"type":"string", "index": "not_analyzed"},
        "phrase__not_analyzed":{"type":"string","index":"not_analyzed"}
      }
    }
    corpusSize = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query":{"match_all":{}}})
    self.corpusSize = corpusSize["count"]
    self.featureNames = map(lambda x: x["name"], config["generator"]["features"])
    for module in config["processor"]["modules"]:
      self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"])

    if processingStartIndex == 0:
      if self.esClient.indices.exists(self.analyzerIndex):
        self.esClient.indices.delete(self.analyzerIndex)
      data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) 
        
    if "annotateFromScratch" not in self.config or self.config["annotateFromScratch"] == True:
      try:
        if self.esClient.indices.exists(self.config["processor"]["index"]):
          self.esClient.indices.delete(self.config["processor"]["index"])
        self.esClient.indices.create(self.config["processor"]["index"])
        self.esClient.indices.put_mapping(index=self.config["processor"]["index"],doc_type=self.processorPhraseType,body=analyzerIndexTypeMapping)
        if self.esClient.indices.exists(self.analyzerIndex):
          self.esClient.indices.delete(self.analyzerIndex)
        data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) 
      except:
        error = sys.exc_info()
        self.logger.error("Error occurred during initialization of analyzer index: " + str(error))
        sys.exit(1)
      else:
        sleep(1)

    #dispatcher creation
    self.annotationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback)

    #remote channel intialisation
    self.controlChannel = RemoteChannel(self.dispatcherName, config)

  def dispatchToAnnotate(self):
    if "indexPhrases" in self.config and self.config["indexPhrases"] == False: return
    nextDocumentIndex = 0
    if self.config["processingStartIndex"] != None: nextDocumentIndex = self.config["processingStartIndex"]
    endDocumentIndex = -1
    if self.config["processingEndIndex"] != None: endDocumentIndex = self.config["processingEndIndex"]
   
    if endDocumentIndex != -1 and self.processingPageSize > (endDocumentIndex - nextDocumentIndex):
      self.processingPageSize = endDocumentIndex - nextDocumentIndex + 1

    self.totalDocumentsDispatched = 0

    while True:
      documents = self.esClient.search(index=self.corpusIndex, doc_type=self.corpusType, body={"from": nextDocumentIndex,"size": self.processingPageSize,"query":{"match_all":{}}, "sort":[{"_id":{"order":"asc"}}]}, fields=["_id"])
      if len(documents["hits"]["hits"]) == 0: 
        break
      self.totalDocumentsDispatched += len(documents["hits"]["hits"])
      self.logger.info("Annotating " + str(nextDocumentIndex) + " to " + str(nextDocumentIndex+len(documents["hits"]["hits"])) + " documents...")
      for document in documents["hits"]["hits"]:
        self.logger.info("Dispatching document " + document["_id"])
        content = {"documentId": document["_id"], "type": "annotate", "count": 1, "from":self.dispatcherName}
        self.annotationDispatcher.send(content, self.workerName)
      nextDocumentIndex += len(documents["hits"]["hits"])
      if endDocumentIndex != -1 and endDocumentIndex <= nextDocumentIndex: 
        break
    
    self.logger.info(str(self.totalDocumentsDispatched) + " documents dispatched")
    while True:
      message = self.annotationDispatcher.receive()
      if "documentId" in message["content"] and message["content"]["documentId"] > 0:
        self.documentsAnnotated += 1
        self.annotationDispatcher.close(message)
        self.logger.info("Annotated document " + message["content"]["documentId"] + " - " + str(self.documentsAnnotated) + "/" + str(self.totalDocumentsDispatched))
      
      if (self.documentsAnnotated + self.documentsNotAnnotated) >= self.totalDocumentsDispatched and not self.lastDispatcher:
        self.controlChannel.send("dying")
        self.annotationDispatcher.end()
        break
    
    self.__terminate()

  def timeoutCallback(self, message):
    if message["content"]["count"] < 5:
      message["content"]["count"] += 1
      self.annotationDispatcher.send(message["content"], self.workerName, self.timeout)
    else:
      #log implementation yet to be done for expired documents
      self.documentsNotAnnotated += 1
      if self.documentsNotAnnotated == self.totalDocumentsDispatched or (self.documentsAnnotated + self.documentsNotAnnotated) == self.totalDocumentsDispatched:
        self.__terminate()

  def __terminate(self):
    self.logger.info(str(self.totalDocumentsDispatched) + " total dispatched")
    self.logger.info(str(self.documentsAnnotated) + " annotated")
    self.logger.info(str(self.documentsNotAnnotated) + " failed to annotate")
    self.logger.info("Annotation complete")
    self.logger.info("Terminating annotation dispatcher")

  def __deleteAnalyzerIndex(self):
    if self.esClient.indices.exists(self.analyzerIndex):
        self.esClient.indices.delete(self.analyzerIndex)
Ejemplo n.º 4
0
class AnnotationDispatcher:
    def __init__(self, config, processingStartIndex, processingEndIndex):
        self.config = config
        self.logger = config["logger"]
        self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" +
                                      str(config["elasticsearch"]["port"]))
        self.bagOfPhrases = {}
        self.corpusIndex = config["corpus"]["index"]
        self.corpusType = config["corpus"]["type"]
        self.corpusFields = config["corpus"]["text_fields"]
        self.corpusSize = 0
        self.processorIndex = config["processor"]["index"]
        self.processorType = config["processor"]["type"]
        self.processorPhraseType = config["processor"]["type"] + "__phrase"
        self.processingPageSize = config["processingPageSize"]
        self.analyzerIndex = self.corpusIndex + "__analysis__"
        self.config["processingStartIndex"] = processingStartIndex
        self.config["processingEndIndex"] = processingEndIndex
        self.config["processingPageSize"] = self.processingPageSize
        self.totalDocumentsDispatched = 0
        self.documentsAnnotated = 0
        self.documentsNotAnnotated = 0
        self.lastDispatcher = False
        self.endProcess = False
        self.dispatcherName = "bayzee.annotation.dispatcher"
        self.workerName = "bayzee.annotation.worker"
        self.timeout = 86400000
        if processingEndIndex != None:
            self.dispatcherName += "." + str(processingStartIndex) + "." + str(
                processingEndIndex)

        analyzerIndexSettings = {
            "index": {
                "analysis": {
                    "analyzer": {
                        "analyzer_shingle": {
                            "type": "custom",
                            "tokenizer": "standard",
                            "filter":
                            ["standard", "lowercase", "filter_shingle"]
                        }
                    },
                    "filter": {
                        "filter_shingle": {
                            "type":
                            "shingle",
                            "max_shingle_size":
                            config["generator"]["maxShingleSize"],
                            "min_shingle_size":
                            config["generator"]["minShingleSize"],
                            "output_unigrams":
                            (config["generator"]["minShingleSize"] == 1)
                        },
                        "filter_stop": {
                            "type": "stop"
                        }
                    }
                }
            }
        }
        analyzerIndexTypeMapping = {
            "properties": {
                "phrase": {
                    "type": "string"
                },
                "document_id": {
                    "type": "string",
                    "index": "not_analyzed"
                },
                "phrase__not_analyzed": {
                    "type": "string",
                    "index": "not_analyzed"
                }
            }
        }
        corpusSize = self.esClient.count(index=self.corpusIndex,
                                         doc_type=self.corpusType,
                                         body={"query": {
                                             "match_all": {}
                                         }})
        self.corpusSize = corpusSize["count"]
        self.featureNames = map(lambda x: x["name"],
                                config["generator"]["features"])
        for module in config["processor"]["modules"]:
            self.featureNames = self.featureNames + map(
                lambda x: x["name"], module["features"])

        if processingStartIndex == 0:
            if self.esClient.indices.exists(self.analyzerIndex):
                self.esClient.indices.delete(self.analyzerIndex)
            data = self.esClient.indices.create(self.analyzerIndex,
                                                analyzerIndexSettings)

        if "annotateFromScratch" not in self.config or self.config[
                "annotateFromScratch"] == True:
            try:
                if self.esClient.indices.exists(
                        self.config["processor"]["index"]):
                    self.esClient.indices.delete(
                        self.config["processor"]["index"])
                self.esClient.indices.create(self.config["processor"]["index"])
                self.esClient.indices.put_mapping(
                    index=self.config["processor"]["index"],
                    doc_type=self.processorPhraseType,
                    body=analyzerIndexTypeMapping)
                if self.esClient.indices.exists(self.analyzerIndex):
                    self.esClient.indices.delete(self.analyzerIndex)
                data = self.esClient.indices.create(self.analyzerIndex,
                                                    analyzerIndexSettings)
            except:
                error = sys.exc_info()
                self.logger.error(
                    "Error occurred during initialization of analyzer index: "
                    + str(error))
                sys.exit(1)
            else:
                sleep(1)

        #dispatcher creation
        self.annotationDispatcher = DurableChannel(self.dispatcherName, config,
                                                   self.timeoutCallback)

        #remote channel intialisation
        self.controlChannel = RemoteChannel(self.dispatcherName, config)

    def dispatchToAnnotate(self):
        if "indexPhrases" in self.config and self.config[
                "indexPhrases"] == False:
            return
        nextDocumentIndex = 0
        if self.config["processingStartIndex"] != None:
            nextDocumentIndex = self.config["processingStartIndex"]
        endDocumentIndex = -1
        if self.config["processingEndIndex"] != None:
            endDocumentIndex = self.config["processingEndIndex"]

        if endDocumentIndex != -1 and self.processingPageSize > (
                endDocumentIndex - nextDocumentIndex):
            self.processingPageSize = endDocumentIndex - nextDocumentIndex + 1

        self.totalDocumentsDispatched = 0

        while True:
            documents = self.esClient.search(index=self.corpusIndex,
                                             doc_type=self.corpusType,
                                             body={
                                                 "from":
                                                 nextDocumentIndex,
                                                 "size":
                                                 self.processingPageSize,
                                                 "query": {
                                                     "match_all": {}
                                                 },
                                                 "sort": [{
                                                     "_id": {
                                                         "order": "asc"
                                                     }
                                                 }]
                                             },
                                             fields=["_id"])
            if len(documents["hits"]["hits"]) == 0:
                break
            self.totalDocumentsDispatched += len(documents["hits"]["hits"])
            self.logger.info("Annotating " + str(nextDocumentIndex) + " to " +
                             str(nextDocumentIndex +
                                 len(documents["hits"]["hits"])) +
                             " documents...")
            for document in documents["hits"]["hits"]:
                self.logger.info("Dispatching document " + document["_id"])
                content = {
                    "documentId": document["_id"],
                    "type": "annotate",
                    "count": 1,
                    "from": self.dispatcherName
                }
                self.annotationDispatcher.send(content, self.workerName)
            nextDocumentIndex += len(documents["hits"]["hits"])
            if endDocumentIndex != -1 and endDocumentIndex <= nextDocumentIndex:
                break

        self.logger.info(
            str(self.totalDocumentsDispatched) + " documents dispatched")
        while True:
            message = self.annotationDispatcher.receive()
            if "documentId" in message[
                    "content"] and message["content"]["documentId"] > 0:
                self.documentsAnnotated += 1
                self.annotationDispatcher.close(message)
                self.logger.info("Annotated document " +
                                 message["content"]["documentId"] + " - " +
                                 str(self.documentsAnnotated) + "/" +
                                 str(self.totalDocumentsDispatched))

            if (self.documentsAnnotated + self.documentsNotAnnotated
                ) >= self.totalDocumentsDispatched and not self.lastDispatcher:
                self.controlChannel.send("dying")
                self.annotationDispatcher.end()
                break

        self.__terminate()

    def timeoutCallback(self, message):
        if message["content"]["count"] < 5:
            message["content"]["count"] += 1
            self.annotationDispatcher.send(message["content"], self.workerName,
                                           self.timeout)
        else:
            #log implementation yet to be done for expired documents
            self.documentsNotAnnotated += 1
            if self.documentsNotAnnotated == self.totalDocumentsDispatched or (
                    self.documentsAnnotated + self.documentsNotAnnotated
            ) == self.totalDocumentsDispatched:
                self.__terminate()

    def __terminate(self):
        self.logger.info(
            str(self.totalDocumentsDispatched) + " total dispatched")
        self.logger.info(str(self.documentsAnnotated) + " annotated")
        self.logger.info(
            str(self.documentsNotAnnotated) + " failed to annotate")
        self.logger.info("Annotation complete")
        self.logger.info("Terminating annotation dispatcher")

    def __deleteAnalyzerIndex(self):
        if self.esClient.indices.exists(self.analyzerIndex):
            self.esClient.indices.delete(self.analyzerIndex)