def __init__(self, config, processingStartIndex, processingEndIndex):
        self.config = config
        self.logger = config["logger"]
        self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
        self.config["processingStartIndex"] = processingStartIndex
        self.config["processingEndIndex"] = processingEndIndex
        self.bagOfPhrases = {}

        self.corpusSize = 0
        self.processorIndex = config["processor"]["index"]
        self.processorType = config["processor"]["type"]
        self.processorPhraseType = config["processor"]["type"] + "__phrase"
        self.processingPageSize = config["processingPageSize"]
        config["processor_phrase_type"] = self.processorPhraseType

        self.featureNames = map(lambda x: x["name"], config["generator"]["features"])
        for module in config["processor"]["modules"]:
            self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"])

        self.totalPhrasesDispatched = 0
        self.phrasesClassified = 0
        self.phrasesNotClassified = 0
        self.timeout = 86400000
        self.dispatcherName = "bayzee.classification.dispatcher"
        if processingEndIndex != None:
            self.dispatcherName += "." + str(processingStartIndex) + "." + str(processingEndIndex)
        self.workerName = "bayzee.classification.worker"

        # creating generation dispatcher
        self.classificationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback)

        # remote channel intialisation
        self.controlChannel = RemoteChannel(self.dispatcherName, config)
Beispiel #2
0
    def __init__(self, config, trainingDataset, holdOutDataset,
                 processingStartIndex, processingEndIndex):
        self.config = config
        self.logger = config["logger"]
        self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" +
                                      str(config["elasticsearch"]["port"]))
        self.trainingDataset = trainingDataset
        self.holdOutDataset = holdOutDataset
        self.config["processingStartIndex"] = processingStartIndex
        self.config["processingEndIndex"] = processingEndIndex
        self.bagOfPhrases = {}
        self.corpusIndex = config["corpus"]["index"]
        self.corpusType = config["corpus"]["type"]
        self.corpusFields = config["corpus"]["text_fields"]
        self.corpusSize = 0
        self.totalPhrasesDispatched = 0
        self.phrasesGenerated = 0
        self.phrasesNotGenerated = 0
        self.timeout = 86400000
        self.dispatcherName = "bayzee.generation.dispatcher"
        if processingEndIndex != None:
            self.dispatcherName += "." + str(processingStartIndex) + "." + str(
                processingEndIndex)
        self.workerName = "bayzee.generation.worker"
        self.processorIndex = config["processor"]["index"]
        self.processorType = config["processor"]["type"]
        self.processorPhraseType = config["processor"]["type"] + "__phrase"
        self.processingPageSize = config["processingPageSize"]
        config["processor_phrase_type"] = self.processorPhraseType

        self.featureNames = map(lambda x: x["name"],
                                config["generator"]["features"])
        for module in config["processor"]["modules"]:
            self.featureNames = self.featureNames + map(
                lambda x: x["name"], module["features"])

        # creating generation dispatcher
        self.generationDispatcher = DurableChannel(self.dispatcherName, config,
                                                   self.timeoutCallback)

        # creating controle channel
        self.controlChannel = RemoteChannel(self.dispatcherName, config)
Beispiel #3
0
  def annotate(self):
    while True:
      message = self.worker.receive()
      if message["content"] == "kill":
        message["responseId"] = message["requestId"]
        self.worker.close(message)
        if len(self.dispatchers) == 0:
          self.worker.end()
          break
        else:
          self.worker.send(content="kill", to=self.workerName)
          continue
      elif message["content"]["type"] == "annotate":
        if message["content"]["from"] not in self.dispatchers:
          self.dispatchers[message["content"]["from"]] = RemoteChannel(message["content"]["from"], self.config)
          self.dispatchers[message["content"]["from"]].listen(self.unregisterDispatcher)
        documentId = message["content"]["documentId"]
        document = self.esClient.get(index=self.corpusIndex, doc_type=self.corpusType, id = documentId, fields=self.corpusFields)
        if "fields" in document:  
          for field in self.corpusFields:
            shingles = []
            if field in document["fields"]:
              if type(document["fields"][field]) is list:
                for element in document["fields"][field]:
                  if len(element) > 0:
                    shingleTokens = self.esClient.indices.analyze(index=self.analyzerIndex, body=element, analyzer="analyzer_shingle")
                    shingles += shingleTokens["tokens"]
              else:
                if len(document["fields"][field]) > 0:
                  shingles = self.esClient.indices.analyze(index=self.analyzerIndex, body=document["fields"][field], analyzer="analyzer_shingle")["tokens"]
              shingles = map(self.__replaceUnderscore, shingles)
              shingles = filter(self.__filterTokens, shingles)
            if shingles != None and len(shingles) > 0:
              for shingle in shingles:
                phrase = shingle["token"]
                key = self.__keyify(phrase)
                if len(key) > 0:
                  data = {"phrase": phrase,"phrase__not_analyzed": phrase,"document_id": document["_id"]}
                  if not self.esClient.exists(index=self.processorIndex, doc_type=self.processorPhraseType, id=key):
                    self.esClient.index(index=self.processorIndex, doc_type=self.processorPhraseType, id=key, body=data)
        sleep(1)
        for processorInstance in self.config["processor_instances"]:
          processorInstance.annotate(self.config, documentId)
        self.worker.reply(message, {"documentId": documentId, "status" : "processed", "type" : "reply"}, self.timeout)

    self.logger.info("Terminating annotation worker")
  def classify(self):
    while True:
      message = self.worker.receive()
      if message["content"] == "kill":
        message["responseId"] = message["requestId"]
        self.worker.close(message)
        if len(self.dispatchers) == 0:
          self.worker.end()
          break
        else:
          self.worker.send(content="kill", to=self.workerName)
          continue
      elif message["content"]["type"] == "classify":
        if message["content"]["from"] not in self.dispatchers:
          self.dispatchers[message["content"]["from"]] = RemoteChannel(message["content"]["from"], self.config)
          self.dispatchers[message["content"]["from"]].listen(self.unregisterDispatcher)
        self.phraseId = message["content"]["phraseId"]
        if self.classifier == None:
          self.trainD = self.__loadDataFromES("train", None)
          self.trainD = orange.Preprocessor_discretize(self.trainD, method=orange.EntropyDiscretization())
          self.__train()

        self.trainD = self.__loadDataFromES("train", None)
        testD = self.__loadDataFromES("test", self.trainD.domain)
      
        self.trainD = orange.Preprocessor_discretize(self.trainD, method=orange.EntropyDiscretization())
        testD = orange.ExampleTable(self.trainD.domain, testD)

        for row in testD:
          phrase = row.getmetas().values()[0].value
          featureSet = {}
          for i,feature in enumerate(self.features):
            featureSet[feature["name"]] = row[i].value

          prob = self.classifier.prob_classify(featureSet).prob("1")
          classType = self.classifier.classify(featureSet)
          self.phraseData["_source"]["prob"] = prob
          self.phraseData["_source"]["class_type"] = classType
          self.logger.info("Classified '" + phrase + "' as " + classType + " with probability " + str(prob))
          self.esClient.index(index=self.processorIndex, doc_type=self.processorPhraseType, id=self.phraseId, body=self.phraseData["_source"])
          self.worker.reply(message, {"phraseId": self.phraseId, "status" : "classified", "type" : "reply"}, 120000000)   

    self.logger.info("Terminating classification worker")
class GenerationDispatcher:
  
  def __init__(self, config, trainingDataset, holdOutDataset, processingStartIndex, processingEndIndex):
    self.config = config
    self.logger = config["logger"]
    self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
    self.trainingDataset = trainingDataset
    self.holdOutDataset = holdOutDataset
    self.config["processingStartIndex"] = processingStartIndex
    self.config["processingEndIndex"] = processingEndIndex
    self.bagOfPhrases = {}
    self.corpusIndex = config["corpus"]["index"]
    self.corpusType = config["corpus"]["type"]
    self.corpusFields = config["corpus"]["text_fields"]
    self.corpusSize = 0
    self.totalPhrasesDispatched = 0
    self.phrasesGenerated = 0
    self.phrasesNotGenerated = 0
    self.timeout = 86400000
    self.dispatcherName = "bayzee.generation.dispatcher"
    if processingEndIndex != None:
      self.dispatcherName += "." + str(processingStartIndex) + "." + str(processingEndIndex)
    self.workerName = "bayzee.generation.worker"
    self.processorIndex = config["processor"]["index"]
    self.processorType = config["processor"]["type"]
    self.processorPhraseType = config["processor"]["type"]+"__phrase"
    self.processingPageSize = config["processingPageSize"]
    config["processor_phrase_type"] = self.processorPhraseType
    
    self.featureNames = map(lambda x: x["name"], config["generator"]["features"])
    for module in config["processor"]["modules"]:
      self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"])

    # creating generation dispatcher
    self.generationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback)
    
    # creating controle channel
    self.controlChannel = RemoteChannel(self.dispatcherName, config)

  def dispatchToGenerate(self):
    processorIndex = self.config["processor"]["index"]
    phraseProcessorType = self.config["processor"]["type"] + "__phrase"
    nextPhraseIndex = 0
    if self.config["processingStartIndex"] != None: nextPhraseIndex = self.config["processingStartIndex"]
    endPhraseIndex = -1
    if self.config["processingEndIndex"] != None: endPhraseIndex = self.config["processingEndIndex"]

    if endPhraseIndex != -1 and self.processingPageSize > (endPhraseIndex - nextPhraseIndex):
      self.processingPageSize = endPhraseIndex - nextPhraseIndex + 1
    
    while True:
      phrases = self.esClient.search(index=processorIndex, doc_type=phraseProcessorType, body={"from": nextPhraseIndex,"size": self.processingPageSize, "query":{"match_all":{}},"sort":[{"phrase__not_analyzed":{"order":"asc"}}]}, fields=["_id"])
      if len(phrases["hits"]["hits"]) == 0: break
      self.totalPhrasesDispatched += len(phrases["hits"]["hits"])
      floatPrecision = "{0:." + str(self.config["generator"]["floatPrecision"]) + "f}"
      self.logger.info("Generating features from " + str(nextPhraseIndex) + " to " + str(nextPhraseIndex+len(phrases["hits"]["hits"])) + " phrases...")
      for phraseData in phrases["hits"]["hits"]:
        self.logger.info("Dispatching phrase " + phraseData["_id"])
        content = {"phraseId": phraseData["_id"], "type": "generate", "count": 1, "from": self.dispatcherName}
        self.generationDispatcher.send(content, self.workerName, self.timeout)
      nextPhraseIndex += len(phrases["hits"]["hits"])
      if endPhraseIndex != -1 and nextPhraseIndex >= endPhraseIndex: break
    
    while True:
      message = self.generationDispatcher.receive()
      if "phraseId" in message["content"] and message["content"]["phraseId"] > 0:
        self.phrasesGenerated += 1
        self.generationDispatcher.close(message)
        self.logger.info("Generated for " + message["content"]["phraseId"] + str(self.phrasesGenerated) + "/" + str(self.totalPhrasesDispatched))
      
      if (self.phrasesGenerated + self.phrasesNotGenerated) >= self.totalPhrasesDispatched:
        self.controlChannel.send("dying")
        break

    self.__terminate()
    
  def timeoutCallback(self, message):
    config.logger.info("Message timed out: " + str(message))
    if message["content"]["count"] < 5:
      message["content"]["count"] += 1
      self.generationDispatcher.send(message["content"], self.workerName, self.timeout)
    else:
      #log implementation yet to be done for expired phrases
      self.phrasesNotGenerated += 1
      if self.phrasesNotGenerated == self.totalPhrasesDispatched or (self.phrasesGenerated + self.phrasesNotGenerated) == self.totalPhrasesDispatched:
        self.__terminate()

  def __terminate(self):
    self.logger.info(str(self.totalPhrasesDispatched) + " total dispatched")
    self.logger.info(str(self.phrasesGenerated) + " generated")
    self.logger.info(str(self.phrasesNotGenerated) + " failed to generate")
    self.logger.info("Generation complete")
    self.logger.info("Terminating generation dispatcher")
Beispiel #6
0
    def __extractFeatures(self):
        while True:
            message = self.worker.receive()
            if message["content"] == "kill":
                message["responseId"] = message["requestId"]
                self.worker.close(message)
                if len(self.dispatchers) == 0:
                    self.worker.end()
                    break
                else:
                    self.worker.send(content="kill", to=self.workerName)
                    continue
            elif message["content"]["type"] == "generate":
                if message["content"]["from"] not in self.dispatchers:
                    self.dispatchers[
                        message["content"]["from"]] = RemoteChannel(
                            message["content"]["from"], self.config)
                    self.dispatchers[message["content"]["from"]].listen(
                        self.unregisterDispatcher)
                phraseId = message["content"]["phraseId"]
                phraseData = self.esClient.get(
                    index=self.processorIndex,
                    doc_type=self.processorPhraseType,
                    id=phraseId)
                floatPrecision = "{0:." + str(
                    self.config["generator"]["floatPrecision"]) + "f}"
                token = phraseData["_source"]["phrase"]
                documentId = phraseData["_source"]["document_id"]
                self.logger.info("Extracted common features for phrase '" +
                                 token + "'")
                entry = {}
                shouldMatch = map(lambda x: {"match_phrase": {
                    x: token
                }}, self.corpusFields)
                query = {"query": {"bool": {"should": shouldMatch}}}
                data = self.esClient.search(index=self.corpusIndex,
                                            doc_type=self.corpusType,
                                            body=query,
                                            explain=True,
                                            size=self.corpusSize)
                entry["max_score"] = 0
                maxScore = 0
                avgScore = 0
                maxTermFrequency = 0
                avgTermFrequency = 0
                for hit in data["hits"]["hits"]:
                    avgScore += float(hit["_score"])
                    numOfScores = 0
                    hitTermFrequency = 0
                    explanation = json.dumps(hit["_explanation"])
                    while len(explanation) > len(token):
                        indexOfToken = explanation.find("tf(") + len("tf(")
                        if indexOfToken < len("tf("):
                            break
                        explanation = explanation[indexOfToken:]
                        freqToken = explanation.split(")")[0]
                        explanation = explanation.split(")")[1]
                        if freqToken.find("freq=") >= 0:
                            numOfScores += 1
                            hitTermFrequency += float(freqToken.split("=")[1])
                    if numOfScores > 0:
                        hitTermFrequency = hitTermFrequency / numOfScores
                    if maxTermFrequency < hitTermFrequency:
                        maxTermFrequency = hitTermFrequency
                    avgTermFrequency += hitTermFrequency

                if len(data["hits"]["hits"]) > 0:
                    avgTermFrequency = avgTermFrequency * 1.0 / len(
                        data["hits"]["hits"])

                if int(data["hits"]["total"]) > 0:
                    avgScore = (avgScore * 1.0) / int(data["hits"]["total"])

                if data["hits"]["max_score"] != None:
                    maxScore = data["hits"]["max_score"]

                if "max_score" in self.featureNames:
                    entry["max_score"] = floatPrecision.format(float(maxScore))
                if "doc_count" in self.featureNames:
                    entry["doc_count"] = floatPrecision.format(
                        float(data["hits"]["total"]))
                if "avg_score" in self.featureNames:
                    entry["avg_score"] = floatPrecision.format(float(avgScore))
                if "max_term_frequency" in self.featureNames:
                    entry["max_term_frequency"] = floatPrecision.format(
                        float(maxTermFrequency))
                if "avg_term_frequency" in self.featureNames:
                    entry["avg_term_frequency"] = floatPrecision.format(
                        float(avgTermFrequency))
                # get additional features
                for processorInstance in self.config["processor_instances"]:
                    processorInstance.extractFeatures(self.config, token,
                                                      entry)

                phraseData["_source"]["features"] = entry
                if token in self.trainingDataset:
                    phraseData["_source"][
                        "is_training"] = self.trainingDataset[token].strip()
                if token in self.holdOutDataset:
                    phraseData["_source"]["is_holdout"] = self.holdOutDataset[
                        token].strip()
                self.esClient.index(index=self.processorIndex,
                                    doc_type=self.processorPhraseType,
                                    id=phraseId,
                                    body=phraseData["_source"])
                self.worker.reply(message, {
                    "phraseId": phraseId,
                    "status": "generated",
                    "type": "reply"
                }, 120000000)
            if message["content"]["type"] == "stop_dispatcher":
                self.worker.reply(
                    message, {
                        "phraseId": -1,
                        "status": "stop_dispatcher",
                        "type": "stop_dispatcher"
                    }, self.timeout)

        self.logger.info("Terminating generation worker")
  def __init__(self, config, processingStartIndex, processingEndIndex):
    self.config = config
    self.logger = config["logger"]
    self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
    self.bagOfPhrases = {}
    self.corpusIndex = config["corpus"]["index"]
    self.corpusType = config["corpus"]["type"]
    self.corpusFields = config["corpus"]["text_fields"]
    self.corpusSize = 0
    self.processorIndex = config["processor"]["index"]
    self.processorType = config["processor"]["type"]
    self.processorPhraseType = config["processor"]["type"] + "__phrase"
    self.processingPageSize = config["processingPageSize"]
    self.analyzerIndex = self.corpusIndex + "__analysis__"
    self.config["processingStartIndex"] = processingStartIndex
    self.config["processingEndIndex"] = processingEndIndex
    self.config["processingPageSize"] = self.processingPageSize
    self.totalDocumentsDispatched = 0
    self.documentsAnnotated = 0
    self.documentsNotAnnotated = 0
    self.lastDispatcher = False
    self.endProcess = False
    self.dispatcherName = "bayzee.annotation.dispatcher"
    self.workerName = "bayzee.annotation.worker"
    self.timeout = 86400000
    if processingEndIndex != None:
      self.dispatcherName += "." + str(processingStartIndex) + "." + str(processingEndIndex)

    analyzerIndexSettings = {
      "index":{
        "analysis":{
          "analyzer":{
            "analyzer_shingle":{
              "type": "custom",
              "tokenizer": "standard",
              "filter": ["standard", "lowercase", "filter_shingle"]
            }
          },
          "filter":{
            "filter_shingle":{
              "type": "shingle",
              "max_shingle_size": config["generator"]["maxShingleSize"],
              "min_shingle_size": config["generator"]["minShingleSize"],
              "output_unigrams": (config["generator"]["minShingleSize"] == 1)
            },
            "filter_stop":{
              "type": "stop"
            }
          }
        }
      }
    }
    analyzerIndexTypeMapping = {
      "properties":{
        "phrase":{"type":"string"},
        "document_id":{"type":"string", "index": "not_analyzed"},
        "phrase__not_analyzed":{"type":"string","index":"not_analyzed"}
      }
    }
    corpusSize = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query":{"match_all":{}}})
    self.corpusSize = corpusSize["count"]
    self.featureNames = map(lambda x: x["name"], config["generator"]["features"])
    for module in config["processor"]["modules"]:
      self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"])

    if processingStartIndex == 0:
      if self.esClient.indices.exists(self.analyzerIndex):
        self.esClient.indices.delete(self.analyzerIndex)
      data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) 
        
    if "annotateFromScratch" not in self.config or self.config["annotateFromScratch"] == True:
      try:
        if self.esClient.indices.exists(self.config["processor"]["index"]):
          self.esClient.indices.delete(self.config["processor"]["index"])
        self.esClient.indices.create(self.config["processor"]["index"])
        self.esClient.indices.put_mapping(index=self.config["processor"]["index"],doc_type=self.processorPhraseType,body=analyzerIndexTypeMapping)
        if self.esClient.indices.exists(self.analyzerIndex):
          self.esClient.indices.delete(self.analyzerIndex)
        data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) 
      except:
        error = sys.exc_info()
        self.logger.error("Error occurred during initialization of analyzer index: " + str(error))
        sys.exit(1)
      else:
        sleep(1)

    #dispatcher creation
    self.annotationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback)

    #remote channel intialisation
    self.controlChannel = RemoteChannel(self.dispatcherName, config)
class AnnotationDispatcher:
  
  def __init__(self, config, processingStartIndex, processingEndIndex):
    self.config = config
    self.logger = config["logger"]
    self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
    self.bagOfPhrases = {}
    self.corpusIndex = config["corpus"]["index"]
    self.corpusType = config["corpus"]["type"]
    self.corpusFields = config["corpus"]["text_fields"]
    self.corpusSize = 0
    self.processorIndex = config["processor"]["index"]
    self.processorType = config["processor"]["type"]
    self.processorPhraseType = config["processor"]["type"] + "__phrase"
    self.processingPageSize = config["processingPageSize"]
    self.analyzerIndex = self.corpusIndex + "__analysis__"
    self.config["processingStartIndex"] = processingStartIndex
    self.config["processingEndIndex"] = processingEndIndex
    self.config["processingPageSize"] = self.processingPageSize
    self.totalDocumentsDispatched = 0
    self.documentsAnnotated = 0
    self.documentsNotAnnotated = 0
    self.lastDispatcher = False
    self.endProcess = False
    self.dispatcherName = "bayzee.annotation.dispatcher"
    self.workerName = "bayzee.annotation.worker"
    self.timeout = 86400000
    if processingEndIndex != None:
      self.dispatcherName += "." + str(processingStartIndex) + "." + str(processingEndIndex)

    analyzerIndexSettings = {
      "index":{
        "analysis":{
          "analyzer":{
            "analyzer_shingle":{
              "type": "custom",
              "tokenizer": "standard",
              "filter": ["standard", "lowercase", "filter_shingle"]
            }
          },
          "filter":{
            "filter_shingle":{
              "type": "shingle",
              "max_shingle_size": config["generator"]["maxShingleSize"],
              "min_shingle_size": config["generator"]["minShingleSize"],
              "output_unigrams": (config["generator"]["minShingleSize"] == 1)
            },
            "filter_stop":{
              "type": "stop"
            }
          }
        }
      }
    }
    analyzerIndexTypeMapping = {
      "properties":{
        "phrase":{"type":"string"},
        "document_id":{"type":"string", "index": "not_analyzed"},
        "phrase__not_analyzed":{"type":"string","index":"not_analyzed"}
      }
    }
    corpusSize = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query":{"match_all":{}}})
    self.corpusSize = corpusSize["count"]
    self.featureNames = map(lambda x: x["name"], config["generator"]["features"])
    for module in config["processor"]["modules"]:
      self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"])

    if processingStartIndex == 0:
      if self.esClient.indices.exists(self.analyzerIndex):
        self.esClient.indices.delete(self.analyzerIndex)
      data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) 
        
    if "annotateFromScratch" not in self.config or self.config["annotateFromScratch"] == True:
      try:
        if self.esClient.indices.exists(self.config["processor"]["index"]):
          self.esClient.indices.delete(self.config["processor"]["index"])
        self.esClient.indices.create(self.config["processor"]["index"])
        self.esClient.indices.put_mapping(index=self.config["processor"]["index"],doc_type=self.processorPhraseType,body=analyzerIndexTypeMapping)
        if self.esClient.indices.exists(self.analyzerIndex):
          self.esClient.indices.delete(self.analyzerIndex)
        data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) 
      except:
        error = sys.exc_info()
        self.logger.error("Error occurred during initialization of analyzer index: " + str(error))
        sys.exit(1)
      else:
        sleep(1)

    #dispatcher creation
    self.annotationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback)

    #remote channel intialisation
    self.controlChannel = RemoteChannel(self.dispatcherName, config)

  def dispatchToAnnotate(self):
    if "indexPhrases" in self.config and self.config["indexPhrases"] == False: return
    nextDocumentIndex = 0
    if self.config["processingStartIndex"] != None: nextDocumentIndex = self.config["processingStartIndex"]
    endDocumentIndex = -1
    if self.config["processingEndIndex"] != None: endDocumentIndex = self.config["processingEndIndex"]
   
    if endDocumentIndex != -1 and self.processingPageSize > (endDocumentIndex - nextDocumentIndex):
      self.processingPageSize = endDocumentIndex - nextDocumentIndex + 1

    self.totalDocumentsDispatched = 0

    while True:
      documents = self.esClient.search(index=self.corpusIndex, doc_type=self.corpusType, body={"from": nextDocumentIndex,"size": self.processingPageSize,"query":{"match_all":{}}, "sort":[{"_id":{"order":"asc"}}]}, fields=["_id"])
      if len(documents["hits"]["hits"]) == 0: 
        break
      self.totalDocumentsDispatched += len(documents["hits"]["hits"])
      self.logger.info("Annotating " + str(nextDocumentIndex) + " to " + str(nextDocumentIndex+len(documents["hits"]["hits"])) + " documents...")
      for document in documents["hits"]["hits"]:
        self.logger.info("Dispatching document " + document["_id"])
        content = {"documentId": document["_id"], "type": "annotate", "count": 1, "from":self.dispatcherName}
        self.annotationDispatcher.send(content, self.workerName)
      nextDocumentIndex += len(documents["hits"]["hits"])
      if endDocumentIndex != -1 and endDocumentIndex <= nextDocumentIndex: 
        break
    
    self.logger.info(str(self.totalDocumentsDispatched) + " documents dispatched")
    while True:
      message = self.annotationDispatcher.receive()
      if "documentId" in message["content"] and message["content"]["documentId"] > 0:
        self.documentsAnnotated += 1
        self.annotationDispatcher.close(message)
        self.logger.info("Annotated document " + message["content"]["documentId"] + " - " + str(self.documentsAnnotated) + "/" + str(self.totalDocumentsDispatched))
      
      if (self.documentsAnnotated + self.documentsNotAnnotated) >= self.totalDocumentsDispatched and not self.lastDispatcher:
        self.controlChannel.send("dying")
        self.annotationDispatcher.end()
        break
    
    self.__terminate()

  def timeoutCallback(self, message):
    if message["content"]["count"] < 5:
      message["content"]["count"] += 1
      self.annotationDispatcher.send(message["content"], self.workerName, self.timeout)
    else:
      #log implementation yet to be done for expired documents
      self.documentsNotAnnotated += 1
      if self.documentsNotAnnotated == self.totalDocumentsDispatched or (self.documentsAnnotated + self.documentsNotAnnotated) == self.totalDocumentsDispatched:
        self.__terminate()

  def __terminate(self):
    self.logger.info(str(self.totalDocumentsDispatched) + " total dispatched")
    self.logger.info(str(self.documentsAnnotated) + " annotated")
    self.logger.info(str(self.documentsNotAnnotated) + " failed to annotate")
    self.logger.info("Annotation complete")
    self.logger.info("Terminating annotation dispatcher")

  def __deleteAnalyzerIndex(self):
    if self.esClient.indices.exists(self.analyzerIndex):
        self.esClient.indices.delete(self.analyzerIndex)
Beispiel #9
0
class GenerationDispatcher:
    def __init__(self, config, trainingDataset, holdOutDataset,
                 processingStartIndex, processingEndIndex):
        self.config = config
        self.logger = config["logger"]
        self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" +
                                      str(config["elasticsearch"]["port"]))
        self.trainingDataset = trainingDataset
        self.holdOutDataset = holdOutDataset
        self.config["processingStartIndex"] = processingStartIndex
        self.config["processingEndIndex"] = processingEndIndex
        self.bagOfPhrases = {}
        self.corpusIndex = config["corpus"]["index"]
        self.corpusType = config["corpus"]["type"]
        self.corpusFields = config["corpus"]["text_fields"]
        self.corpusSize = 0
        self.totalPhrasesDispatched = 0
        self.phrasesGenerated = 0
        self.phrasesNotGenerated = 0
        self.timeout = 86400000
        self.dispatcherName = "bayzee.generation.dispatcher"
        if processingEndIndex != None:
            self.dispatcherName += "." + str(processingStartIndex) + "." + str(
                processingEndIndex)
        self.workerName = "bayzee.generation.worker"
        self.processorIndex = config["processor"]["index"]
        self.processorType = config["processor"]["type"]
        self.processorPhraseType = config["processor"]["type"] + "__phrase"
        self.processingPageSize = config["processingPageSize"]
        config["processor_phrase_type"] = self.processorPhraseType

        self.featureNames = map(lambda x: x["name"],
                                config["generator"]["features"])
        for module in config["processor"]["modules"]:
            self.featureNames = self.featureNames + map(
                lambda x: x["name"], module["features"])

        # creating generation dispatcher
        self.generationDispatcher = DurableChannel(self.dispatcherName, config,
                                                   self.timeoutCallback)

        # creating controle channel
        self.controlChannel = RemoteChannel(self.dispatcherName, config)

    def dispatchToGenerate(self):
        processorIndex = self.config["processor"]["index"]
        phraseProcessorType = self.config["processor"]["type"] + "__phrase"
        nextPhraseIndex = 0
        if self.config["processingStartIndex"] != None:
            nextPhraseIndex = self.config["processingStartIndex"]
        endPhraseIndex = -1
        if self.config["processingEndIndex"] != None:
            endPhraseIndex = self.config["processingEndIndex"]

        if endPhraseIndex != -1 and self.processingPageSize > (
                endPhraseIndex - nextPhraseIndex):
            self.processingPageSize = endPhraseIndex - nextPhraseIndex + 1

        while True:
            phrases = self.esClient.search(index=processorIndex,
                                           doc_type=phraseProcessorType,
                                           body={
                                               "from":
                                               nextPhraseIndex,
                                               "size":
                                               self.processingPageSize,
                                               "query": {
                                                   "match_all": {}
                                               },
                                               "sort": [{
                                                   "phrase__not_analyzed": {
                                                       "order": "asc"
                                                   }
                                               }]
                                           },
                                           fields=["_id"])
            if len(phrases["hits"]["hits"]) == 0: break
            self.totalPhrasesDispatched += len(phrases["hits"]["hits"])
            floatPrecision = "{0:." + str(
                self.config["generator"]["floatPrecision"]) + "f}"
            self.logger.info("Generating features from " +
                             str(nextPhraseIndex) + " to " +
                             str(nextPhraseIndex +
                                 len(phrases["hits"]["hits"])) + " phrases...")
            for phraseData in phrases["hits"]["hits"]:
                self.logger.info("Dispatching phrase " + phraseData["_id"])
                content = {
                    "phraseId": phraseData["_id"],
                    "type": "generate",
                    "count": 1,
                    "from": self.dispatcherName
                }
                self.generationDispatcher.send(content, self.workerName,
                                               self.timeout)
            nextPhraseIndex += len(phrases["hits"]["hits"])
            if endPhraseIndex != -1 and nextPhraseIndex >= endPhraseIndex:
                break

        while True:
            message = self.generationDispatcher.receive()
            if "phraseId" in message[
                    "content"] and message["content"]["phraseId"] > 0:
                self.phrasesGenerated += 1
                self.generationDispatcher.close(message)
                self.logger.info("Generated for " +
                                 message["content"]["phraseId"] +
                                 str(self.phrasesGenerated) + "/" +
                                 str(self.totalPhrasesDispatched))

            if (self.phrasesGenerated +
                    self.phrasesNotGenerated) >= self.totalPhrasesDispatched:
                self.controlChannel.send("dying")
                break

        self.__terminate()

    def timeoutCallback(self, message):
        config.logger.info("Message timed out: " + str(message))
        if message["content"]["count"] < 5:
            message["content"]["count"] += 1
            self.generationDispatcher.send(message["content"], self.workerName,
                                           self.timeout)
        else:
            #log implementation yet to be done for expired phrases
            self.phrasesNotGenerated += 1
            if self.phrasesNotGenerated == self.totalPhrasesDispatched or (
                    self.phrasesGenerated +
                    self.phrasesNotGenerated) == self.totalPhrasesDispatched:
                self.__terminate()

    def __terminate(self):
        self.logger.info(
            str(self.totalPhrasesDispatched) + " total dispatched")
        self.logger.info(str(self.phrasesGenerated) + " generated")
        self.logger.info(str(self.phrasesNotGenerated) + " failed to generate")
        self.logger.info("Generation complete")
        self.logger.info("Terminating generation dispatcher")
    def __init__(self, config, processingStartIndex, processingEndIndex):
        self.config = config
        self.logger = config["logger"]
        self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" +
                                      str(config["elasticsearch"]["port"]))
        self.bagOfPhrases = {}
        self.corpusIndex = config["corpus"]["index"]
        self.corpusType = config["corpus"]["type"]
        self.corpusFields = config["corpus"]["text_fields"]
        self.corpusSize = 0
        self.processorIndex = config["processor"]["index"]
        self.processorType = config["processor"]["type"]
        self.processorPhraseType = config["processor"]["type"] + "__phrase"
        self.processingPageSize = config["processingPageSize"]
        self.analyzerIndex = self.corpusIndex + "__analysis__"
        self.config["processingStartIndex"] = processingStartIndex
        self.config["processingEndIndex"] = processingEndIndex
        self.config["processingPageSize"] = self.processingPageSize
        self.totalDocumentsDispatched = 0
        self.documentsAnnotated = 0
        self.documentsNotAnnotated = 0
        self.lastDispatcher = False
        self.endProcess = False
        self.dispatcherName = "bayzee.annotation.dispatcher"
        self.workerName = "bayzee.annotation.worker"
        self.timeout = 86400000
        if processingEndIndex != None:
            self.dispatcherName += "." + str(processingStartIndex) + "." + str(
                processingEndIndex)

        analyzerIndexSettings = {
            "index": {
                "analysis": {
                    "analyzer": {
                        "analyzer_shingle": {
                            "type": "custom",
                            "tokenizer": "standard",
                            "filter":
                            ["standard", "lowercase", "filter_shingle"]
                        }
                    },
                    "filter": {
                        "filter_shingle": {
                            "type":
                            "shingle",
                            "max_shingle_size":
                            config["generator"]["maxShingleSize"],
                            "min_shingle_size":
                            config["generator"]["minShingleSize"],
                            "output_unigrams":
                            (config["generator"]["minShingleSize"] == 1)
                        },
                        "filter_stop": {
                            "type": "stop"
                        }
                    }
                }
            }
        }
        analyzerIndexTypeMapping = {
            "properties": {
                "phrase": {
                    "type": "string"
                },
                "document_id": {
                    "type": "string",
                    "index": "not_analyzed"
                },
                "phrase__not_analyzed": {
                    "type": "string",
                    "index": "not_analyzed"
                }
            }
        }
        corpusSize = self.esClient.count(index=self.corpusIndex,
                                         doc_type=self.corpusType,
                                         body={"query": {
                                             "match_all": {}
                                         }})
        self.corpusSize = corpusSize["count"]
        self.featureNames = map(lambda x: x["name"],
                                config["generator"]["features"])
        for module in config["processor"]["modules"]:
            self.featureNames = self.featureNames + map(
                lambda x: x["name"], module["features"])

        if processingStartIndex == 0:
            if self.esClient.indices.exists(self.analyzerIndex):
                self.esClient.indices.delete(self.analyzerIndex)
            data = self.esClient.indices.create(self.analyzerIndex,
                                                analyzerIndexSettings)

        if "annotateFromScratch" not in self.config or self.config[
                "annotateFromScratch"] == True:
            try:
                if self.esClient.indices.exists(
                        self.config["processor"]["index"]):
                    self.esClient.indices.delete(
                        self.config["processor"]["index"])
                self.esClient.indices.create(self.config["processor"]["index"])
                self.esClient.indices.put_mapping(
                    index=self.config["processor"]["index"],
                    doc_type=self.processorPhraseType,
                    body=analyzerIndexTypeMapping)
                if self.esClient.indices.exists(self.analyzerIndex):
                    self.esClient.indices.delete(self.analyzerIndex)
                data = self.esClient.indices.create(self.analyzerIndex,
                                                    analyzerIndexSettings)
            except:
                error = sys.exc_info()
                self.logger.error(
                    "Error occurred during initialization of analyzer index: "
                    + str(error))
                sys.exit(1)
            else:
                sleep(1)

        #dispatcher creation
        self.annotationDispatcher = DurableChannel(self.dispatcherName, config,
                                                   self.timeoutCallback)

        #remote channel intialisation
        self.controlChannel = RemoteChannel(self.dispatcherName, config)
class AnnotationDispatcher:
    def __init__(self, config, processingStartIndex, processingEndIndex):
        self.config = config
        self.logger = config["logger"]
        self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" +
                                      str(config["elasticsearch"]["port"]))
        self.bagOfPhrases = {}
        self.corpusIndex = config["corpus"]["index"]
        self.corpusType = config["corpus"]["type"]
        self.corpusFields = config["corpus"]["text_fields"]
        self.corpusSize = 0
        self.processorIndex = config["processor"]["index"]
        self.processorType = config["processor"]["type"]
        self.processorPhraseType = config["processor"]["type"] + "__phrase"
        self.processingPageSize = config["processingPageSize"]
        self.analyzerIndex = self.corpusIndex + "__analysis__"
        self.config["processingStartIndex"] = processingStartIndex
        self.config["processingEndIndex"] = processingEndIndex
        self.config["processingPageSize"] = self.processingPageSize
        self.totalDocumentsDispatched = 0
        self.documentsAnnotated = 0
        self.documentsNotAnnotated = 0
        self.lastDispatcher = False
        self.endProcess = False
        self.dispatcherName = "bayzee.annotation.dispatcher"
        self.workerName = "bayzee.annotation.worker"
        self.timeout = 86400000
        if processingEndIndex != None:
            self.dispatcherName += "." + str(processingStartIndex) + "." + str(
                processingEndIndex)

        analyzerIndexSettings = {
            "index": {
                "analysis": {
                    "analyzer": {
                        "analyzer_shingle": {
                            "type": "custom",
                            "tokenizer": "standard",
                            "filter":
                            ["standard", "lowercase", "filter_shingle"]
                        }
                    },
                    "filter": {
                        "filter_shingle": {
                            "type":
                            "shingle",
                            "max_shingle_size":
                            config["generator"]["maxShingleSize"],
                            "min_shingle_size":
                            config["generator"]["minShingleSize"],
                            "output_unigrams":
                            (config["generator"]["minShingleSize"] == 1)
                        },
                        "filter_stop": {
                            "type": "stop"
                        }
                    }
                }
            }
        }
        analyzerIndexTypeMapping = {
            "properties": {
                "phrase": {
                    "type": "string"
                },
                "document_id": {
                    "type": "string",
                    "index": "not_analyzed"
                },
                "phrase__not_analyzed": {
                    "type": "string",
                    "index": "not_analyzed"
                }
            }
        }
        corpusSize = self.esClient.count(index=self.corpusIndex,
                                         doc_type=self.corpusType,
                                         body={"query": {
                                             "match_all": {}
                                         }})
        self.corpusSize = corpusSize["count"]
        self.featureNames = map(lambda x: x["name"],
                                config["generator"]["features"])
        for module in config["processor"]["modules"]:
            self.featureNames = self.featureNames + map(
                lambda x: x["name"], module["features"])

        if processingStartIndex == 0:
            if self.esClient.indices.exists(self.analyzerIndex):
                self.esClient.indices.delete(self.analyzerIndex)
            data = self.esClient.indices.create(self.analyzerIndex,
                                                analyzerIndexSettings)

        if "annotateFromScratch" not in self.config or self.config[
                "annotateFromScratch"] == True:
            try:
                if self.esClient.indices.exists(
                        self.config["processor"]["index"]):
                    self.esClient.indices.delete(
                        self.config["processor"]["index"])
                self.esClient.indices.create(self.config["processor"]["index"])
                self.esClient.indices.put_mapping(
                    index=self.config["processor"]["index"],
                    doc_type=self.processorPhraseType,
                    body=analyzerIndexTypeMapping)
                if self.esClient.indices.exists(self.analyzerIndex):
                    self.esClient.indices.delete(self.analyzerIndex)
                data = self.esClient.indices.create(self.analyzerIndex,
                                                    analyzerIndexSettings)
            except:
                error = sys.exc_info()
                self.logger.error(
                    "Error occurred during initialization of analyzer index: "
                    + str(error))
                sys.exit(1)
            else:
                sleep(1)

        #dispatcher creation
        self.annotationDispatcher = DurableChannel(self.dispatcherName, config,
                                                   self.timeoutCallback)

        #remote channel intialisation
        self.controlChannel = RemoteChannel(self.dispatcherName, config)

    def dispatchToAnnotate(self):
        if "indexPhrases" in self.config and self.config[
                "indexPhrases"] == False:
            return
        nextDocumentIndex = 0
        if self.config["processingStartIndex"] != None:
            nextDocumentIndex = self.config["processingStartIndex"]
        endDocumentIndex = -1
        if self.config["processingEndIndex"] != None:
            endDocumentIndex = self.config["processingEndIndex"]

        if endDocumentIndex != -1 and self.processingPageSize > (
                endDocumentIndex - nextDocumentIndex):
            self.processingPageSize = endDocumentIndex - nextDocumentIndex + 1

        self.totalDocumentsDispatched = 0

        while True:
            documents = self.esClient.search(index=self.corpusIndex,
                                             doc_type=self.corpusType,
                                             body={
                                                 "from":
                                                 nextDocumentIndex,
                                                 "size":
                                                 self.processingPageSize,
                                                 "query": {
                                                     "match_all": {}
                                                 },
                                                 "sort": [{
                                                     "_id": {
                                                         "order": "asc"
                                                     }
                                                 }]
                                             },
                                             fields=["_id"])
            if len(documents["hits"]["hits"]) == 0:
                break
            self.totalDocumentsDispatched += len(documents["hits"]["hits"])
            self.logger.info("Annotating " + str(nextDocumentIndex) + " to " +
                             str(nextDocumentIndex +
                                 len(documents["hits"]["hits"])) +
                             " documents...")
            for document in documents["hits"]["hits"]:
                self.logger.info("Dispatching document " + document["_id"])
                content = {
                    "documentId": document["_id"],
                    "type": "annotate",
                    "count": 1,
                    "from": self.dispatcherName
                }
                self.annotationDispatcher.send(content, self.workerName)
            nextDocumentIndex += len(documents["hits"]["hits"])
            if endDocumentIndex != -1 and endDocumentIndex <= nextDocumentIndex:
                break

        self.logger.info(
            str(self.totalDocumentsDispatched) + " documents dispatched")
        while True:
            message = self.annotationDispatcher.receive()
            if "documentId" in message[
                    "content"] and message["content"]["documentId"] > 0:
                self.documentsAnnotated += 1
                self.annotationDispatcher.close(message)
                self.logger.info("Annotated document " +
                                 message["content"]["documentId"] + " - " +
                                 str(self.documentsAnnotated) + "/" +
                                 str(self.totalDocumentsDispatched))

            if (self.documentsAnnotated + self.documentsNotAnnotated
                ) >= self.totalDocumentsDispatched and not self.lastDispatcher:
                self.controlChannel.send("dying")
                self.annotationDispatcher.end()
                break

        self.__terminate()

    def timeoutCallback(self, message):
        if message["content"]["count"] < 5:
            message["content"]["count"] += 1
            self.annotationDispatcher.send(message["content"], self.workerName,
                                           self.timeout)
        else:
            #log implementation yet to be done for expired documents
            self.documentsNotAnnotated += 1
            if self.documentsNotAnnotated == self.totalDocumentsDispatched or (
                    self.documentsAnnotated + self.documentsNotAnnotated
            ) == self.totalDocumentsDispatched:
                self.__terminate()

    def __terminate(self):
        self.logger.info(
            str(self.totalDocumentsDispatched) + " total dispatched")
        self.logger.info(str(self.documentsAnnotated) + " annotated")
        self.logger.info(
            str(self.documentsNotAnnotated) + " failed to annotate")
        self.logger.info("Annotation complete")
        self.logger.info("Terminating annotation dispatcher")

    def __deleteAnalyzerIndex(self):
        if self.esClient.indices.exists(self.analyzerIndex):
            self.esClient.indices.delete(self.analyzerIndex)