コード例 #1
0
ファイル: generation_worker.py プロジェクト: rvaughan/bayzee
    def __init__(self, config, trainingDataset, holdOutDataset):
        self.config = config
        self.logger = config["logger"]
        self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" +
                                      str(config["elasticsearch"]["port"]))
        self.trainingDataset = trainingDataset
        self.holdOutDataset = holdOutDataset
        self.bagOfPhrases = {}
        self.corpusIndex = config["corpus"]["index"]
        self.corpusType = config["corpus"]["type"]
        self.corpusFields = config["corpus"]["text_fields"]
        self.corpusSize = 0
        self.timeout = 6000000
        self.processorIndex = config["processor"]["index"]
        self.processorType = config["processor"]["type"]
        self.processorPhraseType = config["processor"]["type"] + "__phrase"
        count = self.esClient.count(index=self.corpusIndex,
                                    doc_type=self.corpusType,
                                    body={"query": {
                                        "match_all": {}
                                    }})
        self.corpusSize = count["count"]
        self.featureNames = map(lambda x: x["name"],
                                config["generator"]["features"])
        for module in config["processor"]["modules"]:
            self.featureNames = self.featureNames + map(
                lambda x: x["name"], module["features"])

        self.workerName = "bayzee.generation.worker"
        self.dispatchers = {}

        #creating worker
        self.worker = DurableChannel(self.workerName, config)
コード例 #2
0
ファイル: annotation_worker.py プロジェクト: rvaughan/bayzee
 def __init__(self, config):
   self.config = config
   self.logger = config["logger"]
   self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
   self.corpusIndex = config["corpus"]["index"]
   self.corpusType = config["corpus"]["type"]
   self.corpusFields = config["corpus"]["text_fields"]
   self.corpusSize = 0
   self.workerName = "bayzee.annotation.worker"
   self.timeout = 6000
   self.processorIndex = config["processor"]["index"]
   self.processorType = config["processor"]["type"]
   self.processorPhraseType = config["processor"]["type"] + "__phrase"
   self.analyzerIndex = self.corpusIndex + "__analysis__"
   self.worker = DurableChannel(self.workerName, config)
   self.dispatchers = {}
コード例 #3
0
 def __init__(self, config, trainingDataset, holdOutDataset):
   self.config = config
   self.logger = config["logger"]
   self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
   self.trainingDataset = trainingDataset
   self.holdOutDataset = holdOutDataset
   self.bagOfPhrases = {}
   self.corpusIndex = config["corpus"]["index"]
   self.corpusType = config["corpus"]["type"]
   self.corpusFields = config["corpus"]["text_fields"]
   self.corpusSize = 0
   self.timeout = 6000000
   self.processorIndex = config["processor"]["index"]
   self.processorType = config["processor"]["type"]
   self.processorPhraseType = config["processor"]["type"]+"__phrase"
   count = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query":{"match_all":{}}})
   self.corpusSize = count["count"]
   self.featureNames = map(lambda x: x["name"], config["generator"]["features"])
   for module in config["processor"]["modules"]:
     self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"])
   
   self.workerName = "bayzee.generation.worker"
   self.dispatchers = {}
   
   #creating worker
   self.worker = DurableChannel(self.workerName, config)
コード例 #4
0
    def __init__(self, config, processingStartIndex, processingEndIndex):
        self.config = config
        self.logger = config["logger"]
        self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
        self.config["processingStartIndex"] = processingStartIndex
        self.config["processingEndIndex"] = processingEndIndex
        self.bagOfPhrases = {}

        self.corpusSize = 0
        self.processorIndex = config["processor"]["index"]
        self.processorType = config["processor"]["type"]
        self.processorPhraseType = config["processor"]["type"] + "__phrase"
        self.processingPageSize = config["processingPageSize"]
        config["processor_phrase_type"] = self.processorPhraseType

        self.featureNames = map(lambda x: x["name"], config["generator"]["features"])
        for module in config["processor"]["modules"]:
            self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"])

        self.totalPhrasesDispatched = 0
        self.phrasesClassified = 0
        self.phrasesNotClassified = 0
        self.timeout = 86400000
        self.dispatcherName = "bayzee.classification.dispatcher"
        if processingEndIndex != None:
            self.dispatcherName += "." + str(processingStartIndex) + "." + str(processingEndIndex)
        self.workerName = "bayzee.classification.worker"

        # creating generation dispatcher
        self.classificationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback)

        # remote channel intialisation
        self.controlChannel = RemoteChannel(self.dispatcherName, config)
コード例 #5
0
    def __init__(self, config, trainingDataset, holdOutDataset,
                 processingStartIndex, processingEndIndex):
        self.config = config
        self.logger = config["logger"]
        self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" +
                                      str(config["elasticsearch"]["port"]))
        self.trainingDataset = trainingDataset
        self.holdOutDataset = holdOutDataset
        self.config["processingStartIndex"] = processingStartIndex
        self.config["processingEndIndex"] = processingEndIndex
        self.bagOfPhrases = {}
        self.corpusIndex = config["corpus"]["index"]
        self.corpusType = config["corpus"]["type"]
        self.corpusFields = config["corpus"]["text_fields"]
        self.corpusSize = 0
        self.totalPhrasesDispatched = 0
        self.phrasesGenerated = 0
        self.phrasesNotGenerated = 0
        self.timeout = 86400000
        self.dispatcherName = "bayzee.generation.dispatcher"
        if processingEndIndex != None:
            self.dispatcherName += "." + str(processingStartIndex) + "." + str(
                processingEndIndex)
        self.workerName = "bayzee.generation.worker"
        self.processorIndex = config["processor"]["index"]
        self.processorType = config["processor"]["type"]
        self.processorPhraseType = config["processor"]["type"] + "__phrase"
        self.processingPageSize = config["processingPageSize"]
        config["processor_phrase_type"] = self.processorPhraseType

        self.featureNames = map(lambda x: x["name"],
                                config["generator"]["features"])
        for module in config["processor"]["modules"]:
            self.featureNames = self.featureNames + map(
                lambda x: x["name"], module["features"])

        # creating generation dispatcher
        self.generationDispatcher = DurableChannel(self.dispatcherName, config,
                                                   self.timeoutCallback)

        # creating controle channel
        self.controlChannel = RemoteChannel(self.dispatcherName, config)
コード例 #6
0
  def __init__(self, config):
    self.config = config
    self.logger = config["logger"]
    self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
    self.trainD = None
    self.classifier = None
    self.phraseId = None
    self.phraseData = None
    self.processorIndex = config["processor"]["index"]
    self.processorType = config["processor"]["type"]
    self.processorPhraseType = config["processor"]["type"]+"__phrase"
    self.features = self.config["generator"]["features"]
    for module in self.config["processor"]["modules"]:
      self.features = self.features + module["features"]

    self.workerName = "bayzee.classification.worker"
    self.timeout = 600000
    self.dispatchers = {}
    
    #creating worker
    self.worker = DurableChannel(self.workerName, config)
コード例 #7
0
class ClassificationWorker:

  def __init__(self, config):
    self.config = config
    self.logger = config["logger"]
    self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
    self.trainD = None
    self.classifier = None
    self.phraseId = None
    self.phraseData = None
    self.processorIndex = config["processor"]["index"]
    self.processorType = config["processor"]["type"]
    self.processorPhraseType = config["processor"]["type"]+"__phrase"
    self.features = self.config["generator"]["features"]
    for module in self.config["processor"]["modules"]:
      self.features = self.features + module["features"]

    self.workerName = "bayzee.classification.worker"
    self.timeout = 600000
    self.dispatchers = {}
    
    #creating worker
    self.worker = DurableChannel(self.workerName, config)

  def classify(self):
    while True:
      message = self.worker.receive()
      if message["content"] == "kill":
        message["responseId"] = message["requestId"]
        self.worker.close(message)
        if len(self.dispatchers) == 0:
          self.worker.end()
          break
        else:
          self.worker.send(content="kill", to=self.workerName)
          continue
      elif message["content"]["type"] == "classify":
        if message["content"]["from"] not in self.dispatchers:
          self.dispatchers[message["content"]["from"]] = RemoteChannel(message["content"]["from"], self.config)
          self.dispatchers[message["content"]["from"]].listen(self.unregisterDispatcher)
        self.phraseId = message["content"]["phraseId"]
        if self.classifier == None:
          self.trainD = self.__loadDataFromES("train", None)
          self.trainD = orange.Preprocessor_discretize(self.trainD, method=orange.EntropyDiscretization())
          self.__train()

        self.trainD = self.__loadDataFromES("train", None)
        testD = self.__loadDataFromES("test", self.trainD.domain)
      
        self.trainD = orange.Preprocessor_discretize(self.trainD, method=orange.EntropyDiscretization())
        testD = orange.ExampleTable(self.trainD.domain, testD)

        for row in testD:
          phrase = row.getmetas().values()[0].value
          featureSet = {}
          for i,feature in enumerate(self.features):
            featureSet[feature["name"]] = row[i].value

          prob = self.classifier.prob_classify(featureSet).prob("1")
          classType = self.classifier.classify(featureSet)
          self.phraseData["_source"]["prob"] = prob
          self.phraseData["_source"]["class_type"] = classType
          self.logger.info("Classified '" + phrase + "' as " + classType + " with probability " + str(prob))
          self.esClient.index(index=self.processorIndex, doc_type=self.processorPhraseType, id=self.phraseId, body=self.phraseData["_source"])
          self.worker.reply(message, {"phraseId": self.phraseId, "status" : "classified", "type" : "reply"}, 120000000)   

    self.logger.info("Terminating classification worker")

  def __getOrangeVariableForFeature(self, feature):
    if feature["isNumerical"]: 
      return orange.FloatVariable(feature["name"])
    else:
      return orange.EnumVariable(feature["name"])

  def __loadDataFromES(self, dataType, domain):
    table = None
    if dataType != "train":
      table = orange.ExampleTable(domain)
    else:
      attributes = map(self.__getOrangeVariableForFeature, self.features)
      classAttribute = orange.EnumVariable("is_good", values = ["0", "1"])
      domain = orange.Domain(attributes, classAttribute)
      domain.addmeta(orange.newmetaid(), orange.StringVariable("phrase"))
      table = orange.ExampleTable(domain)
    phrases = []
    if dataType == "train":
      phrasesCount = self.esClient.count(index=self.processorIndex, doc_type=self.processorPhraseType, body={"query":{"terms":{"is_training":["1","0"]}}})
      size = phrasesCount["count"]
      phrases = self.esClient.search(index=self.processorIndex, doc_type=self.processorPhraseType, body={"query":{"terms":{"is_training":["1","0"]}}}, size=size)
      phrases = phrases["hits"]["hits"]
    elif dataType == "holdout":
      phraseCount = self.esClient.count(index=self.processorIndex, doc_type=self.processorPhraseType, body={"query":{"terms":{"is_holdout":["1","0"]}}})
      size = phrasesCount["count"]
      phrases = self.esClient.search(index=self.processorIndex, doc_type=self.processorPhraseType, body={"query":{"terms":{"is_holdout":["1","0"]}}}, size=size)
      phrases = phrases["hits"]["hits"]
    else:
      self.phraseData = self.esClient.get(index=self.processorIndex, doc_type=self.processorPhraseType, id=self.phraseId)
      phrases = [self.phraseData]

    for row in phrases:
      try:
        row = row["_source"]
        featureValues = []
        classType = "?"
        for feature in self.features:
          featureValues.append(row["features"][feature["name"]].encode("ascii"))
        if dataType == "train":
          classType = row["is_training"].encode("ascii", "ignore")
        elif dataType == "holdout":
          classType = row["is_holdout"].encode("ascii")
        example = None
        for i,featureValue in enumerate(featureValues):
          attr = domain.attributes[i]
          if type(attr) is orange.EnumVariable: 
            attr.addValue(featureValue)
        example = orange.Example(domain, (featureValues + [classType]))
        example[domain.getmetas().items()[0][0]] = row["phrase"].encode("ascii")
        table.append(example)
      except:
        self.logger.error("Error classifying phrase '" + row["phrase"] + "'")
    return table

  def __train(self):
    for a in self.trainD.domain.attributes:
      self.logger.info("%s: %s" % (a.name,reduce(lambda x,y: x+', '+y, [i for i in a.values])))
    trainSet = []
    for row in self.trainD:
      phrase = row.getmetas().values()[0].value
      classType = row[-1].value

      featureSet = {}
      for i,feature in enumerate(self.features):
        featureSet[feature["name"]] = row[i].value

      trainSet.append((featureSet, classType))

    self.logger.info("\nTraining Naive Bayes Classifier with " + str(len(trainSet)) + " phrases...")
    self.classifier = nltk.NaiveBayesClassifier.train(trainSet)
    
    self.classifier.show_most_informative_features(50)

  def __calculateMeasures(self):
  
    falsePositives = 0
    falseNegatives = 0
    truePositives = 0
    trueNegatives = 0
    totalPositives = 0
    totalNegatives = 0
    totalHoldOutGoodPhrases = 0
    totalHoldOutBadPhrases = 0

    self.trainD = self.__loadDataFromES("train", None)
    self.holdOutD = self.__loadDataFromES("hold", self.trainD.domain)
    self.trainD = orange.Preprocessor_discretize(self.trainD, method=orange.EntropyDiscretization())
    self.holdOutD = orange.ExampleTable(self.trainD.domain, self.holdOutD)
    
    for row in self.holdOutD:
      actualClassType = row[-1].value
      phrase = row.getmetas().values()[0].value

      featureSet = {}
      for i,feature in enumerate(self.features):
        featureSet[feature["name"]] = row[i].value

      if self.classifier == None:
        classifierFile = open(self.classifierFilePath)
        self.classifier = pickle.load(classifierFile)
        classifierFile.close()  
      prob = self.classifier.prob_classify(featureSet).prob("1")
      classType = self.classifier.classify(featureSet)

      if classType == "1":
        totalPositives += 1
        if classType == actualClassType:
          truePositives += 1
      else:
        totalNegatives += 1
        if classType == actualClassType:
          trueNegatives += 1

      if actualClassType == "1":
        totalHoldOutGoodPhrases += 1
      else:
        totalHoldOutBadPhrases += 1

    precisionOfGood = 100.0 * truePositives/totalPositives
    recallOfGood = 100.0 * truePositives/totalHoldOutGoodPhrases
    fMeasureOfGood = 2.0 * precisionOfGood * recallOfGood / (precisionOfGood + recallOfGood)
    precisionOfBad = 100.0 * trueNegatives/totalNegatives
    recallOfBad = 100.0*trueNegatives/totalHoldOutBadPhrases
    fMeasureOfBad = 2.0 * precisionOfBad * recallOfBad / (precisionOfBad + recallOfBad)
    self.logger.info("\nPrecision of Good: " + str(round(precisionOfGood, 2)) + "%")
    self.logger.info("Recall of Good: " + str(round(recallOfGood, 2)) + "%")
    self.logger.info("Balanced F-measure of Good: " + str(round(fMeasureOfGood, 2)) + "%")
    self.logger.info("Precision of Bad: " + str(round(precisionOfBad, 2)) + "%")
    self.logger.info("Recall of Bad: " + str(round(recallOfBad, 2)) + "%")
    self.logger.info("Balanced F-measure of Bad: " + str(round(fMeasureOfBad, 2)) + "%")

  def unregisterDispatcher(self, dispatcher, message):
    if message == "dying":
      self.dispatchers.pop(dispatcher, None)

    if len(self.dispatchers) == 0:
      self.worker.send(content="kill", to=self.workerName)
コード例 #8
0
ファイル: annotation_worker.py プロジェクト: rvaughan/bayzee
class AnnotationWorker:
  
  def __init__(self, config):
    self.config = config
    self.logger = config["logger"]
    self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
    self.corpusIndex = config["corpus"]["index"]
    self.corpusType = config["corpus"]["type"]
    self.corpusFields = config["corpus"]["text_fields"]
    self.corpusSize = 0
    self.workerName = "bayzee.annotation.worker"
    self.timeout = 6000
    self.processorIndex = config["processor"]["index"]
    self.processorType = config["processor"]["type"]
    self.processorPhraseType = config["processor"]["type"] + "__phrase"
    self.analyzerIndex = self.corpusIndex + "__analysis__"
    self.worker = DurableChannel(self.workerName, config)
    self.dispatchers = {}

  def annotate(self):
    while True:
      message = self.worker.receive()
      if message["content"] == "kill":
        message["responseId"] = message["requestId"]
        self.worker.close(message)
        if len(self.dispatchers) == 0:
          self.worker.end()
          break
        else:
          self.worker.send(content="kill", to=self.workerName)
          continue
      elif message["content"]["type"] == "annotate":
        if message["content"]["from"] not in self.dispatchers:
          self.dispatchers[message["content"]["from"]] = RemoteChannel(message["content"]["from"], self.config)
          self.dispatchers[message["content"]["from"]].listen(self.unregisterDispatcher)
        documentId = message["content"]["documentId"]
        document = self.esClient.get(index=self.corpusIndex, doc_type=self.corpusType, id = documentId, fields=self.corpusFields)
        if "fields" in document:  
          for field in self.corpusFields:
            shingles = []
            if field in document["fields"]:
              if type(document["fields"][field]) is list:
                for element in document["fields"][field]:
                  if len(element) > 0:
                    shingleTokens = self.esClient.indices.analyze(index=self.analyzerIndex, body=element, analyzer="analyzer_shingle")
                    shingles += shingleTokens["tokens"]
              else:
                if len(document["fields"][field]) > 0:
                  shingles = self.esClient.indices.analyze(index=self.analyzerIndex, body=document["fields"][field], analyzer="analyzer_shingle")["tokens"]
              shingles = map(self.__replaceUnderscore, shingles)
              shingles = filter(self.__filterTokens, shingles)
            if shingles != None and len(shingles) > 0:
              for shingle in shingles:
                phrase = shingle["token"]
                key = self.__keyify(phrase)
                if len(key) > 0:
                  data = {"phrase": phrase,"phrase__not_analyzed": phrase,"document_id": document["_id"]}
                  if not self.esClient.exists(index=self.processorIndex, doc_type=self.processorPhraseType, id=key):
                    self.esClient.index(index=self.processorIndex, doc_type=self.processorPhraseType, id=key, body=data)
        sleep(1)
        for processorInstance in self.config["processor_instances"]:
          processorInstance.annotate(self.config, documentId)
        self.worker.reply(message, {"documentId": documentId, "status" : "processed", "type" : "reply"}, self.timeout)

    self.logger.info("Terminating annotation worker")

  def unregisterDispatcher(self, dispatcher, message):
    if message == "dying":
      self.dispatchers.pop(dispatcher, None)

    if len(self.dispatchers) == 0:
      self.worker.send(content="kill", to=self.workerName)

  def __keyify(self, phrase):
    phrase = phrase.strip()
    if len(phrase) == 0:
      return ""
    key = re.sub("[^A-Za-z0-9]", " ", phrase)
    key = " ".join(phrase.split())
    key = key.lower()
    key = "-".join(phrase.split())
    return key

  def __replaceUnderscore(self,shingle):
    token = shingle["token"]
    token = token.replace("_","")
    token = re.sub('\s+', ' ', token).strip()
    shingle["token"] = token
    return shingle
    
  def __filterTokens(self, shingle):
    global esStopWords
    tokens = shingle["token"].split(" ")
    firstToken = tokens[0]
    lastToken = tokens[-1]
    isValid = True
    isValid = (isValid and lastToken != None)
    isValid = (isValid and len(lastToken) > 1)
    isValid = (isValid and not firstToken.replace(".","",1).isdigit())
    isValid = (isValid and not lastToken.replace(".","",1).isdigit())
    isValid = (isValid and firstToken not in esStopWords)
    isValid = (isValid and lastToken not in esStopWords)
    return isValid
コード例 #9
0
class GenerationWorker:
  
  def __init__(self, config, trainingDataset, holdOutDataset):
    self.config = config
    self.logger = config["logger"]
    self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
    self.trainingDataset = trainingDataset
    self.holdOutDataset = holdOutDataset
    self.bagOfPhrases = {}
    self.corpusIndex = config["corpus"]["index"]
    self.corpusType = config["corpus"]["type"]
    self.corpusFields = config["corpus"]["text_fields"]
    self.corpusSize = 0
    self.timeout = 6000000
    self.processorIndex = config["processor"]["index"]
    self.processorType = config["processor"]["type"]
    self.processorPhraseType = config["processor"]["type"]+"__phrase"
    count = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query":{"match_all":{}}})
    self.corpusSize = count["count"]
    self.featureNames = map(lambda x: x["name"], config["generator"]["features"])
    for module in config["processor"]["modules"]:
      self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"])
    
    self.workerName = "bayzee.generation.worker"
    self.dispatchers = {}
    
    #creating worker
    self.worker = DurableChannel(self.workerName, config)
  
  def generate(self):
    self.__extractFeatures()

  def __extractFeatures(self):
    while True:
      message = self.worker.receive()
      if message["content"] == "kill":
        message["responseId"] = message["requestId"]
        self.worker.close(message)
        if len(self.dispatchers) == 0:
          self.worker.end()
          break
        else:
          self.worker.send(content="kill", to=self.workerName)
          continue
      elif message["content"]["type"] == "generate":
        if message["content"]["from"] not in self.dispatchers:
          self.dispatchers[message["content"]["from"]] = RemoteChannel(message["content"]["from"], self.config)
          self.dispatchers[message["content"]["from"]].listen(self.unregisterDispatcher)
        phraseId = message["content"]["phraseId"]
        phraseData = self.esClient.get(index=self.processorIndex, doc_type=self.processorPhraseType, id = phraseId)
        floatPrecision = "{0:." + str(self.config["generator"]["floatPrecision"]) + "f}"
        token = phraseData["_source"]["phrase"]
        documentId = phraseData["_source"]["document_id"]
        self.logger.info("Extracted common features for phrase '" + token + "'")
        entry = {}
        shouldMatch = map(lambda x: {"match_phrase":{x:token}}, self.corpusFields)
        query = {"query":{"bool":{"should":shouldMatch}}}
        data = self.esClient.search(index=self.corpusIndex, doc_type=self.corpusType, body=query, explain=True, size=self.corpusSize)
        entry["max_score"] = 0
        maxScore = 0
        avgScore = 0
        maxTermFrequency = 0
        avgTermFrequency = 0
        for hit in data["hits"]["hits"]:
          avgScore += float(hit["_score"])
          numOfScores = 0
          hitTermFrequency = 0
          explanation = json.dumps(hit["_explanation"])
          while len(explanation) > len(token):
            indexOfToken = explanation.find("tf(") + len("tf(")
            if indexOfToken < len("tf("):
              break
            explanation = explanation[indexOfToken:]
            freqToken = explanation.split(")")[0]
            explanation = explanation.split(")")[1]
            if freqToken.find("freq=") >= 0:
              numOfScores += 1
              hitTermFrequency += float(freqToken.split("=")[1])
          if numOfScores > 0 : hitTermFrequency = hitTermFrequency / numOfScores
          if maxTermFrequency < hitTermFrequency: maxTermFrequency = hitTermFrequency 
          avgTermFrequency += hitTermFrequency

        if len(data["hits"]["hits"]) > 0:
          avgTermFrequency = avgTermFrequency * 1.0 / len(data["hits"]["hits"])
        
        if int(data["hits"]["total"]) > 0:
          avgScore = (avgScore * 1.0) / int(data["hits"]["total"])
        
        if data["hits"]["max_score"] != None: 
          maxScore = data["hits"]["max_score"]
        
        if "max_score" in self.featureNames:
          entry["max_score"] = floatPrecision.format(float(maxScore))
        if "doc_count" in self.featureNames:
          entry["doc_count"] = floatPrecision.format(float(data["hits"]["total"]))
        if "avg_score" in self.featureNames:
          entry["avg_score"] = floatPrecision.format(float(avgScore))
        if "max_term_frequency" in self.featureNames:
          entry["max_term_frequency"] = floatPrecision.format(float(maxTermFrequency))
        if "avg_term_frequency" in self.featureNames:
          entry["avg_term_frequency"] = floatPrecision.format(float(avgTermFrequency))
        # get additional features
        for processorInstance in self.config["processor_instances"]:
          processorInstance.extractFeatures(self.config, token, entry)

        phraseData["_source"]["features"] = entry
        if token in self.trainingDataset:
          phraseData["_source"]["is_training"] = self.trainingDataset[token].strip()
        if token in self.holdOutDataset:
          phraseData["_source"]["is_holdout"] = self.holdOutDataset[token].strip()
        self.esClient.index(index=self.processorIndex, doc_type=self.processorPhraseType, id=phraseId, body=phraseData["_source"])
        self.worker.reply(message, {"phraseId": phraseId, "status" : "generated", "type" : "reply"}, 120000000)   
      if message["content"]["type"] == "stop_dispatcher":
        self.worker.reply(message, {"phraseId": -1, "status" : "stop_dispatcher", "type" : "stop_dispatcher"}, self.timeout)        

    self.logger.info("Terminating generation worker")

  def unregisterDispatcher(self, dispatcher, message):
    if message == "dying":
      self.dispatchers.pop(dispatcher, None)

    if len(self.dispatchers) == 0:
      self.worker.send(content="kill", to=self.workerName)
コード例 #10
0
class GenerationDispatcher:
  
  def __init__(self, config, trainingDataset, holdOutDataset, processingStartIndex, processingEndIndex):
    self.config = config
    self.logger = config["logger"]
    self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
    self.trainingDataset = trainingDataset
    self.holdOutDataset = holdOutDataset
    self.config["processingStartIndex"] = processingStartIndex
    self.config["processingEndIndex"] = processingEndIndex
    self.bagOfPhrases = {}
    self.corpusIndex = config["corpus"]["index"]
    self.corpusType = config["corpus"]["type"]
    self.corpusFields = config["corpus"]["text_fields"]
    self.corpusSize = 0
    self.totalPhrasesDispatched = 0
    self.phrasesGenerated = 0
    self.phrasesNotGenerated = 0
    self.timeout = 86400000
    self.dispatcherName = "bayzee.generation.dispatcher"
    if processingEndIndex != None:
      self.dispatcherName += "." + str(processingStartIndex) + "." + str(processingEndIndex)
    self.workerName = "bayzee.generation.worker"
    self.processorIndex = config["processor"]["index"]
    self.processorType = config["processor"]["type"]
    self.processorPhraseType = config["processor"]["type"]+"__phrase"
    self.processingPageSize = config["processingPageSize"]
    config["processor_phrase_type"] = self.processorPhraseType
    
    self.featureNames = map(lambda x: x["name"], config["generator"]["features"])
    for module in config["processor"]["modules"]:
      self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"])

    # creating generation dispatcher
    self.generationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback)
    
    # creating controle channel
    self.controlChannel = RemoteChannel(self.dispatcherName, config)

  def dispatchToGenerate(self):
    processorIndex = self.config["processor"]["index"]
    phraseProcessorType = self.config["processor"]["type"] + "__phrase"
    nextPhraseIndex = 0
    if self.config["processingStartIndex"] != None: nextPhraseIndex = self.config["processingStartIndex"]
    endPhraseIndex = -1
    if self.config["processingEndIndex"] != None: endPhraseIndex = self.config["processingEndIndex"]

    if endPhraseIndex != -1 and self.processingPageSize > (endPhraseIndex - nextPhraseIndex):
      self.processingPageSize = endPhraseIndex - nextPhraseIndex + 1
    
    while True:
      phrases = self.esClient.search(index=processorIndex, doc_type=phraseProcessorType, body={"from": nextPhraseIndex,"size": self.processingPageSize, "query":{"match_all":{}},"sort":[{"phrase__not_analyzed":{"order":"asc"}}]}, fields=["_id"])
      if len(phrases["hits"]["hits"]) == 0: break
      self.totalPhrasesDispatched += len(phrases["hits"]["hits"])
      floatPrecision = "{0:." + str(self.config["generator"]["floatPrecision"]) + "f}"
      self.logger.info("Generating features from " + str(nextPhraseIndex) + " to " + str(nextPhraseIndex+len(phrases["hits"]["hits"])) + " phrases...")
      for phraseData in phrases["hits"]["hits"]:
        self.logger.info("Dispatching phrase " + phraseData["_id"])
        content = {"phraseId": phraseData["_id"], "type": "generate", "count": 1, "from": self.dispatcherName}
        self.generationDispatcher.send(content, self.workerName, self.timeout)
      nextPhraseIndex += len(phrases["hits"]["hits"])
      if endPhraseIndex != -1 and nextPhraseIndex >= endPhraseIndex: break
    
    while True:
      message = self.generationDispatcher.receive()
      if "phraseId" in message["content"] and message["content"]["phraseId"] > 0:
        self.phrasesGenerated += 1
        self.generationDispatcher.close(message)
        self.logger.info("Generated for " + message["content"]["phraseId"] + str(self.phrasesGenerated) + "/" + str(self.totalPhrasesDispatched))
      
      if (self.phrasesGenerated + self.phrasesNotGenerated) >= self.totalPhrasesDispatched:
        self.controlChannel.send("dying")
        break

    self.__terminate()
    
  def timeoutCallback(self, message):
    config.logger.info("Message timed out: " + str(message))
    if message["content"]["count"] < 5:
      message["content"]["count"] += 1
      self.generationDispatcher.send(message["content"], self.workerName, self.timeout)
    else:
      #log implementation yet to be done for expired phrases
      self.phrasesNotGenerated += 1
      if self.phrasesNotGenerated == self.totalPhrasesDispatched or (self.phrasesGenerated + self.phrasesNotGenerated) == self.totalPhrasesDispatched:
        self.__terminate()

  def __terminate(self):
    self.logger.info(str(self.totalPhrasesDispatched) + " total dispatched")
    self.logger.info(str(self.phrasesGenerated) + " generated")
    self.logger.info(str(self.phrasesNotGenerated) + " failed to generate")
    self.logger.info("Generation complete")
    self.logger.info("Terminating generation dispatcher")
コード例 #11
0
ファイル: generation_worker.py プロジェクト: rvaughan/bayzee
class GenerationWorker:
    def __init__(self, config, trainingDataset, holdOutDataset):
        self.config = config
        self.logger = config["logger"]
        self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" +
                                      str(config["elasticsearch"]["port"]))
        self.trainingDataset = trainingDataset
        self.holdOutDataset = holdOutDataset
        self.bagOfPhrases = {}
        self.corpusIndex = config["corpus"]["index"]
        self.corpusType = config["corpus"]["type"]
        self.corpusFields = config["corpus"]["text_fields"]
        self.corpusSize = 0
        self.timeout = 6000000
        self.processorIndex = config["processor"]["index"]
        self.processorType = config["processor"]["type"]
        self.processorPhraseType = config["processor"]["type"] + "__phrase"
        count = self.esClient.count(index=self.corpusIndex,
                                    doc_type=self.corpusType,
                                    body={"query": {
                                        "match_all": {}
                                    }})
        self.corpusSize = count["count"]
        self.featureNames = map(lambda x: x["name"],
                                config["generator"]["features"])
        for module in config["processor"]["modules"]:
            self.featureNames = self.featureNames + map(
                lambda x: x["name"], module["features"])

        self.workerName = "bayzee.generation.worker"
        self.dispatchers = {}

        #creating worker
        self.worker = DurableChannel(self.workerName, config)

    def generate(self):
        self.__extractFeatures()

    def __extractFeatures(self):
        while True:
            message = self.worker.receive()
            if message["content"] == "kill":
                message["responseId"] = message["requestId"]
                self.worker.close(message)
                if len(self.dispatchers) == 0:
                    self.worker.end()
                    break
                else:
                    self.worker.send(content="kill", to=self.workerName)
                    continue
            elif message["content"]["type"] == "generate":
                if message["content"]["from"] not in self.dispatchers:
                    self.dispatchers[
                        message["content"]["from"]] = RemoteChannel(
                            message["content"]["from"], self.config)
                    self.dispatchers[message["content"]["from"]].listen(
                        self.unregisterDispatcher)
                phraseId = message["content"]["phraseId"]
                phraseData = self.esClient.get(
                    index=self.processorIndex,
                    doc_type=self.processorPhraseType,
                    id=phraseId)
                floatPrecision = "{0:." + str(
                    self.config["generator"]["floatPrecision"]) + "f}"
                token = phraseData["_source"]["phrase"]
                documentId = phraseData["_source"]["document_id"]
                self.logger.info("Extracted common features for phrase '" +
                                 token + "'")
                entry = {}
                shouldMatch = map(lambda x: {"match_phrase": {
                    x: token
                }}, self.corpusFields)
                query = {"query": {"bool": {"should": shouldMatch}}}
                data = self.esClient.search(index=self.corpusIndex,
                                            doc_type=self.corpusType,
                                            body=query,
                                            explain=True,
                                            size=self.corpusSize)
                entry["max_score"] = 0
                maxScore = 0
                avgScore = 0
                maxTermFrequency = 0
                avgTermFrequency = 0
                for hit in data["hits"]["hits"]:
                    avgScore += float(hit["_score"])
                    numOfScores = 0
                    hitTermFrequency = 0
                    explanation = json.dumps(hit["_explanation"])
                    while len(explanation) > len(token):
                        indexOfToken = explanation.find("tf(") + len("tf(")
                        if indexOfToken < len("tf("):
                            break
                        explanation = explanation[indexOfToken:]
                        freqToken = explanation.split(")")[0]
                        explanation = explanation.split(")")[1]
                        if freqToken.find("freq=") >= 0:
                            numOfScores += 1
                            hitTermFrequency += float(freqToken.split("=")[1])
                    if numOfScores > 0:
                        hitTermFrequency = hitTermFrequency / numOfScores
                    if maxTermFrequency < hitTermFrequency:
                        maxTermFrequency = hitTermFrequency
                    avgTermFrequency += hitTermFrequency

                if len(data["hits"]["hits"]) > 0:
                    avgTermFrequency = avgTermFrequency * 1.0 / len(
                        data["hits"]["hits"])

                if int(data["hits"]["total"]) > 0:
                    avgScore = (avgScore * 1.0) / int(data["hits"]["total"])

                if data["hits"]["max_score"] != None:
                    maxScore = data["hits"]["max_score"]

                if "max_score" in self.featureNames:
                    entry["max_score"] = floatPrecision.format(float(maxScore))
                if "doc_count" in self.featureNames:
                    entry["doc_count"] = floatPrecision.format(
                        float(data["hits"]["total"]))
                if "avg_score" in self.featureNames:
                    entry["avg_score"] = floatPrecision.format(float(avgScore))
                if "max_term_frequency" in self.featureNames:
                    entry["max_term_frequency"] = floatPrecision.format(
                        float(maxTermFrequency))
                if "avg_term_frequency" in self.featureNames:
                    entry["avg_term_frequency"] = floatPrecision.format(
                        float(avgTermFrequency))
                # get additional features
                for processorInstance in self.config["processor_instances"]:
                    processorInstance.extractFeatures(self.config, token,
                                                      entry)

                phraseData["_source"]["features"] = entry
                if token in self.trainingDataset:
                    phraseData["_source"][
                        "is_training"] = self.trainingDataset[token].strip()
                if token in self.holdOutDataset:
                    phraseData["_source"]["is_holdout"] = self.holdOutDataset[
                        token].strip()
                self.esClient.index(index=self.processorIndex,
                                    doc_type=self.processorPhraseType,
                                    id=phraseId,
                                    body=phraseData["_source"])
                self.worker.reply(message, {
                    "phraseId": phraseId,
                    "status": "generated",
                    "type": "reply"
                }, 120000000)
            if message["content"]["type"] == "stop_dispatcher":
                self.worker.reply(
                    message, {
                        "phraseId": -1,
                        "status": "stop_dispatcher",
                        "type": "stop_dispatcher"
                    }, self.timeout)

        self.logger.info("Terminating generation worker")

    def unregisterDispatcher(self, dispatcher, message):
        if message == "dying":
            self.dispatchers.pop(dispatcher, None)

        if len(self.dispatchers) == 0:
            self.worker.send(content="kill", to=self.workerName)
コード例 #12
0
  def __init__(self, config, processingStartIndex, processingEndIndex):
    self.config = config
    self.logger = config["logger"]
    self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
    self.bagOfPhrases = {}
    self.corpusIndex = config["corpus"]["index"]
    self.corpusType = config["corpus"]["type"]
    self.corpusFields = config["corpus"]["text_fields"]
    self.corpusSize = 0
    self.processorIndex = config["processor"]["index"]
    self.processorType = config["processor"]["type"]
    self.processorPhraseType = config["processor"]["type"] + "__phrase"
    self.processingPageSize = config["processingPageSize"]
    self.analyzerIndex = self.corpusIndex + "__analysis__"
    self.config["processingStartIndex"] = processingStartIndex
    self.config["processingEndIndex"] = processingEndIndex
    self.config["processingPageSize"] = self.processingPageSize
    self.totalDocumentsDispatched = 0
    self.documentsAnnotated = 0
    self.documentsNotAnnotated = 0
    self.lastDispatcher = False
    self.endProcess = False
    self.dispatcherName = "bayzee.annotation.dispatcher"
    self.workerName = "bayzee.annotation.worker"
    self.timeout = 86400000
    if processingEndIndex != None:
      self.dispatcherName += "." + str(processingStartIndex) + "." + str(processingEndIndex)

    analyzerIndexSettings = {
      "index":{
        "analysis":{
          "analyzer":{
            "analyzer_shingle":{
              "type": "custom",
              "tokenizer": "standard",
              "filter": ["standard", "lowercase", "filter_shingle"]
            }
          },
          "filter":{
            "filter_shingle":{
              "type": "shingle",
              "max_shingle_size": config["generator"]["maxShingleSize"],
              "min_shingle_size": config["generator"]["minShingleSize"],
              "output_unigrams": (config["generator"]["minShingleSize"] == 1)
            },
            "filter_stop":{
              "type": "stop"
            }
          }
        }
      }
    }
    analyzerIndexTypeMapping = {
      "properties":{
        "phrase":{"type":"string"},
        "document_id":{"type":"string", "index": "not_analyzed"},
        "phrase__not_analyzed":{"type":"string","index":"not_analyzed"}
      }
    }
    corpusSize = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query":{"match_all":{}}})
    self.corpusSize = corpusSize["count"]
    self.featureNames = map(lambda x: x["name"], config["generator"]["features"])
    for module in config["processor"]["modules"]:
      self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"])

    if processingStartIndex == 0:
      if self.esClient.indices.exists(self.analyzerIndex):
        self.esClient.indices.delete(self.analyzerIndex)
      data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) 
        
    if "annotateFromScratch" not in self.config or self.config["annotateFromScratch"] == True:
      try:
        if self.esClient.indices.exists(self.config["processor"]["index"]):
          self.esClient.indices.delete(self.config["processor"]["index"])
        self.esClient.indices.create(self.config["processor"]["index"])
        self.esClient.indices.put_mapping(index=self.config["processor"]["index"],doc_type=self.processorPhraseType,body=analyzerIndexTypeMapping)
        if self.esClient.indices.exists(self.analyzerIndex):
          self.esClient.indices.delete(self.analyzerIndex)
        data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) 
      except:
        error = sys.exc_info()
        self.logger.error("Error occurred during initialization of analyzer index: " + str(error))
        sys.exit(1)
      else:
        sleep(1)

    #dispatcher creation
    self.annotationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback)

    #remote channel intialisation
    self.controlChannel = RemoteChannel(self.dispatcherName, config)
コード例 #13
0
class AnnotationDispatcher:
  
  def __init__(self, config, processingStartIndex, processingEndIndex):
    self.config = config
    self.logger = config["logger"]
    self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
    self.bagOfPhrases = {}
    self.corpusIndex = config["corpus"]["index"]
    self.corpusType = config["corpus"]["type"]
    self.corpusFields = config["corpus"]["text_fields"]
    self.corpusSize = 0
    self.processorIndex = config["processor"]["index"]
    self.processorType = config["processor"]["type"]
    self.processorPhraseType = config["processor"]["type"] + "__phrase"
    self.processingPageSize = config["processingPageSize"]
    self.analyzerIndex = self.corpusIndex + "__analysis__"
    self.config["processingStartIndex"] = processingStartIndex
    self.config["processingEndIndex"] = processingEndIndex
    self.config["processingPageSize"] = self.processingPageSize
    self.totalDocumentsDispatched = 0
    self.documentsAnnotated = 0
    self.documentsNotAnnotated = 0
    self.lastDispatcher = False
    self.endProcess = False
    self.dispatcherName = "bayzee.annotation.dispatcher"
    self.workerName = "bayzee.annotation.worker"
    self.timeout = 86400000
    if processingEndIndex != None:
      self.dispatcherName += "." + str(processingStartIndex) + "." + str(processingEndIndex)

    analyzerIndexSettings = {
      "index":{
        "analysis":{
          "analyzer":{
            "analyzer_shingle":{
              "type": "custom",
              "tokenizer": "standard",
              "filter": ["standard", "lowercase", "filter_shingle"]
            }
          },
          "filter":{
            "filter_shingle":{
              "type": "shingle",
              "max_shingle_size": config["generator"]["maxShingleSize"],
              "min_shingle_size": config["generator"]["minShingleSize"],
              "output_unigrams": (config["generator"]["minShingleSize"] == 1)
            },
            "filter_stop":{
              "type": "stop"
            }
          }
        }
      }
    }
    analyzerIndexTypeMapping = {
      "properties":{
        "phrase":{"type":"string"},
        "document_id":{"type":"string", "index": "not_analyzed"},
        "phrase__not_analyzed":{"type":"string","index":"not_analyzed"}
      }
    }
    corpusSize = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query":{"match_all":{}}})
    self.corpusSize = corpusSize["count"]
    self.featureNames = map(lambda x: x["name"], config["generator"]["features"])
    for module in config["processor"]["modules"]:
      self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"])

    if processingStartIndex == 0:
      if self.esClient.indices.exists(self.analyzerIndex):
        self.esClient.indices.delete(self.analyzerIndex)
      data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) 
        
    if "annotateFromScratch" not in self.config or self.config["annotateFromScratch"] == True:
      try:
        if self.esClient.indices.exists(self.config["processor"]["index"]):
          self.esClient.indices.delete(self.config["processor"]["index"])
        self.esClient.indices.create(self.config["processor"]["index"])
        self.esClient.indices.put_mapping(index=self.config["processor"]["index"],doc_type=self.processorPhraseType,body=analyzerIndexTypeMapping)
        if self.esClient.indices.exists(self.analyzerIndex):
          self.esClient.indices.delete(self.analyzerIndex)
        data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) 
      except:
        error = sys.exc_info()
        self.logger.error("Error occurred during initialization of analyzer index: " + str(error))
        sys.exit(1)
      else:
        sleep(1)

    #dispatcher creation
    self.annotationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback)

    #remote channel intialisation
    self.controlChannel = RemoteChannel(self.dispatcherName, config)

  def dispatchToAnnotate(self):
    if "indexPhrases" in self.config and self.config["indexPhrases"] == False: return
    nextDocumentIndex = 0
    if self.config["processingStartIndex"] != None: nextDocumentIndex = self.config["processingStartIndex"]
    endDocumentIndex = -1
    if self.config["processingEndIndex"] != None: endDocumentIndex = self.config["processingEndIndex"]
   
    if endDocumentIndex != -1 and self.processingPageSize > (endDocumentIndex - nextDocumentIndex):
      self.processingPageSize = endDocumentIndex - nextDocumentIndex + 1

    self.totalDocumentsDispatched = 0

    while True:
      documents = self.esClient.search(index=self.corpusIndex, doc_type=self.corpusType, body={"from": nextDocumentIndex,"size": self.processingPageSize,"query":{"match_all":{}}, "sort":[{"_id":{"order":"asc"}}]}, fields=["_id"])
      if len(documents["hits"]["hits"]) == 0: 
        break
      self.totalDocumentsDispatched += len(documents["hits"]["hits"])
      self.logger.info("Annotating " + str(nextDocumentIndex) + " to " + str(nextDocumentIndex+len(documents["hits"]["hits"])) + " documents...")
      for document in documents["hits"]["hits"]:
        self.logger.info("Dispatching document " + document["_id"])
        content = {"documentId": document["_id"], "type": "annotate", "count": 1, "from":self.dispatcherName}
        self.annotationDispatcher.send(content, self.workerName)
      nextDocumentIndex += len(documents["hits"]["hits"])
      if endDocumentIndex != -1 and endDocumentIndex <= nextDocumentIndex: 
        break
    
    self.logger.info(str(self.totalDocumentsDispatched) + " documents dispatched")
    while True:
      message = self.annotationDispatcher.receive()
      if "documentId" in message["content"] and message["content"]["documentId"] > 0:
        self.documentsAnnotated += 1
        self.annotationDispatcher.close(message)
        self.logger.info("Annotated document " + message["content"]["documentId"] + " - " + str(self.documentsAnnotated) + "/" + str(self.totalDocumentsDispatched))
      
      if (self.documentsAnnotated + self.documentsNotAnnotated) >= self.totalDocumentsDispatched and not self.lastDispatcher:
        self.controlChannel.send("dying")
        self.annotationDispatcher.end()
        break
    
    self.__terminate()

  def timeoutCallback(self, message):
    if message["content"]["count"] < 5:
      message["content"]["count"] += 1
      self.annotationDispatcher.send(message["content"], self.workerName, self.timeout)
    else:
      #log implementation yet to be done for expired documents
      self.documentsNotAnnotated += 1
      if self.documentsNotAnnotated == self.totalDocumentsDispatched or (self.documentsAnnotated + self.documentsNotAnnotated) == self.totalDocumentsDispatched:
        self.__terminate()

  def __terminate(self):
    self.logger.info(str(self.totalDocumentsDispatched) + " total dispatched")
    self.logger.info(str(self.documentsAnnotated) + " annotated")
    self.logger.info(str(self.documentsNotAnnotated) + " failed to annotate")
    self.logger.info("Annotation complete")
    self.logger.info("Terminating annotation dispatcher")

  def __deleteAnalyzerIndex(self):
    if self.esClient.indices.exists(self.analyzerIndex):
        self.esClient.indices.delete(self.analyzerIndex)
コード例 #14
0
class GenerationDispatcher:
    def __init__(self, config, trainingDataset, holdOutDataset,
                 processingStartIndex, processingEndIndex):
        self.config = config
        self.logger = config["logger"]
        self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" +
                                      str(config["elasticsearch"]["port"]))
        self.trainingDataset = trainingDataset
        self.holdOutDataset = holdOutDataset
        self.config["processingStartIndex"] = processingStartIndex
        self.config["processingEndIndex"] = processingEndIndex
        self.bagOfPhrases = {}
        self.corpusIndex = config["corpus"]["index"]
        self.corpusType = config["corpus"]["type"]
        self.corpusFields = config["corpus"]["text_fields"]
        self.corpusSize = 0
        self.totalPhrasesDispatched = 0
        self.phrasesGenerated = 0
        self.phrasesNotGenerated = 0
        self.timeout = 86400000
        self.dispatcherName = "bayzee.generation.dispatcher"
        if processingEndIndex != None:
            self.dispatcherName += "." + str(processingStartIndex) + "." + str(
                processingEndIndex)
        self.workerName = "bayzee.generation.worker"
        self.processorIndex = config["processor"]["index"]
        self.processorType = config["processor"]["type"]
        self.processorPhraseType = config["processor"]["type"] + "__phrase"
        self.processingPageSize = config["processingPageSize"]
        config["processor_phrase_type"] = self.processorPhraseType

        self.featureNames = map(lambda x: x["name"],
                                config["generator"]["features"])
        for module in config["processor"]["modules"]:
            self.featureNames = self.featureNames + map(
                lambda x: x["name"], module["features"])

        # creating generation dispatcher
        self.generationDispatcher = DurableChannel(self.dispatcherName, config,
                                                   self.timeoutCallback)

        # creating controle channel
        self.controlChannel = RemoteChannel(self.dispatcherName, config)

    def dispatchToGenerate(self):
        processorIndex = self.config["processor"]["index"]
        phraseProcessorType = self.config["processor"]["type"] + "__phrase"
        nextPhraseIndex = 0
        if self.config["processingStartIndex"] != None:
            nextPhraseIndex = self.config["processingStartIndex"]
        endPhraseIndex = -1
        if self.config["processingEndIndex"] != None:
            endPhraseIndex = self.config["processingEndIndex"]

        if endPhraseIndex != -1 and self.processingPageSize > (
                endPhraseIndex - nextPhraseIndex):
            self.processingPageSize = endPhraseIndex - nextPhraseIndex + 1

        while True:
            phrases = self.esClient.search(index=processorIndex,
                                           doc_type=phraseProcessorType,
                                           body={
                                               "from":
                                               nextPhraseIndex,
                                               "size":
                                               self.processingPageSize,
                                               "query": {
                                                   "match_all": {}
                                               },
                                               "sort": [{
                                                   "phrase__not_analyzed": {
                                                       "order": "asc"
                                                   }
                                               }]
                                           },
                                           fields=["_id"])
            if len(phrases["hits"]["hits"]) == 0: break
            self.totalPhrasesDispatched += len(phrases["hits"]["hits"])
            floatPrecision = "{0:." + str(
                self.config["generator"]["floatPrecision"]) + "f}"
            self.logger.info("Generating features from " +
                             str(nextPhraseIndex) + " to " +
                             str(nextPhraseIndex +
                                 len(phrases["hits"]["hits"])) + " phrases...")
            for phraseData in phrases["hits"]["hits"]:
                self.logger.info("Dispatching phrase " + phraseData["_id"])
                content = {
                    "phraseId": phraseData["_id"],
                    "type": "generate",
                    "count": 1,
                    "from": self.dispatcherName
                }
                self.generationDispatcher.send(content, self.workerName,
                                               self.timeout)
            nextPhraseIndex += len(phrases["hits"]["hits"])
            if endPhraseIndex != -1 and nextPhraseIndex >= endPhraseIndex:
                break

        while True:
            message = self.generationDispatcher.receive()
            if "phraseId" in message[
                    "content"] and message["content"]["phraseId"] > 0:
                self.phrasesGenerated += 1
                self.generationDispatcher.close(message)
                self.logger.info("Generated for " +
                                 message["content"]["phraseId"] +
                                 str(self.phrasesGenerated) + "/" +
                                 str(self.totalPhrasesDispatched))

            if (self.phrasesGenerated +
                    self.phrasesNotGenerated) >= self.totalPhrasesDispatched:
                self.controlChannel.send("dying")
                break

        self.__terminate()

    def timeoutCallback(self, message):
        config.logger.info("Message timed out: " + str(message))
        if message["content"]["count"] < 5:
            message["content"]["count"] += 1
            self.generationDispatcher.send(message["content"], self.workerName,
                                           self.timeout)
        else:
            #log implementation yet to be done for expired phrases
            self.phrasesNotGenerated += 1
            if self.phrasesNotGenerated == self.totalPhrasesDispatched or (
                    self.phrasesGenerated +
                    self.phrasesNotGenerated) == self.totalPhrasesDispatched:
                self.__terminate()

    def __terminate(self):
        self.logger.info(
            str(self.totalPhrasesDispatched) + " total dispatched")
        self.logger.info(str(self.phrasesGenerated) + " generated")
        self.logger.info(str(self.phrasesNotGenerated) + " failed to generate")
        self.logger.info("Generation complete")
        self.logger.info("Terminating generation dispatcher")
コード例 #15
0
    def __init__(self, config, processingStartIndex, processingEndIndex):
        self.config = config
        self.logger = config["logger"]
        self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" +
                                      str(config["elasticsearch"]["port"]))
        self.bagOfPhrases = {}
        self.corpusIndex = config["corpus"]["index"]
        self.corpusType = config["corpus"]["type"]
        self.corpusFields = config["corpus"]["text_fields"]
        self.corpusSize = 0
        self.processorIndex = config["processor"]["index"]
        self.processorType = config["processor"]["type"]
        self.processorPhraseType = config["processor"]["type"] + "__phrase"
        self.processingPageSize = config["processingPageSize"]
        self.analyzerIndex = self.corpusIndex + "__analysis__"
        self.config["processingStartIndex"] = processingStartIndex
        self.config["processingEndIndex"] = processingEndIndex
        self.config["processingPageSize"] = self.processingPageSize
        self.totalDocumentsDispatched = 0
        self.documentsAnnotated = 0
        self.documentsNotAnnotated = 0
        self.lastDispatcher = False
        self.endProcess = False
        self.dispatcherName = "bayzee.annotation.dispatcher"
        self.workerName = "bayzee.annotation.worker"
        self.timeout = 86400000
        if processingEndIndex != None:
            self.dispatcherName += "." + str(processingStartIndex) + "." + str(
                processingEndIndex)

        analyzerIndexSettings = {
            "index": {
                "analysis": {
                    "analyzer": {
                        "analyzer_shingle": {
                            "type": "custom",
                            "tokenizer": "standard",
                            "filter":
                            ["standard", "lowercase", "filter_shingle"]
                        }
                    },
                    "filter": {
                        "filter_shingle": {
                            "type":
                            "shingle",
                            "max_shingle_size":
                            config["generator"]["maxShingleSize"],
                            "min_shingle_size":
                            config["generator"]["minShingleSize"],
                            "output_unigrams":
                            (config["generator"]["minShingleSize"] == 1)
                        },
                        "filter_stop": {
                            "type": "stop"
                        }
                    }
                }
            }
        }
        analyzerIndexTypeMapping = {
            "properties": {
                "phrase": {
                    "type": "string"
                },
                "document_id": {
                    "type": "string",
                    "index": "not_analyzed"
                },
                "phrase__not_analyzed": {
                    "type": "string",
                    "index": "not_analyzed"
                }
            }
        }
        corpusSize = self.esClient.count(index=self.corpusIndex,
                                         doc_type=self.corpusType,
                                         body={"query": {
                                             "match_all": {}
                                         }})
        self.corpusSize = corpusSize["count"]
        self.featureNames = map(lambda x: x["name"],
                                config["generator"]["features"])
        for module in config["processor"]["modules"]:
            self.featureNames = self.featureNames + map(
                lambda x: x["name"], module["features"])

        if processingStartIndex == 0:
            if self.esClient.indices.exists(self.analyzerIndex):
                self.esClient.indices.delete(self.analyzerIndex)
            data = self.esClient.indices.create(self.analyzerIndex,
                                                analyzerIndexSettings)

        if "annotateFromScratch" not in self.config or self.config[
                "annotateFromScratch"] == True:
            try:
                if self.esClient.indices.exists(
                        self.config["processor"]["index"]):
                    self.esClient.indices.delete(
                        self.config["processor"]["index"])
                self.esClient.indices.create(self.config["processor"]["index"])
                self.esClient.indices.put_mapping(
                    index=self.config["processor"]["index"],
                    doc_type=self.processorPhraseType,
                    body=analyzerIndexTypeMapping)
                if self.esClient.indices.exists(self.analyzerIndex):
                    self.esClient.indices.delete(self.analyzerIndex)
                data = self.esClient.indices.create(self.analyzerIndex,
                                                    analyzerIndexSettings)
            except:
                error = sys.exc_info()
                self.logger.error(
                    "Error occurred during initialization of analyzer index: "
                    + str(error))
                sys.exit(1)
            else:
                sleep(1)

        #dispatcher creation
        self.annotationDispatcher = DurableChannel(self.dispatcherName, config,
                                                   self.timeoutCallback)

        #remote channel intialisation
        self.controlChannel = RemoteChannel(self.dispatcherName, config)
コード例 #16
0
class AnnotationDispatcher:
    def __init__(self, config, processingStartIndex, processingEndIndex):
        self.config = config
        self.logger = config["logger"]
        self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" +
                                      str(config["elasticsearch"]["port"]))
        self.bagOfPhrases = {}
        self.corpusIndex = config["corpus"]["index"]
        self.corpusType = config["corpus"]["type"]
        self.corpusFields = config["corpus"]["text_fields"]
        self.corpusSize = 0
        self.processorIndex = config["processor"]["index"]
        self.processorType = config["processor"]["type"]
        self.processorPhraseType = config["processor"]["type"] + "__phrase"
        self.processingPageSize = config["processingPageSize"]
        self.analyzerIndex = self.corpusIndex + "__analysis__"
        self.config["processingStartIndex"] = processingStartIndex
        self.config["processingEndIndex"] = processingEndIndex
        self.config["processingPageSize"] = self.processingPageSize
        self.totalDocumentsDispatched = 0
        self.documentsAnnotated = 0
        self.documentsNotAnnotated = 0
        self.lastDispatcher = False
        self.endProcess = False
        self.dispatcherName = "bayzee.annotation.dispatcher"
        self.workerName = "bayzee.annotation.worker"
        self.timeout = 86400000
        if processingEndIndex != None:
            self.dispatcherName += "." + str(processingStartIndex) + "." + str(
                processingEndIndex)

        analyzerIndexSettings = {
            "index": {
                "analysis": {
                    "analyzer": {
                        "analyzer_shingle": {
                            "type": "custom",
                            "tokenizer": "standard",
                            "filter":
                            ["standard", "lowercase", "filter_shingle"]
                        }
                    },
                    "filter": {
                        "filter_shingle": {
                            "type":
                            "shingle",
                            "max_shingle_size":
                            config["generator"]["maxShingleSize"],
                            "min_shingle_size":
                            config["generator"]["minShingleSize"],
                            "output_unigrams":
                            (config["generator"]["minShingleSize"] == 1)
                        },
                        "filter_stop": {
                            "type": "stop"
                        }
                    }
                }
            }
        }
        analyzerIndexTypeMapping = {
            "properties": {
                "phrase": {
                    "type": "string"
                },
                "document_id": {
                    "type": "string",
                    "index": "not_analyzed"
                },
                "phrase__not_analyzed": {
                    "type": "string",
                    "index": "not_analyzed"
                }
            }
        }
        corpusSize = self.esClient.count(index=self.corpusIndex,
                                         doc_type=self.corpusType,
                                         body={"query": {
                                             "match_all": {}
                                         }})
        self.corpusSize = corpusSize["count"]
        self.featureNames = map(lambda x: x["name"],
                                config["generator"]["features"])
        for module in config["processor"]["modules"]:
            self.featureNames = self.featureNames + map(
                lambda x: x["name"], module["features"])

        if processingStartIndex == 0:
            if self.esClient.indices.exists(self.analyzerIndex):
                self.esClient.indices.delete(self.analyzerIndex)
            data = self.esClient.indices.create(self.analyzerIndex,
                                                analyzerIndexSettings)

        if "annotateFromScratch" not in self.config or self.config[
                "annotateFromScratch"] == True:
            try:
                if self.esClient.indices.exists(
                        self.config["processor"]["index"]):
                    self.esClient.indices.delete(
                        self.config["processor"]["index"])
                self.esClient.indices.create(self.config["processor"]["index"])
                self.esClient.indices.put_mapping(
                    index=self.config["processor"]["index"],
                    doc_type=self.processorPhraseType,
                    body=analyzerIndexTypeMapping)
                if self.esClient.indices.exists(self.analyzerIndex):
                    self.esClient.indices.delete(self.analyzerIndex)
                data = self.esClient.indices.create(self.analyzerIndex,
                                                    analyzerIndexSettings)
            except:
                error = sys.exc_info()
                self.logger.error(
                    "Error occurred during initialization of analyzer index: "
                    + str(error))
                sys.exit(1)
            else:
                sleep(1)

        #dispatcher creation
        self.annotationDispatcher = DurableChannel(self.dispatcherName, config,
                                                   self.timeoutCallback)

        #remote channel intialisation
        self.controlChannel = RemoteChannel(self.dispatcherName, config)

    def dispatchToAnnotate(self):
        if "indexPhrases" in self.config and self.config[
                "indexPhrases"] == False:
            return
        nextDocumentIndex = 0
        if self.config["processingStartIndex"] != None:
            nextDocumentIndex = self.config["processingStartIndex"]
        endDocumentIndex = -1
        if self.config["processingEndIndex"] != None:
            endDocumentIndex = self.config["processingEndIndex"]

        if endDocumentIndex != -1 and self.processingPageSize > (
                endDocumentIndex - nextDocumentIndex):
            self.processingPageSize = endDocumentIndex - nextDocumentIndex + 1

        self.totalDocumentsDispatched = 0

        while True:
            documents = self.esClient.search(index=self.corpusIndex,
                                             doc_type=self.corpusType,
                                             body={
                                                 "from":
                                                 nextDocumentIndex,
                                                 "size":
                                                 self.processingPageSize,
                                                 "query": {
                                                     "match_all": {}
                                                 },
                                                 "sort": [{
                                                     "_id": {
                                                         "order": "asc"
                                                     }
                                                 }]
                                             },
                                             fields=["_id"])
            if len(documents["hits"]["hits"]) == 0:
                break
            self.totalDocumentsDispatched += len(documents["hits"]["hits"])
            self.logger.info("Annotating " + str(nextDocumentIndex) + " to " +
                             str(nextDocumentIndex +
                                 len(documents["hits"]["hits"])) +
                             " documents...")
            for document in documents["hits"]["hits"]:
                self.logger.info("Dispatching document " + document["_id"])
                content = {
                    "documentId": document["_id"],
                    "type": "annotate",
                    "count": 1,
                    "from": self.dispatcherName
                }
                self.annotationDispatcher.send(content, self.workerName)
            nextDocumentIndex += len(documents["hits"]["hits"])
            if endDocumentIndex != -1 and endDocumentIndex <= nextDocumentIndex:
                break

        self.logger.info(
            str(self.totalDocumentsDispatched) + " documents dispatched")
        while True:
            message = self.annotationDispatcher.receive()
            if "documentId" in message[
                    "content"] and message["content"]["documentId"] > 0:
                self.documentsAnnotated += 1
                self.annotationDispatcher.close(message)
                self.logger.info("Annotated document " +
                                 message["content"]["documentId"] + " - " +
                                 str(self.documentsAnnotated) + "/" +
                                 str(self.totalDocumentsDispatched))

            if (self.documentsAnnotated + self.documentsNotAnnotated
                ) >= self.totalDocumentsDispatched and not self.lastDispatcher:
                self.controlChannel.send("dying")
                self.annotationDispatcher.end()
                break

        self.__terminate()

    def timeoutCallback(self, message):
        if message["content"]["count"] < 5:
            message["content"]["count"] += 1
            self.annotationDispatcher.send(message["content"], self.workerName,
                                           self.timeout)
        else:
            #log implementation yet to be done for expired documents
            self.documentsNotAnnotated += 1
            if self.documentsNotAnnotated == self.totalDocumentsDispatched or (
                    self.documentsAnnotated + self.documentsNotAnnotated
            ) == self.totalDocumentsDispatched:
                self.__terminate()

    def __terminate(self):
        self.logger.info(
            str(self.totalDocumentsDispatched) + " total dispatched")
        self.logger.info(str(self.documentsAnnotated) + " annotated")
        self.logger.info(
            str(self.documentsNotAnnotated) + " failed to annotate")
        self.logger.info("Annotation complete")
        self.logger.info("Terminating annotation dispatcher")

    def __deleteAnalyzerIndex(self):
        if self.esClient.indices.exists(self.analyzerIndex):
            self.esClient.indices.delete(self.analyzerIndex)