def __init__(self, config, trainingDataset, holdOutDataset): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.trainingDataset = trainingDataset self.holdOutDataset = holdOutDataset self.bagOfPhrases = {} self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.timeout = 6000000 self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"] + "__phrase" count = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query": { "match_all": {} }}) self.corpusSize = count["count"] self.featureNames = map(lambda x: x["name"], config["generator"]["features"]) for module in config["processor"]["modules"]: self.featureNames = self.featureNames + map( lambda x: x["name"], module["features"]) self.workerName = "bayzee.generation.worker" self.dispatchers = {} #creating worker self.worker = DurableChannel(self.workerName, config)
def __init__(self, config): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.workerName = "bayzee.annotation.worker" self.timeout = 6000 self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"] + "__phrase" self.analyzerIndex = self.corpusIndex + "__analysis__" self.worker = DurableChannel(self.workerName, config) self.dispatchers = {}
def __init__(self, config, trainingDataset, holdOutDataset): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.trainingDataset = trainingDataset self.holdOutDataset = holdOutDataset self.bagOfPhrases = {} self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.timeout = 6000000 self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"]+"__phrase" count = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query":{"match_all":{}}}) self.corpusSize = count["count"] self.featureNames = map(lambda x: x["name"], config["generator"]["features"]) for module in config["processor"]["modules"]: self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"]) self.workerName = "bayzee.generation.worker" self.dispatchers = {} #creating worker self.worker = DurableChannel(self.workerName, config)
def __init__(self, config, processingStartIndex, processingEndIndex): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.config["processingStartIndex"] = processingStartIndex self.config["processingEndIndex"] = processingEndIndex self.bagOfPhrases = {} self.corpusSize = 0 self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"] + "__phrase" self.processingPageSize = config["processingPageSize"] config["processor_phrase_type"] = self.processorPhraseType self.featureNames = map(lambda x: x["name"], config["generator"]["features"]) for module in config["processor"]["modules"]: self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"]) self.totalPhrasesDispatched = 0 self.phrasesClassified = 0 self.phrasesNotClassified = 0 self.timeout = 86400000 self.dispatcherName = "bayzee.classification.dispatcher" if processingEndIndex != None: self.dispatcherName += "." + str(processingStartIndex) + "." + str(processingEndIndex) self.workerName = "bayzee.classification.worker" # creating generation dispatcher self.classificationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback) # remote channel intialisation self.controlChannel = RemoteChannel(self.dispatcherName, config)
def __init__(self, config, trainingDataset, holdOutDataset, processingStartIndex, processingEndIndex): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.trainingDataset = trainingDataset self.holdOutDataset = holdOutDataset self.config["processingStartIndex"] = processingStartIndex self.config["processingEndIndex"] = processingEndIndex self.bagOfPhrases = {} self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.totalPhrasesDispatched = 0 self.phrasesGenerated = 0 self.phrasesNotGenerated = 0 self.timeout = 86400000 self.dispatcherName = "bayzee.generation.dispatcher" if processingEndIndex != None: self.dispatcherName += "." + str(processingStartIndex) + "." + str( processingEndIndex) self.workerName = "bayzee.generation.worker" self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"] + "__phrase" self.processingPageSize = config["processingPageSize"] config["processor_phrase_type"] = self.processorPhraseType self.featureNames = map(lambda x: x["name"], config["generator"]["features"]) for module in config["processor"]["modules"]: self.featureNames = self.featureNames + map( lambda x: x["name"], module["features"]) # creating generation dispatcher self.generationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback) # creating controle channel self.controlChannel = RemoteChannel(self.dispatcherName, config)
def __init__(self, config): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.trainD = None self.classifier = None self.phraseId = None self.phraseData = None self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"]+"__phrase" self.features = self.config["generator"]["features"] for module in self.config["processor"]["modules"]: self.features = self.features + module["features"] self.workerName = "bayzee.classification.worker" self.timeout = 600000 self.dispatchers = {} #creating worker self.worker = DurableChannel(self.workerName, config)
class ClassificationWorker: def __init__(self, config): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.trainD = None self.classifier = None self.phraseId = None self.phraseData = None self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"]+"__phrase" self.features = self.config["generator"]["features"] for module in self.config["processor"]["modules"]: self.features = self.features + module["features"] self.workerName = "bayzee.classification.worker" self.timeout = 600000 self.dispatchers = {} #creating worker self.worker = DurableChannel(self.workerName, config) def classify(self): while True: message = self.worker.receive() if message["content"] == "kill": message["responseId"] = message["requestId"] self.worker.close(message) if len(self.dispatchers) == 0: self.worker.end() break else: self.worker.send(content="kill", to=self.workerName) continue elif message["content"]["type"] == "classify": if message["content"]["from"] not in self.dispatchers: self.dispatchers[message["content"]["from"]] = RemoteChannel(message["content"]["from"], self.config) self.dispatchers[message["content"]["from"]].listen(self.unregisterDispatcher) self.phraseId = message["content"]["phraseId"] if self.classifier == None: self.trainD = self.__loadDataFromES("train", None) self.trainD = orange.Preprocessor_discretize(self.trainD, method=orange.EntropyDiscretization()) self.__train() self.trainD = self.__loadDataFromES("train", None) testD = self.__loadDataFromES("test", self.trainD.domain) self.trainD = orange.Preprocessor_discretize(self.trainD, method=orange.EntropyDiscretization()) testD = orange.ExampleTable(self.trainD.domain, testD) for row in testD: phrase = row.getmetas().values()[0].value featureSet = {} for i,feature in enumerate(self.features): featureSet[feature["name"]] = row[i].value prob = self.classifier.prob_classify(featureSet).prob("1") classType = self.classifier.classify(featureSet) self.phraseData["_source"]["prob"] = prob self.phraseData["_source"]["class_type"] = classType self.logger.info("Classified '" + phrase + "' as " + classType + " with probability " + str(prob)) self.esClient.index(index=self.processorIndex, doc_type=self.processorPhraseType, id=self.phraseId, body=self.phraseData["_source"]) self.worker.reply(message, {"phraseId": self.phraseId, "status" : "classified", "type" : "reply"}, 120000000) self.logger.info("Terminating classification worker") def __getOrangeVariableForFeature(self, feature): if feature["isNumerical"]: return orange.FloatVariable(feature["name"]) else: return orange.EnumVariable(feature["name"]) def __loadDataFromES(self, dataType, domain): table = None if dataType != "train": table = orange.ExampleTable(domain) else: attributes = map(self.__getOrangeVariableForFeature, self.features) classAttribute = orange.EnumVariable("is_good", values = ["0", "1"]) domain = orange.Domain(attributes, classAttribute) domain.addmeta(orange.newmetaid(), orange.StringVariable("phrase")) table = orange.ExampleTable(domain) phrases = [] if dataType == "train": phrasesCount = self.esClient.count(index=self.processorIndex, doc_type=self.processorPhraseType, body={"query":{"terms":{"is_training":["1","0"]}}}) size = phrasesCount["count"] phrases = self.esClient.search(index=self.processorIndex, doc_type=self.processorPhraseType, body={"query":{"terms":{"is_training":["1","0"]}}}, size=size) phrases = phrases["hits"]["hits"] elif dataType == "holdout": phraseCount = self.esClient.count(index=self.processorIndex, doc_type=self.processorPhraseType, body={"query":{"terms":{"is_holdout":["1","0"]}}}) size = phrasesCount["count"] phrases = self.esClient.search(index=self.processorIndex, doc_type=self.processorPhraseType, body={"query":{"terms":{"is_holdout":["1","0"]}}}, size=size) phrases = phrases["hits"]["hits"] else: self.phraseData = self.esClient.get(index=self.processorIndex, doc_type=self.processorPhraseType, id=self.phraseId) phrases = [self.phraseData] for row in phrases: try: row = row["_source"] featureValues = [] classType = "?" for feature in self.features: featureValues.append(row["features"][feature["name"]].encode("ascii")) if dataType == "train": classType = row["is_training"].encode("ascii", "ignore") elif dataType == "holdout": classType = row["is_holdout"].encode("ascii") example = None for i,featureValue in enumerate(featureValues): attr = domain.attributes[i] if type(attr) is orange.EnumVariable: attr.addValue(featureValue) example = orange.Example(domain, (featureValues + [classType])) example[domain.getmetas().items()[0][0]] = row["phrase"].encode("ascii") table.append(example) except: self.logger.error("Error classifying phrase '" + row["phrase"] + "'") return table def __train(self): for a in self.trainD.domain.attributes: self.logger.info("%s: %s" % (a.name,reduce(lambda x,y: x+', '+y, [i for i in a.values]))) trainSet = [] for row in self.trainD: phrase = row.getmetas().values()[0].value classType = row[-1].value featureSet = {} for i,feature in enumerate(self.features): featureSet[feature["name"]] = row[i].value trainSet.append((featureSet, classType)) self.logger.info("\nTraining Naive Bayes Classifier with " + str(len(trainSet)) + " phrases...") self.classifier = nltk.NaiveBayesClassifier.train(trainSet) self.classifier.show_most_informative_features(50) def __calculateMeasures(self): falsePositives = 0 falseNegatives = 0 truePositives = 0 trueNegatives = 0 totalPositives = 0 totalNegatives = 0 totalHoldOutGoodPhrases = 0 totalHoldOutBadPhrases = 0 self.trainD = self.__loadDataFromES("train", None) self.holdOutD = self.__loadDataFromES("hold", self.trainD.domain) self.trainD = orange.Preprocessor_discretize(self.trainD, method=orange.EntropyDiscretization()) self.holdOutD = orange.ExampleTable(self.trainD.domain, self.holdOutD) for row in self.holdOutD: actualClassType = row[-1].value phrase = row.getmetas().values()[0].value featureSet = {} for i,feature in enumerate(self.features): featureSet[feature["name"]] = row[i].value if self.classifier == None: classifierFile = open(self.classifierFilePath) self.classifier = pickle.load(classifierFile) classifierFile.close() prob = self.classifier.prob_classify(featureSet).prob("1") classType = self.classifier.classify(featureSet) if classType == "1": totalPositives += 1 if classType == actualClassType: truePositives += 1 else: totalNegatives += 1 if classType == actualClassType: trueNegatives += 1 if actualClassType == "1": totalHoldOutGoodPhrases += 1 else: totalHoldOutBadPhrases += 1 precisionOfGood = 100.0 * truePositives/totalPositives recallOfGood = 100.0 * truePositives/totalHoldOutGoodPhrases fMeasureOfGood = 2.0 * precisionOfGood * recallOfGood / (precisionOfGood + recallOfGood) precisionOfBad = 100.0 * trueNegatives/totalNegatives recallOfBad = 100.0*trueNegatives/totalHoldOutBadPhrases fMeasureOfBad = 2.0 * precisionOfBad * recallOfBad / (precisionOfBad + recallOfBad) self.logger.info("\nPrecision of Good: " + str(round(precisionOfGood, 2)) + "%") self.logger.info("Recall of Good: " + str(round(recallOfGood, 2)) + "%") self.logger.info("Balanced F-measure of Good: " + str(round(fMeasureOfGood, 2)) + "%") self.logger.info("Precision of Bad: " + str(round(precisionOfBad, 2)) + "%") self.logger.info("Recall of Bad: " + str(round(recallOfBad, 2)) + "%") self.logger.info("Balanced F-measure of Bad: " + str(round(fMeasureOfBad, 2)) + "%") def unregisterDispatcher(self, dispatcher, message): if message == "dying": self.dispatchers.pop(dispatcher, None) if len(self.dispatchers) == 0: self.worker.send(content="kill", to=self.workerName)
class AnnotationWorker: def __init__(self, config): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.workerName = "bayzee.annotation.worker" self.timeout = 6000 self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"] + "__phrase" self.analyzerIndex = self.corpusIndex + "__analysis__" self.worker = DurableChannel(self.workerName, config) self.dispatchers = {} def annotate(self): while True: message = self.worker.receive() if message["content"] == "kill": message["responseId"] = message["requestId"] self.worker.close(message) if len(self.dispatchers) == 0: self.worker.end() break else: self.worker.send(content="kill", to=self.workerName) continue elif message["content"]["type"] == "annotate": if message["content"]["from"] not in self.dispatchers: self.dispatchers[message["content"]["from"]] = RemoteChannel(message["content"]["from"], self.config) self.dispatchers[message["content"]["from"]].listen(self.unregisterDispatcher) documentId = message["content"]["documentId"] document = self.esClient.get(index=self.corpusIndex, doc_type=self.corpusType, id = documentId, fields=self.corpusFields) if "fields" in document: for field in self.corpusFields: shingles = [] if field in document["fields"]: if type(document["fields"][field]) is list: for element in document["fields"][field]: if len(element) > 0: shingleTokens = self.esClient.indices.analyze(index=self.analyzerIndex, body=element, analyzer="analyzer_shingle") shingles += shingleTokens["tokens"] else: if len(document["fields"][field]) > 0: shingles = self.esClient.indices.analyze(index=self.analyzerIndex, body=document["fields"][field], analyzer="analyzer_shingle")["tokens"] shingles = map(self.__replaceUnderscore, shingles) shingles = filter(self.__filterTokens, shingles) if shingles != None and len(shingles) > 0: for shingle in shingles: phrase = shingle["token"] key = self.__keyify(phrase) if len(key) > 0: data = {"phrase": phrase,"phrase__not_analyzed": phrase,"document_id": document["_id"]} if not self.esClient.exists(index=self.processorIndex, doc_type=self.processorPhraseType, id=key): self.esClient.index(index=self.processorIndex, doc_type=self.processorPhraseType, id=key, body=data) sleep(1) for processorInstance in self.config["processor_instances"]: processorInstance.annotate(self.config, documentId) self.worker.reply(message, {"documentId": documentId, "status" : "processed", "type" : "reply"}, self.timeout) self.logger.info("Terminating annotation worker") def unregisterDispatcher(self, dispatcher, message): if message == "dying": self.dispatchers.pop(dispatcher, None) if len(self.dispatchers) == 0: self.worker.send(content="kill", to=self.workerName) def __keyify(self, phrase): phrase = phrase.strip() if len(phrase) == 0: return "" key = re.sub("[^A-Za-z0-9]", " ", phrase) key = " ".join(phrase.split()) key = key.lower() key = "-".join(phrase.split()) return key def __replaceUnderscore(self,shingle): token = shingle["token"] token = token.replace("_","") token = re.sub('\s+', ' ', token).strip() shingle["token"] = token return shingle def __filterTokens(self, shingle): global esStopWords tokens = shingle["token"].split(" ") firstToken = tokens[0] lastToken = tokens[-1] isValid = True isValid = (isValid and lastToken != None) isValid = (isValid and len(lastToken) > 1) isValid = (isValid and not firstToken.replace(".","",1).isdigit()) isValid = (isValid and not lastToken.replace(".","",1).isdigit()) isValid = (isValid and firstToken not in esStopWords) isValid = (isValid and lastToken not in esStopWords) return isValid
class GenerationWorker: def __init__(self, config, trainingDataset, holdOutDataset): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.trainingDataset = trainingDataset self.holdOutDataset = holdOutDataset self.bagOfPhrases = {} self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.timeout = 6000000 self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"]+"__phrase" count = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query":{"match_all":{}}}) self.corpusSize = count["count"] self.featureNames = map(lambda x: x["name"], config["generator"]["features"]) for module in config["processor"]["modules"]: self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"]) self.workerName = "bayzee.generation.worker" self.dispatchers = {} #creating worker self.worker = DurableChannel(self.workerName, config) def generate(self): self.__extractFeatures() def __extractFeatures(self): while True: message = self.worker.receive() if message["content"] == "kill": message["responseId"] = message["requestId"] self.worker.close(message) if len(self.dispatchers) == 0: self.worker.end() break else: self.worker.send(content="kill", to=self.workerName) continue elif message["content"]["type"] == "generate": if message["content"]["from"] not in self.dispatchers: self.dispatchers[message["content"]["from"]] = RemoteChannel(message["content"]["from"], self.config) self.dispatchers[message["content"]["from"]].listen(self.unregisterDispatcher) phraseId = message["content"]["phraseId"] phraseData = self.esClient.get(index=self.processorIndex, doc_type=self.processorPhraseType, id = phraseId) floatPrecision = "{0:." + str(self.config["generator"]["floatPrecision"]) + "f}" token = phraseData["_source"]["phrase"] documentId = phraseData["_source"]["document_id"] self.logger.info("Extracted common features for phrase '" + token + "'") entry = {} shouldMatch = map(lambda x: {"match_phrase":{x:token}}, self.corpusFields) query = {"query":{"bool":{"should":shouldMatch}}} data = self.esClient.search(index=self.corpusIndex, doc_type=self.corpusType, body=query, explain=True, size=self.corpusSize) entry["max_score"] = 0 maxScore = 0 avgScore = 0 maxTermFrequency = 0 avgTermFrequency = 0 for hit in data["hits"]["hits"]: avgScore += float(hit["_score"]) numOfScores = 0 hitTermFrequency = 0 explanation = json.dumps(hit["_explanation"]) while len(explanation) > len(token): indexOfToken = explanation.find("tf(") + len("tf(") if indexOfToken < len("tf("): break explanation = explanation[indexOfToken:] freqToken = explanation.split(")")[0] explanation = explanation.split(")")[1] if freqToken.find("freq=") >= 0: numOfScores += 1 hitTermFrequency += float(freqToken.split("=")[1]) if numOfScores > 0 : hitTermFrequency = hitTermFrequency / numOfScores if maxTermFrequency < hitTermFrequency: maxTermFrequency = hitTermFrequency avgTermFrequency += hitTermFrequency if len(data["hits"]["hits"]) > 0: avgTermFrequency = avgTermFrequency * 1.0 / len(data["hits"]["hits"]) if int(data["hits"]["total"]) > 0: avgScore = (avgScore * 1.0) / int(data["hits"]["total"]) if data["hits"]["max_score"] != None: maxScore = data["hits"]["max_score"] if "max_score" in self.featureNames: entry["max_score"] = floatPrecision.format(float(maxScore)) if "doc_count" in self.featureNames: entry["doc_count"] = floatPrecision.format(float(data["hits"]["total"])) if "avg_score" in self.featureNames: entry["avg_score"] = floatPrecision.format(float(avgScore)) if "max_term_frequency" in self.featureNames: entry["max_term_frequency"] = floatPrecision.format(float(maxTermFrequency)) if "avg_term_frequency" in self.featureNames: entry["avg_term_frequency"] = floatPrecision.format(float(avgTermFrequency)) # get additional features for processorInstance in self.config["processor_instances"]: processorInstance.extractFeatures(self.config, token, entry) phraseData["_source"]["features"] = entry if token in self.trainingDataset: phraseData["_source"]["is_training"] = self.trainingDataset[token].strip() if token in self.holdOutDataset: phraseData["_source"]["is_holdout"] = self.holdOutDataset[token].strip() self.esClient.index(index=self.processorIndex, doc_type=self.processorPhraseType, id=phraseId, body=phraseData["_source"]) self.worker.reply(message, {"phraseId": phraseId, "status" : "generated", "type" : "reply"}, 120000000) if message["content"]["type"] == "stop_dispatcher": self.worker.reply(message, {"phraseId": -1, "status" : "stop_dispatcher", "type" : "stop_dispatcher"}, self.timeout) self.logger.info("Terminating generation worker") def unregisterDispatcher(self, dispatcher, message): if message == "dying": self.dispatchers.pop(dispatcher, None) if len(self.dispatchers) == 0: self.worker.send(content="kill", to=self.workerName)
class GenerationDispatcher: def __init__(self, config, trainingDataset, holdOutDataset, processingStartIndex, processingEndIndex): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.trainingDataset = trainingDataset self.holdOutDataset = holdOutDataset self.config["processingStartIndex"] = processingStartIndex self.config["processingEndIndex"] = processingEndIndex self.bagOfPhrases = {} self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.totalPhrasesDispatched = 0 self.phrasesGenerated = 0 self.phrasesNotGenerated = 0 self.timeout = 86400000 self.dispatcherName = "bayzee.generation.dispatcher" if processingEndIndex != None: self.dispatcherName += "." + str(processingStartIndex) + "." + str(processingEndIndex) self.workerName = "bayzee.generation.worker" self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"]+"__phrase" self.processingPageSize = config["processingPageSize"] config["processor_phrase_type"] = self.processorPhraseType self.featureNames = map(lambda x: x["name"], config["generator"]["features"]) for module in config["processor"]["modules"]: self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"]) # creating generation dispatcher self.generationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback) # creating controle channel self.controlChannel = RemoteChannel(self.dispatcherName, config) def dispatchToGenerate(self): processorIndex = self.config["processor"]["index"] phraseProcessorType = self.config["processor"]["type"] + "__phrase" nextPhraseIndex = 0 if self.config["processingStartIndex"] != None: nextPhraseIndex = self.config["processingStartIndex"] endPhraseIndex = -1 if self.config["processingEndIndex"] != None: endPhraseIndex = self.config["processingEndIndex"] if endPhraseIndex != -1 and self.processingPageSize > (endPhraseIndex - nextPhraseIndex): self.processingPageSize = endPhraseIndex - nextPhraseIndex + 1 while True: phrases = self.esClient.search(index=processorIndex, doc_type=phraseProcessorType, body={"from": nextPhraseIndex,"size": self.processingPageSize, "query":{"match_all":{}},"sort":[{"phrase__not_analyzed":{"order":"asc"}}]}, fields=["_id"]) if len(phrases["hits"]["hits"]) == 0: break self.totalPhrasesDispatched += len(phrases["hits"]["hits"]) floatPrecision = "{0:." + str(self.config["generator"]["floatPrecision"]) + "f}" self.logger.info("Generating features from " + str(nextPhraseIndex) + " to " + str(nextPhraseIndex+len(phrases["hits"]["hits"])) + " phrases...") for phraseData in phrases["hits"]["hits"]: self.logger.info("Dispatching phrase " + phraseData["_id"]) content = {"phraseId": phraseData["_id"], "type": "generate", "count": 1, "from": self.dispatcherName} self.generationDispatcher.send(content, self.workerName, self.timeout) nextPhraseIndex += len(phrases["hits"]["hits"]) if endPhraseIndex != -1 and nextPhraseIndex >= endPhraseIndex: break while True: message = self.generationDispatcher.receive() if "phraseId" in message["content"] and message["content"]["phraseId"] > 0: self.phrasesGenerated += 1 self.generationDispatcher.close(message) self.logger.info("Generated for " + message["content"]["phraseId"] + str(self.phrasesGenerated) + "/" + str(self.totalPhrasesDispatched)) if (self.phrasesGenerated + self.phrasesNotGenerated) >= self.totalPhrasesDispatched: self.controlChannel.send("dying") break self.__terminate() def timeoutCallback(self, message): config.logger.info("Message timed out: " + str(message)) if message["content"]["count"] < 5: message["content"]["count"] += 1 self.generationDispatcher.send(message["content"], self.workerName, self.timeout) else: #log implementation yet to be done for expired phrases self.phrasesNotGenerated += 1 if self.phrasesNotGenerated == self.totalPhrasesDispatched or (self.phrasesGenerated + self.phrasesNotGenerated) == self.totalPhrasesDispatched: self.__terminate() def __terminate(self): self.logger.info(str(self.totalPhrasesDispatched) + " total dispatched") self.logger.info(str(self.phrasesGenerated) + " generated") self.logger.info(str(self.phrasesNotGenerated) + " failed to generate") self.logger.info("Generation complete") self.logger.info("Terminating generation dispatcher")
class GenerationWorker: def __init__(self, config, trainingDataset, holdOutDataset): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.trainingDataset = trainingDataset self.holdOutDataset = holdOutDataset self.bagOfPhrases = {} self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.timeout = 6000000 self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"] + "__phrase" count = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query": { "match_all": {} }}) self.corpusSize = count["count"] self.featureNames = map(lambda x: x["name"], config["generator"]["features"]) for module in config["processor"]["modules"]: self.featureNames = self.featureNames + map( lambda x: x["name"], module["features"]) self.workerName = "bayzee.generation.worker" self.dispatchers = {} #creating worker self.worker = DurableChannel(self.workerName, config) def generate(self): self.__extractFeatures() def __extractFeatures(self): while True: message = self.worker.receive() if message["content"] == "kill": message["responseId"] = message["requestId"] self.worker.close(message) if len(self.dispatchers) == 0: self.worker.end() break else: self.worker.send(content="kill", to=self.workerName) continue elif message["content"]["type"] == "generate": if message["content"]["from"] not in self.dispatchers: self.dispatchers[ message["content"]["from"]] = RemoteChannel( message["content"]["from"], self.config) self.dispatchers[message["content"]["from"]].listen( self.unregisterDispatcher) phraseId = message["content"]["phraseId"] phraseData = self.esClient.get( index=self.processorIndex, doc_type=self.processorPhraseType, id=phraseId) floatPrecision = "{0:." + str( self.config["generator"]["floatPrecision"]) + "f}" token = phraseData["_source"]["phrase"] documentId = phraseData["_source"]["document_id"] self.logger.info("Extracted common features for phrase '" + token + "'") entry = {} shouldMatch = map(lambda x: {"match_phrase": { x: token }}, self.corpusFields) query = {"query": {"bool": {"should": shouldMatch}}} data = self.esClient.search(index=self.corpusIndex, doc_type=self.corpusType, body=query, explain=True, size=self.corpusSize) entry["max_score"] = 0 maxScore = 0 avgScore = 0 maxTermFrequency = 0 avgTermFrequency = 0 for hit in data["hits"]["hits"]: avgScore += float(hit["_score"]) numOfScores = 0 hitTermFrequency = 0 explanation = json.dumps(hit["_explanation"]) while len(explanation) > len(token): indexOfToken = explanation.find("tf(") + len("tf(") if indexOfToken < len("tf("): break explanation = explanation[indexOfToken:] freqToken = explanation.split(")")[0] explanation = explanation.split(")")[1] if freqToken.find("freq=") >= 0: numOfScores += 1 hitTermFrequency += float(freqToken.split("=")[1]) if numOfScores > 0: hitTermFrequency = hitTermFrequency / numOfScores if maxTermFrequency < hitTermFrequency: maxTermFrequency = hitTermFrequency avgTermFrequency += hitTermFrequency if len(data["hits"]["hits"]) > 0: avgTermFrequency = avgTermFrequency * 1.0 / len( data["hits"]["hits"]) if int(data["hits"]["total"]) > 0: avgScore = (avgScore * 1.0) / int(data["hits"]["total"]) if data["hits"]["max_score"] != None: maxScore = data["hits"]["max_score"] if "max_score" in self.featureNames: entry["max_score"] = floatPrecision.format(float(maxScore)) if "doc_count" in self.featureNames: entry["doc_count"] = floatPrecision.format( float(data["hits"]["total"])) if "avg_score" in self.featureNames: entry["avg_score"] = floatPrecision.format(float(avgScore)) if "max_term_frequency" in self.featureNames: entry["max_term_frequency"] = floatPrecision.format( float(maxTermFrequency)) if "avg_term_frequency" in self.featureNames: entry["avg_term_frequency"] = floatPrecision.format( float(avgTermFrequency)) # get additional features for processorInstance in self.config["processor_instances"]: processorInstance.extractFeatures(self.config, token, entry) phraseData["_source"]["features"] = entry if token in self.trainingDataset: phraseData["_source"][ "is_training"] = self.trainingDataset[token].strip() if token in self.holdOutDataset: phraseData["_source"]["is_holdout"] = self.holdOutDataset[ token].strip() self.esClient.index(index=self.processorIndex, doc_type=self.processorPhraseType, id=phraseId, body=phraseData["_source"]) self.worker.reply(message, { "phraseId": phraseId, "status": "generated", "type": "reply" }, 120000000) if message["content"]["type"] == "stop_dispatcher": self.worker.reply( message, { "phraseId": -1, "status": "stop_dispatcher", "type": "stop_dispatcher" }, self.timeout) self.logger.info("Terminating generation worker") def unregisterDispatcher(self, dispatcher, message): if message == "dying": self.dispatchers.pop(dispatcher, None) if len(self.dispatchers) == 0: self.worker.send(content="kill", to=self.workerName)
def __init__(self, config, processingStartIndex, processingEndIndex): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.bagOfPhrases = {} self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"] + "__phrase" self.processingPageSize = config["processingPageSize"] self.analyzerIndex = self.corpusIndex + "__analysis__" self.config["processingStartIndex"] = processingStartIndex self.config["processingEndIndex"] = processingEndIndex self.config["processingPageSize"] = self.processingPageSize self.totalDocumentsDispatched = 0 self.documentsAnnotated = 0 self.documentsNotAnnotated = 0 self.lastDispatcher = False self.endProcess = False self.dispatcherName = "bayzee.annotation.dispatcher" self.workerName = "bayzee.annotation.worker" self.timeout = 86400000 if processingEndIndex != None: self.dispatcherName += "." + str(processingStartIndex) + "." + str(processingEndIndex) analyzerIndexSettings = { "index":{ "analysis":{ "analyzer":{ "analyzer_shingle":{ "type": "custom", "tokenizer": "standard", "filter": ["standard", "lowercase", "filter_shingle"] } }, "filter":{ "filter_shingle":{ "type": "shingle", "max_shingle_size": config["generator"]["maxShingleSize"], "min_shingle_size": config["generator"]["minShingleSize"], "output_unigrams": (config["generator"]["minShingleSize"] == 1) }, "filter_stop":{ "type": "stop" } } } } } analyzerIndexTypeMapping = { "properties":{ "phrase":{"type":"string"}, "document_id":{"type":"string", "index": "not_analyzed"}, "phrase__not_analyzed":{"type":"string","index":"not_analyzed"} } } corpusSize = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query":{"match_all":{}}}) self.corpusSize = corpusSize["count"] self.featureNames = map(lambda x: x["name"], config["generator"]["features"]) for module in config["processor"]["modules"]: self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"]) if processingStartIndex == 0: if self.esClient.indices.exists(self.analyzerIndex): self.esClient.indices.delete(self.analyzerIndex) data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) if "annotateFromScratch" not in self.config or self.config["annotateFromScratch"] == True: try: if self.esClient.indices.exists(self.config["processor"]["index"]): self.esClient.indices.delete(self.config["processor"]["index"]) self.esClient.indices.create(self.config["processor"]["index"]) self.esClient.indices.put_mapping(index=self.config["processor"]["index"],doc_type=self.processorPhraseType,body=analyzerIndexTypeMapping) if self.esClient.indices.exists(self.analyzerIndex): self.esClient.indices.delete(self.analyzerIndex) data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) except: error = sys.exc_info() self.logger.error("Error occurred during initialization of analyzer index: " + str(error)) sys.exit(1) else: sleep(1) #dispatcher creation self.annotationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback) #remote channel intialisation self.controlChannel = RemoteChannel(self.dispatcherName, config)
class AnnotationDispatcher: def __init__(self, config, processingStartIndex, processingEndIndex): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.bagOfPhrases = {} self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"] + "__phrase" self.processingPageSize = config["processingPageSize"] self.analyzerIndex = self.corpusIndex + "__analysis__" self.config["processingStartIndex"] = processingStartIndex self.config["processingEndIndex"] = processingEndIndex self.config["processingPageSize"] = self.processingPageSize self.totalDocumentsDispatched = 0 self.documentsAnnotated = 0 self.documentsNotAnnotated = 0 self.lastDispatcher = False self.endProcess = False self.dispatcherName = "bayzee.annotation.dispatcher" self.workerName = "bayzee.annotation.worker" self.timeout = 86400000 if processingEndIndex != None: self.dispatcherName += "." + str(processingStartIndex) + "." + str(processingEndIndex) analyzerIndexSettings = { "index":{ "analysis":{ "analyzer":{ "analyzer_shingle":{ "type": "custom", "tokenizer": "standard", "filter": ["standard", "lowercase", "filter_shingle"] } }, "filter":{ "filter_shingle":{ "type": "shingle", "max_shingle_size": config["generator"]["maxShingleSize"], "min_shingle_size": config["generator"]["minShingleSize"], "output_unigrams": (config["generator"]["minShingleSize"] == 1) }, "filter_stop":{ "type": "stop" } } } } } analyzerIndexTypeMapping = { "properties":{ "phrase":{"type":"string"}, "document_id":{"type":"string", "index": "not_analyzed"}, "phrase__not_analyzed":{"type":"string","index":"not_analyzed"} } } corpusSize = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query":{"match_all":{}}}) self.corpusSize = corpusSize["count"] self.featureNames = map(lambda x: x["name"], config["generator"]["features"]) for module in config["processor"]["modules"]: self.featureNames = self.featureNames + map(lambda x: x["name"], module["features"]) if processingStartIndex == 0: if self.esClient.indices.exists(self.analyzerIndex): self.esClient.indices.delete(self.analyzerIndex) data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) if "annotateFromScratch" not in self.config or self.config["annotateFromScratch"] == True: try: if self.esClient.indices.exists(self.config["processor"]["index"]): self.esClient.indices.delete(self.config["processor"]["index"]) self.esClient.indices.create(self.config["processor"]["index"]) self.esClient.indices.put_mapping(index=self.config["processor"]["index"],doc_type=self.processorPhraseType,body=analyzerIndexTypeMapping) if self.esClient.indices.exists(self.analyzerIndex): self.esClient.indices.delete(self.analyzerIndex) data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) except: error = sys.exc_info() self.logger.error("Error occurred during initialization of analyzer index: " + str(error)) sys.exit(1) else: sleep(1) #dispatcher creation self.annotationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback) #remote channel intialisation self.controlChannel = RemoteChannel(self.dispatcherName, config) def dispatchToAnnotate(self): if "indexPhrases" in self.config and self.config["indexPhrases"] == False: return nextDocumentIndex = 0 if self.config["processingStartIndex"] != None: nextDocumentIndex = self.config["processingStartIndex"] endDocumentIndex = -1 if self.config["processingEndIndex"] != None: endDocumentIndex = self.config["processingEndIndex"] if endDocumentIndex != -1 and self.processingPageSize > (endDocumentIndex - nextDocumentIndex): self.processingPageSize = endDocumentIndex - nextDocumentIndex + 1 self.totalDocumentsDispatched = 0 while True: documents = self.esClient.search(index=self.corpusIndex, doc_type=self.corpusType, body={"from": nextDocumentIndex,"size": self.processingPageSize,"query":{"match_all":{}}, "sort":[{"_id":{"order":"asc"}}]}, fields=["_id"]) if len(documents["hits"]["hits"]) == 0: break self.totalDocumentsDispatched += len(documents["hits"]["hits"]) self.logger.info("Annotating " + str(nextDocumentIndex) + " to " + str(nextDocumentIndex+len(documents["hits"]["hits"])) + " documents...") for document in documents["hits"]["hits"]: self.logger.info("Dispatching document " + document["_id"]) content = {"documentId": document["_id"], "type": "annotate", "count": 1, "from":self.dispatcherName} self.annotationDispatcher.send(content, self.workerName) nextDocumentIndex += len(documents["hits"]["hits"]) if endDocumentIndex != -1 and endDocumentIndex <= nextDocumentIndex: break self.logger.info(str(self.totalDocumentsDispatched) + " documents dispatched") while True: message = self.annotationDispatcher.receive() if "documentId" in message["content"] and message["content"]["documentId"] > 0: self.documentsAnnotated += 1 self.annotationDispatcher.close(message) self.logger.info("Annotated document " + message["content"]["documentId"] + " - " + str(self.documentsAnnotated) + "/" + str(self.totalDocumentsDispatched)) if (self.documentsAnnotated + self.documentsNotAnnotated) >= self.totalDocumentsDispatched and not self.lastDispatcher: self.controlChannel.send("dying") self.annotationDispatcher.end() break self.__terminate() def timeoutCallback(self, message): if message["content"]["count"] < 5: message["content"]["count"] += 1 self.annotationDispatcher.send(message["content"], self.workerName, self.timeout) else: #log implementation yet to be done for expired documents self.documentsNotAnnotated += 1 if self.documentsNotAnnotated == self.totalDocumentsDispatched or (self.documentsAnnotated + self.documentsNotAnnotated) == self.totalDocumentsDispatched: self.__terminate() def __terminate(self): self.logger.info(str(self.totalDocumentsDispatched) + " total dispatched") self.logger.info(str(self.documentsAnnotated) + " annotated") self.logger.info(str(self.documentsNotAnnotated) + " failed to annotate") self.logger.info("Annotation complete") self.logger.info("Terminating annotation dispatcher") def __deleteAnalyzerIndex(self): if self.esClient.indices.exists(self.analyzerIndex): self.esClient.indices.delete(self.analyzerIndex)
class GenerationDispatcher: def __init__(self, config, trainingDataset, holdOutDataset, processingStartIndex, processingEndIndex): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.trainingDataset = trainingDataset self.holdOutDataset = holdOutDataset self.config["processingStartIndex"] = processingStartIndex self.config["processingEndIndex"] = processingEndIndex self.bagOfPhrases = {} self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.totalPhrasesDispatched = 0 self.phrasesGenerated = 0 self.phrasesNotGenerated = 0 self.timeout = 86400000 self.dispatcherName = "bayzee.generation.dispatcher" if processingEndIndex != None: self.dispatcherName += "." + str(processingStartIndex) + "." + str( processingEndIndex) self.workerName = "bayzee.generation.worker" self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"] + "__phrase" self.processingPageSize = config["processingPageSize"] config["processor_phrase_type"] = self.processorPhraseType self.featureNames = map(lambda x: x["name"], config["generator"]["features"]) for module in config["processor"]["modules"]: self.featureNames = self.featureNames + map( lambda x: x["name"], module["features"]) # creating generation dispatcher self.generationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback) # creating controle channel self.controlChannel = RemoteChannel(self.dispatcherName, config) def dispatchToGenerate(self): processorIndex = self.config["processor"]["index"] phraseProcessorType = self.config["processor"]["type"] + "__phrase" nextPhraseIndex = 0 if self.config["processingStartIndex"] != None: nextPhraseIndex = self.config["processingStartIndex"] endPhraseIndex = -1 if self.config["processingEndIndex"] != None: endPhraseIndex = self.config["processingEndIndex"] if endPhraseIndex != -1 and self.processingPageSize > ( endPhraseIndex - nextPhraseIndex): self.processingPageSize = endPhraseIndex - nextPhraseIndex + 1 while True: phrases = self.esClient.search(index=processorIndex, doc_type=phraseProcessorType, body={ "from": nextPhraseIndex, "size": self.processingPageSize, "query": { "match_all": {} }, "sort": [{ "phrase__not_analyzed": { "order": "asc" } }] }, fields=["_id"]) if len(phrases["hits"]["hits"]) == 0: break self.totalPhrasesDispatched += len(phrases["hits"]["hits"]) floatPrecision = "{0:." + str( self.config["generator"]["floatPrecision"]) + "f}" self.logger.info("Generating features from " + str(nextPhraseIndex) + " to " + str(nextPhraseIndex + len(phrases["hits"]["hits"])) + " phrases...") for phraseData in phrases["hits"]["hits"]: self.logger.info("Dispatching phrase " + phraseData["_id"]) content = { "phraseId": phraseData["_id"], "type": "generate", "count": 1, "from": self.dispatcherName } self.generationDispatcher.send(content, self.workerName, self.timeout) nextPhraseIndex += len(phrases["hits"]["hits"]) if endPhraseIndex != -1 and nextPhraseIndex >= endPhraseIndex: break while True: message = self.generationDispatcher.receive() if "phraseId" in message[ "content"] and message["content"]["phraseId"] > 0: self.phrasesGenerated += 1 self.generationDispatcher.close(message) self.logger.info("Generated for " + message["content"]["phraseId"] + str(self.phrasesGenerated) + "/" + str(self.totalPhrasesDispatched)) if (self.phrasesGenerated + self.phrasesNotGenerated) >= self.totalPhrasesDispatched: self.controlChannel.send("dying") break self.__terminate() def timeoutCallback(self, message): config.logger.info("Message timed out: " + str(message)) if message["content"]["count"] < 5: message["content"]["count"] += 1 self.generationDispatcher.send(message["content"], self.workerName, self.timeout) else: #log implementation yet to be done for expired phrases self.phrasesNotGenerated += 1 if self.phrasesNotGenerated == self.totalPhrasesDispatched or ( self.phrasesGenerated + self.phrasesNotGenerated) == self.totalPhrasesDispatched: self.__terminate() def __terminate(self): self.logger.info( str(self.totalPhrasesDispatched) + " total dispatched") self.logger.info(str(self.phrasesGenerated) + " generated") self.logger.info(str(self.phrasesNotGenerated) + " failed to generate") self.logger.info("Generation complete") self.logger.info("Terminating generation dispatcher")
def __init__(self, config, processingStartIndex, processingEndIndex): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.bagOfPhrases = {} self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"] + "__phrase" self.processingPageSize = config["processingPageSize"] self.analyzerIndex = self.corpusIndex + "__analysis__" self.config["processingStartIndex"] = processingStartIndex self.config["processingEndIndex"] = processingEndIndex self.config["processingPageSize"] = self.processingPageSize self.totalDocumentsDispatched = 0 self.documentsAnnotated = 0 self.documentsNotAnnotated = 0 self.lastDispatcher = False self.endProcess = False self.dispatcherName = "bayzee.annotation.dispatcher" self.workerName = "bayzee.annotation.worker" self.timeout = 86400000 if processingEndIndex != None: self.dispatcherName += "." + str(processingStartIndex) + "." + str( processingEndIndex) analyzerIndexSettings = { "index": { "analysis": { "analyzer": { "analyzer_shingle": { "type": "custom", "tokenizer": "standard", "filter": ["standard", "lowercase", "filter_shingle"] } }, "filter": { "filter_shingle": { "type": "shingle", "max_shingle_size": config["generator"]["maxShingleSize"], "min_shingle_size": config["generator"]["minShingleSize"], "output_unigrams": (config["generator"]["minShingleSize"] == 1) }, "filter_stop": { "type": "stop" } } } } } analyzerIndexTypeMapping = { "properties": { "phrase": { "type": "string" }, "document_id": { "type": "string", "index": "not_analyzed" }, "phrase__not_analyzed": { "type": "string", "index": "not_analyzed" } } } corpusSize = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query": { "match_all": {} }}) self.corpusSize = corpusSize["count"] self.featureNames = map(lambda x: x["name"], config["generator"]["features"]) for module in config["processor"]["modules"]: self.featureNames = self.featureNames + map( lambda x: x["name"], module["features"]) if processingStartIndex == 0: if self.esClient.indices.exists(self.analyzerIndex): self.esClient.indices.delete(self.analyzerIndex) data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) if "annotateFromScratch" not in self.config or self.config[ "annotateFromScratch"] == True: try: if self.esClient.indices.exists( self.config["processor"]["index"]): self.esClient.indices.delete( self.config["processor"]["index"]) self.esClient.indices.create(self.config["processor"]["index"]) self.esClient.indices.put_mapping( index=self.config["processor"]["index"], doc_type=self.processorPhraseType, body=analyzerIndexTypeMapping) if self.esClient.indices.exists(self.analyzerIndex): self.esClient.indices.delete(self.analyzerIndex) data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) except: error = sys.exc_info() self.logger.error( "Error occurred during initialization of analyzer index: " + str(error)) sys.exit(1) else: sleep(1) #dispatcher creation self.annotationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback) #remote channel intialisation self.controlChannel = RemoteChannel(self.dispatcherName, config)
class AnnotationDispatcher: def __init__(self, config, processingStartIndex, processingEndIndex): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.bagOfPhrases = {} self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"] + "__phrase" self.processingPageSize = config["processingPageSize"] self.analyzerIndex = self.corpusIndex + "__analysis__" self.config["processingStartIndex"] = processingStartIndex self.config["processingEndIndex"] = processingEndIndex self.config["processingPageSize"] = self.processingPageSize self.totalDocumentsDispatched = 0 self.documentsAnnotated = 0 self.documentsNotAnnotated = 0 self.lastDispatcher = False self.endProcess = False self.dispatcherName = "bayzee.annotation.dispatcher" self.workerName = "bayzee.annotation.worker" self.timeout = 86400000 if processingEndIndex != None: self.dispatcherName += "." + str(processingStartIndex) + "." + str( processingEndIndex) analyzerIndexSettings = { "index": { "analysis": { "analyzer": { "analyzer_shingle": { "type": "custom", "tokenizer": "standard", "filter": ["standard", "lowercase", "filter_shingle"] } }, "filter": { "filter_shingle": { "type": "shingle", "max_shingle_size": config["generator"]["maxShingleSize"], "min_shingle_size": config["generator"]["minShingleSize"], "output_unigrams": (config["generator"]["minShingleSize"] == 1) }, "filter_stop": { "type": "stop" } } } } } analyzerIndexTypeMapping = { "properties": { "phrase": { "type": "string" }, "document_id": { "type": "string", "index": "not_analyzed" }, "phrase__not_analyzed": { "type": "string", "index": "not_analyzed" } } } corpusSize = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query": { "match_all": {} }}) self.corpusSize = corpusSize["count"] self.featureNames = map(lambda x: x["name"], config["generator"]["features"]) for module in config["processor"]["modules"]: self.featureNames = self.featureNames + map( lambda x: x["name"], module["features"]) if processingStartIndex == 0: if self.esClient.indices.exists(self.analyzerIndex): self.esClient.indices.delete(self.analyzerIndex) data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) if "annotateFromScratch" not in self.config or self.config[ "annotateFromScratch"] == True: try: if self.esClient.indices.exists( self.config["processor"]["index"]): self.esClient.indices.delete( self.config["processor"]["index"]) self.esClient.indices.create(self.config["processor"]["index"]) self.esClient.indices.put_mapping( index=self.config["processor"]["index"], doc_type=self.processorPhraseType, body=analyzerIndexTypeMapping) if self.esClient.indices.exists(self.analyzerIndex): self.esClient.indices.delete(self.analyzerIndex) data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) except: error = sys.exc_info() self.logger.error( "Error occurred during initialization of analyzer index: " + str(error)) sys.exit(1) else: sleep(1) #dispatcher creation self.annotationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback) #remote channel intialisation self.controlChannel = RemoteChannel(self.dispatcherName, config) def dispatchToAnnotate(self): if "indexPhrases" in self.config and self.config[ "indexPhrases"] == False: return nextDocumentIndex = 0 if self.config["processingStartIndex"] != None: nextDocumentIndex = self.config["processingStartIndex"] endDocumentIndex = -1 if self.config["processingEndIndex"] != None: endDocumentIndex = self.config["processingEndIndex"] if endDocumentIndex != -1 and self.processingPageSize > ( endDocumentIndex - nextDocumentIndex): self.processingPageSize = endDocumentIndex - nextDocumentIndex + 1 self.totalDocumentsDispatched = 0 while True: documents = self.esClient.search(index=self.corpusIndex, doc_type=self.corpusType, body={ "from": nextDocumentIndex, "size": self.processingPageSize, "query": { "match_all": {} }, "sort": [{ "_id": { "order": "asc" } }] }, fields=["_id"]) if len(documents["hits"]["hits"]) == 0: break self.totalDocumentsDispatched += len(documents["hits"]["hits"]) self.logger.info("Annotating " + str(nextDocumentIndex) + " to " + str(nextDocumentIndex + len(documents["hits"]["hits"])) + " documents...") for document in documents["hits"]["hits"]: self.logger.info("Dispatching document " + document["_id"]) content = { "documentId": document["_id"], "type": "annotate", "count": 1, "from": self.dispatcherName } self.annotationDispatcher.send(content, self.workerName) nextDocumentIndex += len(documents["hits"]["hits"]) if endDocumentIndex != -1 and endDocumentIndex <= nextDocumentIndex: break self.logger.info( str(self.totalDocumentsDispatched) + " documents dispatched") while True: message = self.annotationDispatcher.receive() if "documentId" in message[ "content"] and message["content"]["documentId"] > 0: self.documentsAnnotated += 1 self.annotationDispatcher.close(message) self.logger.info("Annotated document " + message["content"]["documentId"] + " - " + str(self.documentsAnnotated) + "/" + str(self.totalDocumentsDispatched)) if (self.documentsAnnotated + self.documentsNotAnnotated ) >= self.totalDocumentsDispatched and not self.lastDispatcher: self.controlChannel.send("dying") self.annotationDispatcher.end() break self.__terminate() def timeoutCallback(self, message): if message["content"]["count"] < 5: message["content"]["count"] += 1 self.annotationDispatcher.send(message["content"], self.workerName, self.timeout) else: #log implementation yet to be done for expired documents self.documentsNotAnnotated += 1 if self.documentsNotAnnotated == self.totalDocumentsDispatched or ( self.documentsAnnotated + self.documentsNotAnnotated ) == self.totalDocumentsDispatched: self.__terminate() def __terminate(self): self.logger.info( str(self.totalDocumentsDispatched) + " total dispatched") self.logger.info(str(self.documentsAnnotated) + " annotated") self.logger.info( str(self.documentsNotAnnotated) + " failed to annotate") self.logger.info("Annotation complete") self.logger.info("Terminating annotation dispatcher") def __deleteAnalyzerIndex(self): if self.esClient.indices.exists(self.analyzerIndex): self.esClient.indices.delete(self.analyzerIndex)