def run(self, productid, datapath): trainingId = self.trainingId try: caseTitlesFetcher = CaseTitlesFetcher(self.trainingConfig, self.trainingId) caseTitlesFetcher.fetchCaseTitles(productid, datapath) loggerInstance.logToFile("{0}.log".format(trainingId), "CaseTitlesFetcher: Successfully fetched & extracted case titles from Kusto") except Exception as e: loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]CaseTitlesFetcher: " + str(e)) raise TrainingException("CaseTitlesFetcher: " + str(e)) try: if self.trainingConfig.stackoverflowKey: sfFetcher = StackOverFlowFetcher(self.trainingConfig.stackoverflowKey, self.trainingConfig, self.trainingId) sfFetcher.fetchStackOverflowTitles(productid, datapath) loggerInstance.logToFile("{0}.log".format(trainingId), "StackOverFlowFetcher: Successfully fetched stack overflow question titles") else: loggerInstance.logToFile("{0}.log".format(trainingId), "StackOverFlowFetcher: Stackoverflow API Key not provided") except Exception as e: loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]StackOverFlowFetcher: " + str(e)) raise TrainingException("StackOverFlowFetcher: " + str(e))
def get_Tag_Questions(self, tag): trainingId = self.trainingId fetchmore = True pagenum = 1 items = [] while True: try: url = "http://api.stackexchange.com/2.2/questions?key={0}&site=stackoverflow&page={1}&order=desc&sort=votes&tagged={2}&filter=default".format(self.key, pagenum, tag) req = requests.get(url=url) if not (req.status_code == 200): loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]TagDownloader: Tag " + str(tag) + " - " + str(req.json()['error_message'])) raise TrainingException("TagDownloader: Tag " + str(tag) + " - " + str(req.json()['error_message'])) content = req.json() items += [{"text": x["title"], "links": [x["link"]], "qid": x["question_id"]} for x in content["items"] if (x["score"]>0 or x["answer_count"]>0)] #print(txt) if len(items)>self.trainingConfig.stackoverFlowTopN: break if content["has_more"] == "false" or not content["has_more"]: break pagenum += 1 #print("\r" + str(pagenum),end='') except Exception as e: loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]StackOverFlowFetcher: Tag " + str(tag) + " - " + str(e)) raise TrainingException("StackOverFlowFetcher: " + str(e)) loggerInstance.logToFile("{0}.log".format(trainingId), "StackOverFlowFetcher: Fetched " + str(len(items)) + " questions for tag " + str(tag)) return items
def runCaseTitlesExtraction(self, df, productid, datapath): trainingId = self.trainingId if self.trainingConfig.downloadCaseTitlesEnabled and df.any: df["Incidents_SupportTopicL2Current"]=df["Incidents_SupportTopicL2Current"].fillna("NOSELECTION") df["Incidents_SupportTopicL3Current"]=df["Incidents_SupportTopicL3Current"].fillna("NOSELECTION") groups = df.groupby(["Incidents_SupportTopicL2Current", "Incidents_SupportTopicL3Current"]) loggerInstance.logToFile("{0}.log".format(trainingId), "RunCaseTitlesExtraction: Processing " + str(df.shape[0]) + " case titles across " + str(len(list(groups))) + " categories") results = sorted(list(itertools.chain.from_iterable([self.extractor(key, group) for key, group in groups])), key=lambda x: x["text"]) else: results = [] try: sampleUtterances = json.loads(open(os.path.join(datapath, "SampleUtterances.json"), "r").read()) #sampleUtterances = list(set(sampleUtterances+results)) for x in results: found = False for y in sampleUtterances["incidenttitles"]: if x["text"]<y["text"]: break elif x["text"]==y["text"]: y["links"] += x["links"] y["links"] = list(set(y["links"])) found = True break if not found: sampleUtterances["incidenttitles"].append(x) open(os.path.join(datapath, "SampleUtterances.json"), "w").write(json.dumps(sampleUtterances, indent=4)) loggerInstance.logToFile("{0}.log".format(trainingId), "RunCaseTitlesExtraction: Successfully written extracted case titles to file SampleUtterances.json") except (FileNotFoundError) as e: loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]RunCaseTitlesExtraction: File SampleUtterances.json does not exist, creating new file.") open(os.path.join(datapath, "SampleUtterances.json"), "w").write(json.dumps({"incidenttitles" : results, "stackoverflowtitles": []}, indent=4))
def fetchDetectors(self, productid, datapath): trainingId = self.trainingId try: content = json.loads(requests.get(self.detectorsUrl).content) loggerInstance.logToFile( "{0}.log".format(trainingId), "DetectorsFetcher: Fetched " + str(len(content)) + " detectors") except Exception as e: loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]DetectorsFetcher: " + str(e)) raise TrainingException("DetectorsFetcher: " + str(e)) detectors = [ detector for detector in content if (productid in getProductId(detector["resourceFilter"])) ] loggerInstance.logToFile( "{0}.log".format(trainingId), "DetectorsFetcher: Shortlisted " + str(len(detectors)) + " detectors for training based on productId " + str(productid)) for detector in detectors: if detector["metadata"]: md = json.loads(detector["metadata"]) detector["utterances"] = md[ "utterances"] if "utterances" in md else [] else: detector["utterances"] = [] if len(content) > 0: try: open(os.path.join(datapath, "Detectors.json"), "w").write(json.dumps(detectors, indent=4)) loggerInstance.logToFile( "{0}.log".format(trainingId), "DetectorsFetcher: Written detectors to file Detectors.json" ) except Exception as e: loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]DetectorsFetcher: " + str(e)) raise TrainingException("DetectorsFetcher: " + str(e))
def prepareDataForTraining(self, productid): config = json.loads(open("resourceConfig/config.json", "r").read()) trainingId = self.trainingId # check product in training config rawdatapath = "rawdata_{0}".format(productid) try: os.makedirs(rawdatapath) except FileExistsError: pass loggerInstance.logToFile("{0}.log".format(trainingId), "Created folders for raw data") try: sampleUtterancesFetcher = SampleUtterancesFetcher( self.trainingConfig, self.trainingId) sampleUtterancesFetcher.run(productid, rawdatapath) loggerInstance.logToFile( "{0}.log".format(trainingId), "SampleUtterancesFetcher: Successfully fetched & extracted sample utterances" ) except Exception as e: loggerInstance.logToFile( "{0}.log".format(trainingId), "[ERROR]SampleUtterancesFetcher: " + str(e)) raise TrainingException("SampleUtterancesFetcher: " + str(e)) try: detectorsFetcher = DetectorsFetcher( "http://localhost:{0}/internal/detectors".format( config["internalApiPort"]), self.trainingId) detectorsFetcher.fetchDetectors(productid, rawdatapath) loggerInstance.logToFile( "{0}.log".format(trainingId), "DetectorsFetcher: Successfully fetched detectors") except Exception as e: loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]DetectorsFetcher: " + str(e)) raise TrainingException("DetectorsFetcher: " + str(e)) gc.collect()
def fetchStackOverflowTitles(self, productid, datapath): trainingId = self.trainingId questions = [] try: questions = json.loads(open(os.path.join(datapath, "SampleUtterances.json"), "r").read())["stackoverflowtitles"] except: questions = [] print("TAG DOWNLOAD SET TO --", self.trainingConfig.downloadStackoverflowEnabled) if self.trainingConfig.downloadStackoverflowEnabled: #Get tags for product id tags = self.trainingConfig.stackoverflowTags #Fetch questions for tags for tag in tags: qids = [x["qid"] for x in questions] questions += [q for q in self.get_Tag_Questions(tag) if q["qid"] not in qids] try: sampleUtterances = json.loads(open(os.path.join(datapath, "SampleUtterances.json"), "r").read()) sampleUtterances["stackoverflowtitles"] = questions open(os.path.join(datapath, "SampleUtterances.json"), "w").write(json.dumps(sampleUtterances)) loggerInstance.logToFile("{0}.log".format(trainingId), "StackOverFlowFetcher: Successfully written stackoverflow questions to file SampleUtterances.json") except (FileNotFoundError): loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]StackOverFlowFetcher: File SampleUtterances.json does not exist, creating new file.") sampleUtterances = {"incidenttitles": [], "stackoverflowtitles": questions} open(os.path.join(datapath, "SampleUtterances.json"), "w").write(json.dumps(sampleUtterances))
def extractor(self, key, group): trainingId = self.trainingId category = key[0]+"--"+key[1] lines = [(self.endSentence(row["CleanCaseTitles"]), row["SupportCenterCaseLink"]) for ind, row in group.iterrows()] resultTitles = [] if self.trainingConfig.runExtractionEnabled and len(lines)>10: numsentences = group.shape[0] loggerInstance.logToFile("{0}.log".format(trainingId), "Extractor: Running extractor on category " + category + " containing " + str(numsentences) + " case titles") doc = " ".join([x[0] for x in lines]) keysentences = retrieveSentences(doc, max([10, int(numsentences*self.trainingConfig.extractionRatio)])*10) loggerInstance.logToFile("{0}.log".format(trainingId), "Extractor: Extracted " + str(len(keysentences)) + " sentences.") for sent in keysentences: caselinks = [x[1] for x in lines if self.squeeze(x[0])==self.squeeze(sent)] if not caselinks: caselinks = [x[1] for x in lines if self.squeeze(sent) in self.squeeze(x[0])] if not caselinks: caselinks = [x[1] for x in lines if re.sub('[^0-9a-zA-Z]+', '', sent)==re.sub('[^0-9a-zA-Z]+', '', x[0])] if caselinks: resultTitles.append({"text": sent, "links": caselinks, "category": category}) else: loggerInstance.logToFile("{0}.log".format(trainingId), "Extractor: Disabled or not enough lines for summarization") resultTitles = [{"text": x[0], "links": x[1], "category": category} for x in lines] return resultTitles
def fetchCaseTitles(self, productid, datapath): trainingId = self.trainingId if self.trainingConfig.downloadCaseTitlesEnabled: ndays = int(self.trainingConfig.caseTitlesDaysSince) try: db = "Product360" query = """cluster('usage360').database('Product360'). AllCloudSupportIncidentDataWithP360MetadataMapping | where DerivedProductIDStr in ('{0}') | where Incidents_CreatedTime >= ago({1}d) | summarize IncidentTime = any(Incidents_CreatedTime) by Incidents_IncidentId , Incidents_Severity , Incidents_ProductName , Incidents_SupportTopicL2Current , Incidents_SupportTopicL3Current, Incidents_Title | extend SupportCenterCaseLink = strcat('https://azuresupportcenter.msftcloudes.com/caseoverview?srId=', Incidents_IncidentId) | order by Incidents_SupportTopicL3Current asc""".format(productid, ndays) response = self.kustoClient.execute(db, query) except Exception as e: raise TrainingException("KustoFetcher: " + str(e)) try: df = dataframe_from_result_table(response.primary_results[0]) loggerInstance.logToFile("{0}.log".format(trainingId), "DataCleansing: " + str(df.shape[0]) + " incidents fetched") #Remove all non english cases df["isEnglish"] = df["Incidents_Title"].map(self.isEnglish) df_eng = df[df["isEnglish"]==True] del df_eng["isEnglish"] loggerInstance.logToFile("{0}.log".format(trainingId), "DataCleansing: " + str(df.shape[0] - df_eng.shape[0]) + " non English language cases removed") #all cases with character length 3 or less mask = (df_eng["Incidents_Title"].str.len()>3) df_eng_1 = df_eng[mask] #Extract case title from piped sentences df_eng_1["Incidents_Title_PipeCleansed"] = df_eng_1["Incidents_Title"].map(self.pipeCleansing) #Remove any content in square brackets df_eng_1["Incidents_Title_PipeCleansed"] = df_eng_1["Incidents_Title_PipeCleansed"].map(lambda x: re.sub("[\\[].*?[\\]]", "", x)) #Remove any remaining titles with character length 3 or less mask = (df_eng_1["Incidents_Title_PipeCleansed"].str.len()>3) df_eng_2 = df_eng_1[mask] #Remove any garbage phrases (defined in garbage list) mask = (df_eng_2["Incidents_Title_PipeCleansed"].isin(self.garbageList)) df_eng_clean = df_eng_2[~mask] loggerInstance.logToFile("{0}.log".format(trainingId), "DataCleansing: " + str(df_eng.shape[0] - df_eng_clean.shape[0]) + " garbage case title incidents removed") #Remove any cases with two or less words (Except for short phrases that make sense) df_eng_clean["wordcount"] = df_eng_clean["Incidents_Title_PipeCleansed"].map(lambda x: len([a for a in x.split() if len(a)>2])) df_eng_clean["drop"] = df_eng_clean[["Incidents_Title_PipeCleansed", "wordcount"]].apply(lambda x: (x["Incidents_Title_PipeCleansed"] not in self.shortPhrases) and (x["wordcount"]<2), axis=1) df_eng_clean = df_eng_clean[df_eng_clean["drop"] == False] del df_eng_clean["drop"] del df_eng_clean["wordcount"] df_eng_clean["CleanCaseTitles"] = df_eng_clean["Incidents_Title_PipeCleansed"] del df_eng_clean["Incidents_Title_PipeCleansed"] loggerInstance.logToFile("{0}.log".format(trainingId), "DataCleansing: " + str(df_eng_clean.shape[0]) + " incidents will be processed for summarization") except Exception as e: raise TrainingException("DataCleansing: " + str(e)) try: self.runCaseTitlesExtraction(df_eng_clean, productid, datapath) except Exception as e: raise TrainingException("CaseTitleExtraction: " + str(e)) else: loggerInstance.logToFile("{0}.log".format(trainingId), "CaseTitleExtraction: Disabled") try: self.runCaseTitlesExtraction(None, productid, datapath) except Exception as e: raise TrainingException("CaseTitleExtraction: " + str(e))
def trainModel(trainingId, productid, trainingConfig): loggerInstance.logToFile("{0}.log".format(trainingId), json.dumps(trainingConfig.__dict__)) datapath = "rawdata_{0}".format(productid) outpath = "{0}".format(productid) try: os.mkdir(datapath) except FileExistsError: pass try: os.mkdir(outpath) except FileExistsError: pass loggerInstance.logToFile( "{0}.log".format(trainingId), "Created folders for raw data and processed models") try: dataProcessor = DataProcessor(trainingConfig, trainingId) dataProcessor.prepareDataForTraining(productid) loggerInstance.logToFile( "{0}.log".format(trainingId), "DataFetcher: Sucessfully fetched and processed for training") except Exception as e: loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]DataFetcher: " + str(e)) raise TrainingException("DataFetcher: " + str(e)) try: detectorsdata = open(os.path.join(datapath, "Detectors.json"), "r").read() detectors = json.loads(detectorsdata) if trainingConfig.detectorContentSplitted: detector_mappings = [] detector_tokens = [] i = 0 for x in detectors: detector_mappings.append({ "startindex": i, "endindex": i + len(x["utterances"]) + 1, "id": x["id"] }) detector_tokens += [ getAllNGrams(x["name"], trainingConfig.textNGrams) ] + [ getAllNGrams(x["description"], trainingConfig.textNGrams) ] + [ getAllNGrams(y["text"], trainingConfig.textNGrams) for y in x["utterances"] ] i += (len(x["utterances"]) + 2) open(os.path.join(outpath, "Mappings.json"), "w").write(json.dumps(detector_mappings)) else: detector_tokens = [ getAllNGrams( x["name"] + " " + x["description"] + " " + " ".join([y["text"] for y in x["utterances"]]), trainingConfig.textNGrams) for x in detectors ] loggerInstance.logToFile( "{0}.log".format(trainingId), "DetectorProcessor: Sucessfully processed detectors data into tokens" ) except Exception as e: loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]DetectorProcessor: " + str(e)) raise TrainingException("DetectorProcessor: " + str(e)) try: #Stackoverflow and Case Incidents data load sampleUtterancesContent = json.loads( open(os.path.join(datapath, "SampleUtterances.json"), "r").read()) sampleUtterances = (sampleUtterancesContent["incidenttitles"] if trainingConfig.includeCaseTitles else []) + ( sampleUtterancesContent["stackoverflowtitles"] if trainingConfig.includeStackoverflow else []) sampleUtterances_tokens = [ getAllNGrams(sampleUtterances[i]["text"], trainingConfig.textNGrams) for i in range(len(sampleUtterances)) ] loggerInstance.logToFile( "{0}.log".format(trainingId), "CaseTitlesProcessor: Sucessfully processed sample utterances into tokens" ) except Exception as e: loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]CaseTitlesProcessor: " + str(e)) raise TrainingException("CaseTitlesProcessor: " + str(e)) try: trainDictionary(detector_tokens + sampleUtterances_tokens, productid, outpath) loggerInstance.logToFile( "{0}.log".format(trainingId), "DictionaryTrainer: Sucessfully trained dictionary") except Exception as e: loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]DictionaryTrainer: " + str(e)) raise TrainingException("DictionaryTrainer: " + str(e)) if trainingConfig.trainDetectors: try: trainModelM1([], detector_tokens, sampleUtterances_tokens, productid, outpath) loggerInstance.logToFile( "{0}.log".format(trainingId), "ModelM1Trainer: Sucessfully trained model m1") except Exception as e: loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]ModelM1Trainer: " + str(e)) raise TrainingException("ModelM1Trainer: " + str(e)) else: loggerInstance.logToFile("{0}.log".format(trainingId), "ModelM1Trainer: Training is disabled") if trainingConfig.trainUtterances: try: trainModelM2([], detector_tokens, sampleUtterances_tokens, productid, outpath) loggerInstance.logToFile( "{0}.log".format(trainingId), "ModelM2Trainer: Sucessfully trained model m2") except Exception as e: loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]ModelM2Trainer: " + str(e)) raise TrainingException("ModelM2Trainer: " + str(e)) open(os.path.join(outpath, "Detectors.json"), "w").write(json.dumps(detectors)) open(os.path.join(outpath, "SampleUtterances.json"), "w").write(json.dumps(sampleUtterances)) modelInfo = { "detectorContentSplitted": trainingConfig.detectorContentSplitted, "textNGrams": trainingConfig.textNGrams } open(os.path.join(outpath, "ModelInfo.json"), "w").write(json.dumps(modelInfo)) modelPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), outpath) try: publishModels(productid, modelPath, trainingId) loggerInstance.logToFile( "{0}.log".format(trainingId), "ModelPublisher: Sucessfully published models") except Exception as e: loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]ModelPublisher: " + str(e)) raise TrainingException("ModelPublisher: " + str(e))