def run(self, productid, datapath):
     trainingId = self.trainingId
     try:
         caseTitlesFetcher = CaseTitlesFetcher(self.trainingConfig, self.trainingId)
         caseTitlesFetcher.fetchCaseTitles(productid, datapath)
         loggerInstance.logToFile("{0}.log".format(trainingId), "CaseTitlesFetcher: Successfully fetched & extracted case titles from Kusto")
     except Exception as e:
         loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]CaseTitlesFetcher: " + str(e))
         raise TrainingException("CaseTitlesFetcher: " + str(e))
     try:
         if self.trainingConfig.stackoverflowKey:
             sfFetcher = StackOverFlowFetcher(self.trainingConfig.stackoverflowKey, self.trainingConfig, self.trainingId)
             sfFetcher.fetchStackOverflowTitles(productid, datapath)
             loggerInstance.logToFile("{0}.log".format(trainingId), "StackOverFlowFetcher: Successfully fetched stack overflow question titles")
         else:
             loggerInstance.logToFile("{0}.log".format(trainingId), "StackOverFlowFetcher: Stackoverflow API Key not provided")
     except Exception as e:
         loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]StackOverFlowFetcher: " + str(e))
         raise TrainingException("StackOverFlowFetcher: " + str(e))
 def get_Tag_Questions(self, tag):
     trainingId = self.trainingId
     fetchmore = True
     pagenum = 1
     items = []
     while True:
         try:
             url = "http://api.stackexchange.com/2.2/questions?key={0}&site=stackoverflow&page={1}&order=desc&sort=votes&tagged={2}&filter=default".format(self.key, pagenum, tag)
             req = requests.get(url=url)
             if not (req.status_code == 200):
                 loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]TagDownloader: Tag " + str(tag) + " - " + str(req.json()['error_message']))
                 raise TrainingException("TagDownloader: Tag " + str(tag) + " - " + str(req.json()['error_message']))
             content = req.json()
             items += [{"text": x["title"], "links": [x["link"]], "qid": x["question_id"]} for x in content["items"] if (x["score"]>0 or x["answer_count"]>0)]
             #print(txt)
             if len(items)>self.trainingConfig.stackoverFlowTopN:
                 break
             if content["has_more"] == "false" or not content["has_more"]:
                 break
             pagenum += 1
             #print("\r" + str(pagenum),end='')
         except Exception as e:
             loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]StackOverFlowFetcher: Tag " + str(tag) + " - " + str(e))
             raise TrainingException("StackOverFlowFetcher: " + str(e))
     loggerInstance.logToFile("{0}.log".format(trainingId), "StackOverFlowFetcher: Fetched " + str(len(items)) + " questions for tag " + str(tag))
     return items
 def runCaseTitlesExtraction(self, df, productid, datapath):
     trainingId = self.trainingId
     if self.trainingConfig.downloadCaseTitlesEnabled and df.any:
         df["Incidents_SupportTopicL2Current"]=df["Incidents_SupportTopicL2Current"].fillna("NOSELECTION")
         df["Incidents_SupportTopicL3Current"]=df["Incidents_SupportTopicL3Current"].fillna("NOSELECTION")
         groups = df.groupby(["Incidents_SupportTopicL2Current", "Incidents_SupportTopicL3Current"])
         loggerInstance.logToFile("{0}.log".format(trainingId), "RunCaseTitlesExtraction: Processing " + str(df.shape[0]) + " case titles across " + str(len(list(groups))) + " categories")
         results = sorted(list(itertools.chain.from_iterable([self.extractor(key, group) for key, group in groups])), key=lambda x: x["text"])
     else:
         results = []
     try:
         sampleUtterances = json.loads(open(os.path.join(datapath, "SampleUtterances.json"), "r").read())
         #sampleUtterances = list(set(sampleUtterances+results))
         for x in results:
             found = False
             for y in sampleUtterances["incidenttitles"]:
                 if x["text"]<y["text"]:
                     break
                 elif x["text"]==y["text"]:
                     y["links"] += x["links"]
                     y["links"] = list(set(y["links"]))
                     found = True
                     break
             if not found:
                 sampleUtterances["incidenttitles"].append(x)
         open(os.path.join(datapath, "SampleUtterances.json"), "w").write(json.dumps(sampleUtterances, indent=4))
         loggerInstance.logToFile("{0}.log".format(trainingId), "RunCaseTitlesExtraction: Successfully written extracted case titles to file SampleUtterances.json")
     except (FileNotFoundError) as e:
         loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]RunCaseTitlesExtraction: File SampleUtterances.json does not exist, creating new file.")
         open(os.path.join(datapath, "SampleUtterances.json"), "w").write(json.dumps({"incidenttitles" : results, "stackoverflowtitles": []}, indent=4))
コード例 #4
0
 def fetchDetectors(self, productid, datapath):
     trainingId = self.trainingId
     try:
         content = json.loads(requests.get(self.detectorsUrl).content)
         loggerInstance.logToFile(
             "{0}.log".format(trainingId), "DetectorsFetcher: Fetched " +
             str(len(content)) + " detectors")
     except Exception as e:
         loggerInstance.logToFile("{0}.log".format(trainingId),
                                  "[ERROR]DetectorsFetcher: " + str(e))
         raise TrainingException("DetectorsFetcher: " + str(e))
     detectors = [
         detector for detector in content
         if (productid in getProductId(detector["resourceFilter"]))
     ]
     loggerInstance.logToFile(
         "{0}.log".format(trainingId),
         "DetectorsFetcher: Shortlisted " + str(len(detectors)) +
         " detectors for training based on productId " + str(productid))
     for detector in detectors:
         if detector["metadata"]:
             md = json.loads(detector["metadata"])
             detector["utterances"] = md[
                 "utterances"] if "utterances" in md else []
         else:
             detector["utterances"] = []
     if len(content) > 0:
         try:
             open(os.path.join(datapath, "Detectors.json"),
                  "w").write(json.dumps(detectors, indent=4))
             loggerInstance.logToFile(
                 "{0}.log".format(trainingId),
                 "DetectorsFetcher: Written detectors to file Detectors.json"
             )
         except Exception as e:
             loggerInstance.logToFile("{0}.log".format(trainingId),
                                      "[ERROR]DetectorsFetcher: " + str(e))
             raise TrainingException("DetectorsFetcher: " + str(e))
コード例 #5
0
 def prepareDataForTraining(self, productid):
     config = json.loads(open("resourceConfig/config.json", "r").read())
     trainingId = self.trainingId
     # check product in training config
     rawdatapath = "rawdata_{0}".format(productid)
     try:
         os.makedirs(rawdatapath)
     except FileExistsError:
         pass
     loggerInstance.logToFile("{0}.log".format(trainingId),
                              "Created folders for raw data")
     try:
         sampleUtterancesFetcher = SampleUtterancesFetcher(
             self.trainingConfig, self.trainingId)
         sampleUtterancesFetcher.run(productid, rawdatapath)
         loggerInstance.logToFile(
             "{0}.log".format(trainingId),
             "SampleUtterancesFetcher: Successfully fetched & extracted sample utterances"
         )
     except Exception as e:
         loggerInstance.logToFile(
             "{0}.log".format(trainingId),
             "[ERROR]SampleUtterancesFetcher: " + str(e))
         raise TrainingException("SampleUtterancesFetcher: " + str(e))
     try:
         detectorsFetcher = DetectorsFetcher(
             "http://localhost:{0}/internal/detectors".format(
                 config["internalApiPort"]), self.trainingId)
         detectorsFetcher.fetchDetectors(productid, rawdatapath)
         loggerInstance.logToFile(
             "{0}.log".format(trainingId),
             "DetectorsFetcher: Successfully fetched detectors")
     except Exception as e:
         loggerInstance.logToFile("{0}.log".format(trainingId),
                                  "[ERROR]DetectorsFetcher: " + str(e))
         raise TrainingException("DetectorsFetcher: " + str(e))
     gc.collect()
 def fetchStackOverflowTitles(self, productid, datapath):
     trainingId = self.trainingId
     questions = []
     try:
         questions = json.loads(open(os.path.join(datapath, "SampleUtterances.json"), "r").read())["stackoverflowtitles"]
     except:
         questions = []
     print("TAG DOWNLOAD SET TO --", self.trainingConfig.downloadStackoverflowEnabled)
     if self.trainingConfig.downloadStackoverflowEnabled:
         #Get tags for product id
         tags = self.trainingConfig.stackoverflowTags
         #Fetch questions for tags
         for tag in tags:
             qids = [x["qid"] for x in questions]
             questions += [q for q in self.get_Tag_Questions(tag) if q["qid"] not in qids]
     try:
         sampleUtterances = json.loads(open(os.path.join(datapath, "SampleUtterances.json"), "r").read())
         sampleUtterances["stackoverflowtitles"] = questions
         open(os.path.join(datapath, "SampleUtterances.json"), "w").write(json.dumps(sampleUtterances))
         loggerInstance.logToFile("{0}.log".format(trainingId), "StackOverFlowFetcher: Successfully written stackoverflow questions to file SampleUtterances.json")
     except (FileNotFoundError):
         loggerInstance.logToFile("{0}.log".format(trainingId), "[ERROR]StackOverFlowFetcher: File SampleUtterances.json does not exist, creating new file.")
         sampleUtterances = {"incidenttitles": [], "stackoverflowtitles": questions}
         open(os.path.join(datapath, "SampleUtterances.json"), "w").write(json.dumps(sampleUtterances))
 def extractor(self, key, group):
     trainingId = self.trainingId
     category = key[0]+"--"+key[1]
     lines = [(self.endSentence(row["CleanCaseTitles"]), row["SupportCenterCaseLink"])  for ind, row in group.iterrows()]
     resultTitles = []
     if self.trainingConfig.runExtractionEnabled and len(lines)>10:
         numsentences = group.shape[0]
         loggerInstance.logToFile("{0}.log".format(trainingId), "Extractor: Running extractor on category " + category + " containing " + str(numsentences) + " case titles")
         doc = " ".join([x[0] for x in lines])
         keysentences = retrieveSentences(doc, max([10, int(numsentences*self.trainingConfig.extractionRatio)])*10)
         loggerInstance.logToFile("{0}.log".format(trainingId), "Extractor: Extracted " + str(len(keysentences)) + " sentences.")
         for sent in keysentences:
             caselinks = [x[1] for x in lines if self.squeeze(x[0])==self.squeeze(sent)]
             if not caselinks:
                 caselinks = [x[1] for x in lines if self.squeeze(sent) in self.squeeze(x[0])]
             if not caselinks:
                 caselinks = [x[1] for x in lines if re.sub('[^0-9a-zA-Z]+', '', sent)==re.sub('[^0-9a-zA-Z]+', '', x[0])]
             if caselinks:
                 resultTitles.append({"text": sent, "links": caselinks, "category": category})
     else:
         loggerInstance.logToFile("{0}.log".format(trainingId), "Extractor: Disabled or not enough lines for summarization")
         resultTitles = [{"text": x[0], "links": x[1], "category": category} for x in lines]
     return resultTitles
    def fetchCaseTitles(self, productid, datapath):
        trainingId = self.trainingId
        if self.trainingConfig.downloadCaseTitlesEnabled:
            ndays = int(self.trainingConfig.caseTitlesDaysSince)
            try:
                db = "Product360"
                query = """cluster('usage360').database('Product360').
            AllCloudSupportIncidentDataWithP360MetadataMapping
            | where DerivedProductIDStr in ('{0}')
            | where Incidents_CreatedTime >= ago({1}d)
            | summarize IncidentTime = any(Incidents_CreatedTime) by Incidents_IncidentId , Incidents_Severity , Incidents_ProductName , Incidents_SupportTopicL2Current , Incidents_SupportTopicL3Current, Incidents_Title  
            | extend SupportCenterCaseLink = strcat('https://azuresupportcenter.msftcloudes.com/caseoverview?srId=', Incidents_IncidentId)
            | order by Incidents_SupportTopicL3Current asc""".format(productid, ndays)
                response = self.kustoClient.execute(db, query)
            except Exception as e:
                raise TrainingException("KustoFetcher: " + str(e))
            
            try:
                df = dataframe_from_result_table(response.primary_results[0])
                loggerInstance.logToFile("{0}.log".format(trainingId), "DataCleansing: " + str(df.shape[0]) + " incidents fetched")
            
                #Remove all non english cases
                df["isEnglish"] = df["Incidents_Title"].map(self.isEnglish)
                df_eng = df[df["isEnglish"]==True]
                del df_eng["isEnglish"]
                loggerInstance.logToFile("{0}.log".format(trainingId), "DataCleansing: " + str(df.shape[0] - df_eng.shape[0]) + " non English language cases removed")
            
                #all cases with character length 3 or less
                mask = (df_eng["Incidents_Title"].str.len()>3)
                df_eng_1 = df_eng[mask]
            
                #Extract case title from piped sentences
                df_eng_1["Incidents_Title_PipeCleansed"] = df_eng_1["Incidents_Title"].map(self.pipeCleansing)
                
                #Remove any content in square brackets
                df_eng_1["Incidents_Title_PipeCleansed"] = df_eng_1["Incidents_Title_PipeCleansed"].map(lambda x: re.sub("[\\[].*?[\\]]", "", x))
            
                #Remove any remaining titles with character length 3 or less
                mask = (df_eng_1["Incidents_Title_PipeCleansed"].str.len()>3)
                df_eng_2 = df_eng_1[mask]

                #Remove any garbage phrases (defined in garbage list)
                mask = (df_eng_2["Incidents_Title_PipeCleansed"].isin(self.garbageList))
                df_eng_clean = df_eng_2[~mask]
                loggerInstance.logToFile("{0}.log".format(trainingId), "DataCleansing: " + str(df_eng.shape[0] - df_eng_clean.shape[0]) + " garbage case title incidents removed")
                
                #Remove any cases with two or less words (Except for short phrases that make sense)
                df_eng_clean["wordcount"] = df_eng_clean["Incidents_Title_PipeCleansed"].map(lambda x: len([a for a in x.split() if len(a)>2]))
                df_eng_clean["drop"] = df_eng_clean[["Incidents_Title_PipeCleansed", "wordcount"]].apply(lambda x: (x["Incidents_Title_PipeCleansed"] not in self.shortPhrases) and (x["wordcount"]<2), axis=1)
                df_eng_clean = df_eng_clean[df_eng_clean["drop"] == False]
                del df_eng_clean["drop"]
                del df_eng_clean["wordcount"]
            
                df_eng_clean["CleanCaseTitles"] = df_eng_clean["Incidents_Title_PipeCleansed"]
                del df_eng_clean["Incidents_Title_PipeCleansed"]
                loggerInstance.logToFile("{0}.log".format(trainingId), "DataCleansing: " + str(df_eng_clean.shape[0]) + " incidents will be processed for summarization")
            except Exception as e:
                raise TrainingException("DataCleansing: " + str(e))
            try:
                self.runCaseTitlesExtraction(df_eng_clean, productid, datapath)
            except Exception as e:
                raise TrainingException("CaseTitleExtraction: " + str(e))
        else:
            loggerInstance.logToFile("{0}.log".format(trainingId), "CaseTitleExtraction: Disabled")
            try:
                self.runCaseTitlesExtraction(None, productid, datapath)
            except Exception as e:
                raise TrainingException("CaseTitleExtraction: " + str(e))
コード例 #9
0
def trainModel(trainingId, productid, trainingConfig):
    loggerInstance.logToFile("{0}.log".format(trainingId),
                             json.dumps(trainingConfig.__dict__))
    datapath = "rawdata_{0}".format(productid)
    outpath = "{0}".format(productid)
    try:
        os.mkdir(datapath)
    except FileExistsError:
        pass
    try:
        os.mkdir(outpath)
    except FileExistsError:
        pass
    loggerInstance.logToFile(
        "{0}.log".format(trainingId),
        "Created folders for raw data and processed models")
    try:
        dataProcessor = DataProcessor(trainingConfig, trainingId)
        dataProcessor.prepareDataForTraining(productid)
        loggerInstance.logToFile(
            "{0}.log".format(trainingId),
            "DataFetcher: Sucessfully fetched and processed for training")
    except Exception as e:
        loggerInstance.logToFile("{0}.log".format(trainingId),
                                 "[ERROR]DataFetcher: " + str(e))
        raise TrainingException("DataFetcher: " + str(e))
    try:
        detectorsdata = open(os.path.join(datapath, "Detectors.json"),
                             "r").read()
        detectors = json.loads(detectorsdata)
        if trainingConfig.detectorContentSplitted:
            detector_mappings = []
            detector_tokens = []
            i = 0
            for x in detectors:
                detector_mappings.append({
                    "startindex":
                    i,
                    "endindex":
                    i + len(x["utterances"]) + 1,
                    "id":
                    x["id"]
                })
                detector_tokens += [
                    getAllNGrams(x["name"], trainingConfig.textNGrams)
                ] + [
                    getAllNGrams(x["description"], trainingConfig.textNGrams)
                ] + [
                    getAllNGrams(y["text"], trainingConfig.textNGrams)
                    for y in x["utterances"]
                ]
                i += (len(x["utterances"]) + 2)
            open(os.path.join(outpath, "Mappings.json"),
                 "w").write(json.dumps(detector_mappings))
        else:
            detector_tokens = [
                getAllNGrams(
                    x["name"] + " " + x["description"] + " " +
                    " ".join([y["text"] for y in x["utterances"]]),
                    trainingConfig.textNGrams) for x in detectors
            ]
        loggerInstance.logToFile(
            "{0}.log".format(trainingId),
            "DetectorProcessor: Sucessfully processed detectors data into tokens"
        )
    except Exception as e:
        loggerInstance.logToFile("{0}.log".format(trainingId),
                                 "[ERROR]DetectorProcessor: " + str(e))
        raise TrainingException("DetectorProcessor: " + str(e))
    try:
        #Stackoverflow and Case Incidents data load
        sampleUtterancesContent = json.loads(
            open(os.path.join(datapath, "SampleUtterances.json"), "r").read())
        sampleUtterances = (sampleUtterancesContent["incidenttitles"]
                            if trainingConfig.includeCaseTitles else []) + (
                                sampleUtterancesContent["stackoverflowtitles"]
                                if trainingConfig.includeStackoverflow else [])
        sampleUtterances_tokens = [
            getAllNGrams(sampleUtterances[i]["text"],
                         trainingConfig.textNGrams)
            for i in range(len(sampleUtterances))
        ]
        loggerInstance.logToFile(
            "{0}.log".format(trainingId),
            "CaseTitlesProcessor: Sucessfully processed sample utterances into tokens"
        )
    except Exception as e:
        loggerInstance.logToFile("{0}.log".format(trainingId),
                                 "[ERROR]CaseTitlesProcessor: " + str(e))
        raise TrainingException("CaseTitlesProcessor: " + str(e))
    try:
        trainDictionary(detector_tokens + sampleUtterances_tokens, productid,
                        outpath)
        loggerInstance.logToFile(
            "{0}.log".format(trainingId),
            "DictionaryTrainer: Sucessfully trained dictionary")
    except Exception as e:
        loggerInstance.logToFile("{0}.log".format(trainingId),
                                 "[ERROR]DictionaryTrainer: " + str(e))
        raise TrainingException("DictionaryTrainer: " + str(e))
    if trainingConfig.trainDetectors:
        try:
            trainModelM1([], detector_tokens, sampleUtterances_tokens,
                         productid, outpath)
            loggerInstance.logToFile(
                "{0}.log".format(trainingId),
                "ModelM1Trainer: Sucessfully trained model m1")
        except Exception as e:
            loggerInstance.logToFile("{0}.log".format(trainingId),
                                     "[ERROR]ModelM1Trainer: " + str(e))
            raise TrainingException("ModelM1Trainer: " + str(e))
    else:
        loggerInstance.logToFile("{0}.log".format(trainingId),
                                 "ModelM1Trainer: Training is disabled")
    if trainingConfig.trainUtterances:
        try:
            trainModelM2([], detector_tokens, sampleUtterances_tokens,
                         productid, outpath)
            loggerInstance.logToFile(
                "{0}.log".format(trainingId),
                "ModelM2Trainer: Sucessfully trained model m2")
        except Exception as e:
            loggerInstance.logToFile("{0}.log".format(trainingId),
                                     "[ERROR]ModelM2Trainer: " + str(e))
            raise TrainingException("ModelM2Trainer: " + str(e))
    open(os.path.join(outpath, "Detectors.json"),
         "w").write(json.dumps(detectors))
    open(os.path.join(outpath, "SampleUtterances.json"),
         "w").write(json.dumps(sampleUtterances))
    modelInfo = {
        "detectorContentSplitted": trainingConfig.detectorContentSplitted,
        "textNGrams": trainingConfig.textNGrams
    }
    open(os.path.join(outpath, "ModelInfo.json"),
         "w").write(json.dumps(modelInfo))
    modelPath = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             outpath)
    try:
        publishModels(productid, modelPath, trainingId)
        loggerInstance.logToFile(
            "{0}.log".format(trainingId),
            "ModelPublisher: Sucessfully published models")
    except Exception as e:
        loggerInstance.logToFile("{0}.log".format(trainingId),
                                 "[ERROR]ModelPublisher: " + str(e))
        raise TrainingException("ModelPublisher: " + str(e))