def loadData(): """Load data for the first time. Read the DB dump and create the data based on the 'Aperture'""" config = cg.Config() dataL = dt.Data(config) map_labels = dataL.loadMapFromJson(config.data_path + "map_labels.json") # Load Data targets, tickets, complete_labels, first_level_targets, first_level_labels = dataL.load_data( ) labels = dataL.getfirstLevelTargets(map_labels['map']) map_labels = dataL.loadMapFromJson( "C:/Users/anton/Desktop/Lavoro/Ticket_Classification/map_labels.json") filtdata = fd.FilterData(config, labels) #Filter Data tickets, targets = filtdata.filterOutTickets(tickets, targets) #tickets, targets = filtdata.filterOutTicketsWithTargetsMap(map_labels['map'], targets, tickets) #Map Data new_targets = dataL.mapTargets(map_labels['map'], targets) tt_complete = [] count = 0 for old in targets: tt_complete.append(old + " -- > " + new_targets[count]) count += 1 #level_zero_map_label = ["Generico", "Amministrazione", "Tecnico", "Commerciale", "Magazzino"] dataL.writeArrayInFile(tickets, 'onlyApertura/tickets.txt', "utf-8") dataL.writeArrayInFile(targets, 'onlyApertura/complete_targets.txt', "utf-8") dataL.writeArrayInFile(new_targets, 'onlyApertura/targets_mapped.txt', "utf-8") dataL.writeArrayInFile(tt_complete, 'onlyApertura/tt_multilevel.txt', "utf-8")
def mainTrainModel(): print("============ Start =============\n") print("1 - Load Configuration\n") config = cg.Config() dataL = dt.Data(config) print("2 - Load Data and Targets\n") map_labels = dataL.loadMapFromJson(config.data_path + "map_labels.json") tickets = dataL.loadDataInArray( config.data_path + "tickets_balanced_15000.txt", config.csv_encoding) targets = dataL.loadDataInArray(config.data_path + "target_balanced_15000.txt") labels = dataL.getfirstLevelTargets(map_labels['map']) print("3 - Preprocess Data\n") tickets, targets = ut.removeIdenticalTickets(tickets, targets) tickets_to_lower, targets, words = preprocessData(tickets, targets, labels) print("4 - Build Vocabulary\n") # Create Vocabulary voc = vc.Vocabulary(config) dictionary, reverse_dict = voc.build_dictionary(words, labels) voc.saveDictionary(dictionary, "vocabulary") print("5 - Create Ticket Sequences and Targets Hot Vectors\n") #Create Sequences and HotVectors for the Target tickets_sequences = dataL.createDataSequence(tickets_to_lower, dictionary) oneHotVectorTarget = dataL.transformInOneHotVector(labels, targets) print("6 - Filter Data - Removeing Token OOV\n") filtdata = fd.FilterData(config, labels) tickets_sequences, oneHotVectorTarget, trash = filtdata.removeTokenOOV( tickets_sequences, oneHotVectorTarget, dictionary) print(" *** Class Trash len : " + str(len(trash))) print("7 - Generate Training and Testing Dataset\n") X_train, X_test, y_train, y_test = ut.get_train_and_test( tickets_sequences, oneHotVectorTarget, test_size=0.2) dataL.writeArrayStringInFile( X_train, 'parsed_sequences_15000/tickets_training.txt', "utf-8") dataL.writeArrayStringInFile(X_test, 'parsed_sequences_15000/tickets_test.txt', "utf-8") dataL.writeArrayStringInFile(y_train, 'parsed_sequences_15000/target_training.txt', "utf-8") dataL.writeArrayStringInFile(y_test, 'parsed_sequences_15000/target_test.txt', "utf-8") print(" *** Training Size : " + str(len(X_train)) + "\n") if config.use_pretrained_embs: print(" *** Uso pretrained Words Embedding\n") skip = sk.SkipgramModel(config) skipgramModel = skip.get_skipgram() skipgramEmbedding = skip.getCustomEmbeddingMatrix( skipgramModel, reverse_dict) config.skipgramEmbedding = skipgramEmbedding print("8 - Start Training\n") ml.runTraining(config, X_train, y_train, labels) print("============ End =============\n")
def testPriority(): print("============ Start =============\n") print("1 - Load Configuration\n") config = cg.Config() dataL = dt.Data(config) print("2 - Load Data and Targets Sequences\n") tickets = dataL.loadDataInArray( config.main_path + "parsed_sequences/tickets_test.txt", config.csv_encoding) targets = dataL.loadDataInArray(config.main_path + "parsed_sequences/target_test.txt") labels = ["1", "2", "3", "4", "5"] oneHotVectorTarget = dataL.transformListStringInOneHot(targets) print("*** Test Size : " + str(len(tickets)) + "\n") tickets_parsed = dataL.fromSequenceStringToSequenceArray(tickets) print("3 - Load Vocabulary\n") voc = vc.Vocabulary(config) dictionary = voc.loadDictionary("vocabulary") # Create Sequence # tickets_sequences = dataL.createDataSequence(tickets_to_lower, dictionary) # Create OneHotVector for the targets # oneHotVectorTarget = dataL.transformInOneHotVector(labels, targets) print("4 - Filter Data - Removeing Token OOV\n") filtdata = fd.FilterData(config, labels) tickets_training_sequences, oneHotVectorTarget_training, trash = filtdata.removeTokenOOV( tickets_parsed, oneHotVectorTarget, dictionary) print("*** Class Trash len : " + str(len(trash))) print("5 - Restore and Eval Model - " + str(config.model_to_restore)) # # tm.restoreModel(config,tickets_parsed,oneHotVectorTarget, labels, dictionary) # tm.restoreModelAndPredict(config,tickets_parsed,oneHotVectorTarget, labels, dictionary) ml.runEvaluation(config, tickets_parsed, oneHotVectorTarget, labels, dictionary) # print("============ End =============\n")
def trainPriority(): print("============ Start =============\n") print("1 - Load Configuration\n") config = cg.Config() config.configFromFile("config/priority_config.json") dataL = dt.Data(config) print("2 - Load Data and Targets\n") tickets_training = dataL.loadDataInArray( config.main_path + "onlyAperturaPriority/tickets_training.txt", config.csv_encoding) tickets_test = dataL.loadDataInArray( config.main_path + "onlyAperturaPriority/tickets_test.txt", config.csv_encoding) targets_training = dataL.loadDataInArray( config.main_path + "onlyAperturaPriority/targets_training.txt", config.csv_encoding) targets_test = dataL.loadDataInArray( config.main_path + "onlyAperturaPriority/targets_test.txt", config.csv_encoding) labels = ["1", "2", "3", "4", "5"] print("3 - Preprocess Data\n") tickets_training_tl, targets_training, words = preprocessData( tickets_training, targets_training, labels) tickets_test_tl, targets_test, w_ = preprocessData(tickets_test, targets_test, labels) if config.loadOrbuild_dictionary == "build": print("4 - Build Vocabulary\n") # Create Vocabulary voc = vc.Vocabulary(config) dictionary, reverse_dict = voc.build_dictionary(words, labels) voc.saveDictionary(dictionary, "vocabulary") print("*** Vocabulary saved \n") else: print("4 - Load Vocabulary\n") # Load Existing Vocabulary voc = vc.Vocabulary(config) dictionary = voc.loadDictionary("vocabulary") reverse_dict = voc.getReverseDictionary(dictionary) print("5 - Create Ticket Sequences and Targets Hot Vectors\n") # Create Sequences and HotVectors for the Target tickets_training_sequences = dataL.createDataSequence( tickets_training_tl, dictionary) oneHotVectorTarget_training = dataL.transformInOneHotVector( labels, targets_training) tickets_test_sequences = dataL.createDataSequence(tickets_test_tl, dictionary) oneHotVectorTarget_test = dataL.transformInOneHotVector( labels, targets_test) print("6 - Filter Data - Removeing Token OOV\n") filtdata = fd.FilterData(config, labels) tickets_training_sequences, oneHotVectorTarget_training, trash = filtdata.removeTokenOOV( tickets_training_sequences, oneHotVectorTarget_training, dictionary) print(" *** Classe Cestino in Training : " + str(len(trash)) + "\n") tickets_test_sequences, oneHotVectorTarget_test, trash = filtdata.removeTokenOOV( tickets_test_sequences, oneHotVectorTarget_test, dictionary) print(" *** Classe Cestino in Test : " + str(len(trash)) + "\n") print("7 - Generate Training and Testing Dataset\n") dataL.writeArrayInFileCompleteDataPath( tickets_training_sequences, config.data_sequences_path + '/tickets_training.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( tickets_test_sequences, config.data_sequences_path + '/tickets_test.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( oneHotVectorTarget_training, config.data_sequences_path + '/target_training.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( oneHotVectorTarget_test, config.data_sequences_path + '/target_test.txt', "utf-8") print(" *** Training Size : " + str(len(tickets_training_sequences)) + "\n") print(" *** Test Size : " + str(len(tickets_test_sequences)) + "\n") if config.use_pretrained_embs: print(" *** Use pretrained Words Embedding\n") skip = sk.SkipgramModel(config) skipgramModel = skip.get_skipgram() skipgramEmbedding = skip.getCustomEmbeddingMatrix( skipgramModel, reverse_dict) config.skipgramEmbedding = skipgramEmbedding print("8 - Start Training\n") ml.runTraining(config, tickets_training_sequences, oneHotVectorTarget_training, labels) print("============ End =============\n")
def mainTrainModelOnApertureWithSequenceFeatures(): print("============ Start =============\n") print("1 - Load Configuration\n") config = cg.Config() dataL = dt.Data(config) print("2 - Load Data and Targets\n") tickets_training, tickets_test, targets_training, targets_test = loadAndSplit( ) map_labels = dataL.loadMapFromJson(config.data_path + "map_labels.json") labels = dataL.getfirstLevelTargets(map_labels['map']) print("3 - Preprocess Data\n") tickets_training_tl, targets_training, words = preprocessData( tickets_training, targets_training, labels) tickets_test_tl, targets_test, w_ = preprocessData(tickets_test, targets_test, labels) print("4 - Build Vocabulary\n") # Create Vocabulary voc = vc.Vocabulary(config) dictionary, reverse_dict = voc.build_dictionary(words, labels) voc.saveDictionary(dictionary, "vocabulary") print("5 - Create Ticket Sequences and Targets Hot Vectors\n") #Create Sequences and HotVectors for the Target tickets_training_sequences = dataL.createDataSequence( tickets_training_tl, dictionary) oneHotVectorTarget_training = dataL.transformInOneHotVector( labels, targets_training) tickets_test_sequences = dataL.createDataSequence(tickets_test_tl, dictionary) oneHotVectorTarget_test = dataL.transformInOneHotVector( labels, targets_test) print("6 - Create Ticket Feature Sequences") #Create Sequences Features tickets_feature_sequences = dataL.extractFeatures(tickets_training_tl, dictionary) tickets_feature_test_sequences = dataL.createDataSequence( tickets_test_tl, dictionary) print("6 - Filter Data - Removeing Token OOV\n") filtdata = fd.FilterData(config, labels) tickets_training_sequences, oneHotVectorTarget_training, tickets_feature_sequences_training, trash = filtdata.removeTokenOOVwithSequenceFeatures( tickets_training_sequences, oneHotVectorTarget_training, tickets_feature_sequences, dictionary) print("*** Classe Cestino in Training : " + str(len(trash))) tickets_test_sequences, oneHotVectorTarget_test, tickets_feature_test_sequences, trash = filtdata.removeTokenOOVwithSequenceFeatures( tickets_test_sequences, oneHotVectorTarget_test, tickets_feature_test_sequences, dictionary) print("*** Classe Cestino in Test : " + str(len(trash))) print("7 - Generate Training and Testing Dataset\n") dataL.writeArrayStringInFile(tickets_training_sequences, 'parsed_sequences/tickets_training.txt', "utf-8") dataL.writeArrayStringInFile(tickets_test_sequences, 'parsed_sequences/tickets_test.txt', "utf-8") dataL.writeArrayStringInFile(oneHotVectorTarget_training, 'parsed_sequences/target_training.txt', "utf-8") dataL.writeArrayStringInFile(oneHotVectorTarget_test, 'parsed_sequences/target_test.txt', "utf-8") print("*** Training Size : " + str(len(tickets_training_sequences)) + "\n") if config.use_pretrained_embs: print("*** Uso pretrained Words Embedding\n") skip = sk.SkipgramModel(config) skipgramModel = skip.get_skipgram() skipgramEmbedding = skip.getCustomEmbeddingMatrix( skipgramModel, reverse_dict) config.skipgramEmbedding = skipgramEmbedding print("8 - Start Training\n") ml.runTrainingWithFeatureSequence(config, tickets_training_sequences, oneHotVectorTarget_training, labels, tickets_feature_sequences_training) print("============ End =============\n")
def training_model(main_path, type, config_file, from_date, to_date, customer): #logging.basicConfig(filename=logCustomer, level=logging.INFO) #lg.configureLogger(QIUserLogger, customer, "training") # QIUserLogger.info( "-----------------------------------------------------------------") QIUserLogger.info( "------------------------Training Start---------------------------") # QIUserLogger.info("** Initialization start... **") main_path = main_path type = type config_file = config_file from_date = from_date to_date = to_date QIUserLogger.info(" MainPath - " + str(main_path)) QIUserLogger.info(" Type - " + str(type)) QIUserLogger.info(" ConfigFile - " + str(config_file)) QIUserLogger.info(" FromDate - " + str(from_date)) QIUserLogger.info(" ToDate - " + str(to_date)) # QIUserLogger.info("** Initialization End **") try: QIUserLogger.info("1 - Load Configurations") QIUserLogger.info(" ** Config for Classification") # Load Config files configModel = cg.Config() configModel.configFromFile(config_file) configModel.main_path = main_path configModel.updateDataOfMainPath(config_file, main_path) dataL = dt.Data(configModel) # QIUserLogger.info("2 - Login In API") # Login to API configConnection = con.ConfigConnection() dir_path = os.path.dirname(os.path.realpath(__file__)) configConnection.configFromFile(dir_path + "/config/" + customer + "/connector_config.json") connector = con.Connector(configConnection) # Create Persistent Session Reqsess = requests.session() # LogIN connector.login(Reqsess) QIUserLogger.info("3 - GET TICKETS FROM API") # params = "closedfrom=" + str(from_date) + "&closedto=" + str( to_date) + "&maxnum=" + str(configConnection.max_tickets_to_get) #params = {"closedfrom": from_date, "closedto": to_date, "maxnum" : configConnection.max_tickets_to_get} responseTicket = connector.getTickets(Reqsess, params) if len(responseTicket) > 0: rTicket = [] for t in responseTicket: rTicket.append(t['description']) # id2lab = dict( zip(configModel.labels_map.values(), configModel.labels_map.keys())) # gather_tickets, gather_targets = gatherData( type, responseTicket, configModel, id2lab) # QIUserLogger.info("4 - REMOVE STOP WORDS FROM NEW TICKETS") tok = tk.Tokenizer(gather_tickets) tok.tokenizeTickets() tickets_to_lower = tok.toLower() gather_tickets, gather_targets = tok.removeStopWordsToString( tickets_to_lower, gather_targets) QIUserLogger.info("5 - GET STORED DATA TICKETS") tickets_train = dataL.loadDataInArray( configModel.data_path + "/tickets.txt", configModel.csv_encoding) targets_train = dataL.loadDataInArray(configModel.data_path + "/targets.txt") # # Count if we reached the threshold QIUserLogger.info("6 - MERGE THE DATA - STORED AND GATHERED") max_length = configModel.max_num_tickets len_gather_tickets = len(gather_tickets) len_tickets = len(tickets_train) #Effettuo un nuovo training su tutto il dataset e non un transfer #learning perchè voglio utilizzare sempre un vocabolario aggiornato. tickets = tickets_train + gather_tickets targets = targets_train + gather_targets reached_dim = len_gather_tickets + len_tickets if reached_dim > max_length: elem_to_cut = reached_dim - max_length #cut out the firsts elem_to_cut elements merged_targets = tickets[elem_to_cut:] merged_tickets = targets[elem_to_cut:] tickets = merged_tickets targets = merged_targets reached_dim = max_length QIUserLogger.info("7 - REMOVE IDENTICAL TICKETS") #tickets, targets = ut.removeIdenticalTickets(tickets, targets) tickets, targets = ut.removeIdenticalTicketsFromNew( tickets, targets, len_tickets, reached_dim) QIUserLogger.info("8 - SAVING MERGED DATA") dataL.writeArrayInFileCompleteDataPath( tickets, configModel.data_path + '/tickets.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( targets, configModel.data_path + '/targets.txt', "utf-8") # QIUserLogger.info("9 - EXTRACT WORDS FROM TICKETS") words = tok.extractWordsTicketString(tickets) # QIUserLogger.info("10 - BUILD NEW VOCABULARY") # Create Vocabulary voc = vc.Vocabulary(configModel) dictionary, reverse_dict = voc.build_dictionary( words, configModel.labels) voc.saveDictionary(dictionary, "vocabulary") QIUserLogger.info("*** Vocabulary saved") # QIUserLogger.info("11 -- SPLIT DATA IN TRAINING AND TEST DATASET") tickets_training, tickets_test, Target_training, Target_test = ut.get_train_and_test( tickets, targets) dataL.writeArrayInFileCompleteDataPath( tickets_training, configModel.data_path + '/tickets_training.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( Target_training, configModel.data_path + '/targets_training.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( tickets_test, configModel.data_path + '/tickets_test.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( Target_test, configModel.data_path + '/targets_test.txt', "utf-8") # QIUserLogger.info("12 - CREATE TICKETS AND TARGETS SEQUENCES") # Create Sequences and HotVectors for the Target tickets_training_sequences = dataL.createDataSequenceTicketsString( tickets_training, dictionary) oneHotVectorTarget_training = dataL.transformInOneHotVector( configModel.labels, Target_training) # QIUserLogger.info("13 - FILTER OUT DATA - Removing Token OOV") filtdata = fd.FilterData(configModel, configModel.labels) tickets_training_sequences, oneHotVectorTarget_training, trash = filtdata.removeTokenOOV( tickets_training_sequences, oneHotVectorTarget_training, dictionary) QIUserLogger.info(" *** Classe Cestino in Training : " + str(len(trash))) # #QIUserLogger.info(" -- Split Training | Test Dataset") #tickets_training_sequences, tickets_test_sequences, oneHotVectorTarget_training, oneHotVectorTarget_test = ut.get_train_and_test(tickets_training_sequences, oneHotVectorTarget_training) # QIUserLogger.info("14 - SAVING TRAINING SEQUENCES") dataL.writeArrayInFileCompleteDataPath( tickets_training_sequences, configModel.data_sequences_path + '/tickets_training.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( oneHotVectorTarget_training, configModel.data_sequences_path + '/target_training.txt', "utf-8") QIUserLogger.info(" *** Training Size : " + str(len(tickets_training_sequences)) + "\n") if configModel.use_pretrained_embs: QIUserLogger.info(" *** Use pretrained Words Embedding") skip = sk.SkipgramModel(configModel) skipgramModel = skip.get_skipgram() skipgramEmbedding = skip.getCustomEmbeddingMatrix( skipgramModel, reverse_dict) configModel.skipgramEmbedding = skipgramEmbedding # Start Training QIUserLogger.info("15 - START TRAINING") ml.runTraining(configModel, tickets_training_sequences, oneHotVectorTarget_training, configModel.labels) QIUserLogger.info("============ End =============") else: QIUserLogger.info( "No New Tickets found. There is no need of a new training.") # LogIN connector.logout(Reqsess) # except Exception as e: print(str(e)) QIUserLogger.error("Error in training_model " + str(e))
descriptions, sequences, categoryIDS, priorityIDS = [], [], [], [] for ticket in tickets: ticket_cleaned = cleanData(ticket) ticket_array = ticket_cleaned.split(" ") # Create Sequences and HotVectors for the Target tickets_sequences = dataMC.createDataSequence([ticket_array], dictionary) ticket_sequence = tickets_sequences[0] descriptions.append(ticket) sequences.append(ticket_sequence) # Trashing ticket with too much words out of vocabulary filtdata = filter_data.FilterData(configModelClass, labelsClass) trashIT = filtdata.trashingTicket(ticket_sequence, dictionary) if trashIT: params = {"categoryId": "Trash", "priorityId": 0} # Invio il ticket con la label: Cestino categoryIDS.append("Trash") priorityIDS.append(1) else: categoryIDS.append(0) priorityIDS.append(1) # Aggrego i tickets