Ejemplo n.º 1
0
def mainTrainModel():
    print("============ Start =============\n")

    print("1 - Load Configuration\n")
    config = cg.Config()
    dataL = dt.Data(config)
    print("2 - Load Data and Targets\n")
    map_labels = dataL.loadMapFromJson(config.data_path + "map_labels.json")
    tickets = dataL.loadDataInArray(
        config.data_path + "tickets_balanced_15000.txt", config.csv_encoding)
    targets = dataL.loadDataInArray(config.data_path +
                                    "target_balanced_15000.txt")
    labels = dataL.getfirstLevelTargets(map_labels['map'])
    print("3 - Preprocess Data\n")
    tickets, targets = ut.removeIdenticalTickets(tickets, targets)
    tickets_to_lower, targets, words = preprocessData(tickets, targets, labels)
    print("4 - Build Vocabulary\n")
    # Create Vocabulary
    voc = vc.Vocabulary(config)
    dictionary, reverse_dict = voc.build_dictionary(words, labels)
    voc.saveDictionary(dictionary, "vocabulary")
    print("5 - Create Ticket Sequences and Targets Hot Vectors\n")
    #Create Sequences and HotVectors for the Target
    tickets_sequences = dataL.createDataSequence(tickets_to_lower, dictionary)
    oneHotVectorTarget = dataL.transformInOneHotVector(labels, targets)
    print("6 - Filter Data - Removeing Token OOV\n")
    filtdata = fd.FilterData(config, labels)
    tickets_sequences, oneHotVectorTarget, trash = filtdata.removeTokenOOV(
        tickets_sequences, oneHotVectorTarget, dictionary)
    print("	*** Class Trash len : " + str(len(trash)))
    print("7 - Generate Training and Testing Dataset\n")
    X_train, X_test, y_train, y_test = ut.get_train_and_test(
        tickets_sequences, oneHotVectorTarget, test_size=0.2)
    dataL.writeArrayStringInFile(
        X_train, 'parsed_sequences_15000/tickets_training.txt', "utf-8")
    dataL.writeArrayStringInFile(X_test,
                                 'parsed_sequences_15000/tickets_test.txt',
                                 "utf-8")
    dataL.writeArrayStringInFile(y_train,
                                 'parsed_sequences_15000/target_training.txt',
                                 "utf-8")
    dataL.writeArrayStringInFile(y_test,
                                 'parsed_sequences_15000/target_test.txt',
                                 "utf-8")
    print("	*** Training Size : " + str(len(X_train)) + "\n")
    if config.use_pretrained_embs:
        print("	*** Uso pretrained Words Embedding\n")
        skip = sk.SkipgramModel(config)
        skipgramModel = skip.get_skipgram()
        skipgramEmbedding = skip.getCustomEmbeddingMatrix(
            skipgramModel, reverse_dict)
        config.skipgramEmbedding = skipgramEmbedding

    print("8 - Start Training\n")
    ml.runTraining(config, X_train, y_train, labels)
    print("============ End =============\n")
Ejemplo n.º 2
0
def mainTrainModelOnPreprocessedData():
    print("============ Start =============\n")
    print("1 - Load Configuration\n")
    config = cg.Config()
    dataL = dt.Data(config)
    print("2 - Load Data and Targets Sequences\n")
    map_labels = dataL.loadMapFromJson(config.data_path + "map_labels.json")
    tickets = dataL.loadDataInArray(
        config.main_path + "parsed_sequences_b/tickets_training.txt",
        config.csv_encoding)
    targets = dataL.loadDataInArray(config.main_path +
                                    "parsed_sequences_b/target_training.txt")
    labels = dataL.getfirstLevelTargets(map_labels['map'])
    oneHotVectorTarget = dataL.transformListStringInOneHot(targets)
    print("*** Training Size : " + str(len(tickets)) + "\n")
    tickets_parsed = []
    for t in tickets:
        tickets_work = []
        tt = re.split("\[", t)
        tt = re.split("\]", tt[1])
        tt = re.split(",", tt[0])
        for inner_t in tt:
            a = int(inner_t)
            tickets_work.append(a)
        tickets_parsed.append(tickets_work)

    print("3 - Load Vocabulary\n")
    voc = vc.Vocabulary(config)
    dictionary = voc.loadDictionary("vocabulary")
    reverse_dict = voc.getReverseDictionary(dictionary)

    print("*** Training Size : " + str(len(tickets)) + "\n")
    if config.use_pretrained_embs:
        print("*** Uso pretrained Words Embedding\n")
        skip = sk.SkipgramModel(config)
        skipgramModel = skip.get_skipgram()
        skipgramEmbedding = skip.getCustomEmbeddingMatrix(
            skipgramModel, reverse_dict)
        config.skipgramEmbedding = skipgramEmbedding

    print("4 - Start Training\n")
    ml.runTraining(config, tickets_parsed, oneHotVectorTarget, labels)
    print("============ End =============\n")
Ejemplo n.º 3
0
def trainPriority():
    print("============ Start =============\n")

    print("1 - Load Configuration\n")
    config = cg.Config()
    config.configFromFile("config/priority_config.json")
    dataL = dt.Data(config)
    print("2 - Load Data and Targets\n")
    tickets_training = dataL.loadDataInArray(
        config.main_path + "onlyAperturaPriority/tickets_training.txt",
        config.csv_encoding)
    tickets_test = dataL.loadDataInArray(
        config.main_path + "onlyAperturaPriority/tickets_test.txt",
        config.csv_encoding)
    targets_training = dataL.loadDataInArray(
        config.main_path + "onlyAperturaPriority/targets_training.txt",
        config.csv_encoding)
    targets_test = dataL.loadDataInArray(
        config.main_path + "onlyAperturaPriority/targets_test.txt",
        config.csv_encoding)
    labels = ["1", "2", "3", "4", "5"]
    print("3 - Preprocess Data\n")
    tickets_training_tl, targets_training, words = preprocessData(
        tickets_training, targets_training, labels)
    tickets_test_tl, targets_test, w_ = preprocessData(tickets_test,
                                                       targets_test, labels)
    if config.loadOrbuild_dictionary == "build":
        print("4 - Build Vocabulary\n")
        # Create Vocabulary
        voc = vc.Vocabulary(config)
        dictionary, reverse_dict = voc.build_dictionary(words, labels)
        voc.saveDictionary(dictionary, "vocabulary")
        print("*** Vocabulary saved \n")
    else:
        print("4 - Load Vocabulary\n")
        # Load Existing Vocabulary
        voc = vc.Vocabulary(config)
        dictionary = voc.loadDictionary("vocabulary")
        reverse_dict = voc.getReverseDictionary(dictionary)

    print("5 - Create Ticket Sequences and Targets Hot Vectors\n")
    # Create Sequences and HotVectors for the Target
    tickets_training_sequences = dataL.createDataSequence(
        tickets_training_tl, dictionary)
    oneHotVectorTarget_training = dataL.transformInOneHotVector(
        labels, targets_training)
    tickets_test_sequences = dataL.createDataSequence(tickets_test_tl,
                                                      dictionary)
    oneHotVectorTarget_test = dataL.transformInOneHotVector(
        labels, targets_test)
    print("6 - Filter Data - Removeing Token OOV\n")
    filtdata = fd.FilterData(config, labels)
    tickets_training_sequences, oneHotVectorTarget_training, trash = filtdata.removeTokenOOV(
        tickets_training_sequences, oneHotVectorTarget_training, dictionary)
    print("	*** Classe Cestino in Training : " + str(len(trash)) + "\n")
    tickets_test_sequences, oneHotVectorTarget_test, trash = filtdata.removeTokenOOV(
        tickets_test_sequences, oneHotVectorTarget_test, dictionary)
    print("	*** Classe Cestino in Test : " + str(len(trash)) + "\n")
    print("7 - Generate Training and Testing Dataset\n")
    dataL.writeArrayInFileCompleteDataPath(
        tickets_training_sequences,
        config.data_sequences_path + '/tickets_training.txt', "utf-8")
    dataL.writeArrayInFileCompleteDataPath(
        tickets_test_sequences,
        config.data_sequences_path + '/tickets_test.txt', "utf-8")
    dataL.writeArrayInFileCompleteDataPath(
        oneHotVectorTarget_training,
        config.data_sequences_path + '/target_training.txt', "utf-8")
    dataL.writeArrayInFileCompleteDataPath(
        oneHotVectorTarget_test,
        config.data_sequences_path + '/target_test.txt', "utf-8")
    print("	*** Training Size : " + str(len(tickets_training_sequences)) +
          "\n")
    print("	*** Test Size : " + str(len(tickets_test_sequences)) + "\n")
    if config.use_pretrained_embs:
        print("	*** Use pretrained Words Embedding\n")
        skip = sk.SkipgramModel(config)
        skipgramModel = skip.get_skipgram()
        skipgramEmbedding = skip.getCustomEmbeddingMatrix(
            skipgramModel, reverse_dict)
        config.skipgramEmbedding = skipgramEmbedding

    print("8 - Start Training\n")
    ml.runTraining(config, tickets_training_sequences,
                   oneHotVectorTarget_training, labels)
    print("============ End =============\n")
Ejemplo n.º 4
0
def training_model(main_path, type, config_file, from_date, to_date, customer):

    #logging.basicConfig(filename=logCustomer, level=logging.INFO)
    #lg.configureLogger(QIUserLogger, customer, "training")
    #
    QIUserLogger.info(
        "-----------------------------------------------------------------")
    QIUserLogger.info(
        "------------------------Training Start---------------------------")
    #
    QIUserLogger.info("** Initialization start... **")
    main_path = main_path
    type = type
    config_file = config_file
    from_date = from_date
    to_date = to_date

    QIUserLogger.info("	MainPath - " + str(main_path))
    QIUserLogger.info("	Type - " + str(type))
    QIUserLogger.info("	ConfigFile - " + str(config_file))
    QIUserLogger.info("	FromDate - " + str(from_date))
    QIUserLogger.info("	ToDate - " + str(to_date))
    #
    QIUserLogger.info("** Initialization End **")

    try:
        QIUserLogger.info("1 - Load Configurations")
        QIUserLogger.info("	** Config for Classification")
        # Load Config files
        configModel = cg.Config()
        configModel.configFromFile(config_file)
        configModel.main_path = main_path
        configModel.updateDataOfMainPath(config_file, main_path)
        dataL = dt.Data(configModel)
        #
        QIUserLogger.info("2 - Login In API")
        # Login to API
        configConnection = con.ConfigConnection()

        dir_path = os.path.dirname(os.path.realpath(__file__))
        configConnection.configFromFile(dir_path + "/config/" + customer +
                                        "/connector_config.json")
        connector = con.Connector(configConnection)
        # Create Persistent Session
        Reqsess = requests.session()
        # LogIN
        connector.login(Reqsess)
        QIUserLogger.info("3 - GET TICKETS FROM API")
        #
        params = "closedfrom=" + str(from_date) + "&closedto=" + str(
            to_date) + "&maxnum=" + str(configConnection.max_tickets_to_get)
        #params = {"closedfrom": from_date, "closedto": to_date, "maxnum" : configConnection.max_tickets_to_get}
        responseTicket = connector.getTickets(Reqsess, params)
        if len(responseTicket) > 0:
            rTicket = []
            for t in responseTicket:
                rTicket.append(t['description'])
            #
            id2lab = dict(
                zip(configModel.labels_map.values(),
                    configModel.labels_map.keys()))
            #
            gather_tickets, gather_targets = gatherData(
                type, responseTicket, configModel, id2lab)
            #
            QIUserLogger.info("4 - REMOVE STOP WORDS FROM NEW TICKETS")
            tok = tk.Tokenizer(gather_tickets)
            tok.tokenizeTickets()
            tickets_to_lower = tok.toLower()
            gather_tickets, gather_targets = tok.removeStopWordsToString(
                tickets_to_lower, gather_targets)

            QIUserLogger.info("5 - GET STORED DATA TICKETS")
            tickets_train = dataL.loadDataInArray(
                configModel.data_path + "/tickets.txt",
                configModel.csv_encoding)
            targets_train = dataL.loadDataInArray(configModel.data_path +
                                                  "/targets.txt")
            #
            # Count if we reached the threshold
            QIUserLogger.info("6 - MERGE THE DATA - STORED AND GATHERED")
            max_length = configModel.max_num_tickets
            len_gather_tickets = len(gather_tickets)
            len_tickets = len(tickets_train)
            #Effettuo un nuovo training su tutto il dataset e non un transfer
            #learning perchè voglio utilizzare sempre un vocabolario aggiornato.
            tickets = tickets_train + gather_tickets
            targets = targets_train + gather_targets
            reached_dim = len_gather_tickets + len_tickets
            if reached_dim > max_length:
                elem_to_cut = reached_dim - max_length
                #cut out the firsts elem_to_cut elements
                merged_targets = tickets[elem_to_cut:]
                merged_tickets = targets[elem_to_cut:]
                tickets = merged_tickets
                targets = merged_targets
                reached_dim = max_length

            QIUserLogger.info("7 - REMOVE IDENTICAL TICKETS")
            #tickets, targets = ut.removeIdenticalTickets(tickets, targets)
            tickets, targets = ut.removeIdenticalTicketsFromNew(
                tickets, targets, len_tickets, reached_dim)

            QIUserLogger.info("8 - SAVING MERGED DATA")
            dataL.writeArrayInFileCompleteDataPath(
                tickets, configModel.data_path + '/tickets.txt', "utf-8")
            dataL.writeArrayInFileCompleteDataPath(
                targets, configModel.data_path + '/targets.txt', "utf-8")
            #
            QIUserLogger.info("9 - EXTRACT WORDS FROM TICKETS")
            words = tok.extractWordsTicketString(tickets)
            #
            QIUserLogger.info("10 - BUILD NEW VOCABULARY")
            # Create Vocabulary
            voc = vc.Vocabulary(configModel)
            dictionary, reverse_dict = voc.build_dictionary(
                words, configModel.labels)
            voc.saveDictionary(dictionary, "vocabulary")
            QIUserLogger.info("*** Vocabulary saved")
            #
            QIUserLogger.info("11 -- SPLIT DATA IN TRAINING AND TEST DATASET")
            tickets_training, tickets_test, Target_training, Target_test = ut.get_train_and_test(
                tickets, targets)
            dataL.writeArrayInFileCompleteDataPath(
                tickets_training,
                configModel.data_path + '/tickets_training.txt', "utf-8")
            dataL.writeArrayInFileCompleteDataPath(
                Target_training,
                configModel.data_path + '/targets_training.txt', "utf-8")

            dataL.writeArrayInFileCompleteDataPath(
                tickets_test, configModel.data_path + '/tickets_test.txt',
                "utf-8")
            dataL.writeArrayInFileCompleteDataPath(
                Target_test, configModel.data_path + '/targets_test.txt',
                "utf-8")

            #
            QIUserLogger.info("12 - CREATE TICKETS AND TARGETS SEQUENCES")
            # Create Sequences and HotVectors for the Target
            tickets_training_sequences = dataL.createDataSequenceTicketsString(
                tickets_training, dictionary)
            oneHotVectorTarget_training = dataL.transformInOneHotVector(
                configModel.labels, Target_training)
            #
            QIUserLogger.info("13 - FILTER OUT  DATA - Removing Token OOV")
            filtdata = fd.FilterData(configModel, configModel.labels)
            tickets_training_sequences, oneHotVectorTarget_training, trash = filtdata.removeTokenOOV(
                tickets_training_sequences, oneHotVectorTarget_training,
                dictionary)
            QIUserLogger.info("	*** Classe Cestino in Training : " +
                              str(len(trash)))
            #
            #QIUserLogger.info("	-- Split Training | Test Dataset")
            #tickets_training_sequences, tickets_test_sequences, oneHotVectorTarget_training, oneHotVectorTarget_test = ut.get_train_and_test(tickets_training_sequences, oneHotVectorTarget_training)
            #
            QIUserLogger.info("14 - SAVING TRAINING SEQUENCES")
            dataL.writeArrayInFileCompleteDataPath(
                tickets_training_sequences,
                configModel.data_sequences_path + '/tickets_training.txt',
                "utf-8")
            dataL.writeArrayInFileCompleteDataPath(
                oneHotVectorTarget_training,
                configModel.data_sequences_path + '/target_training.txt',
                "utf-8")

            QIUserLogger.info("	*** Training Size : " +
                              str(len(tickets_training_sequences)) + "\n")
            if configModel.use_pretrained_embs:
                QIUserLogger.info("	*** Use pretrained Words Embedding")
                skip = sk.SkipgramModel(configModel)
                skipgramModel = skip.get_skipgram()
                skipgramEmbedding = skip.getCustomEmbeddingMatrix(
                    skipgramModel, reverse_dict)
                configModel.skipgramEmbedding = skipgramEmbedding
                # Start Training
                QIUserLogger.info("15 - START TRAINING")
            ml.runTraining(configModel, tickets_training_sequences,
                           oneHotVectorTarget_training, configModel.labels)
            QIUserLogger.info("============ End =============")
        else:
            QIUserLogger.info(
                "No New Tickets found. There is no need of a new training.")

        # LogIN
        connector.logout(Reqsess)
        #
    except Exception as e:
        print(str(e))
        QIUserLogger.error("Error in training_model " + str(e))