Esempio n. 1
0
    def predict_on_test_files(self, estimator, csv_files_path):

        failed_csvs = []

        for csv_file in tqdm(os.listdir(csv_files_path), desc="predicting"):
            csv_file = os.path.join(csv_files_path, csv_file)
            if csv_file.endswith(".csv"):
                sentence = ""
                # try:
                print_info("processing ====> {}".format(csv_file))
                df = pd.read_csv(csv_file).fillna(UNKNOWN_WORD)

                df = self.predict_on_test_file(estimator, df)

                out_dir = estimator.model_dir + "/predictions/"
                check_n_makedirs(out_dir)
                df.to_csv(out_dir + ntpath.basename(csv_file), index=False)
                # except Exception as e:
                #     print_error(traceback.print_exc())
                #     failed_csvs.append(csv_file)
                #     print_warn("Failed processing ====> {}".format(csv_file))
                #     pdb.set_trace()

        # print_error(failed_csvs)
        return out_dir
Esempio n. 2
0
    def predict_on_csv_files(self, estimator,
                             csv_files_path):

        df = ""
        sentence = ""
        # for csv_file in tqdm(os.listdir(csv_files_path)):
        #    csv_file = os.path.join(csv_files_path, csv_file)
        csv_file = csv_files_path
        if csv_file.endswith(".csv"):
            print_info(csv_file)
            # print_info("processing ====> {}".format(csv_file))
            df = pd.read_csv(csv_file).fillna(UNKNOWN_WORD)
            # df = io_2_iob(df, entity_col, entity_iob_col) # removing since we are using preprocessed test folder TODO chain IOB
            sentence = (" ".join(df[self.preprocessed_data_info.TEXT_COL].values))

            char_ids = [[self.preprocessed_data_info.char_2_id_map.get(c, 0) for c in word] for word in sentence.split(" ")]
            char_ids, char_ids_length = self._pad_sequences([char_ids], pad_tok=0, nlevels=2)

            # TODO add batch support
            predicted_tags, confidence, pred_1, pred_1_confidence, pred_2, pred_2_confidence, \
            pred_3, pred_3_confidence = self.get_tags(estimator, sentence, char_ids, self.preprocessed_data_info.ENTITY_VOCAB_FILE)
            df["predictions"] = predicted_tags
            df["confidence"] = confidence
            df["pred_1"] = pred_1
            df["pred_1_confidence"] = pred_1_confidence
            df["pred_2"] = pred_2
            df["pred_2_confidence"] = pred_2_confidence
            df["pred_3"] = pred_3
            df["pred_3_confidence"] = pred_3_confidence

            out_dir = estimator.model_dir +"/predictions/"
            check_n_makedirs(out_dir)
            df.to_csv(out_dir +ntpath.basename(csv_file), index=False)

        return df
Esempio n. 3
0
    def __init__(self, name, experiment_dir, batch_size):
        '''
        Data Iterators with different features type are expected to 
        implement this interface, exposing the input functions and their hooks
        :param experiment_dir: 
        :param batch_size: 
        
        '''

        self.NAME = name
        self.EXPERIMENT_ROOT_DIR = experiment_dir
        self.OUT_DIR = self.EXPERIMENT_ROOT_DIR + "/" + self.NAME + "/"

        self._load_ini()
        # self.preprocessed_data_info = PreprocessedDataInfo.load(experiment_dir)

        # This rule is assumed to be correct if the previous stage is of IPreprocessorInterface
        self.PREPROCESSED_DATA_DIR = self.EXPERIMENT_ROOT_DIR + "/" + self.config.get_item(
            "OutputDirectories", "preprocessed_data_dir")
        self.TRAIN_FILES_IN_PATH = self.PREPROCESSED_DATA_DIR + "/train/"
        self.VAL_FILES_IN_PATH = self.PREPROCESSED_DATA_DIR + "/val/"
        self.TEST_FILES_IN_PATH = self.PREPROCESSED_DATA_DIR + "/test/"

        self.TEXT_COL = self.config.get_item("Schema", "text_column")
        self.ENTITY_COL = self.config.get_item("Schema", "entity_column")
        self.WORDS_VOCAB_FILE = self.OUT_DIR + "/" + self.TEXT_COL + "_" + "vocab.tsv"
        self.CHARS_VOCAB_FILE = self.OUT_DIR + "/" + self.TEXT_COL + "_" + "chars_vocab.tsv"
        self.ENTITY_VOCAB_FILE = self.OUT_DIR + "/" + self.ENTITY_COL + "_vocab.tsv"

        check_n_makedirs(self.OUT_DIR)

        self.BATCH_SIZE = batch_size
        self.NUM_TAGS = None
        self.VOCAB_SIZE = None
        self.CHAR_VOCAB_SIZE = None

        self._train_data_input_fn = None
        self._train_data_init_hook = None

        self._val_data_input_fn = None
        self._val_data_init_hook = None

        self._test_data_input_fn = None
        self._test_data_init_hook = None
    def predict_on_test_files(self, estimator, csv_files_path):

        failed_csvs = []
        out_dir = estimator.model_dir + "/predictions/"
        check_n_makedirs(out_dir)

        files = [
            file for file in os.listdir(csv_files_path)
            if file.endswith('.csv')
        ]
        batchsize = 12
        index = 0
        remaining = len(files)
        progress_bar = tqdm(total=len(files))

        while remaining > 0:
            batch = min(remaining, batchsize)

            print('NEW BATCH\n')
            dfs = []

            for csv_file in files[index:index + batch]:
                df = pd.read_csv(os.path.join(csv_files_path,
                                              csv_file)).fillna(UNKNOWN_WORD)
                df.file_name = csv_file
                dfs.append(df)

            dfs = self.predict_on_dataframes(estimator, dfs)

            for predicted_df in dfs:
                print_info(predicted_df.file_name)
                predicted_df.to_csv(out_dir +
                                    ntpath.basename(predicted_df.file_name),
                                    index=False)

            index += batch
            remaining -= batch
            progress_bar.update(index)

        progress_bar.close()
        return out_dir
 def create_target_directories(self):
     if os.path.exists(self.OUT_DIR):
         if self.OVER_WRITE == "yes":
             print_info("Deletingls data folder: {}".format(self.OUT_DIR))
             shutil.rmtree(self.OUT_DIR)
             print_info("Recreating data folder: {}".format(self.OUT_DIR))
             os.makedirs(self.OUT_DIR)
             check_n_makedirs(self.TRAIN_CSV_INTERMEDIATE_PATH)
             check_n_makedirs(self.VAL_CSV_INTERMEDIATE_PATH)
             check_n_makedirs(self.TEST_CSV_INTERMEDIATE_PATH)
         else:
             print_info(
                 "Skipping preprocessing step, since the data is already available"
             )
             return "skip"
     else:
         print_info("Creating data folder: {}".format(self.OUT_DIR))
         os.makedirs(self.OUT_DIR)
         check_n_makedirs(self.TRAIN_CSV_INTERMEDIATE_PATH)
         check_n_makedirs(self.VAL_CSV_INTERMEDIATE_PATH)
         check_n_makedirs(self.TEST_CSV_INTERMEDIATE_PATH)
Esempio n. 6
0
    def with_user_hyperparamaters(experiment_root_dir, data_dir):
        # model_root_dir,
        #                        vocab_size,
        #                        char_vocab_size,
        #                        number_tags,
        #                        unknown_word,
        #                        pad_word,
        #                        tags_vocab_file,
        #                        words_vocab_file,
        #                        chars_vocab_file):

        preprocessed_data_info = PreprocessedDataInfo.load(data_dir)

        use_crf = "y"  # TODO
        use_char_embedding = False
        char_level_lstm_hidden_size = 128  # default
        char_emd_size = 128  # default

        if use_crf == 'y':
            use_crf = True
        else:
            use_crf = False

        use_char_embedding_option = input("use_char_embedding (y/n)") or "y"
        learning_rate = float(input("learning_rate (0.001): ")) or 0.001
        num_lstm_layers = int(input("num_word_lstm_layers (2): ")) or 2

        if use_char_embedding_option == 'y':
            use_char_embedding = True
            char_level_lstm_hidden_size = int(
                input("char_level_lstm_hidden_size (32): ")) or 32
            char_emd_size = int(input("char_emd_size (32): ")) or 32
        else:
            use_char_embedding = False

        word_level_lstm_hidden_size = int(
            input("word_level_lstm_hidden_size (32): ")) or 32
        word_emd_size = int(input("word_emd_size (128): ")) or 32
        out_keep_propability = float(
            input("out_keep_propability(0.5) : ")) or 0.5

        # Does this sound logical? review please
        model_dir = experiment_root_dir + "/bilstm_crf_v1/" + \
                    "charembd_{}_lr_{}_lstmsize_{}-{}-{}_wemb_{}_cemb_{}_outprob_{}".format(
                        str(use_char_embedding),
                        learning_rate,
                        num_lstm_layers,
                        word_level_lstm_hidden_size,
                        char_level_lstm_hidden_size,
                        word_emd_size,
                        char_emd_size,
                        out_keep_propability)

        model_config = BiLSTMCRFConfigV1(
            model_dir=model_dir,
            vocab_size=preprocessed_data_info.VOCAB_SIZE,
            char_vocab_size=preprocessed_data_info.CHAR_VOCAB_SIZE,
            number_tags=preprocessed_data_info.NUM_TAGS,
            unknown_word=UNKNOWN_WORD,
            pad_word=PAD_WORD,
            tags_vocab_file=preprocessed_data_info.ENTITY_VOCAB_FILE,
            words_vocab_file=preprocessed_data_info.WORDS_VOCAB_FILE,
            chars_vocab_file=preprocessed_data_info.WORDS_VOCAB_FILE,
            # hyper parameters
            use_char_embedding=use_char_embedding,
            learning_rate=learning_rate,
            word_level_lstm_hidden_size=word_level_lstm_hidden_size,
            char_level_lstm_hidden_size=char_level_lstm_hidden_size,
            word_emd_size=word_emd_size,
            char_emd_size=char_emd_size,
            num_lstm_layers=num_lstm_layers,
            out_keep_propability=out_keep_propability,
            use_crf=True)
        check_n_makedirs(model_dir)
        IModelConfig.save(model_dir=model_dir, config=model_config)

        return model_config
Esempio n. 7
0
    def with_user_hyperparamaters(experiment_root_dir, data_iterator):

        use_crf = "y"  # TODO
        use_char_embedding = False
        char_level_lstm_hidden_size = 32  # default
        char_emd_size = 32  # default

        if use_crf == 'y':
            use_crf = True
        else:
            use_crf = False

        use_char_embedding_option = input("use_char_embedding (y/n): ") or "y"
        learning_rate = input("learning_rate (0.001): ") or 0.001
        learning_rate = float(learning_rate)
        num_lstm_layers = input("num_word_lstm_layers (2): ") or 2
        num_lstm_layers = int(num_lstm_layers)

        if use_char_embedding_option == 'y':
            use_char_embedding = True
            char_level_lstm_hidden_size = input(
                "char_level_lstm_hidden_size (48): ") or 32
            char_level_lstm_hidden_size = int(char_level_lstm_hidden_size)
            char_emd_size = input("char_emd_size (32): ") or 32
            char_emd_size = int(char_emd_size)
        else:
            use_char_embedding = False

        word_level_lstm_hidden_size = input(
            "word_level_lstm_hidden_size (48): ") or 48
        word_level_lstm_hidden_size = int(word_level_lstm_hidden_size)
        word_emd_size = input("word_emd_size (48): ") or 48
        word_emd_size = int(word_emd_size)
        out_keep_propability = input("out_keep_propability(0.5) : ") or 0.5
        out_keep_propability = float(out_keep_propability)

        # Does this sound logical? review please
        '''
        experiment_root_dir/
            - data_iterator/
                - model_name/
                    - user_hyper_params/
        '''
        model_dir = experiment_root_dir + "/" + data_iterator.NAME + "/bilstm_crf_v0/" + \
                    "charembd_{}_lr_{}_lstmsize_{}-{}-{}_wemb_{}_cemb_{}_outprob_{}".format(
                        str(use_char_embedding),
                        learning_rate,
                        num_lstm_layers,
                        word_level_lstm_hidden_size,
                        char_level_lstm_hidden_size,
                        word_emd_size,
                        char_emd_size,
                        out_keep_propability)

        model_config = BiLSTMCRFConfigV0(
            model_dir=model_dir,
            vocab_size=data_iterator.VOCAB_SIZE,
            char_vocab_size=data_iterator.CHAR_VOCAB_SIZE,
            number_tags=data_iterator.NUM_TAGS,
            unknown_word=UNKNOWN_WORD,
            pad_word=PAD_WORD,
            tags_vocab_file=data_iterator.ENTITY_VOCAB_FILE,
            words_vocab_file=data_iterator.WORDS_VOCAB_FILE,
            chars_vocab_file=data_iterator.CHAR_VOCAB_SIZE,
            # hyper parameters
            use_char_embedding=use_char_embedding,
            learning_rate=learning_rate,
            word_level_lstm_hidden_size=word_level_lstm_hidden_size,
            char_level_lstm_hidden_size=char_level_lstm_hidden_size,
            word_emd_size=word_emd_size,
            char_emd_size=char_emd_size,
            num_lstm_layers=num_lstm_layers,
            out_keep_propability=out_keep_propability,
            use_crf=True)
        check_n_makedirs(model_dir)
        IModelConfig.save(model_dir=model_dir, config=model_config)

        return model_config