from load_data import Data from model import Model from train import Train if __name__ == "__main__": data = Data() data.load() data.data_augment() data.data_splitting() data.print() dataset = data.get_dataset() testset = data.get_testset() models = Model(dataset[0].shape[1:], 50) m = models.ResNet() m.summary() # train = Train(m, dataset, testset, 50, 32, 'adam', 'sparse_categorical_crossentropy') train = Train(m, dataset, testset, 100, 200, 'adam', 'categorical_crossentropy') train.training() train.evaluate() # tensorboard --logdir logs/scalars --port=7000
def main(config: argparse.Namespace) -> None: # TODO docstring with open(config.config_file, 'r') as cfg: experiments: dict = yaml.load(cfg) print('loading data') data = Data(config.yelp_file, config.geneea_file) data.print(f'Processing file {config.config_file}') print('generating samples') datasize: int = data.generate_sample(experiments['config']['chunks'], LikeTypeEnum.USEFUL) stats: DataGraph = DataGraph('', 'number of instances', 'percentage') # texts_tokenized = (self._tokenize(row.text) for index, row # in self.data.iterrows()) # words_freqs = nltk.FreqDist(w.lower() for tokens in texts_tokenized # for w in tokens) # # # TODO statistics # # for x in all_words: # # print(all_words[x]) # # # self.print('total number of words:', sum(all_words.values())) # # self.print('unique words:', len(all_words)) # # self.print('words present only once:', # # sum(c for c in all_words.values() if c == 1)) # # all_words.plot(30) # # # only the right frequencies # self.gram_words = words_freqs.copy() # for w, count in words_freqs.items(): # if count > 200 or count == 20: # # TODO Measure # del self.gram_words[w] # # self.gram_words = frozenset(self.gram_words.keys()) # calculate mutual information of all features if wanted # and dump it into text files if experiments['config']['mi']: for x in FeatureSetEnum: if x == FeatureSetEnum.BIGRAMS or \ x == FeatureSetEnum.TRIGRAMS or \ x == FeatureSetEnum.FOURGRAMS: continue if x == FeatureSetEnum.UNIGRAMS: # TODO REMOVE continue # get data data.set_statfile(f'mi_{x}') data.print(f'Mutual Information of {x}.') train = data.get_feature_dict(SampleTypeEnum.TRAIN, {x}) test = data.get_feature_dict(SampleTypeEnum.TEST, {x}) instances = train + test # get matrix matrix_convertor = featurematrixconversion.Preprocessor({}) vector_instances = matrix_convertor.process( instances, SampleTypeEnum.TRAIN) # calculate mutual info matrix_gen, labels_gen = zip(*vector_instances) matrix = sparse.vstack(matrix_gen) labels = list(labels_gen) mi = mutual_info_classif(matrix, labels) # dump data for f_name, f_mi in zip(matrix_convertor.all_fs, mi): data.print(f'{f_name} {f_mi}') data.set_statfile(f'statistics') first_run: bool = True while True: train_size: int \ = int(datasize - datasize / experiments['config']['chunks']) train_size_log: int = int(ceil(log2(train_size)) + 1) data.max_tfidf = experiments['config']['max_tfidf'] data.max_ngrams = experiments['config']['max_ngrams'] for ex in experiments['tasks']: # convert features to set: features: Set[FeatureSetEnum] \ = {FeatureSetEnum[f] for f in ex['features']} train_set = data.get_feature_dict(SampleTypeEnum.TRAIN, features, ex['extra_data']) test_set = data.get_feature_dict(SampleTypeEnum.TEST, features, ex['extra_data']) if first_run: unique_features: set = set() for inst in train_set: unique_features = unique_features.union(set( inst[0].keys())) data.print( f'Number of unique features for {ex["name"]}: {len(unique_features)}' ) unique_features = set() l_curves = experiments['config']['l_curves'] start_size: int = 1 if l_curves \ else train_size_log-1 for t_size in map(lambda x: min(2**x, train_size), range(start_size, train_size_log)): if l_curves: train_set_copy = train_set[:t_size] test_set_copy = test_set[:] else: train_set_copy = train_set test_set_copy = test_set # preprocess data for pp in ex['preprocessing']: prep: PreprocessorBase \ = getattr(preprocessors, pp).Preprocessor(ex['config']) train_set_copy = prep.process(train_set_copy, SampleTypeEnum.TRAIN) test_set_copy = prep.process(test_set_copy, SampleTypeEnum.TEST) if first_run and hasattr(train_set[0][0], 'keys'): unique_features: set = set() for inst in train_set: unique_features = unique_features.union( set(inst[0].keys())) data.print( f'Number of unique features after preprocessing for {ex["name"]}: {len(unique_features)}' ) unique_features = set() cls: ClassifierBase \ = getattr(classifiers, ex['classificator']).Classifier(ex['config']) cls.train(train_set_copy) evaluation: dict \ = compute_evaluation_scores(cls, test_set_copy, LikeTypeEnum.USEFUL) stats.add_points(len(train_set_copy), ex['name'], evaluation) if l_curves: evaluation: dict \ = compute_evaluation_scores(cls, train_set_copy, LikeTypeEnum.USEFUL) stats.add_points(len(train_set_copy), ex['name'] + '-train', evaluation) first_run = False if not data.prepare_next_dataset(): break # aggregate results here for g in experiments['graphs']: stats.name = g['name'] stats.set_view(g['data']) data.plot(stats)