Esempio n. 1
0
from load_data import Data
from model import Model
from train import Train


if __name__ == "__main__":

    data = Data()
    data.load()
    data.data_augment()
    data.data_splitting()
    data.print()
    dataset = data.get_dataset()
    testset = data.get_testset()
    
    models = Model(dataset[0].shape[1:], 50)
    m = models.ResNet()
    m.summary()

    # train = Train(m, dataset, testset, 50, 32, 'adam', 'sparse_categorical_crossentropy')
    train = Train(m, dataset, testset, 100, 200, 'adam', 'categorical_crossentropy')
    train.training()
    train.evaluate()

    


# tensorboard --logdir logs/scalars --port=7000
Esempio n. 2
0
def main(config: argparse.Namespace) -> None:
    # TODO docstring
    with open(config.config_file, 'r') as cfg:
        experiments: dict = yaml.load(cfg)

    print('loading data')
    data = Data(config.yelp_file, config.geneea_file)
    data.print(f'Processing file {config.config_file}')

    print('generating samples')
    datasize: int = data.generate_sample(experiments['config']['chunks'],
                                         LikeTypeEnum.USEFUL)

    stats: DataGraph = DataGraph('', 'number of instances', 'percentage')

    # texts_tokenized = (self._tokenize(row.text) for index, row
    #                    in self.data.iterrows())
    # words_freqs = nltk.FreqDist(w.lower() for tokens in texts_tokenized
    #                             for w in tokens)
    #
    # # TODO statistics
    # # for x in all_words:
    # # print(all_words[x])
    #
    # # self.print('total number of words:', sum(all_words.values()))
    # # self.print('unique words:', len(all_words))
    # # self.print('words present only once:',
    # # sum(c for c in all_words.values() if c == 1))
    # # all_words.plot(30)
    #
    # # only the right frequencies
    # self.gram_words = words_freqs.copy()
    # for w, count in words_freqs.items():
    #     if count > 200 or count == 20:
    #         # TODO Measure
    #         del self.gram_words[w]
    #
    # self.gram_words = frozenset(self.gram_words.keys())

    # calculate mutual information of all features if wanted
    # and dump it into text files
    if experiments['config']['mi']:
        for x in FeatureSetEnum:
            if x == FeatureSetEnum.BIGRAMS or \
                    x == FeatureSetEnum.TRIGRAMS or \
                    x == FeatureSetEnum.FOURGRAMS:
                continue
            if x == FeatureSetEnum.UNIGRAMS:  # TODO REMOVE
                continue
            # get data
            data.set_statfile(f'mi_{x}')
            data.print(f'Mutual Information of {x}.')
            train = data.get_feature_dict(SampleTypeEnum.TRAIN, {x})
            test = data.get_feature_dict(SampleTypeEnum.TEST, {x})
            instances = train + test

            # get matrix
            matrix_convertor = featurematrixconversion.Preprocessor({})
            vector_instances = matrix_convertor.process(
                instances, SampleTypeEnum.TRAIN)

            # calculate mutual info
            matrix_gen, labels_gen = zip(*vector_instances)
            matrix = sparse.vstack(matrix_gen)
            labels = list(labels_gen)
            mi = mutual_info_classif(matrix, labels)

            # dump data
            for f_name, f_mi in zip(matrix_convertor.all_fs, mi):
                data.print(f'{f_name}	{f_mi}')

        data.set_statfile(f'statistics')

    first_run: bool = True

    while True:
        train_size: int \
            = int(datasize - datasize / experiments['config']['chunks'])
        train_size_log: int = int(ceil(log2(train_size)) + 1)

        data.max_tfidf = experiments['config']['max_tfidf']
        data.max_ngrams = experiments['config']['max_ngrams']

        for ex in experiments['tasks']:
            # convert features to set:
            features: Set[FeatureSetEnum] \
                = {FeatureSetEnum[f] for f in ex['features']}
            train_set = data.get_feature_dict(SampleTypeEnum.TRAIN, features,
                                              ex['extra_data'])
            test_set = data.get_feature_dict(SampleTypeEnum.TEST, features,
                                             ex['extra_data'])

            if first_run:
                unique_features: set = set()
                for inst in train_set:
                    unique_features = unique_features.union(set(
                        inst[0].keys()))
                data.print(
                    f'Number of unique features for {ex["name"]}: {len(unique_features)}'
                )
                unique_features = set()

            l_curves = experiments['config']['l_curves']
            start_size: int = 1 if l_curves \
                else train_size_log-1

            for t_size in map(lambda x: min(2**x, train_size),
                              range(start_size, train_size_log)):
                if l_curves:
                    train_set_copy = train_set[:t_size]
                    test_set_copy = test_set[:]
                else:
                    train_set_copy = train_set
                    test_set_copy = test_set

                # preprocess data
                for pp in ex['preprocessing']:
                    prep: PreprocessorBase \
                        = getattr(preprocessors, pp).Preprocessor(ex['config'])
                    train_set_copy = prep.process(train_set_copy,
                                                  SampleTypeEnum.TRAIN)
                    test_set_copy = prep.process(test_set_copy,
                                                 SampleTypeEnum.TEST)

                if first_run and hasattr(train_set[0][0], 'keys'):
                    unique_features: set = set()
                    for inst in train_set:
                        unique_features = unique_features.union(
                            set(inst[0].keys()))
                    data.print(
                        f'Number of unique features after preprocessing for {ex["name"]}: {len(unique_features)}'
                    )
                    unique_features = set()

                cls: ClassifierBase \
                    = getattr(classifiers, ex['classificator']).Classifier(ex['config'])
                cls.train(train_set_copy)

                evaluation: dict \
                    = compute_evaluation_scores(cls, test_set_copy, LikeTypeEnum.USEFUL)

                stats.add_points(len(train_set_copy), ex['name'], evaluation)

                if l_curves:
                    evaluation: dict \
                        = compute_evaluation_scores(cls, train_set_copy, LikeTypeEnum.USEFUL)

                    stats.add_points(len(train_set_copy),
                                     ex['name'] + '-train', evaluation)

                first_run = False

        if not data.prepare_next_dataset():
            break

    # aggregate results here
    for g in experiments['graphs']:
        stats.name = g['name']
        stats.set_view(g['data'])
        data.plot(stats)