Exemple #1
0
    def __init__(self,
                 meta_file,
                 root_dir,
                 transform=None,
                 pre_processor=None,
                 max_word_length=32):
        self.__meta_file = meta_file
        self.__words = list()
        self.__root_dir = root_dir
        self.__transform = transform
        self.__statistics = None
        self.__pre_processor = pre_processor
        self.__max_word_length = max_word_length

        if self.__pre_processor is None:
            logger.info("No pre-processor selected.")
        else:
            logger.info(f"Selected pre-processor: {pre_processor.name}")

        with TimeMeasure(enter_msg="Begin meta data loading.",
                         exit_msg="Finished meta data loading after {}.",
                         writer=logger.debug):
            self.__process_meta_file()
            self.__availability_check()

        with TimeMeasure(enter_msg="Begin health check.",
                         exit_msg="Finished health check after {}.",
                         writer=logger.debug):
            self.__health_check()

        with TimeMeasure(enter_msg="Begin creating statistics.",
                         exit_msg="Finished creating statistics after {}.",
                         writer=logger.debug):
            self.__create_statistics()
def text_preprocessor(input_filename,
                      *,
                      preprocessor_cls='TextPreprocessor',
                      custom_stop_words=None,
                      lem_ignore_patterns=None,
                      remove_duplicates=False):
    cl.section('Text Preprocessor')

    input_filename = data_source_file(input_filename)
    preprocessor_cls = globals()[preprocessor_cls]

    with TimeMeasure('preprocess_text'):
        result = preprocess_csv(input_filename,
                                preprocessor_cls=preprocessor_cls,
                                custom_stop_words=custom_stop_words,
                                lem_ignore_patterns=lem_ignore_patterns)

        if remove_duplicates:
            result = remove_duplicate_text(result)

        result = tuple(result)
        cl.info('Effective data size: %d' % len(result))

    with TimeMeasure('save_preprocessed'):
        save_preprocessed(result, input_filename)
Exemple #3
0
def get_data_loaders(meta_path,
                     images_path,
                     transformation,
                     augmentation,
                     data_loading_config,
                     pre_processor=None,
                     max_word_length=32):
    relative_train_size = data_loading_config.train_size
    batch_size = data_loading_config.batch_size
    restore_path = data_loading_config.get("restore_path", default=None)
    save_path = data_loading_config.get("save_path", default=None)

    with TimeMeasure(enter_msg="Begin initialization of data set.",
                     exit_msg="Finished initialization of data set after {}.",
                     writer=logger.debug):
        data_set = WordsDataSet(meta_path,
                                images_path,
                                transform=transformation,
                                pre_processor=pre_processor,
                                max_word_length=max_word_length)

    with TimeMeasure(enter_msg="Splitting data set", writer=logger.debug):
        if restore_path is not None and os.path.exists(restore_path):
            loaded = True
            train_data_set, test_data_set = __restore_train_test_split(
                restore_path, data_set)
        else:
            loaded = False
            train_size = int(relative_train_size * len(data_set))
            test_size = len(data_set) - train_size
            train_data_set, test_data_set = random_split(
                data_set, (train_size, test_size))

        if augmentation is not None:
            augmented_data_set = AugmentedDataSet(train_data_set, augmentation)
        else:
            augmented_data_set = train_data_set

    if not loaded:
        __save_train_test_split(save_path, train_data_set, test_data_set)

    with TimeMeasure(enter_msg="Init data loader", writer=logger.debug):
        train_loader = DataLoader(augmented_data_set,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  num_workers=8,
                                  drop_last=False)
        train_eval_loader = DataLoader(train_data_set,
                                       batch_size=batch_size,
                                       shuffle=True,
                                       num_workers=8,
                                       drop_last=False)
        test_loader = DataLoader(test_data_set,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 num_workers=8,
                                 drop_last=False)

    return train_loader, train_eval_loader, test_loader
def model_analyzer(modeldesc, sourcedesc, *, num_top_words=30,
                   num_top_docs=30, debug=False):
    cl.section('LDA Model Analyzer')
    cl.info('Model description: %s' % modeldesc)
    cl.info('Source description: %s' % sourcedesc)

    with TimeMeasure('load_all'):
        ldamodel, corpus, prep_items, source_texts = load_all(modeldesc,
                                                              sourcedesc)

    with TimeMeasure('analyzing'):
        prep_ids = tuple(item[0] for item in prep_items)
        dictionary = ldamodel.id2word
        num_topics = ldamodel.num_topics
        topics = [{
                    'topic_id': i,
                    'words': get_topic_words(ldamodel, i, num_top_words),
                    'popularity': 0.0,
                    'documents': collections.defaultdict(float)
                } for i in range(num_topics)]

        if debug:
            debugfilename = model_file('ldadoctopics-%s.txt' % modeldesc)
            with open(debugfilename, 'w', encoding='utf-8') as debugfile:
                for index, doc in enumerate(corpus):
                    text_id = prep_ids[index]
                    doc_topics = ldamodel.get_document_topics(doc)
                    text = source_texts[text_id].strip()
                    debugfile.write('%s -> %r, %s\n' % (text_id, doc_topics,
                                                        text))

        term_topics_cache = {}

        for word in dictionary:
            term_topics_cache[word] = ldamodel.get_term_topics(word)

        for index, doc in enumerate(corpus):
            for topic_id, prob in ldamodel.get_document_topics(doc):
                topics[topic_id]['popularity'] += prob

            for word, freq in doc:
                if word not in dictionary:
                    continue

                for topic_id, prob in term_topics_cache[word]:
                    topics[topic_id]['documents'][index] += prob * freq

        for topic in topics:
            topic['documents'] = get_topic_top_docs(topic['documents'],
                                                    num_top_docs,
                                                    prep_ids, source_texts)

        topics = sorted(topics, key=lambda x: x['popularity'], reverse=True)

    with TimeMeasure('export_markdown'):
        export_markdown(modeldesc, sourcedesc, topics)
Exemple #5
0
def main():
    try:
        with TimeMeasure():
            check_convolution1D()

        with TimeMeasure():
            check_convolution2D()

    except Exception:
        sleep(1)
        tb = traceback.format_exc()
        print(tb, file=sys.stderr)
def text_preprocessor_user(sourcedesc):
    cl.section('Text Preprocessor Grouping By User')

    assert re.fullmatch(r'[-_0-9a-zA-Z+]+', sourcedesc)

    csvfilename = data_source_file('%s.csv' % sourcedesc)

    with TimeMeasure('preprocess_text'):
        result = list(preprocess_csv(csvfilename))

    with TimeMeasure('save_preprocessed'):
        savefilename = name_with_title_suffix(csvfilename, '-user')
        export_csv(result, savefilename)
Exemple #7
0
 def train_single_model(self, model, loaders, total_epochs, device,
                        de_en_coder, model_id):
     train_loader = loaders[0]
     train_eval_loader = loaders[1]
     test_loader = loaders[2]
     for epoch_idx in range(1, self.__environment.max_epochs + 1):
         enter_msg = f"Train Epoch: {epoch_idx: 4d} (total: {total_epochs + 1: 4d})"
         with TimeMeasure(enter_msg=enter_msg,
                          writer=logger.info,
                          print_enabled=True) as tm:
             current_learning_rate = self.__learning_rate_adaptor(
                 total_epochs)
             loss, words = self.core_training(model, train_loader,
                                              current_learning_rate, device,
                                              de_en_coder)
             logger.info(f"loss: {loss}")
             total_epochs += 1
             if epoch_idx % self.__environment.save_interval is 0:
                 train_metrics = evaluate_model(
                     de_en_coder=de_en_coder,
                     word_prediction=self.__word_prediction,
                     model=model,
                     data_loader=train_eval_loader,
                     device=device)
                 test_metrics = evaluate_model(
                     de_en_coder=de_en_coder,
                     word_prediction=self.__word_prediction,
                     model=model,
                     data_loader=test_loader,
                     device=device)
                 model_data = {
                     "name": f"{model.__class__.__name__}_{model_id:03d}"
                 }
                 self.__stats.save_per_period(total_epochs, train_metrics,
                                              test_metrics, model_data)
Exemple #8
0
def main():
    xtensor_benchmark_parameters = {
        "format": "console",
        "min_time": 0.25,
        "repetitions": 10,
        "report_aggregates_only": True
    }

    xvigra_benchmark_parameters = {
        "format": "console",
        "min_time": 1,
        "repetitions": 10,
        "report_aggregates_only": True
    }

    benchmark_parameters = {
        "xtensor": xtensor_benchmark_parameters,
        "xvigra": xvigra_benchmark_parameters
    }

    benchmark_folders = {
        "xtensor": (
            "benchmark_normalizing",
            "benchmark_transpose-view",
            "benchmark_reshape-view",
            "benchmark_tensor_copy_complete_X",
            "benchmark_tensor_copy_complete_Y",
            "benchmark_tensor_copy_complete_Z",
            "benchmark_strided-view_copy_complete_X",
            "benchmark_strided-view_copy_complete_Y",
            "benchmark_strided-view_copy_complete_Z",
            "benchmark_strided-view_copy_paddingStride_X",
            "benchmark_strided-view_copy_paddingStride_Y",
            "benchmark_strided-view_copy_paddingStride_Z",
            "benchmark_view_copy_complete_X",
            "benchmark_view_copy_complete_Y",
            "benchmark_view_copy_complete_Z",
            "benchmark_view_copy_paddingStride_X",
            "benchmark_view_copy_paddingStride_Y",
            "benchmark_view_copy_paddingStride_Z",
        ),
        "xvigra": (
            "benchmark_convolve1D_inputSize_channelFirst",
            "benchmark_convolve1D_inputSize_channelLast",
            #   "benchmark_convolve2D_inputSize_channelFirst",
            #   "benchmark_convolve2D_inputSize_channelLast",
            #   "benchmark_separableConvolve1D_inputSize",
            #   "benchmark_separableConvolve2D_inputSize",
            #   "benchmark_separableConvolve1D_kernelSize",
            #   "benchmark_separableConvolve2D_kernelSize"
        )
    }

    build_all()

    for folder_name, benchmark_files in benchmark_folders.items():
        for file_name in benchmark_files:
            with TimeMeasure(f"{'─' * 100}\nRunning {file_name}:", f"Total time: {{}}\n{'─' * 100}\n"):
                call_benchmark(file_name, benchmark_parameters[folder_name], folder=folder_name)
Exemple #9
0
 def __save_progress(self, total_epochs, model, loss):
     with TimeMeasure(enter_msg="Saving progress...",
                      writer=logger.debug,
                      print_enabled=self.__print_enabled):
         path = p_join("trained_models", self.__name,
                       "epoch-{:05d}.pt".format(total_epochs))
         save_checkpoint(path, total_epochs, model, loss,
                         self.__environment)
def text_preprocessor_twlda(sourcedesc,
                            *,
                            tweet_min_length=3,
                            user_min_tweets=1,
                            remove_duplicates=False):
    cl.section('Text Preprocessor For Twitter-LDA')

    assert re.fullmatch(r'[-_0-9a-zA-Z+]+', sourcedesc)

    input_filename = data_source_file('%s.csv' % sourcedesc)

    with TimeMeasure('preprocess_text'):
        prepdata, sourcedata = preprocess_csv(input_filename, tweet_min_length,
                                              user_min_tweets,
                                              remove_duplicates)

    with TimeMeasure('save_preprocessed'):
        save_preprocessed(prepdata, sourcedata)
Exemple #11
0
def get_data_loaders_cv(meta_path,
                        images_path,
                        transformation,
                        augmentation,
                        data_loading_config,
                        pre_processor=None,
                        number_of_splits=3):
    batch_size = data_loading_config.batch_size

    with TimeMeasure(enter_msg="Begin initialization of data set.",
                     exit_msg="Finished initialization of data set after {}.",
                     writer=logger.debug):
        data_set = WordsDataSet(meta_path,
                                images_path,
                                transform=transformation,
                                pre_processor=pre_processor)

    with TimeMeasure(enter_msg="Splitting data set", writer=logger.debug):
        train_test_array = cv_split(data_set, number_of_splits, augmentation)

    with TimeMeasure(enter_msg="Init data loader", writer=logger.debug):
        loader_array = []
        for train_set, test_set, augmented_set in train_test_array:
            train_eval_loader = DataLoader(train_set,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           num_workers=8,
                                           drop_last=False)
            test_loader = DataLoader(test_set,
                                     batch_size=batch_size,
                                     shuffle=True,
                                     num_workers=8,
                                     drop_last=False)
            train_loader = DataLoader(augmented_set,
                                      batch_size=batch_size,
                                      shuffle=True,
                                      num_workers=8,
                                      drop_last=False)
            loader_array.append((train_loader, train_eval_loader, test_loader))

    return loader_array
Exemple #12
0
def twitter_lda(*, output_desc, topics, iteration, alpha_g=None,
                beta_word=0.01, beta_b=0.01, gamma=20,
                show_console_output=True):
    cl.section('Twitter-LDA Runner')
    cl.info('Output description: %s' % output_desc)

    assert re.fullmatch(r'[-_0-9a-zA-Z]+', output_desc)

    if alpha_g is None:
        alpha_g = 50 / topics

    set_parameters(topics, alpha_g, beta_word, beta_b, gamma, iteration)

    with TimeMeasure('Twitter-LDA training'):
        run_twlda(show_console_output=show_console_output)

    move_result(output_desc)
Exemple #13
0
    def train(self, train_loader, de_en_coder, current_epoch=0, device="cpu"):
        logger.info("Enter training mode.")
        total_epochs = current_epoch
        last_save, loss = 0, None
        stats = Statistics.get_instance(self.__name)

        logger.info(
            f"Try warm start? - {'Yes' if self.__environment.warm_start else 'No'}"
        )
        if self.__environment.warm_start:
            try:
                total_epochs, state_dict, loss = self.__load_progress()
                self.__environment.update_max_epochs(total_epochs)
                self.__model.load_state_dict(state_dict)
            except RuntimeError:
                logger.warning("Warm start was not possible!")

        logger.info(
            f"Begin training for {self.__environment.max_epochs} epochs")

        for epoch_idx in range(1, self.__environment.max_epochs + 1):
            enter_msg = f"Train Epoch: {epoch_idx: 4d} (total: {total_epochs + 1: 4d})"
            with TimeMeasure(enter_msg=enter_msg,
                             writer=logger.info,
                             print_enabled=self.__print_enabled) as tm:
                current_learning_rate = self.__learning_rate_adaptor(
                    total_epochs)
                loss, words = self.core_training(train_loader,
                                                 current_learning_rate, device,
                                                 de_en_coder)
                logger.info("loss: {}".format(loss))
                total_epochs += 1

                stats.save_per_epoch(total_epochs, tm.delta, loss, words)
                if epoch_idx % self.__environment.save_interval is 0:
                    last_save = total_epochs
                    self.__save_progress(total_epochs, self.__model, loss)
                    self.__save_period_stats(total_epochs)

        if last_save < total_epochs:
            logger.info("final save")
            self.__save_progress(total_epochs, self.__model, loss)
            self.__save_period_stats(total_epochs)

        return self.__model
Exemple #14
0
def lda_topic_model(input_filename, keyword, size, *, num_topics,
                    iterations=50, passes=1, chunksize=2000, eval_every=10,
                    verbose=False, gamma_threshold=0.001, filter_no_below=5,
                    filter_no_above=0.5, filter_keep_n=100000,
                    open_browser=True):
    cl.section('LDA Topic Model Training')
    cl.info('Keyword: %s' % keyword)
    cl.info('Data size: %d' % size)
    cl.info('Number of topics: %d' % num_topics)
    cl.info('Iterations: %d' % iterations)
    cl.info('Passes: %d' % passes)
    cl.info('Chunk size: %d' % chunksize)
    cl.info('Eval every: %s' % eval_every)
    cl.info('Verbose: %s' % verbose)
    cl.info('Gamma Threshold: %f' % gamma_threshold)
    cl.info('Filter no below: %d' % filter_no_below)
    cl.info('Filter no above: %f' % filter_no_above)
    cl.info('Filter keep n: %d' % filter_keep_n)

    assert re.fullmatch(r'[-_0-9a-zA-Z+]+', keyword)

    input_filename = data_source_file(input_filename)
    description = '%s-%d-%d-%dx%d-%s' % (keyword, size, num_topics, iterations,
                                         passes, time.strftime('%Y%m%d%H%M%S'))

    if verbose:
        log_filename = log_file('ldalog-%s.log' % description)
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.DEBUG, filename=log_filename)
        cl.info('Writing logs into file: %s' % log_filename)

    with TimeMeasure('load_preprocessed_text'):
        preprocessed_texts = file_read_json(input_filename)
        preprocessed_texts = [item[1] for item in preprocessed_texts]

    with TimeMeasure('gen_dict_corpus'):
        cl.progress('Generating dictionary and corpus...')

        dictionary = Dictionary(preprocessed_texts, prune_at=None)
        dictionary.filter_extremes(no_below=filter_no_below,
                                   no_above=filter_no_above,
                                   keep_n=filter_keep_n)
        dictionary.compactify()

        corpus = [dictionary.doc2bow(text) for text in preprocessed_texts]

        corpusfilename = model_file('ldacorpus-%s.json' % description)
        file_write_json(corpusfilename, corpus)
        cl.success('Corpus saved as: %s' % corpusfilename)

    with TimeMeasure('training'):
        cl.progress('Performing training...')

        with NoConsoleOutput():
            ldamodel = LdaMulticore(corpus, workers=N_WORKERS,
                                    id2word=dictionary, num_topics=num_topics,
                                    iterations=iterations, passes=passes,
                                    chunksize=chunksize, eval_every=eval_every,
                                    gamma_threshold=gamma_threshold,
                                    alpha='symmetric', eta='auto')

        cl.success('Training finished.')

    with TimeMeasure('save_model'):
        modelfilename = 'ldamodel-%s' % description
        ldamodel.save(model_file(modelfilename))
        cl.success('Model saved as: %s' % modelfilename)

    with TimeMeasure('measure_coherence'):
        cl.progress('Measuring topic coherence...')
        measure_coherence(ldamodel, preprocessed_texts, corpus, dictionary)

    with TimeMeasure('vis_save'):
        cl.progress('Preparing visualization...')
        vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
        htmlfilename = 'ldavis-%s.html' % description
        htmlfilename = report_file(htmlfilename)
        pyLDAvis.save_html(vis, htmlfilename)
        cl.success('Visualized result saved in file: %s' % htmlfilename)

    if open_browser:
        open_html_in_browser(htmlfilename)
    # Train (with different number of topics)
    for topics in num_topics_range:
        cl.info('Running with %d topics' % topics)
        retry_until_success(twitter_lda,
                            output_desc='java-%d' % topics,
                            topics=topics,
                            iteration=ITERATIONS,
                            show_console_output=True)

    # Analyze (Perplexity Plot + HTML Reports + Compress)
    report_files = []
    plot_file, minima_points = plot_diff_topics(num_topics_range, 'java',
                                                r'Perplexity is ([\d.]+)',
                                                pipe_encoding)
    report_files.append(plot_file)
    report_points = minima_points if REPORT_ONLY_MINIMA else num_topics_range

    for topics in report_points:
        report_files.append(
            visualization_twlda(KEYWORD,
                                'java-%d' % topics,
                                '%s-%d' % (tag, topics),
                                userinfo_file,
                                open_browser=False))
    compress_report_files(tag, report_files)


if __name__ == '__main__':
    with TimeMeasure('train_task'):
        main()
PROXY = None  # The proxy used to collect Twitter data. (None or 'http://xxx')
TWSCRAPE_POOLSIZE = 20  # Number of processes for twitterscraper.


def get_usernames(tweets_file):
    return list(
        set(row['user'] for row in csv_reader(data_source_file(tweets_file))))


def main():
    tweets_file = 'twdata-java.csv'
    userinfo_file = 'twusers-java.csv'

    # Retrieve (Scrape + Recover Retweets + Get User Info)
    data_retriever('twitterscraper',
                   QUERY,
                   tweets_file,
                   lang='en',
                   proxy=PROXY,
                   remove_duplicates=False,
                   twscrape_poolsize=TWSCRAPE_POOLSIZE,
                   twscrape_begindate=datetime.date.today() - DATEBACK)
    tweets_file_recovered = retweets_recover(tweets_file)
    usernames = get_usernames(tweets_file_recovered)
    user_info_retriever(usernames, userinfo_file)


if __name__ == '__main__':
    with TimeMeasure('crawl_task'):
        main()
Exemple #17
0
def main(config_name):
    logger.info(f"Run with config '{config_name}'.")
    with TimeMeasure(enter_msg="Setup everything",
                     exit_msg="Setup finished after {}.",
                     writer=logger.debug):
        device = get_available_device()
        logger.info(f"Active device: {device}")

        config = Configuration(f"../configs/{config_name}.json")

        prediction_config = config["prediction"]
        data_set_config = config["data_set"]
        data_loading_config = config["data_loading"]
        training_config = config["training"]
        environment_config = config["training/environment"]
        model_config = config["model"]

        # in char list we use '|' as a symbol the CTC-blank
        de_en_coder = WordDeEnCoder(list(prediction_config.char_list))
        word_predictor = setup_decoder_from_config(prediction_config, "eval")
        word_predictor_debug = setup_decoder_from_config(
            prediction_config, "debug")

        model = get_model_by_name(model_config.name)(
            model_config.parameters).to(device)

        main_locals = locals()
        transformations = build_transformations(
            data_loading_config.transformations, main_locals)
        augmentations = data_loading_config.if_exists(
            path="augmentations",
            runner=lambda augms: build_augmentations(augms, main_locals),
            default=None)
        augmentation = transforms.Compose(
            augmentations) if augmentations is not None else None

        train_loader, train_eval_loader, test_loader = get_data_loaders(
            meta_path=data_set_config.meta_path,
            images_path=data_set_config.images_path,
            transformation=transforms.Compose(transformations),
            augmentation=augmentation,
            data_loading_config=data_loading_config,
            pre_processor=pre_processor(config),
            max_word_length=data_set_config("max_word_length"))

        environment = TrainingEnvironment.from_config(environment_config)

        if "small" in model_config.name.lower():
            learning_rate = dynamic_learning_rate_small
        else:
            learning_rate = dynamic_learning_rate_big

        trainer = Trainer(name=training_config.name,
                          model=model,
                          word_prediction=word_predictor_debug,
                          dynamic_learning_rate=learning_rate,
                          environment=environment)

    with TimeMeasure(enter_msg="Load pre-trained model.",
                     exit_msg="Finished loading after {}.",
                     writer=logger.debug):
        model = trainer.load_latest_model()

    with TimeMeasure(writer=logger.debug):
        result = list()

        with torch.no_grad():
            for batch_idx, (feature_batch,
                            label_batch) in enumerate(test_loader):
                feature_batch = feature_batch.to(device)
                label_batch = [
                    right_strip(word, 1.0)
                    for word in word_tensor_to_list(label_batch)
                ]
                label_batch = [
                    de_en_coder.decode_word(word) for word in label_batch
                ]
                model.init_hidden(batch_size=feature_batch.size()[0],
                                  device=device)

                output = F.softmax(model(feature_batch), dim=-1)
                output = np.array(output.cpu())
                prediction = word_predictor(output)

                for i in range(len(prediction)):
                    token, target = prediction[i], label_batch[i]
                    character_error_rate = Levenshtein.distance(
                        token, target) / len(target)
                    result.append((target, token, character_error_rate))

        result = sorted(result, key=lambda row: -row[2])
        for idx, (expectation, prediction, error) in enumerate(result):
            logger.info(
                f"{idx:05d} | {expectation:20s} | {prediction:20s} | {error:6.4f}"
            )