def __init__(self, meta_file, root_dir, transform=None, pre_processor=None, max_word_length=32): self.__meta_file = meta_file self.__words = list() self.__root_dir = root_dir self.__transform = transform self.__statistics = None self.__pre_processor = pre_processor self.__max_word_length = max_word_length if self.__pre_processor is None: logger.info("No pre-processor selected.") else: logger.info(f"Selected pre-processor: {pre_processor.name}") with TimeMeasure(enter_msg="Begin meta data loading.", exit_msg="Finished meta data loading after {}.", writer=logger.debug): self.__process_meta_file() self.__availability_check() with TimeMeasure(enter_msg="Begin health check.", exit_msg="Finished health check after {}.", writer=logger.debug): self.__health_check() with TimeMeasure(enter_msg="Begin creating statistics.", exit_msg="Finished creating statistics after {}.", writer=logger.debug): self.__create_statistics()
def text_preprocessor(input_filename, *, preprocessor_cls='TextPreprocessor', custom_stop_words=None, lem_ignore_patterns=None, remove_duplicates=False): cl.section('Text Preprocessor') input_filename = data_source_file(input_filename) preprocessor_cls = globals()[preprocessor_cls] with TimeMeasure('preprocess_text'): result = preprocess_csv(input_filename, preprocessor_cls=preprocessor_cls, custom_stop_words=custom_stop_words, lem_ignore_patterns=lem_ignore_patterns) if remove_duplicates: result = remove_duplicate_text(result) result = tuple(result) cl.info('Effective data size: %d' % len(result)) with TimeMeasure('save_preprocessed'): save_preprocessed(result, input_filename)
def get_data_loaders(meta_path, images_path, transformation, augmentation, data_loading_config, pre_processor=None, max_word_length=32): relative_train_size = data_loading_config.train_size batch_size = data_loading_config.batch_size restore_path = data_loading_config.get("restore_path", default=None) save_path = data_loading_config.get("save_path", default=None) with TimeMeasure(enter_msg="Begin initialization of data set.", exit_msg="Finished initialization of data set after {}.", writer=logger.debug): data_set = WordsDataSet(meta_path, images_path, transform=transformation, pre_processor=pre_processor, max_word_length=max_word_length) with TimeMeasure(enter_msg="Splitting data set", writer=logger.debug): if restore_path is not None and os.path.exists(restore_path): loaded = True train_data_set, test_data_set = __restore_train_test_split( restore_path, data_set) else: loaded = False train_size = int(relative_train_size * len(data_set)) test_size = len(data_set) - train_size train_data_set, test_data_set = random_split( data_set, (train_size, test_size)) if augmentation is not None: augmented_data_set = AugmentedDataSet(train_data_set, augmentation) else: augmented_data_set = train_data_set if not loaded: __save_train_test_split(save_path, train_data_set, test_data_set) with TimeMeasure(enter_msg="Init data loader", writer=logger.debug): train_loader = DataLoader(augmented_data_set, batch_size=batch_size, shuffle=True, num_workers=8, drop_last=False) train_eval_loader = DataLoader(train_data_set, batch_size=batch_size, shuffle=True, num_workers=8, drop_last=False) test_loader = DataLoader(test_data_set, batch_size=batch_size, shuffle=True, num_workers=8, drop_last=False) return train_loader, train_eval_loader, test_loader
def model_analyzer(modeldesc, sourcedesc, *, num_top_words=30, num_top_docs=30, debug=False): cl.section('LDA Model Analyzer') cl.info('Model description: %s' % modeldesc) cl.info('Source description: %s' % sourcedesc) with TimeMeasure('load_all'): ldamodel, corpus, prep_items, source_texts = load_all(modeldesc, sourcedesc) with TimeMeasure('analyzing'): prep_ids = tuple(item[0] for item in prep_items) dictionary = ldamodel.id2word num_topics = ldamodel.num_topics topics = [{ 'topic_id': i, 'words': get_topic_words(ldamodel, i, num_top_words), 'popularity': 0.0, 'documents': collections.defaultdict(float) } for i in range(num_topics)] if debug: debugfilename = model_file('ldadoctopics-%s.txt' % modeldesc) with open(debugfilename, 'w', encoding='utf-8') as debugfile: for index, doc in enumerate(corpus): text_id = prep_ids[index] doc_topics = ldamodel.get_document_topics(doc) text = source_texts[text_id].strip() debugfile.write('%s -> %r, %s\n' % (text_id, doc_topics, text)) term_topics_cache = {} for word in dictionary: term_topics_cache[word] = ldamodel.get_term_topics(word) for index, doc in enumerate(corpus): for topic_id, prob in ldamodel.get_document_topics(doc): topics[topic_id]['popularity'] += prob for word, freq in doc: if word not in dictionary: continue for topic_id, prob in term_topics_cache[word]: topics[topic_id]['documents'][index] += prob * freq for topic in topics: topic['documents'] = get_topic_top_docs(topic['documents'], num_top_docs, prep_ids, source_texts) topics = sorted(topics, key=lambda x: x['popularity'], reverse=True) with TimeMeasure('export_markdown'): export_markdown(modeldesc, sourcedesc, topics)
def main(): try: with TimeMeasure(): check_convolution1D() with TimeMeasure(): check_convolution2D() except Exception: sleep(1) tb = traceback.format_exc() print(tb, file=sys.stderr)
def text_preprocessor_user(sourcedesc): cl.section('Text Preprocessor Grouping By User') assert re.fullmatch(r'[-_0-9a-zA-Z+]+', sourcedesc) csvfilename = data_source_file('%s.csv' % sourcedesc) with TimeMeasure('preprocess_text'): result = list(preprocess_csv(csvfilename)) with TimeMeasure('save_preprocessed'): savefilename = name_with_title_suffix(csvfilename, '-user') export_csv(result, savefilename)
def train_single_model(self, model, loaders, total_epochs, device, de_en_coder, model_id): train_loader = loaders[0] train_eval_loader = loaders[1] test_loader = loaders[2] for epoch_idx in range(1, self.__environment.max_epochs + 1): enter_msg = f"Train Epoch: {epoch_idx: 4d} (total: {total_epochs + 1: 4d})" with TimeMeasure(enter_msg=enter_msg, writer=logger.info, print_enabled=True) as tm: current_learning_rate = self.__learning_rate_adaptor( total_epochs) loss, words = self.core_training(model, train_loader, current_learning_rate, device, de_en_coder) logger.info(f"loss: {loss}") total_epochs += 1 if epoch_idx % self.__environment.save_interval is 0: train_metrics = evaluate_model( de_en_coder=de_en_coder, word_prediction=self.__word_prediction, model=model, data_loader=train_eval_loader, device=device) test_metrics = evaluate_model( de_en_coder=de_en_coder, word_prediction=self.__word_prediction, model=model, data_loader=test_loader, device=device) model_data = { "name": f"{model.__class__.__name__}_{model_id:03d}" } self.__stats.save_per_period(total_epochs, train_metrics, test_metrics, model_data)
def main(): xtensor_benchmark_parameters = { "format": "console", "min_time": 0.25, "repetitions": 10, "report_aggregates_only": True } xvigra_benchmark_parameters = { "format": "console", "min_time": 1, "repetitions": 10, "report_aggregates_only": True } benchmark_parameters = { "xtensor": xtensor_benchmark_parameters, "xvigra": xvigra_benchmark_parameters } benchmark_folders = { "xtensor": ( "benchmark_normalizing", "benchmark_transpose-view", "benchmark_reshape-view", "benchmark_tensor_copy_complete_X", "benchmark_tensor_copy_complete_Y", "benchmark_tensor_copy_complete_Z", "benchmark_strided-view_copy_complete_X", "benchmark_strided-view_copy_complete_Y", "benchmark_strided-view_copy_complete_Z", "benchmark_strided-view_copy_paddingStride_X", "benchmark_strided-view_copy_paddingStride_Y", "benchmark_strided-view_copy_paddingStride_Z", "benchmark_view_copy_complete_X", "benchmark_view_copy_complete_Y", "benchmark_view_copy_complete_Z", "benchmark_view_copy_paddingStride_X", "benchmark_view_copy_paddingStride_Y", "benchmark_view_copy_paddingStride_Z", ), "xvigra": ( "benchmark_convolve1D_inputSize_channelFirst", "benchmark_convolve1D_inputSize_channelLast", # "benchmark_convolve2D_inputSize_channelFirst", # "benchmark_convolve2D_inputSize_channelLast", # "benchmark_separableConvolve1D_inputSize", # "benchmark_separableConvolve2D_inputSize", # "benchmark_separableConvolve1D_kernelSize", # "benchmark_separableConvolve2D_kernelSize" ) } build_all() for folder_name, benchmark_files in benchmark_folders.items(): for file_name in benchmark_files: with TimeMeasure(f"{'─' * 100}\nRunning {file_name}:", f"Total time: {{}}\n{'─' * 100}\n"): call_benchmark(file_name, benchmark_parameters[folder_name], folder=folder_name)
def __save_progress(self, total_epochs, model, loss): with TimeMeasure(enter_msg="Saving progress...", writer=logger.debug, print_enabled=self.__print_enabled): path = p_join("trained_models", self.__name, "epoch-{:05d}.pt".format(total_epochs)) save_checkpoint(path, total_epochs, model, loss, self.__environment)
def text_preprocessor_twlda(sourcedesc, *, tweet_min_length=3, user_min_tweets=1, remove_duplicates=False): cl.section('Text Preprocessor For Twitter-LDA') assert re.fullmatch(r'[-_0-9a-zA-Z+]+', sourcedesc) input_filename = data_source_file('%s.csv' % sourcedesc) with TimeMeasure('preprocess_text'): prepdata, sourcedata = preprocess_csv(input_filename, tweet_min_length, user_min_tweets, remove_duplicates) with TimeMeasure('save_preprocessed'): save_preprocessed(prepdata, sourcedata)
def get_data_loaders_cv(meta_path, images_path, transformation, augmentation, data_loading_config, pre_processor=None, number_of_splits=3): batch_size = data_loading_config.batch_size with TimeMeasure(enter_msg="Begin initialization of data set.", exit_msg="Finished initialization of data set after {}.", writer=logger.debug): data_set = WordsDataSet(meta_path, images_path, transform=transformation, pre_processor=pre_processor) with TimeMeasure(enter_msg="Splitting data set", writer=logger.debug): train_test_array = cv_split(data_set, number_of_splits, augmentation) with TimeMeasure(enter_msg="Init data loader", writer=logger.debug): loader_array = [] for train_set, test_set, augmented_set in train_test_array: train_eval_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=8, drop_last=False) test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True, num_workers=8, drop_last=False) train_loader = DataLoader(augmented_set, batch_size=batch_size, shuffle=True, num_workers=8, drop_last=False) loader_array.append((train_loader, train_eval_loader, test_loader)) return loader_array
def twitter_lda(*, output_desc, topics, iteration, alpha_g=None, beta_word=0.01, beta_b=0.01, gamma=20, show_console_output=True): cl.section('Twitter-LDA Runner') cl.info('Output description: %s' % output_desc) assert re.fullmatch(r'[-_0-9a-zA-Z]+', output_desc) if alpha_g is None: alpha_g = 50 / topics set_parameters(topics, alpha_g, beta_word, beta_b, gamma, iteration) with TimeMeasure('Twitter-LDA training'): run_twlda(show_console_output=show_console_output) move_result(output_desc)
def train(self, train_loader, de_en_coder, current_epoch=0, device="cpu"): logger.info("Enter training mode.") total_epochs = current_epoch last_save, loss = 0, None stats = Statistics.get_instance(self.__name) logger.info( f"Try warm start? - {'Yes' if self.__environment.warm_start else 'No'}" ) if self.__environment.warm_start: try: total_epochs, state_dict, loss = self.__load_progress() self.__environment.update_max_epochs(total_epochs) self.__model.load_state_dict(state_dict) except RuntimeError: logger.warning("Warm start was not possible!") logger.info( f"Begin training for {self.__environment.max_epochs} epochs") for epoch_idx in range(1, self.__environment.max_epochs + 1): enter_msg = f"Train Epoch: {epoch_idx: 4d} (total: {total_epochs + 1: 4d})" with TimeMeasure(enter_msg=enter_msg, writer=logger.info, print_enabled=self.__print_enabled) as tm: current_learning_rate = self.__learning_rate_adaptor( total_epochs) loss, words = self.core_training(train_loader, current_learning_rate, device, de_en_coder) logger.info("loss: {}".format(loss)) total_epochs += 1 stats.save_per_epoch(total_epochs, tm.delta, loss, words) if epoch_idx % self.__environment.save_interval is 0: last_save = total_epochs self.__save_progress(total_epochs, self.__model, loss) self.__save_period_stats(total_epochs) if last_save < total_epochs: logger.info("final save") self.__save_progress(total_epochs, self.__model, loss) self.__save_period_stats(total_epochs) return self.__model
def lda_topic_model(input_filename, keyword, size, *, num_topics, iterations=50, passes=1, chunksize=2000, eval_every=10, verbose=False, gamma_threshold=0.001, filter_no_below=5, filter_no_above=0.5, filter_keep_n=100000, open_browser=True): cl.section('LDA Topic Model Training') cl.info('Keyword: %s' % keyword) cl.info('Data size: %d' % size) cl.info('Number of topics: %d' % num_topics) cl.info('Iterations: %d' % iterations) cl.info('Passes: %d' % passes) cl.info('Chunk size: %d' % chunksize) cl.info('Eval every: %s' % eval_every) cl.info('Verbose: %s' % verbose) cl.info('Gamma Threshold: %f' % gamma_threshold) cl.info('Filter no below: %d' % filter_no_below) cl.info('Filter no above: %f' % filter_no_above) cl.info('Filter keep n: %d' % filter_keep_n) assert re.fullmatch(r'[-_0-9a-zA-Z+]+', keyword) input_filename = data_source_file(input_filename) description = '%s-%d-%d-%dx%d-%s' % (keyword, size, num_topics, iterations, passes, time.strftime('%Y%m%d%H%M%S')) if verbose: log_filename = log_file('ldalog-%s.log' % description) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG, filename=log_filename) cl.info('Writing logs into file: %s' % log_filename) with TimeMeasure('load_preprocessed_text'): preprocessed_texts = file_read_json(input_filename) preprocessed_texts = [item[1] for item in preprocessed_texts] with TimeMeasure('gen_dict_corpus'): cl.progress('Generating dictionary and corpus...') dictionary = Dictionary(preprocessed_texts, prune_at=None) dictionary.filter_extremes(no_below=filter_no_below, no_above=filter_no_above, keep_n=filter_keep_n) dictionary.compactify() corpus = [dictionary.doc2bow(text) for text in preprocessed_texts] corpusfilename = model_file('ldacorpus-%s.json' % description) file_write_json(corpusfilename, corpus) cl.success('Corpus saved as: %s' % corpusfilename) with TimeMeasure('training'): cl.progress('Performing training...') with NoConsoleOutput(): ldamodel = LdaMulticore(corpus, workers=N_WORKERS, id2word=dictionary, num_topics=num_topics, iterations=iterations, passes=passes, chunksize=chunksize, eval_every=eval_every, gamma_threshold=gamma_threshold, alpha='symmetric', eta='auto') cl.success('Training finished.') with TimeMeasure('save_model'): modelfilename = 'ldamodel-%s' % description ldamodel.save(model_file(modelfilename)) cl.success('Model saved as: %s' % modelfilename) with TimeMeasure('measure_coherence'): cl.progress('Measuring topic coherence...') measure_coherence(ldamodel, preprocessed_texts, corpus, dictionary) with TimeMeasure('vis_save'): cl.progress('Preparing visualization...') vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) htmlfilename = 'ldavis-%s.html' % description htmlfilename = report_file(htmlfilename) pyLDAvis.save_html(vis, htmlfilename) cl.success('Visualized result saved in file: %s' % htmlfilename) if open_browser: open_html_in_browser(htmlfilename)
# Train (with different number of topics) for topics in num_topics_range: cl.info('Running with %d topics' % topics) retry_until_success(twitter_lda, output_desc='java-%d' % topics, topics=topics, iteration=ITERATIONS, show_console_output=True) # Analyze (Perplexity Plot + HTML Reports + Compress) report_files = [] plot_file, minima_points = plot_diff_topics(num_topics_range, 'java', r'Perplexity is ([\d.]+)', pipe_encoding) report_files.append(plot_file) report_points = minima_points if REPORT_ONLY_MINIMA else num_topics_range for topics in report_points: report_files.append( visualization_twlda(KEYWORD, 'java-%d' % topics, '%s-%d' % (tag, topics), userinfo_file, open_browser=False)) compress_report_files(tag, report_files) if __name__ == '__main__': with TimeMeasure('train_task'): main()
PROXY = None # The proxy used to collect Twitter data. (None or 'http://xxx') TWSCRAPE_POOLSIZE = 20 # Number of processes for twitterscraper. def get_usernames(tweets_file): return list( set(row['user'] for row in csv_reader(data_source_file(tweets_file)))) def main(): tweets_file = 'twdata-java.csv' userinfo_file = 'twusers-java.csv' # Retrieve (Scrape + Recover Retweets + Get User Info) data_retriever('twitterscraper', QUERY, tweets_file, lang='en', proxy=PROXY, remove_duplicates=False, twscrape_poolsize=TWSCRAPE_POOLSIZE, twscrape_begindate=datetime.date.today() - DATEBACK) tweets_file_recovered = retweets_recover(tweets_file) usernames = get_usernames(tweets_file_recovered) user_info_retriever(usernames, userinfo_file) if __name__ == '__main__': with TimeMeasure('crawl_task'): main()
def main(config_name): logger.info(f"Run with config '{config_name}'.") with TimeMeasure(enter_msg="Setup everything", exit_msg="Setup finished after {}.", writer=logger.debug): device = get_available_device() logger.info(f"Active device: {device}") config = Configuration(f"../configs/{config_name}.json") prediction_config = config["prediction"] data_set_config = config["data_set"] data_loading_config = config["data_loading"] training_config = config["training"] environment_config = config["training/environment"] model_config = config["model"] # in char list we use '|' as a symbol the CTC-blank de_en_coder = WordDeEnCoder(list(prediction_config.char_list)) word_predictor = setup_decoder_from_config(prediction_config, "eval") word_predictor_debug = setup_decoder_from_config( prediction_config, "debug") model = get_model_by_name(model_config.name)( model_config.parameters).to(device) main_locals = locals() transformations = build_transformations( data_loading_config.transformations, main_locals) augmentations = data_loading_config.if_exists( path="augmentations", runner=lambda augms: build_augmentations(augms, main_locals), default=None) augmentation = transforms.Compose( augmentations) if augmentations is not None else None train_loader, train_eval_loader, test_loader = get_data_loaders( meta_path=data_set_config.meta_path, images_path=data_set_config.images_path, transformation=transforms.Compose(transformations), augmentation=augmentation, data_loading_config=data_loading_config, pre_processor=pre_processor(config), max_word_length=data_set_config("max_word_length")) environment = TrainingEnvironment.from_config(environment_config) if "small" in model_config.name.lower(): learning_rate = dynamic_learning_rate_small else: learning_rate = dynamic_learning_rate_big trainer = Trainer(name=training_config.name, model=model, word_prediction=word_predictor_debug, dynamic_learning_rate=learning_rate, environment=environment) with TimeMeasure(enter_msg="Load pre-trained model.", exit_msg="Finished loading after {}.", writer=logger.debug): model = trainer.load_latest_model() with TimeMeasure(writer=logger.debug): result = list() with torch.no_grad(): for batch_idx, (feature_batch, label_batch) in enumerate(test_loader): feature_batch = feature_batch.to(device) label_batch = [ right_strip(word, 1.0) for word in word_tensor_to_list(label_batch) ] label_batch = [ de_en_coder.decode_word(word) for word in label_batch ] model.init_hidden(batch_size=feature_batch.size()[0], device=device) output = F.softmax(model(feature_batch), dim=-1) output = np.array(output.cpu()) prediction = word_predictor(output) for i in range(len(prediction)): token, target = prediction[i], label_batch[i] character_error_rate = Levenshtein.distance( token, target) / len(target) result.append((target, token, character_error_rate)) result = sorted(result, key=lambda row: -row[2]) for idx, (expectation, prediction, error) in enumerate(result): logger.info( f"{idx:05d} | {expectation:20s} | {prediction:20s} | {error:6.4f}" )