def coherence_metrics(): metric_fcts = load_metric_fcts('coherence') if len(metric_fcts) == 0: nbprint('No metrics active.') return topiclist_infos = data.get_all_topiclist_infos() if len(topiclist_infos) == 0: nbprint('No topics found.') return # Group them into batches based on topic_version and add num_tokens topiclist_info_batches = defaultdict(list) for info in topiclist_infos: for num_tokens in config.metrics['num_tokens']: extended_info = info.copy() extended_info['num_tokens'] = num_tokens if 'second_info' in info: token_version = info['second_info']['token_version'] else: token_version = info['token_version'] topiclist_info_batches[token_version].append(extended_info) for token_version, batch in topiclist_info_batches.items(): nbprint('Batch {}'.format(token_version)).push() for metric_id, fct in metric_fcts.items(): start = time.time() nbprint('Metric: {}'.format( config.metrics['coherence'][metric_id]['name'])).push() coherence_metric_batch(token_version, batch, metric_id, fct) end = time.time() nbprint('Runtime: {} minutes'.format((end - start) / 60)).pop() nbprint.pop()
def run_importer(info=None): nbprint('Importer').push() if info is None: iterate(["data"], [import_data], depth=0) else: import_data(info) nbprint.pop()
def run_tokenizer(info=None): nbprint('Tokenizer').push() if info is None: iterate(['token:BC', 'data'], tokenize) else: tokenize(info) nbprint.pop()
def run_vocab(info=None): nbprint('Vocab').push() if info is None: iterate(["data", "token", "vocab"], [check_tokens, build_vocab]) else: check_tokens(info) build_vocab(info) nbprint.pop()
def run_distiller(): global rejector rejector = Rejector(0.99) nbprint('Distiller').push() iterate(['distiller', 'distillerinputs'], add_second_info, print_iterates=False) nbprint.pop()
def add_data(self, filename): nbprint("Loading '{}'".format(filename)).push() folderpath = join(config.paths["rawdata"], "tweetsodp") jsonfilename = join(folderpath, filename + ".json") zipfilename = join(folderpath, filename + ".json.zip") self.load_id_to_classname(folderpath, filename) if isfile(jsonfilename): with open(jsonfilename, "r") as jsonfile: self.parse_files(jsonfile) else: with zipfile.ZipFile(zipfilename) as zip: with zip.open(filename + ".json") as jsonfile: self.parse_files(jsonfile) nbprint.pop()
def run(self): # Open Writer with data.document_writer(self.info) as document_writer: self.docinfo = DocumentInfo(document_writer) # Iterate all archives folder = join(config.paths["rawdata"], "tweetsla") archives = self.get_archives(folder) for idx, archive in enumerate(archives): nbprint('{}/{}: {}'.format(idx + 1, len(archives), archive)).push() self.archivepath = join(folder, archive) self.import_archive() nbprint.pop() # Print Meta Info self.docinfo.save_meta(self.info)
def clustering_metrics(): metric_fcts = load_metric_fcts('clustering') clustering_data = data.load_metric_data('clustering') # First everything by taking the column wise maximum as cluster idx nbprint('H Matrix').push() h_mat_infos = data.get_all_h_mat_infos(labeled_only=True) for info in ProgressIterator(h_mat_infos, print_every=1): # Grab the corresponding entry from clustering data metric_data_entry = grab_metric_data_entry(clustering_data, info) # Iterate all metric functions and store result in entry for metric_id, metric_fct in metric_fcts.items(): # Skip metric if it already exists: if metric_id in metric_data_entry: continue # Compute the metric labels_true = load_ground_truth_classes(info) labels_pred = load_class_array_from_h_mat(info) metric_data_entry[metric_id] = metric_fct(labels_true, labels_pred) # Save everything in between data.save_metric_data(clustering_data, 'clustering') # Taking indices directly from c nbprint.pop()('C Vector').push() c_vec_infos = data.get_all_c_vec_infos(labeled_only=True) for info in ProgressIterator(c_vec_infos, print_every=1): # Grab the corresponding entry from clustering data metric_data_entry = grab_metric_data_entry(clustering_data, info) # Iterate all metric functions and store result in entry for metric_id, metric_fct in metric_fcts.items(): # Skip metric if it already exists: if metric_id in metric_data_entry: continue # Compute the metric labels_true = load_ground_truth_classes(info) labels_pred = data.load_c_vec(info) metric_data_entry[metric_id] = metric_fct(labels_true, labels_pred) # Save everything in between data.save_metric_data(clustering_data, 'clustering') nbprint.pop()
def run_models(info=None): nbprint('Models').push() if info is None: iterate(['models', 'modelinputs', 'num_topics'], [check_input_mat, run_model]) else: info['model_info'] = config.models['list'][info['model_name']] info['model'] = get_model(info) if not info['model'].output_of(info): nbprint('Model is not compatible to inputs.') else: try: check_input_mat(info) run_model(info) except BreakIteration: pass nbprint.pop()
def call_next(what, callbacks, print_string, new_data, info, depth, print_iterates): if print_string and print_iterates == True: nbprint(print_string) if print_iterates == True: nbprint.push() new_info = {**info, **new_data} if len(callbacks) < len(what): iterate(what[1:], callbacks, new_info, depth + 1) else: try: if callbacks[0]: callbacks[0](new_info) if len(what) > 1: iterate(what[1:], callbacks[1:], new_info, depth + 1, print_iterates) except BreakIteration: if print_iterates == True: nbprint('skipping') pass if print_iterates == True: nbprint.pop()
def run_distiller_on(first_info, second_info): global rejector must_execute = False info = first_info.copy() if second_info is None: must_execute = True else: must_execute = (first_info.get('token_version', None) == second_info['token_version'] and first_info.get('vocab_version', None) == second_info['vocab_version'] and first_info.get('vector_version', None) == second_info['vector_version']) info['second_info'] = second_info if must_execute or rejector.allow(): nbprint('({}), ({})'.format(info_summary_str(first_info), info_summary_str(second_info))).push() if config.skip_existing and data.topiclist_exists(info): nbprint('Skipping Distiller (file(s) exists)') else: info['distiller'].run_distiller(info) info['distiller'].save() nbprint('Distiller: success') nbprint.pop()
def run_topic_metrics(): nbprint('Topic Metrics').push() nbprint('Coherence').push() coherence_metrics() nbprint.pop() nbprint('similarity').push() similarity_metrics() nbprint.pop() nbprint.pop()
def run_model_metrics(): nbprint('Model Metrics').push() nbprint('Clustering').push() clustering_metrics() nbprint.pop() nbprint('Classification').push() classification_metrics() nbprint.pop() nbprint.pop()
def run_vectorizer(info=None): nbprint('Vectorizer').push() global runvars if info is None: if config.vectorizer['run_B']: nbprint('BoW').push() runvars = {} iterate(['data', 'token:BC', 'vocab', 'vector:B'], [count_mat, bow]) nbprint.pop() if config.vectorizer['run_C']: nbprint('cBoW').push() runvars = {} iterate(['data', 'token:C', 'vocab', 'vector:C'], [count_mat, cbow]) nbprint.pop() if config.vectorizer['run_P']: nbprint('Phrase').push() runvars = {} iterate(['data', 'vector:P'], [phrase]) nbprint.pop() else: runvars = {} vector_bcp, vector_id = config.split(info['vector_version']) if vector_bcp == 'B' or vector_bcp == 'C': count_mat(info) if vector_bcp == 'B': bow(info) else: cbow(info) else: phrase(info) runvars = None nbprint.pop()