def after_step(context, step): if step.status == "failed" and context.create_log is True: append_logfile(context, step) if step.status == "failed" and context.create_smartview is True and hasattr( context, "falseguids"): # print(context.falseguids) add_smartview(context.smview_file, step.name, context.falseguids)
def preprocess_wos_articles_from_dir(self, data_dir, preprocessing, remove_stopwords, nodes_to_analyze): """Method to extract, load and preprocess article metadata provided by Web of Science with support for multiprocessing. Parameters ---------- data_dir : str preprocessing : str Specifies which preprocessing method to apply. Supported strings: 'word_tokenize', 'sentences_with_lemmas', 'pos_tag', and 'lemmatize'. remove_stopwords : str Specifies which stopword-list to apply. Supported strings: Nltk-Stopwords nodes_to_analyze: Nodes nodes that have to be analyzed Returns ------- assets : list(Asset) """ start = timeit.default_timer() stream_processing_jobs = [] for root, dirs, files in os.walk(data_dir): for name in files: file_path = data_dir + name stream_processing_job = {"preprocessing": preprocessing, "remove_stopwords": remove_stopwords, 'file_path': file_path, 'nodes_to_analyze': nodes_to_analyze, 'stopwords': self.stop_words} stream_processing_jobs.append(stream_processing_job) p = Pool(processes=cpu_count() - 1) assets = p.map(stream_preprocessing, stream_processing_jobs) p.close() p.join() assets = list(itertools.chain.from_iterable(assets)) # Logfile stop = timeit.default_timer() runtime = stop - start event_title = "Load and preprocess Academic Data from Directory" event_description = \ "Importing " + str(len(assets)) + \ " academic assets from directory into assetlist." \ + " Preprocessing = " + str(preprocessing) append_logfile(logfile_path=self.logfile_path, event_title=event_title, event_description=event_description, runtime=runtime) return assets
def stop_timer_and_log(self, details=''): """Log to logfile""" stop_time = timeit.default_timer() runtime = stop_time - self.start_time event_title = 'Algorithm: ' + self.alg_name event_description = details if self.logfile_path is not None: append_logfile(logfile_path=self.logfile_path, event_title=event_title, event_description=event_description, runtime=runtime)
def load_assetlist_from_dir(file_path, logfile_path=None): start_time = timeit.default_timer() with open(file_path, "rb") as fp: assetlist = pickle.load(fp) event_description = str( len(assetlist)) + ' Assets loaded from ' + file_path stop_time = timeit.default_timer() runtime = stop_time - start_time event_title = 'Load list of assets' if logfile_path is not None: append_logfile(logfile_path=logfile_path, event_title=event_title, event_description=event_description, runtime=runtime) return assetlist
def preprocess_patent_files_from_dir( self, data_dir, preprocessing, remove_stopwords, nodes_to_analyze, filter_patents_by_node): """Method to extract, load and preprocess patent data parsed by our uspto_xml_parser with support for multiprocessing. Parameters ---------- data_dir : str preprocessing : str Specifies which preprocessing method to apply. Supported strings: 'word_tokenize', 'sentences_with_lemmas', 'pos_tag', and 'lemmatize'. remove_stopwords : str Specifies which stopword-list to apply. Supported strings: Nltk-Stopwords nodes_to_analyze: Nodes nodes that have to be analyzed filter_patents_by_node: bool Returns ------- nothing """ start = timeit.default_timer() stream_processing_jobs = [] for root, dirs, files in os.walk(data_dir): for name in files: file_path = data_dir + name if os.path.getsize(file_path) > 0: stream_processing_job = {"preprocessing": preprocessing, "remove_stopwords": remove_stopwords, "file_path": file_path, "nodes_to_analyze": nodes_to_analyze, "filter_patents_by_node": filter_patents_by_node} stream_processing_jobs.append(stream_processing_job) else: print("Empty File!") p = Pool(processes=cpu_count()-1, maxtasksperchild=1) asset_cnt = 0 for assets in p.imap_unordered(stream_preprocessing, stream_processing_jobs): nodes_to_analyze.enrich_with_assets(assets) print("Imported " + str(len(assets)) + " assets into nodes") asset_cnt = asset_cnt + len(assets) p.close() p.join() # Logfile stop = timeit.default_timer() runtime = stop - start event_title = "Load and preprocess Patent Data from Directory" event_description = \ "Importing " + str(asset_cnt) + " patents from directory " \ "into assetlist." \ + " Preprocessing = " + str(preprocessing) append_logfile(logfile_path=self.logfile_path, event_title=event_title, event_description=event_description, runtime=runtime)