def prepare(): parser = argparse.ArgumentParser(description='VECTORIZE the sampled data') parser.add_argument( '--config', required=True, help="File with the configuration, must contain key 'vectorize'") args = parser.parse_args() config = util.loadConfig(args.config) print("Starting vectorize with config {}".format( config["vectorize"]["hash"])) config["logger"] = util.setupLogging(config, "vectorize") config["src"] = os.path.join(config["clean"]["baseDir"], config["vectorize"]["cleanHash"], "useable.csv") config["labels"] = util.getLabels(config)[1:] config["stop_words"] = util.getStopWords(config) stopWordsHash = util.getDictHash(config["stop_words"]) if stopWordsHash != config["vectorize"]["stopWordsHash"]: config["logger"].error( "Hash of used and configured stop words differ:" "\n\t{} (used)" "\n\t{} (configured)" "\n\tRestore old stop word list or change config".format( stopWordsHash, config["vectorize"]["stopWordsHash"])) os.sys.exit(1) if config["vectorize"]["stemming"] == "none": config["payload"] = "payload" else: config["payload"] = config["vectorize"]["stemming"] return config
def prepare(): parser = argparse.ArgumentParser( description='CLEAN analyze cleaned records') parser.add_argument( '--config', required=True, help="File with the configuration for the cleaning run") parser.add_argument('--type', default="subjectScheme", choices=("subjectScheme", "schemeURI", "scheme2label"), help="Display subject schemes") parser.add_argument( '--label', default="1", help="Display scheme hits for this label (scheme2label)") parser.add_argument( '--scheme', default="all", help="Display scheme hits for this scheme (scheme2label)") args = parser.parse_args() config = util.loadConfig(args.config) config["type"] = args.type config["label"] = args.label config["scheme"] = args.scheme return config
def prepare(): parser = argparse.ArgumentParser( description='EVALUATE a model/param_grid/data bundle' ) parser.add_argument('--config', required = True, help = "File with the configuration, must contain key 'evaluate'") parser.add_argument('--device', default="default", help = "Device name used to train the models ('/device:GPU:0', '/device:GPU:1')") args = parser.parse_args() config = loadConfig(args.config) print("Starting with config {}\n\ttail -f {}".format( config["evaluate"]["hash"], config["evaluate"]["logFile"] )) config["logger"] = setupLogging(config, "evaluate") config["device"] = args.device if config["device"] == "default": prefix = "0" else: prefix = "1" config["target"] = os.path.join(config["evaluate"]["baseDir"], prefix + "_evaluation.csv") config["srcDir"] = os.path.join( config["vectorize"]["baseDir"], config["evaluate"]["vectorizeHash"] ) return config
def testLoadConfig(): # Check autocompletion of config works config = util.loadConfig(getTestConfig()) for key in ("hash", "rawDataDir", "processedDataDir", "configDir"): assert key in config.keys() # Check autocreations work (therefore delete the dirs first) hashedConfig = os.path.join(os.path.dirname(getTestConfig()), config["hash"] + ".json") if os.path.isfile(hashedConfig): os.remove(hashedConfig) shutil.rmtree(config["processedDataDir"]) config = util.loadConfig(getTestConfig()) # Check config backup works and provides identical copy for key in ("processedDataDir", "configDir"): assert os.path.isdir(config[key]) for key in ("retrieve", "clean", "sample", "train", "evaluate", "use"): assert os.path.isdir(os.path.join(config["processedDataDir"], key)) assert os.path.exists(hashedConfig) config2 = util.loadConfig(hashedConfig) assert config == config2
def prepare(): parser = argparse.ArgumentParser( description='RETRIEVE: retrieve all raw data.' ) parser.add_argument('--config', required = True, help = "File with the configuration, must contain key 'retrieve'") parser.add_argument('--sleep', default = 20, help = "Time period to sleep until a harvester is checked during harvesting") args = parser.parse_args() config = util.loadConfig(args.config) config["retrieve"]["sleep"] = int(args.sleep) config["logger"] = util.setupLogging(config, "retrieve") return config
def prepare(): """ Prepares a cleaning run # Returns config: dict A configuration with all paths, compiled regexes and a logger """ parser = argparse.ArgumentParser( description='CLEAN retrieved metadata records') parser.add_argument( '--config', required=True, help="File with the configuration for the cleaning run") parser.add_argument('--worker', default=3, help="Number of workers") args = parser.parse_args() config = util.loadConfig(args.config) # This should be the only output, allowing to tail the log config["logger"] = util.setupLogging(config, "clean") config["worker"] = int(args.worker) usedMappingHash = util.getFileHash("clean/cleanDataHelpers.py") if usedMappingHash != config["clean"]["mappingHash"]: config["logger"].error( "Hash of used and configured mapping differ:" "\n\t{} (used)" "\n\t{} (configured)" "\n\tRestore old mapping or change config".format( usedMappingHash, config["clean"]["mappingHash"])) os.sys.exit(1) config["labels"] = util.getLabels(config) config["regex"] = { "ddcValue": re.compile(config["clean"]["regex"]["ddcValue"]), "ddcSchemeURI": re.compile(config["clean"]["regex"]["ddcSchemeURI"]), "special": re.compile(config["clean"]["regex"]["special"]), "dataInput": re.compile(config["clean"]["regex"]["dataInput"]), "dataOutput": re.compile(config["clean"]["regex"]["dataOutput"]) } return config
def prepare(): """ Prepares a cleaning run # Returns config: dict A configuration with all paths, compiled regexes and a logger """ parser = argparse.ArgumentParser( description='Grep over subjects of retrieved metadata records') parser.add_argument( '--config', required=True, help="File with the configuration for the cleaning run") parser.add_argument('--field', default="value", choices=("value", "subjectScheme", "schemeURI"), help="On which field to grep on") parser.add_argument('--grep', required=True, help="Grep expression") args = parser.parse_args() config = util.loadConfig(args.config) config["field"] = args.field config["grep"] = args.grep if "regex" in config["clean"].keys(): config["regex"] = { "dataInput": re.compile(config["clean"]["regex"]["dataInput"]), } elif "dataInputRegex" in config["clean"].keys(): config["regex"] = { "dataInput": re.compile(config["clean"]["dataInputRegex"]), } return config
from PyQt5.QtWidgets import QApplication, QWidget from PyQt5.QtWidgets import QLabel, QPushButton, QLineEdit, QTextBrowser from PyQt5.QtGui import QValidator, QDoubleValidator from PyQt5.QtWidgets import QVBoxLayout, QHBoxLayout, QFormLayout from gui_util.file_handler import FileHandler TABS_DIR = os.path.dirname(os.path.realpath(__file__)) SRC_DIR = os.path.dirname(TABS_DIR) sys.path.append(SRC_DIR) sys.path.append(os.path.join(SRC_DIR, 'util')) sys.path.append(os.path.join(SRC_DIR, 'database')) from util import util from database.ft_db import get_firstrade_db DATA_DIR = os.path.join(SRC_DIR, 'data') CONFIG = util.loadConfig(os.path.join(DATA_DIR, 'config.json')) FTDB = get_firstrade_db(db_fpath=os.path.join(DATA_DIR, CONFIG['database'])) STANDARD_H = 600 STANDARD_W = 800 class ImportTab(QWidget): def __init__(self, parent=None): super(self.__class__, self).__init__(parent) self.left = 10 self.top = 10 self.width = STANDARD_W self.height = STANDARD_H self.setGeometry(self.left, self.top, self.width, self.height) self.setUI()
import shutil from train.mlp import train_ngram_model ################################################################################ # TEST PREPARATION ################################################################################ def getTestConfig(): return os.path.join( os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "config/config.json") payload = {"test": [1, 2, 3], "test2": {"test3": "abc"}} subdir = "retrieve" config = util.loadConfig(getTestConfig()) ################################################################################ # TESTS ################################################################################ def testLoadConfig(): # Check autocompletion of config works config = util.loadConfig(getTestConfig()) for key in ("hash", "rawDataDir", "processedDataDir", "configDir"): assert key in config.keys() # Check autocreations work (therefore delete the dirs first) hashedConfig = os.path.join(os.path.dirname(getTestConfig()), config["hash"] + ".json") if os.path.isfile(hashedConfig):
df = util.cfm2df(cfm, range(len(shortAnzsrc))) df_cfm = pd.DataFrame(data=df.values, index=shortAnzsrc, columns=shortAnzsrc) plt.figure(figsize=(40, 28)) sn.heatmap(df_cfm, annot=True) return plt.plot() parser = argparse.ArgumentParser( description='Plot evaluations for a trained model.') parser.add_argument('--config', required=True, help="File with the configuration for the training run") args = parser.parse_args() config = util.loadConfig(args.config) model_file = os.path.join(config["processedDataDir"], "train", "mlp_model.h5") print("Loading model: {}".format(model_file)) model = models.load_model(model_file) (test_texts, test_labels) = util.loadJsonFromFile(config, "test.json", "train") cfm = util.getConfusionMatrix(config, model, test_texts, test_labels) perCfm = cfm / cfm.sum(axis=1, keepdims=True) plotConfusionMatrix(config, perCfm) savefig(os.path.join(config["processedDataDir"], "evaluate", "cfm.png"))
import os, sys sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) from util.util import loadConfig, loadBinary, loadJsonFromFile from tensorflow.python.keras import models import numpy as np import argparse parser = argparse.ArgumentParser( description='TEST: test a model with a given configuration.') parser.add_argument('--config', required=True, help="File with the configuration for the training run") args = parser.parse_args() config = loadConfig(args.config) model_file = os.path.join(config["processedDataDir"], "train", "mlp_model.h5") print("Loading model: {}".format(model_file)) model = models.load_model(model_file) tests = [ "mathematics proof theorem lemma number topology deduction", "particle physics theoretical physics experimental physics atom mass motion star nova", "chemistry liquid acid protein reaction", "earth science atmosphere geochemistry geology oceanography hydrology", "ecology soil environmental sciences", "biology species population life organism evolution", "agriculture veterinary crop cattle forest animal slaughterhouse", "computer science information science library Memory Computation IT programming language code", "engineering construction electronic structure applied", "technology nanotechnology biotechnology hardware",
import os, sys sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) import util.util as util import json mode = "" config = util.loadConfig("../config/config{}.json".format(mode)) selections = [ { "feature_selection": {"mode": "multipleOfLabels", "value": 1000}, "stemming": "none" }, { "feature_selection": {"mode": "multipleOfLabels", "value": 2500}, "stemming": "none" }, { "feature_selection": {"mode": "multipleOfLabels", "value": 5000}, "stemming": "none", }, { "feature_selection": {"mode": "multipleOfLabels", "value": 1000}, "stemming": "lancaster" }, { "feature_selection": {"mode": "multipleOfLabels", "value": 2500}, "stemming": "lancaster" }, {