def read_overmap_terrain_data() -> None: """ Fill the OVERMAP_TERRAIN_COLORS """ overmap_terrain_data, errors = import_data( json_dir=Path('../../data/json/overmap/overmap_terrain/'), json_fmatch='*.json', ) if errors: print(errors) for entry in overmap_terrain_data: if entry.get('type') != 'overmap_terrain': continue entry_ids = entry.get('id') if not entry_ids: continue color_name = entry.get('color') COLOR_NAMES.add(color_name) color = SCHEME['colors'].get(color_name) if not color: continue if isinstance(entry_ids, str): entry_ids = (entry_ids, ) for terrain_id in entry_ids: OVERMAP_TERRAIN_DATA[terrain_id] = color
def read_mapgen_palettes() -> None: """ Fill the PALETTES global """ palette_entries, errors = import_data( json_dir=Path('../../data/json/mapgen_palettes/'), json_fmatch='*.json', ) if errors: print(errors) for entry in palette_entries: add_palette(entry)
def get_mapgen_data( mapgen_dir: Path, pattern: str, ) -> list: """ Get all mapgen entries """ mapgen_data, errors = import_data( json_dir=mapgen_dir, json_fmatch=pattern, ) if errors: print(errors) return mapgen_data
def read_terrain_color_names() -> None: """ Fill the TERRAIN_COLOR_NAMES global """ terrain_data, errors = import_data( json_dir=Path('../../data/json/furniture_and_terrain/'), json_fmatch='terrain*.json', ) if errors: print(errors) for terrain in terrain_data: terrain_type = terrain.get('type') terrain_id = terrain.get('id') terrain_color = terrain.get('color') if isinstance(terrain_color, list): terrain_color = terrain_color[0] if terrain_type == 'terrain' and terrain_id and terrain_color: TERRAIN_COLOR_NAMES[terrain_id] = terrain_color COLOR_NAMES.add(terrain_color)
parser.add_argument("--fnmatch", default="*.json", help="override with glob expression to select a smaller fileset.") parser.add_argument("--all", action="store_true", help="if set, includes all matches. if not set, includes first match in the stream.") parser.add_argument("where", action=WhereAction, nargs='+', type=str, help="where exclusions of the form 'where_key=where_val', no quotes.") if __name__ == "__main__": args = parser.parse_args() json_data, load_errors = import_data(json_fmatch=args.fnmatch) if load_errors: # If we start getting unexpected JSON or other things, might need to # revisit quitting on load_errors print("Error loading JSON data.") for e in load_errrors: print(e) sys.exit(1) elif not json_data: print("No data loaded.") sys.exit(1) # Wasteful iteration, but less code to maintain on a tool that will likely # change again. plucked = [item for item in json_data if matches_all_wheres(item, args.where)]
if not stats: print("Sorry, didn't find any stats for '%s' in the JSON." % search_key) sys.exit(1) title = "Count of values from field '%s'" % search_key print("\n\n%s" % title) print("(Data from %s out of %s blobs)" % (num_matches, len(json_data))) print("-" * len(title)) ui_counts_to_columns(stats) elif len(sys.argv) == 3 and sys.argv[2] == "--json": # Count values associated with key, machine output. search_key = sys.argv[1] where_key = None where_value = None json_data = import_data()[0] stats, num_matches = value_counter(json_data, search_key, where_key, where_value) if not stats: # Still JSON parser friendly, indicator of fail with emptiness. print(json.dumps([])) sys.exit(1) else: print(json.dumps(stats)) elif len(sys.argv) == 4: # Count values associated with key, filter, human friendly output. search_key = sys.argv[1] where_key = sys.argv[2] where_value = sys.argv[3] json_data = import_data()[0] stats, num_matches = value_counter(json_data, search_key, where_key, where_value)
import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers import numpy as np import pylab as plt from scipy.io import loadmat import model as m import params import util #Import data raw_data = util.import_data() #Remove nans util.remove_nans(raw_data) #Normalize raw_data = util.normalize(raw_data) #Load in nan_map nan_map = loadmat(params.NAN_MAP_PATH)['nan_map'].astype(bool) #Convert nans back to 0 raw_data = util.reset_nan_values(raw_data, nan_map) #For each window, run the algorithm #for i in range(int(params.TV_SPLIT * len(raw_data)) - 1 - params.PREDICT_WINDOW_SIZE): for i in range(1): print("SIMULATION WINDOW %d OUT OF %d:" % ((i + 1), int(params.TV_SPLIT * len(raw_data)) - 1 - params.PREDICT_WINDOW_SIZE))
def run(self): # Load corpus corpus = import_data(self.corpus) self.dictionary, self.reverse_dictionary, sent_lengths, self.max_sent_len, enc_data, dec_data, dec_lab = build_dictionary( corpus) # Save metadata for visualisation of embedding matrix meta_data = sorted(self.dictionary, key=model.dictionary.get) print(len(meta_data)) with open('meta_data.tsv', 'w') as f: tsv_writer = csv.writer(f, dialect='excel') tsv_writer.writerow( str(i.encode('utf-8')) + '\n' for i in meta_data) # np.savetxt("meta_data.tsv", meta_data, fmt="%s") self.dictionary = sorted(self.dictionary.items(), key=operator.itemgetter(1)) self.vocabulary_size = len(self.dictionary) self.max_sent_len += 1 # Create datasets for encoder and decoders enc_data = enc_data[1:-1] enc_lengths = sent_lengths[1:-1] post_lengths = sent_lengths[2:] + 1 post_data = dec_data[2:] post_lab = dec_lab[2:] pre_lengths = sent_lengths[:-2] + 1 pre_data = dec_data[:-2] pre_lab = dec_lab[:-2] # Print summary statistics self.corpus_length = len(enc_data) self.corpus_stats() self.graph = tf.Graph() with self.graph.as_default(): print('\r~~~~~~~ Building model ~~~~~~~\r') self.initializer = tf.random_normal_initializer() # Variables self.word_embeddings = tf.get_variable( 'embeddings', [self.vocabulary_size, self.embedding_size], tf.float32, initializer=self.initializer) self.W_pre = tf.get_variable( 'precoder/weight', [self.embedding_size, self.vocabulary_size], tf.float32, initializer=self.initializer) self.b_pre = tf.get_variable('precoder/bias', [self.vocabulary_size], tf.float32, initializer=self.initializer) self.W_post = tf.get_variable( 'postcoder/weight', [self.embedding_size, self.vocabulary_size], tf.float32, initializer=self.initializer) self.b_post = tf.get_variable('postcoder/bias', [self.vocabulary_size], tf.float32, initializer=self.initializer) global_step = tf.Variable(0, name='global_step', trainable=False) # Encoder placeholders sentences = tf.placeholder(tf.int32, [None, None], "sentences") sentences_lengths = tf.placeholder(tf.int32, [None], "sentences_lengths") # Postcoder placeholders post_inputs = tf.placeholder(tf.int32, [None, None], "post_inputs") post_labels = tf.placeholder(tf.int32, [None, None], "post_labels") post_sentences_lengths = tf.placeholder(tf.int32, [None], "post_sentences_lengths") # Precoder placeholders pre_inputs = tf.placeholder(tf.int32, [None, None], "pre_inputs") pre_labels = tf.placeholder(tf.int32, [None, None], "pre_labels") pre_sentences_lengths = tf.placeholder(tf.int32, [None], "pre_sentences_lengths") # Embed sentences sentences_embedded = self.embed_data(sentences) post_inputs_embedded = self.embed_data(post_inputs) pre_inputs_embedded = self.embed_data(pre_inputs) # Encoder encoded_sentences = self.encoder(sentences_embedded, sentences_lengths, self.bidirectional) # Decoder for following sentence post_logits_projected, post_logits = self.decoder( decoder_inputs=post_inputs_embedded, encoder_state=encoded_sentences, name='postcoder', lengths=post_sentences_lengths, train=True) # Decoder for previous sentence pre_logits_projected, pre_logits = self.decoder( decoder_inputs=pre_inputs_embedded, encoder_state=encoded_sentences, name='precoder', lengths=pre_sentences_lengths, train=True) # Compute loss if self.loss_function == 'softmax': post_loss = self.get_softmax_loss(post_labels, post_logits_projected) pre_loss = self.get_softmax_loss(pre_labels, pre_logits_projected) else: post_loss = self.get_sampled_softmax_loss(post_labels, post_logits, name='postcoder') pre_loss = self.get_sampled_softmax_loss(pre_labels, pre_logits, name='precoder') loss = pre_loss + post_loss opt_op = tf.contrib.layers.optimize_loss( loss=loss, global_step=global_step, learning_rate=self.learning_rate, optimizer='Adam', clip_gradients=2.0, learning_rate_decay_fn=None, summaries=['loss']) # Decode sentences at prediction time pre_predict = self.decoder(decoder_inputs=pre_inputs_embedded, encoder_state=encoded_sentences, name='precoder', lengths=pre_sentences_lengths, train=False) post_predict = self.decoder(decoder_inputs=post_inputs_embedded, encoder_state=encoded_sentences, name='postcoder', lengths=post_sentences_lengths, train=False) predict = [pre_predict, post_predict] with tf.Session(graph=self.graph) as session: self.a = tf.contrib.graph_editor.get_tensors(self.graph) train_loss_writer = tf.summary.FileWriter( './tensorboard/train_loss', session.graph) # Use the same LOG_DIR where you stored your checkpoint. embedding_writer = tf.summary.FileWriter('./tensorboard/', session.graph) config = projector.ProjectorConfig() embedding = config.embeddings.add() embedding.tensor_name = self.word_embeddings.name # Link this tensor to its metadata file (e.g. labels). embedding.metadata_path = os.path.join('./meta_data.tsv') # Saves a configuration file that TensorBoard will read during startup. projector.visualize_embeddings(embedding_writer, config) merged = tf.summary.merge_all() print('\r~~~~~~~ Initializing variables ~~~~~~~\r') tf.global_variables_initializer().run() print('\r~~~~~~~ Starting training ~~~~~~~\r') start_time = time.time() try: train_summaryIndex = -1 for epoch in range(self.num_epochs): self.is_train = True epoch_time = time.time() print('----- Epoch', epoch, '-----') print('Shuffling dataset') perm = np.random.permutation(self.corpus_length) enc_lengths_perm = enc_lengths[perm] enc_data_perm = enc_data[perm] post_lengths_perm = post_lengths[perm] post_inputs_perm = np.array(post_data)[perm] post_labels_perm = np.array(post_lab)[perm] pre_lengths_perm = pre_lengths[perm] pre_inputs_perm = np.array(pre_data)[perm] pre_labels_perm = np.array(pre_lab)[perm] total_loss = 0 predict_step = 50 for step in range(self.corpus_length // self.batch_size): begin = step * self.batch_size end = (step + 1) * self.batch_size batch_enc_lengths = enc_lengths_perm[begin:end] batch_enc_inputs = enc_data_perm[begin:end] batch_post_lengths = post_lengths_perm[begin:end] batch_post_inputs = post_inputs_perm[ begin:end, :np.max(batch_post_lengths)] batch_post_labels = post_labels_perm[ begin:end, :np.max(batch_post_lengths)] batch_pre_lengths = pre_lengths_perm[begin:end] batch_pre_inputs = pre_inputs_perm[ begin:end, :np.max(batch_pre_lengths)] batch_pre_labels = pre_labels_perm[ begin:end, :np.max(batch_pre_lengths)] train_dict = { sentences: batch_enc_inputs, sentences_lengths: batch_enc_lengths, post_inputs: batch_post_inputs, post_labels: batch_post_labels, post_sentences_lengths: batch_post_lengths, pre_inputs: batch_pre_inputs, pre_labels: batch_pre_labels, pre_sentences_lengths: batch_pre_lengths } _, loss_val, batch_summary, glob_step = session.run( [opt_op, loss, merged, global_step], feed_dict=train_dict) train_loss_writer.add_summary( batch_summary, step + (self.corpus_length // self.batch_size) * epoch) total_loss += loss_val if glob_step % predict_step == 0: # if step > 0: print("Average loss at step ", glob_step, ": ", total_loss / predict_step) total_loss = 0 print('\nOriginal sequence:\n') print( self.print_sentence(batch_pre_inputs[0, 1:], batch_pre_lengths[0] - 1)) print( self.print_sentence(batch_enc_inputs[0], batch_enc_lengths[0])) print( self.print_sentence(batch_post_inputs[0, 1:], batch_post_lengths[0] - 1)) test_enc_lengths = np.expand_dims( batch_enc_lengths[0], 0) test_enc_inputs = np.expand_dims( batch_enc_inputs[0], 0) test_post_lengths = np.expand_dims( batch_post_lengths[0], 0) test_post_inputs = np.expand_dims( batch_post_inputs[0], 0) test_post_labels = np.expand_dims( batch_post_labels[0], 0) test_pre_lengths = np.expand_dims( batch_pre_lengths[0], 0) test_pre_inputs = np.expand_dims( batch_pre_inputs[0], 0) test_pre_labels = np.expand_dims( batch_pre_labels[0], 0) test_dict = { sentences_lengths: test_enc_lengths, sentences: test_enc_inputs, post_sentences_lengths: test_post_lengths, post_inputs: test_post_inputs, post_labels: test_post_labels, pre_sentences_lengths: test_pre_lengths, pre_inputs: test_pre_inputs, pre_labels: test_pre_labels } pre_prediction, post_prediction = session.run( [predict], feed_dict=test_dict)[0] print( '\nPredicted previous and following sequence around original sentence:\n' ) print( self.print_sentence(pre_prediction[0], len(pre_prediction[0]))) print( self.print_sentence(batch_enc_inputs[0], batch_enc_lengths[0])) print( self.print_sentence(post_prediction[0], len(post_prediction[0]))) end_time = time.time() print('\nTime for %d steps: %0.2f seconds' % (predict_step, end_time - start_time)) start_time = time.time() print( '\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) saver = tf.train.Saver() saver.save(session, os.path.join('./tensorboard/', 'model.ckpt')) except KeyboardInterrupt: save = input('save?') if 'y' in save: self.save_model(session, 0)
def test_auto_encoder_with_data(control_seq, estimator): """ test data with control seq missed """ X_train_raw, y_train_raw, X_test_raw, y_test_raw = import_data() X_train_raw_0 = mask_source_channel(control_seq, X_train_raw, 1) X_test_raw_0 = mask_source_channel(control_seq, X_test_raw, 1) # split train data into 5 folds kf = KFold(n_splits=5, shuffle=True) for train_index, validation_index in kf.split(X_train_raw): pass # raw data X_train_raw_train = X_train_raw.iloc[train_index] X_train_raw_validation = X_train_raw.iloc[validation_index] # missed features data X_train_raw_0_train = X_train_raw_0.iloc[train_index] X_train_raw_0_validation = X_train_raw_0.iloc[validation_index] # drop na X_train_raw_0_train_dropna = X_train_raw_0_train.dropna(axis=1) X_test_raw_0_dropna = X_test_raw_0.dropna(axis=1) # normalize data X_train_all_dropna_normalize = normalize_data(X_train_raw_0_train_dropna, X_train_raw_train) X_train_raw_train_normalize = normalize_data(X_train_raw_train, X_train_raw_train) X_test_raw_0_dropna_normalize = normalize_data(X_test_raw_0_dropna, X_test_raw) # this is the size of our encoded representations # we know that there are 561 features... # reduce to 28 dim (compr rate = 20) encoding_dim = 4 # this is our input placeholder input_seq = Input(shape=(X_train_raw_0_train_dropna.shape[1], )) # encode layer encoded = Dense(128, activation='relu')(input_seq) #encoded = Dense(64, activation='relu')(encoded) encoded = Dense(16, activation='relu')(encoded) encoded = Dense(8, activation='relu')(encoded) encoder_output = Dense(encoding_dim)(encoded) # decode layer decoded = Dense(16, activation='relu')(encoder_output) #decoded = Dense(64, activation='relu')(decoded) decoded = Dense(128, activation='relu')(decoded) decoded = Dense(561, activation='sigmoid')(decoded) # construct autoencoder autoencoder = Model(inputs=input_seq, outputs=decoded) # compile autoencoder autoencoder.compile(optimizer=Adam(lr=LEARNING_RATE), loss='mean_squared_error') autoencoder.summary() # training from keras.callbacks import TensorBoard autoencoder.fit( X_train_all_dropna_normalize, X_train_raw_train_normalize, epochs=300, batch_size=100, #shuffle=True, #validation_data=(X_train_raw_0_validation, X_train_raw_validation), callbacks=[TensorBoard(log_dir='./tmp/autoencoder')]) # tensorboard --logdir=E:\har\tmp\autoencoder # predict and denormalize data X_test_raw_refilled = autoencoder.predict(X_test_raw_0_dropna_normalize) X_test_raw_refilled_denor = denormalize_data(X_test_raw_refilled, X_test_raw) ynew = estimator.predict(X_test_raw_refilled_denor) ynew = ynew + 1 acc_tests = np.trace(confusion_matrix(y_test_raw, ynew)) / len(ynew) return acc_tests
# # # without PCA # X1 = phishing_X # # X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size=0.3) # num_features = X_train.shape[1] # num_classes = 2 # nodes = (num_classes + num_features) / 2 # momentum1, learning_rate1 = 0.9, 0.25 # # # end = getEpochCurves(momentum1, learning_rate1, X_train, X_test, y_train, y_test, str(X1.shape[1])) # print "Time taken with " + str(X1.shape[1]) + " components " + str(end) # with 25 components phishing_X, Y1, optdigits_X, Y2 = import_data() pca = FastICA(n_components=26, random_state=5) X1 = pca.fit_transform(phishing_X) X1 /= X1.std(axis=0) print("original shape: ", phishing_X.shape) print("transformed shape:", X1.shape) projected_phishing = np.hstack((X1, Y1[..., None])) np.savetxt('phishing_ica.csv', projected_phishing, delimiter=',') # X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size=0.3) # num_features = X_train.shape[1] # num_classes = 2 # nodes = (num_classes + num_features) / 2 # momentum1, learning_rate1 = 0.9, 0.25 #
formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( "--fnmatch", default="*.json", help="override with glob expression to select a smaller fileset.") parser.add_argument( "where", action=WhereAction, nargs='+', type=str, help="where exclusions of the form 'where_key=where_val', no quotes.") if __name__ == "__main__": args = parser.parse_args() json_data, load_errors = import_data(json_fmatch=args.fnmatch) if load_errors: # If we start getting unexpected JSON or other things, might need to # revisit quitting on load_errors print("Error loading JSON data.") for e in load_errrors: print(e) sys.exit(1) elif not json_data: print("No data loaded.") sys.exit(1) matched = [] not_matched = [] for item in json_data: if matches_all_wheres(item, args.where):
from __future__ import print_function import sys import os import json from util import import_data, matches_where if __name__ == "__main__": if len(sys.argv) == 3: # pluck one where_key = sys.argv[1] where_value = sys.argv[2] # TODO: Put the errors back in, someday, maybe. json_data, _ = import_data() plucked = None for item in json_data: is_match = matches_where(item, where_key, where_value) if is_match: plucked = item break if not plucked: sys.exit(1) else: print(json.dumps(plucked, indent=4)) elif len(sys.argv) == 4 and sys.argv[3] == "--all": # pluck all where_key = sys.argv[1]
from __future__ import print_function import sys import os import json from util import import_data, matches_where, CDDAJSONWriter if __name__ == "__main__": if len(sys.argv) == 3: # pluck one where_key = sys.argv[1] where_value = sys.argv[2] # TODO: Put the errors back in, someday, maybe. json_data, _ = import_data() plucked = None for item in json_data: is_match = matches_where(item, where_key, where_value) if is_match: plucked = item break if not plucked: sys.exit(1) else: print(CDDAJSONWriter(plucked).dumps()) elif len(sys.argv) == 4 and sys.argv[3] == "--all": # pluck all where_key = sys.argv[1]
""" # using autoencoder to refill missing data import numpy as np import os os.chdir('E:\\har') from util import * import pandas as pd from util import plot_confusion_matrix, import_data, mask_source_channel import matplotlib.pyplot as plt LEARNING_RATE = 7e-4 X_train_raw, y_train_raw, X_test_raw, y_test_raw = import_data() X_train_raw_0 = mask_source_channel(['Acc'], X_train_raw, 1) X_test_raw_0 = mask_source_channel(['Acc'], X_test_raw, 1) # split train data into 5 folds from sklearn.model_selection import KFold kf = KFold(n_splits=5, shuffle=True) for train_index, validation_index in kf.split(X_train_raw): pass # raw data X_train_raw_train = X_train_raw.iloc[train_index] X_train_raw_validation = X_train_raw.iloc[validation_index] # missed features data X_train_raw_0_train = X_train_raw_0.iloc[train_index]
def __init__(self, corpus, parameters): self.corpus = corpus self.para = parameters self.dictionary, self.reverse_dictionary, sent_lengths, self.max_sent_len, enc_data, dec_data, dec_lab = build_dictionary( import_data(self.corpus)) self.dictionary_sorted = sorted(self.dictionary.items(), key=operator.itemgetter(1)) self.vocabulary_size = len(self.dictionary_sorted) self.max_sent_len += 1 self.data = autoencoder_data(enc_data=enc_data, dec_data=dec_data, dec_lab=dec_lab, sent_lengths=sent_lengths) print('\r~~~~~~~ Building graph ~~~~~~~\r') self.graph = tf.get_default_graph() self.initializer = tf.random_normal_initializer() # Variables self.word_embeddings = tf.get_variable( 'embeddings', [self.vocabulary_size, self.para.embedding_size], tf.float32, initializer=self.initializer) self.W = tf.get_variable( 'decoder/weight', [self.para.embedding_size, self.vocabulary_size], tf.float32, initializer=self.initializer) self.b = tf.get_variable('decoder/bias', [self.vocabulary_size], tf.float32, initializer=self.initializer) self.global_step = tf.Variable(0, name='global_step', trainable=False) # Encoder placeholders self.enc_inputs = tf.placeholder(tf.int32, [None, None], "enc_inputs") self.enc_input_lengths = tf.placeholder(tf.int32, [None], "enc_input_lengths") # Decoder placeholders self.dec_inputs = tf.placeholder(tf.int32, [None, None], "dec_inputs") self.dec_labels = tf.placeholder(tf.int32, [None, None], "dec_labels") self.dec_input_lengths = tf.placeholder(tf.int32, [None], "dec_input_lengths") # Embed sentences enc_inputs_embedded = self.embed_data(self.enc_inputs) dec_inputs_embedded = self.embed_data(self.dec_inputs) # Encoder self.encoded_sentences = self.encoder(enc_inputs_embedded, self.enc_input_lengths, self.para.bidirectional) # Decoder for following sentence dec_logits_projected, dec_logits = self.decoder( decoder_inputs=dec_inputs_embedded, encoder_state=self.encoded_sentences, name='decoder', lengths=self.dec_input_lengths, train=True) # Compute loss if self.para.loss_function == 'softmax': self.loss = self.get_softmax_loss(self.dec_labels, dec_logits_projected) else: self.loss = self.get_sampled_softmax_loss(self.dec_labels, dec_logits, name='decoder') self.opt_op = tf.contrib.layers.optimize_loss( loss=self.loss, global_step=self.global_step, learning_rate=self.para.learning_rate, optimizer='Adam', clip_gradients=2.0, learning_rate_decay_fn=None, summaries=['loss']) # Decode sentences at prediction time self.predict = self.decoder(decoder_inputs=dec_inputs_embedded, encoder_state=self.encoded_sentences, name='decoder', lengths=self.dec_input_lengths, train=False)
class CDDAValues: """Worker class that prints table from provided data""" output = None def __init__(self, format_string): format_class = get_format_class_by_extension(format_string) self.output = format_class() def print_table(self, data, columns, types_filter, none_string, with_header): if with_header: self.output.header(columns) for item in data: if types_filter and item.get('type') not in types_filter: continue self.output.row(item_values(item, columns, none_string)) if __name__ == "__main__": args = parser.parse_args() if args.tileset_types_only: args.type = TILESET_TYPES # Get data (don't care about load errors) json_data, _ = util.import_data(json_fmatch=args.fnmatch) worker = CDDAValues(args.format) worker.print_table(json_data, args.columns, args.type, args.nonestring, args.with_header)
print("Sorry, didn't find any stats for '%s' in the JSON." % search_key) sys.exit(1) title = "List of values from field '%s'" % search_key print("\n\n%s" % title) print("(Data from %s out of %s blobs)" % (num_matches, len(json_data))) print("-" * len(title)) ui_values_to_columns(sorted(stats.keys())) elif len(sys.argv) == 3 and sys.argv[2] == "--json": # Count values associated with key, machine output. search_key = sys.argv[1] where_key = None where_value = None json_data = import_data()[0] stats, num_matches = value_counter(json_data, search_key, where_key, where_value) if not stats: # Still JSON parser friendly, indicator of fail with emptiness. print(json.dumps([])) sys.exit(1) else: print(json.dumps(sorted(stats.keys()))) elif len(sys.argv) == 4: # Count values associated with key, filter, human friendly output. search_key = sys.argv[1] where_key = sys.argv[2] where_value = sys.argv[3] json_data = import_data()[0]
return KNeighborsClassifier(n_neighbors=n_neighbors) def validation_curve(self, X1, Y1, dataset_name): myList = list(range(1, 50)) neighbors = filter(lambda x: x % 2 != 0, myList) param_grid = neighbors title = "Validation Curve for {} Dataset (KNN)".format(dataset_name) cv = StratifiedKFold(n_splits=10, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size=0.3) estimator = KNeighborsClassifier() plot_validation_curve(estimator, title, X1, Y1, "n_neighbors", param_grid, ylim=None, xlim=(1, 50), cv=cv) plt.show() if __name__ == '__main__': X1, Y1, X2, Y2 = import_data() kNN().main(X1, Y1, "Letter Recognition") kNN().main(X2, Y2, "Madelon")
#!/usr/bin/env python3 """Lists duplicates in JSON by `type` and `id` fields """ from collections import defaultdict from util import import_data data = import_data()[0] all_ids = defaultdict(set) for obj in data: obj_id = obj.get('id') obj_type = obj.get('type') if obj_id and not isinstance(obj_id, list): if obj_id not in all_ids[obj_type]: all_ids[obj_type].add(obj_id) else: print(obj_type, obj_id)
def main(): core_data, core_errors = util.import_data() print('Importing Generic Guns data from %r' % GG_DIR) gg_data, gg_errors = util.import_data(GG_DIR) if core_errors or gg_errors: print('Errors reading json:\n%s' % '\n'.join(core_errors + gg_errors)) sys.exit(1) gg_migrations = get_ids(items_of_type(gg_data, 'MIGRATION')) core_guns = items_of_type(core_data, 'GUN') def is_not_fake_item(i): return i.get('copy-from', '') != 'fake_item' def is_not_whitelisted_skill(i): return 'skill' in i and i['skill'] not in SKILL_WHITELIST def has_pockets(i): return 'pocket_data' in i def lacks_whitelisted_pocket(i): return not any( pocket.get('ammo_restriction', {}).keys() & AMMO_TYPE_WHITELIST for pocket in i.get('pocket_data', [])) def can_be_unwielded(i): return 'NO_UNWIELD' not in i.get('flags', []) core_guns = items_for_which_all_ancestors(core_guns, is_not_fake_item) core_guns = items_for_which_any_ancestor(core_guns, is_not_whitelisted_skill) core_guns = items_for_which_any_ancestor(core_guns, has_pockets) core_guns = items_for_which_all_ancestors(core_guns, lacks_whitelisted_pocket) core_guns = items_for_which_all_ancestors(core_guns, can_be_unwielded) core_magazines = items_of_type(core_data, 'MAGAZINE') core_magazines = items_for_which_all_ancestors(core_magazines, lacks_whitelisted_pocket) core_ammo = items_of_type(core_data, 'AMMO') def is_not_whitelisted_ammo_type(i): return 'ammo_type' in i and i['ammo_type'] not in AMMO_TYPE_WHITELIST def is_bullet(i): return i.get('damage', {}).get('damage_type', '') == 'bullet' core_bullets = items_for_which_any_ancestor(core_ammo, is_bullet) core_bullets = items_for_which_any_ancestor(core_bullets, is_not_whitelisted_ammo_type) if (not gg_migrations or not core_guns or not core_magazines or not core_ammo): print('One of the collections is empty; something has gone wrong with ' 'data collection') return 1 returncode = 0 def check_missing(items, name): ids = get_ids(items) - ID_WHITELIST missing_migrations = ids - gg_migrations if missing_migrations: print('Missing Generic Guns migrations for these types of %s:' % name) print('\n'.join(sorted(missing_migrations))) print() nonlocal returncode returncode = 1 check_missing(core_bullets, 'ammo') check_missing(core_magazines, 'magazine') check_missing(core_guns, 'guns') if returncode: print('The above errors can be resolved by either adding suitable ' 'migrations to Generic Guns or adding to the whitelists of ' 'things not requiring migration in ' 'tools/json_tools/generic_guns_validator.py') return returncode
def load_submit(submit_name: str) -> pd.DataFrame: try: msg = f"failed to decode {submit_name}." with open(submit_name, "r") as fin: upload = json.load(fin) msg = f"{submit_name} find no solution element." upload = upload.get("solution") msg = f"{submit_name} can not convert to dataframe." return pd.DataFrame.from_dict(upload) except Exception as e: print(msg) print(str(e)) if __name__ == '__main__': ok, data_total = validator.import_data('jobs.json') if not ok: print("load environment setting failed.") sys.exit(-1) js = validator.JobShop(data_total) submit_name = sys.argv[1] if len(sys.argv) > 1 else "submit.json" df_up = load_submit(submit_name) ok, msg = validator.prepare(js, df_up) if not ok: print(msg) sys.exit(-1) ok, msg = validator.check(js) if not ok: for l in msg: