def _doImportFromLocal(filename): try: dp = utils.Progress(TITLE, line1 = GETTEXT(30140) % GETTEXT(30000), line2 = filename, line3 = GETTEXT(30141)) return extractAll(filename, dp, filename) except Exception, e: utils.log('Error in _doImportFromLocal %s' % str(e))
def _doImportFromRemote(): try: location = LOCATION.replace(' ', '%20') file = os.path.join(HOME, '_sf_temp.zip') dp = utils.Progress(TITLE, line1=GETTEXT(30140) % GETTEXT(30000), line2=location.replace('%20', ' '), line3=GETTEXT(30141)) import download import urllib download.doDownload(urllib.quote_plus(location), urllib.quote_plus(file), urllib.quote_plus(TITLE), quiet=True) if os.path.exists(file): success = extractAll(file, dp, location.replace('%20', ' ')) utils.DeleteFile(file) return success except Exception as e: utils.log('Error in _doImportFromRemote %s' % str(e)) return False
def dlProgress(count, blockSize, totalSize): percent = int(count * blockSize * 100 / totalSize) dp = utils.Progress( "[COLOR tomato]CerebroTV Checking For Updates[/COLOR]", line1="[COLOR yellow]Please Wait Download in Progress[/COLOR].", line2="[COLOR gold]CerebroTV Update Service[/COLOR]", line3="test") dp.update(percent)
def _doImportFromRemote(): try: location = LOCATION.replace(' ', '%20') file = os.path.join(HOME, '_sf_temp.zip') dp = utils.Progress(TITLE, line1 = GETTEXT(30140) % GETTEXT(30000), line2 = location.replace('%20', ' '), line3 = GETTEXT(30141)) import download download.doDownload(location, file, TITLE) if os.path.exists(file): success = extractAll(file, dp, location.replace('%20', ' ')) utils.DeleteFile(file) return success except Exception, e: utils.log(e)
def gen_model_df(run, model_file): ''' Returns a DataFrame with times as rows and channels as columns. Cells are filled with the most likely model number ''' all_lc = list(itertools.product(range(len(run.time_dirs)), run.channels)) # Create empty DataFrame df = pd.DataFrame(index=run.gps_times, columns=run.channels) p = utils.Progress(all_lc, 'Generating best model DataFrame...') for i, tup in enumerate(all_lc): t, channel = tup c = run.get_channel_index(channel) # linechain file name lc_file = os.path.join(run.time_dirs[t], f'linechain_channel{c}.dat') # Find the mode model = get_counts(lc_file).argmax() df.loc[run.gps_times[t], channel] = model # Update progress p.update(i) # Write to CSV df.to_csv(model_file, sep=' ') return df
def save_summary(run): ''' Returns a multi-index DataFrame of PSD summaries across multiple times from one run folder. The first index represents channel, the second GPS time and the third frequency. Inserts blank rows in place of time gaps. Input ----- run : Run object ''' # Set up progress indicator p = utils.Progress(run.time_dirs, f'Importing {run.name} psd files...') # Concatenate DataFrames of all times; takes a while summaries = [] for i, d in enumerate(run.time_dirs): summaries.append(summarize_psd(run, d)) # Update progress indicator p.update(i) summaries = pd.concat(summaries) # Check for time gaps and fill with NaN DataFrames print('Checking for time gaps...') frequencies = summaries.index.unique(level='FREQ') midx = pd.MultiIndex.from_product( [run.channels, run.missing_times, frequencies], names=['CHANNEL', 'TIME', 'FREQ'] ) filler = pd.DataFrame(columns=summaries.columns, index=midx) summaries = summaries.append(filler).sort_index(level=[0, 1, 2]) print(f'Filled {len(run.missing_times)} missing times with NaN.') # Output to file print(f'Writing to {run.psd_file}...') summaries.to_pickle(run.psd_file) return summaries
def main(args): """Main function to train the model. Args: args: Parsed arguments. Returns: Execution status defined by `constants.ExitCode`. """ # Validate paths. if not validate_paths(args): return constants.ExitCode.INVALID_PATH # Extract paths. input_dir = args.input_dir model_dir = args.model_dir log_dir = args.log_dir existing_model = args.existing_model # Extract model parameters. batch_size = args.batch_size dropout_pkeep = args.dropout_pkeep hidden_state_size = args.hidden_state_size hidden_layer_size = args.hidden_layer_size learning_rate = args.learning_rate # Extract additional flags. debug = args.debug validation = args.validation # Split corpus for training and validation. # validation_text will be empty if validation is False. code_text, validation_text, input_ranges = utils.read_data_files( input_dir, validation=validation) # Bail out if we don't have enough corpus for training. if len(code_text) < batch_size * constants.TRAINING_SEQLEN + 1: return constants.ExitCode.CORPUS_TOO_SMALL # Get corpus files info. Will be used in debug mode to generate sample text. files_info_list = [] if debug: files_info_list = utils.get_files_info(input_dir) assert files_info_list # Calculate validation batch size. It will be 0 if we choose not to validate. validation_batch_size = len(validation_text) // constants.VALIDATION_SEQLEN # Display some stats on the data. epoch_size = len(code_text) // (batch_size * constants.TRAINING_SEQLEN) utils.print_data_stats(len(code_text), len(validation_text), epoch_size) # Set graph-level random seed, so any random sequence generated in this # graph is repeatable. It could also be removed. tf.set_random_seed(0) # Define placeholder for learning rate, dropout and batch size. lr = tf.placeholder(tf.float32, name='lr') pkeep = tf.placeholder(tf.float32, name='pkeep') batchsize = tf.placeholder(tf.int32, name='batchsize') # Input data. input_bytes = tf.placeholder(tf.uint8, [None, None], name='input_bytes') input_onehot = tf.one_hot(input_bytes, constants.ALPHA_SIZE, 1.0, 0.0) # Expected outputs = same sequence shifted by 1, since we are trying to # predict the next character. expected_bytes = tf.placeholder(tf.uint8, [None, None], name='expected_bytes') expected_onehot = tf.one_hot(expected_bytes, constants.ALPHA_SIZE, 1.0, 0.0) # Input state. hidden_state = tf.placeholder( tf.float32, [None, hidden_state_size * hidden_layer_size], name='hidden_state') # "naive dropout" implementation. cells = [rnn.GRUCell(hidden_state_size) for _ in range(hidden_layer_size)] dropcells = [ rnn.DropoutWrapper(cell, input_keep_prob=pkeep) for cell in cells ] multicell = rnn.MultiRNNCell(dropcells, state_is_tuple=False) multicell = rnn.DropoutWrapper(multicell, output_keep_prob=pkeep) output_raw, next_state = tf.nn.dynamic_rnn( multicell, input_onehot, dtype=tf.float32, initial_state=hidden_state) next_state = tf.identity(next_state, name='next_state') # Reshape training outputs. output_flat = tf.reshape(output_raw, [-1, hidden_state_size]) output_logits = layers.linear(output_flat, constants.ALPHA_SIZE) # Reshape expected outputs. expected_flat = tf.reshape(expected_onehot, [-1, constants.ALPHA_SIZE]) # Compute training loss. loss = tf.nn.softmax_cross_entropy_with_logits_v2( logits=output_logits, labels=expected_flat) loss = tf.reshape(loss, [batchsize, -1]) # Use softmax to normalize training outputs. output_onehot = tf.nn.softmax(output_logits, name='output_onehot') # Use argmax to get the max value, which is the predicted bytes. output_bytes = tf.argmax(output_onehot, 1) output_bytes = tf.reshape(output_bytes, [batchsize, -1], name='output_bytes') # Choose Adam optimizer to compute gradients. optimizer = tf.train.AdamOptimizer(lr).minimize(loss) # Stats for display. seqloss = tf.reduce_mean(loss, 1) batchloss = tf.reduce_mean(seqloss) accuracy = tf.reduce_mean( tf.cast( tf.equal(expected_bytes, tf.cast(output_bytes, tf.uint8)), tf.float32)) loss_summary = tf.summary.scalar('batch_loss', batchloss) acc_summary = tf.summary.scalar('batch_accuracy', accuracy) summaries = tf.summary.merge([loss_summary, acc_summary]) # Init Tensorboard stuff. # This will save Tensorboard information in folder specified in command line. # Two sets of data are saved so that you can compare training and # validation curves visually in Tensorboard. timestamp = str(math.trunc(time.time())) summary_writer = tf.summary.FileWriter( os.path.join(log_dir, timestamp + '-training')) validation_writer = tf.summary.FileWriter( os.path.join(log_dir, timestamp + '-validation')) # Init for saving models. # They will be saved into a directory specified in command line. saver = tf.train.Saver(max_to_keep=constants.MAX_TO_KEEP) # For display: init the progress bar. step_size = batch_size * constants.TRAINING_SEQLEN frequency = constants.DISPLAY_FREQ * step_size progress = utils.Progress( constants.DISPLAY_FREQ, size=constants.DISPLAY_LEN, msg='Training on next {} batches'.format(constants.DISPLAY_FREQ)) # Set initial state. state = np.zeros([batch_size, hidden_state_size * hidden_layer_size]) session = tf.Session() # We continue training on exsiting model, or start with a new model. if existing_model: print('Continue training on existing model: {}'.format(existing_model)) try: saver.restore(session, existing_model) except: print( ('Failed to restore existing model since model ' 'parameters do not match.'), file=sys.stderr) return constants.ExitCode.TENSORFLOW_ERROR else: print('No existing model provided. Start training with a new model.') session.run(tf.global_variables_initializer()) # Num of bytes we have trained so far. steps = 0 # Training loop. for input_batch, expected_batch, epoch in utils.rnn_minibatch_sequencer( code_text, batch_size, constants.TRAINING_SEQLEN, nb_epochs=constants.EPOCHS): # Train on one mini-batch. feed_dict = { input_bytes: input_batch, expected_bytes: expected_batch, hidden_state: state, lr: learning_rate, pkeep: dropout_pkeep, batchsize: batch_size } _, predicted, new_state = session.run( [optimizer, output_bytes, next_state], feed_dict=feed_dict) # Log training data for Tensorboard display a mini-batch of sequences # every `frequency` batches. if debug and steps % frequency == 0: feed_dict = { input_bytes: input_batch, expected_bytes: expected_batch, hidden_state: state, pkeep: 1.0, batchsize: batch_size } predicted, seq_loss, batch_loss, acc_value, summaries_value = session.run( [output_bytes, seqloss, batchloss, accuracy, summaries], feed_dict=feed_dict) utils.print_learning_learned_comparison( input_batch, predicted, seq_loss, input_ranges, batch_loss, acc_value, epoch_size, steps, epoch) summary_writer.add_summary(summaries_value, steps) # Run a validation step every `frequency` batches. # The validation text should be a single sequence but that's too slow. # We cut it up and batch the pieces (slightly inaccurate). if validation and steps % frequency == 0 and validation_batch_size: utils.print_validation_header(len(code_text), input_ranges) validation_x, validation_y, _ = next( utils.rnn_minibatch_sequencer(validation_text, validation_batch_size, constants.VALIDATION_SEQLEN, 1)) null_state = np.zeros( [validation_batch_size, hidden_state_size * hidden_layer_size]) feed_dict = { input_bytes: validation_x, expected_bytes: validation_y, hidden_state: null_state, pkeep: 1.0, batchsize: validation_batch_size } batch_loss, acc_value, summaries_value = session.run( [batchloss, accuracy, summaries], feed_dict=feed_dict) utils.print_validation_stats(batch_loss, acc_value) # Save validation data for Tensorboard. validation_writer.add_summary(summaries_value, steps) # Display a short text generated with the current weights and biases. # If enabled, there will be a large output. if debug and steps // 4 % frequency == 0: utils.print_text_generation_header() file_info = utils.random_element_from_list(files_info_list) first_byte, file_size = file_info['first_byte'], file_info['file_size'] ry = np.array([[first_byte]]) rh = np.zeros([1, hidden_state_size * hidden_layer_size]) sample = [first_byte] for _ in range(file_size - 1): feed_dict = { input_bytes: ry, pkeep: 1.0, hidden_state: rh, batchsize: 1 } ryo, rh = session.run([output_onehot, next_state], feed_dict=feed_dict) rc = utils.sample_from_probabilities(ryo, topn=10 if epoch <= 1 else 2) sample.append(rc) ry = np.array([[rc]]) print(repr(utils.decode_to_text(sample))) utils.print_text_generation_footer() # Save a checkpoint every `10 * frequency` batches. Each checkpoint is # a version of model. if steps // 10 % frequency == 0: saved_model_name = constants.RNN_MODEL_NAME + '_' + timestamp saved_model_path = os.path.join(model_dir, saved_model_name) saved_model = saver.save(session, saved_model_path, global_step=steps) print('Saved model: {}'.format(saved_model)) # Display progress bar. if debug: progress.step(reset=steps % frequency == 0) # Update state. state = new_state steps += step_size # Save the model after training is done. saved_model_name = constants.RNN_MODEL_NAME + '_' + timestamp saved_model_path = os.path.join(model_dir, saved_model_name) saved_model = saver.save(session, saved_model_path, global_step=steps) print('Saved model: {}'.format(saved_model)) return constants.ExitCode.SUCCESS
def main(): # Argument parser parser = argparse.ArgumentParser( description='Generate linechain summaries and plots.') parser.add_argument( 'runs', type=str, nargs='*', help='run directory name (default: all folders in "data/" directory)') parser.add_argument( '-c', '--compare', dest='compare', action='store_true', help='compare summary plots for different runs side by side') parser.add_argument( '--overwrite-all', dest='overwrite', action='store_true', help='re-generate summary files even if they already exist (default: \ ask for each run)') parser.add_argument( '--keep-all', dest='keep', action='store_true', help='do not generate summary file if it already exists (default: ask \ for each run)') args = parser.parse_args() # Add all runs in data directory if none are specified if len(args.runs) == 0: args.runs = glob(f'data{os.sep}*{os.sep}*{os.sep}') # Initialize run objects; skip missing directories runs = utils.init_runs(args.runs) for run in runs: print(f'\n-- {run.mode} {run.name} --') # Log output file log_file = os.path.join(run.summary_dir, 'linechain.log') # Confirm to overwrite if summary already exists if args.keep: overwrite = False elif args.overwrite: overwrite = True elif os.path.exists(run.linechain_file): over = input('Found linechain.pkl for this run. Overwrite? (y/N) ') overwrite = True if over == 'y' else False else: overwrite = True if overwrite: run.linecounts, run.lc_summary = save_summary(run, log_file) else: run.lc_summary = pd.read_pickle(run.linechain_file) run.linecounts = pd.read_pickle(run.linecounts_file) if not args.compare: # Plot line parameters print('Plotting...') # Plot linecount colormaps for i, channel in enumerate(run.channels): plot_file = os.path.join(run.plot_dir, f'linecounts{i}.png') plot.linecounts_cmap(run, channel, plot_file) if channel in run.lc_summary.index.unique(level='CHANNEL'): for param in run.lc_summary.index.unique( level='PARAMETER'): plot_file = os.path.join( run.plot_dir, f'linechain_{param.lower()}{i}.png') plot.linechain_scatter(run, channel, param, plot_file=plot_file, show=False) if args.compare: p = utils.Progress(runs[0].channels, '\nPlotting run comparisons...') multirun_dir = os.path.join('out', 'multirun') if not os.path.exists(multirun_dir): os.makedirs(multirun_dir) for i, channel in enumerate(runs[0].channels): plot.compare_linecounts(runs, channel, plot_file=os.path.join( multirun_dir, f'linecounts{i}.png')) p.update(i) print('Done!')
def save_summary(run, log_file=None): ''' Returns a summary DataFrame for all linechain files in the given run. Input ----- run : Run object log_file : string, path to log file (if any) ''' # Set up log file log = utils.Log(log_file, f'linechain.py log file for {run.name}') # Generate iterable of channels and times all_lc = list(itertools.product(run.channels, run.time_dirs)) counts = [] summaries = [] # Set up progress indicator p = utils.Progress(all_lc, f'Importing {run.name} linechain...') for i, t in enumerate(all_lc): channel, time_dir = t ch_idx = run.get_channel_index(channel) # Counts for each viable model lc_file = os.path.join(time_dir, f'linechain_channel{ch_idx}.dat') time_counts = get_counts(lc_file) counts.append(time_counts) # Spectral line summary statistics summaries.append( summarize_linechain(run, time_dir, channel, time_counts, log)) # Update progress indicator p.update(i) # Combine counts into one DataFrame counts = pd.DataFrame(counts, index=pd.MultiIndex.from_product( [run.channels, run.gps_times], names=['CHANNEL', 'TIME'])) # Combine with DataFrame of missing times missing = pd.DataFrame(columns=counts.columns, index=pd.MultiIndex.from_product( [run.channels, run.missing_times], names=['CHANNEL', 'TIME'])) counts = pd.concat([counts, missing]).sort_index(level=[0, 1]) counts = counts.astype('float64') # Log final output log.log('All line counts:') log.log(counts.to_string(max_cols=80)) # Output to file counts.to_pickle(run.linecounts_file) print('Model counts written to ' + run.linecounts_file) # Combine summaries into one DataFrame summaries = pd.concat(summaries, axis=0) midx = pd.MultiIndex.from_tuples( summaries.index, names=['CHANNEL', 'TIME', 'LINE', 'PARAMETER']) summaries.index = midx # Log final output log.log('All summaries:') log.log(summaries.to_string(max_cols=80)) # Output to file summaries.to_pickle(run.linechain_file) print('Summary written to ' + run.linechain_file) return counts, summaries
def main(): # Argument parser parser = argparse.ArgumentParser( description='Generate PSD summaries and plots.' ) parser.add_argument('runs', type=str, nargs='*', help='run directory name (default: all folders in "data/" directory)' ) parser.add_argument('-c', '--compare', dest='compare', action='store_true', help='compare summary plots for different runs side by side') parser.add_argument('--overwrite-all', dest='overwrite', action='store_true', help='re-generate summary files even if they already exist (default: \ ask for each run)' ) parser.add_argument('--keep-all', dest='keep', action='store_true', help='do not generate summary file if it already exists (default: ask \ for each run)' ) args = parser.parse_args() # Add all runs in data directory if none are specified if len(args.runs) == 0: args.runs = glob(f'data{os.sep}*{os.sep}*{os.sep}') # Initialize run objects; skip missing directories runs = utils.init_runs(args.runs) # Import impacts file, if any impacts_file = 'impacts.dat' impacts = np.array([]) if os.path.exists(impacts_file): impacts = get_impacts(impacts_file) for run in runs: print(f'\n-- {run.mode} {run.name} --') # Log output file log_file = os.path.join(run.summary_dir, 'psd.log') log = utils.Log(log_file, f'psd.py log file for {run.name}') # Confirm to overwrite if summary already exists if args.keep: overwrite = False elif args.overwrite: overwrite = True elif os.path.exists(run.psd_file): over = input('Found psd.pkl for this run. Overwrite? (y/N) ') overwrite = True if over == 'y' else False else: overwrite = True # Import / generate summary PSD DataFrame if overwrite: run.psd_summary = save_summary(run) else: run.psd_summary = pd.read_pickle(run.psd_file) # Make plots df = run.psd_summary # Frequency slices: roughly logarithmic, low-frequency plot_frequencies = np.array([1e-3, 3e-3, 5e-3, 1e-2, 3e-2, 5e-2]) plot_frequencies = get_exact_freq(run.psd_summary, plot_frequencies) # Time slices: get even spread of times n = 6 indices = [int(i / (n-1) * len(run.gps_times)) for i in range(1,n-1)] slice_times = sorted([run.gps_times[0], run.gps_times[-1]] + [run.gps_times[i] for i in indices] ) if not args.compare: p = utils.Progress(run.channels, 'Plotting...') for i, channel in enumerate(run.channels): # FFT analysis fft_file = os.path.join(run.plot_dir, f'fft{i}.png') rfftfreq, rfft = fft(run, channel, plot_frequencies, log) plot.fft(rfftfreq, rfft, run, channel, plot_frequencies, logfreq=False, plot_file=fft_file) # Colormap cmap_file = os.path.join(run.plot_dir, f'colormap{i}.png') plot.save_colormaps(run, channel, cmap_file) # Frequency slices fslice_file = os.path.join(run.plot_dir, f'fslice{i}.png') plot.save_freq_slices([run], channel, plot_frequencies, impacts=impacts, plot_file=fslice_file) # Time slices tslice_file = os.path.join(run.plot_dir, f'tslice{i}.png') plot.save_time_slices(run, channel, slice_times, tslice_file) # Update progress p.update(i) # Plot run comparisons if args.compare: p = utils.Progress(runs[0].channels, '\nPlotting run comparisons...') multirun_dir = os.path.join('out', 'multirun') if not os.path.exists(multirun_dir): os.makedirs(multirun_dir) for i, channel in enumerate(runs[0].channels): plot.compare_colormaps(runs, channel, plot_file=os.path.join(multirun_dir, f'colormap{i}.png')) plot.save_freq_slices(runs, channel, plot_frequencies, impacts=impacts, plot_file=os.path.join(multirun_dir, f'fslice{i}.png')) fft_freqs = np.array([1e-3, 5e-3, 3e-2]) fft_freqs = get_exact_freq(runs[0].psd_summary, fft_freqs) plot.compare_fft(runs, channel, fft_freqs, plot_file=os.path.join(multirun_dir, f'fft{i}.png')) p.update(i) print('Done!')