Ejemplo n.º 1
0
def _doImportFromLocal(filename):
    try:
        dp = utils.Progress(TITLE, line1 = GETTEXT(30140) % GETTEXT(30000), line2 = filename, line3 = GETTEXT(30141))
        return extractAll(filename, dp, filename)

    except Exception, e:
        utils.log('Error in _doImportFromLocal %s' % str(e))
Ejemplo n.º 2
0
def _doImportFromRemote():
    try:
        location = LOCATION.replace(' ', '%20')
        file = os.path.join(HOME, '_sf_temp.zip')

        dp = utils.Progress(TITLE,
                            line1=GETTEXT(30140) % GETTEXT(30000),
                            line2=location.replace('%20', ' '),
                            line3=GETTEXT(30141))

        import download
        import urllib
        download.doDownload(urllib.quote_plus(location),
                            urllib.quote_plus(file),
                            urllib.quote_plus(TITLE),
                            quiet=True)

        if os.path.exists(file):
            success = extractAll(file, dp, location.replace('%20', ' '))
            utils.DeleteFile(file)
            return success
    except Exception as e:
        utils.log('Error in _doImportFromRemote %s' % str(e))

    return False
Ejemplo n.º 3
0
def dlProgress(count, blockSize, totalSize):
    percent = int(count * blockSize * 100 / totalSize)
    dp = utils.Progress(
        "[COLOR tomato]CerebroTV Checking For Updates[/COLOR]",
        line1="[COLOR yellow]Please Wait Download in Progress[/COLOR].",
        line2="[COLOR gold]CerebroTV Update Service[/COLOR]",
        line3="test")
    dp.update(percent)
Ejemplo n.º 4
0
def _doImportFromRemote():
    try:
        location = LOCATION.replace(' ', '%20')
        file     = os.path.join(HOME, '_sf_temp.zip')

        dp = utils.Progress(TITLE, line1 = GETTEXT(30140) % GETTEXT(30000), line2 = location.replace('%20', ' '), line3 = GETTEXT(30141))

        import download
        download.doDownload(location, file, TITLE)

        if os.path.exists(file):
            success = extractAll(file, dp, location.replace('%20', ' '))
            utils.DeleteFile(file)
            return success
    except Exception, e:
        utils.log(e)
Ejemplo n.º 5
0
def gen_model_df(run, model_file):
    '''
    Returns a DataFrame with times as rows and channels as columns. Cells
    are filled with the most likely model number
    '''
    all_lc = list(itertools.product(range(len(run.time_dirs)), run.channels))
    # Create empty DataFrame
    df = pd.DataFrame(index=run.gps_times, columns=run.channels)
    p = utils.Progress(all_lc, 'Generating best model DataFrame...')
    for i, tup in enumerate(all_lc):
        t, channel = tup
        c = run.get_channel_index(channel)
        # linechain file name
        lc_file = os.path.join(run.time_dirs[t], f'linechain_channel{c}.dat')
        # Find the mode
        model = get_counts(lc_file).argmax()
        df.loc[run.gps_times[t], channel] = model
        # Update progress
        p.update(i)
    # Write to CSV
    df.to_csv(model_file, sep=' ')
    return df
Ejemplo n.º 6
0
def save_summary(run):
    '''
    Returns a multi-index DataFrame of PSD summaries across multiple times 
    from one run folder. The first index represents channel, the second GPS time
    and the third frequency. Inserts blank rows in place of time gaps.
    
    Input
    -----
      run : Run object
    '''
    # Set up progress indicator
    p = utils.Progress(run.time_dirs, f'Importing {run.name} psd files...')
    # Concatenate DataFrames of all times; takes a while
    summaries = []
    for i, d in enumerate(run.time_dirs):
        summaries.append(summarize_psd(run, d))
        # Update progress indicator
        p.update(i)

    summaries = pd.concat(summaries)

    # Check for time gaps and fill with NaN DataFrames
    print('Checking for time gaps...')
    frequencies = summaries.index.unique(level='FREQ')
    midx = pd.MultiIndex.from_product(
        [run.channels, run.missing_times, frequencies],
        names=['CHANNEL', 'TIME', 'FREQ']
    )
    filler = pd.DataFrame(columns=summaries.columns, index=midx)
    summaries = summaries.append(filler).sort_index(level=[0, 1, 2])
    print(f'Filled {len(run.missing_times)} missing times with NaN.')
    
    # Output to file
    print(f'Writing to {run.psd_file}...')
    summaries.to_pickle(run.psd_file)
    return summaries
Ejemplo n.º 7
0
def main(args):
  """Main function to train the model.

  Args:
    args: Parsed arguments.

  Returns:
    Execution status defined by `constants.ExitCode`.
  """
  # Validate paths.
  if not validate_paths(args):
    return constants.ExitCode.INVALID_PATH

  # Extract paths.
  input_dir = args.input_dir
  model_dir = args.model_dir
  log_dir = args.log_dir
  existing_model = args.existing_model

  # Extract model parameters.
  batch_size = args.batch_size
  dropout_pkeep = args.dropout_pkeep
  hidden_state_size = args.hidden_state_size
  hidden_layer_size = args.hidden_layer_size
  learning_rate = args.learning_rate

  # Extract additional flags.
  debug = args.debug
  validation = args.validation

  # Split corpus for training and validation.
  # validation_text will be empty if validation is False.
  code_text, validation_text, input_ranges = utils.read_data_files(
      input_dir, validation=validation)

  # Bail out if we don't have enough corpus for training.
  if len(code_text) < batch_size * constants.TRAINING_SEQLEN + 1:
    return constants.ExitCode.CORPUS_TOO_SMALL

  # Get corpus files info. Will be used in debug mode to generate sample text.
  files_info_list = []
  if debug:
    files_info_list = utils.get_files_info(input_dir)
    assert files_info_list

  # Calculate validation batch size. It will be 0 if we choose not to validate.
  validation_batch_size = len(validation_text) // constants.VALIDATION_SEQLEN

  # Display some stats on the data.
  epoch_size = len(code_text) // (batch_size * constants.TRAINING_SEQLEN)
  utils.print_data_stats(len(code_text), len(validation_text), epoch_size)

  # Set graph-level random seed, so any random sequence generated in this
  # graph is repeatable. It could also be removed.
  tf.set_random_seed(0)

  # Define placeholder for learning rate, dropout and batch size.
  lr = tf.placeholder(tf.float32, name='lr')
  pkeep = tf.placeholder(tf.float32, name='pkeep')
  batchsize = tf.placeholder(tf.int32, name='batchsize')

  # Input data.
  input_bytes = tf.placeholder(tf.uint8, [None, None], name='input_bytes')
  input_onehot = tf.one_hot(input_bytes, constants.ALPHA_SIZE, 1.0, 0.0)

  # Expected outputs = same sequence shifted by 1, since we are trying to
  # predict the next character.
  expected_bytes = tf.placeholder(tf.uint8, [None, None], name='expected_bytes')
  expected_onehot = tf.one_hot(expected_bytes, constants.ALPHA_SIZE, 1.0, 0.0)

  # Input state.
  hidden_state = tf.placeholder(
      tf.float32, [None, hidden_state_size * hidden_layer_size],
      name='hidden_state')

  # "naive dropout" implementation.
  cells = [rnn.GRUCell(hidden_state_size) for _ in range(hidden_layer_size)]
  dropcells = [
      rnn.DropoutWrapper(cell, input_keep_prob=pkeep) for cell in cells
  ]
  multicell = rnn.MultiRNNCell(dropcells, state_is_tuple=False)
  multicell = rnn.DropoutWrapper(multicell, output_keep_prob=pkeep)

  output_raw, next_state = tf.nn.dynamic_rnn(
      multicell, input_onehot, dtype=tf.float32, initial_state=hidden_state)
  next_state = tf.identity(next_state, name='next_state')

  # Reshape training outputs.
  output_flat = tf.reshape(output_raw, [-1, hidden_state_size])
  output_logits = layers.linear(output_flat, constants.ALPHA_SIZE)

  # Reshape expected outputs.
  expected_flat = tf.reshape(expected_onehot, [-1, constants.ALPHA_SIZE])

  # Compute training loss.
  loss = tf.nn.softmax_cross_entropy_with_logits_v2(
      logits=output_logits, labels=expected_flat)
  loss = tf.reshape(loss, [batchsize, -1])

  # Use softmax to normalize training outputs.
  output_onehot = tf.nn.softmax(output_logits, name='output_onehot')

  # Use argmax to get the max value, which is the predicted bytes.
  output_bytes = tf.argmax(output_onehot, 1)
  output_bytes = tf.reshape(output_bytes, [batchsize, -1], name='output_bytes')

  # Choose Adam optimizer to compute gradients.
  optimizer = tf.train.AdamOptimizer(lr).minimize(loss)

  # Stats for display.
  seqloss = tf.reduce_mean(loss, 1)
  batchloss = tf.reduce_mean(seqloss)
  accuracy = tf.reduce_mean(
      tf.cast(
          tf.equal(expected_bytes, tf.cast(output_bytes, tf.uint8)),
          tf.float32))
  loss_summary = tf.summary.scalar('batch_loss', batchloss)
  acc_summary = tf.summary.scalar('batch_accuracy', accuracy)
  summaries = tf.summary.merge([loss_summary, acc_summary])

  # Init Tensorboard stuff.
  # This will save Tensorboard information in folder specified in command line.
  # Two sets of data are saved so that you can compare training and
  # validation curves visually in Tensorboard.
  timestamp = str(math.trunc(time.time()))
  summary_writer = tf.summary.FileWriter(
      os.path.join(log_dir, timestamp + '-training'))
  validation_writer = tf.summary.FileWriter(
      os.path.join(log_dir, timestamp + '-validation'))

  # Init for saving models.
  # They will be saved into a directory specified in command line.
  saver = tf.train.Saver(max_to_keep=constants.MAX_TO_KEEP)

  # For display: init the progress bar.
  step_size = batch_size * constants.TRAINING_SEQLEN
  frequency = constants.DISPLAY_FREQ * step_size
  progress = utils.Progress(
      constants.DISPLAY_FREQ,
      size=constants.DISPLAY_LEN,
      msg='Training on next {} batches'.format(constants.DISPLAY_FREQ))

  # Set initial state.
  state = np.zeros([batch_size, hidden_state_size * hidden_layer_size])
  session = tf.Session()

  # We continue training on exsiting model, or start with a new model.
  if existing_model:
    print('Continue training on existing model: {}'.format(existing_model))
    try:
      saver.restore(session, existing_model)
    except:
      print(
          ('Failed to restore existing model since model '
           'parameters do not match.'),
          file=sys.stderr)
      return constants.ExitCode.TENSORFLOW_ERROR
  else:
    print('No existing model provided. Start training with a new model.')
    session.run(tf.global_variables_initializer())

  # Num of bytes we have trained so far.
  steps = 0

  # Training loop.
  for input_batch, expected_batch, epoch in utils.rnn_minibatch_sequencer(
      code_text,
      batch_size,
      constants.TRAINING_SEQLEN,
      nb_epochs=constants.EPOCHS):

    # Train on one mini-batch.
    feed_dict = {
        input_bytes: input_batch,
        expected_bytes: expected_batch,
        hidden_state: state,
        lr: learning_rate,
        pkeep: dropout_pkeep,
        batchsize: batch_size
    }

    _, predicted, new_state = session.run(
        [optimizer, output_bytes, next_state], feed_dict=feed_dict)

    # Log training data for Tensorboard display a mini-batch of sequences
    # every `frequency` batches.
    if debug and steps % frequency == 0:
      feed_dict = {
          input_bytes: input_batch,
          expected_bytes: expected_batch,
          hidden_state: state,
          pkeep: 1.0,
          batchsize: batch_size
      }
      predicted, seq_loss, batch_loss, acc_value, summaries_value = session.run(
          [output_bytes, seqloss, batchloss, accuracy, summaries],
          feed_dict=feed_dict)
      utils.print_learning_learned_comparison(
          input_batch, predicted, seq_loss, input_ranges, batch_loss, acc_value,
          epoch_size, steps, epoch)
      summary_writer.add_summary(summaries_value, steps)

    # Run a validation step every `frequency` batches.
    # The validation text should be a single sequence but that's too slow.
    # We cut it up and batch the pieces (slightly inaccurate).
    if validation and steps % frequency == 0 and validation_batch_size:
      utils.print_validation_header(len(code_text), input_ranges)
      validation_x, validation_y, _ = next(
          utils.rnn_minibatch_sequencer(validation_text, validation_batch_size,
                                        constants.VALIDATION_SEQLEN, 1))
      null_state = np.zeros(
          [validation_batch_size, hidden_state_size * hidden_layer_size])
      feed_dict = {
          input_bytes: validation_x,
          expected_bytes: validation_y,
          hidden_state: null_state,
          pkeep: 1.0,
          batchsize: validation_batch_size
      }
      batch_loss, acc_value, summaries_value = session.run(
          [batchloss, accuracy, summaries], feed_dict=feed_dict)
      utils.print_validation_stats(batch_loss, acc_value)

      # Save validation data for Tensorboard.
      validation_writer.add_summary(summaries_value, steps)

    # Display a short text generated with the current weights and biases.
    # If enabled, there will be a large output.
    if debug and steps // 4 % frequency == 0:
      utils.print_text_generation_header()
      file_info = utils.random_element_from_list(files_info_list)
      first_byte, file_size = file_info['first_byte'], file_info['file_size']
      ry = np.array([[first_byte]])
      rh = np.zeros([1, hidden_state_size * hidden_layer_size])
      sample = [first_byte]
      for _ in range(file_size - 1):
        feed_dict = {
            input_bytes: ry,
            pkeep: 1.0,
            hidden_state: rh,
            batchsize: 1
        }
        ryo, rh = session.run([output_onehot, next_state], feed_dict=feed_dict)
        rc = utils.sample_from_probabilities(ryo, topn=10 if epoch <= 1 else 2)
        sample.append(rc)
        ry = np.array([[rc]])
      print(repr(utils.decode_to_text(sample)))
      utils.print_text_generation_footer()

    # Save a checkpoint every `10 * frequency` batches. Each checkpoint is
    # a version of model.
    if steps // 10 % frequency == 0:
      saved_model_name = constants.RNN_MODEL_NAME + '_' + timestamp
      saved_model_path = os.path.join(model_dir, saved_model_name)
      saved_model = saver.save(session, saved_model_path, global_step=steps)
      print('Saved model: {}'.format(saved_model))

    # Display progress bar.
    if debug:
      progress.step(reset=steps % frequency == 0)

    # Update state.
    state = new_state
    steps += step_size

  # Save the model after training is done.
  saved_model_name = constants.RNN_MODEL_NAME + '_' + timestamp
  saved_model_path = os.path.join(model_dir, saved_model_name)
  saved_model = saver.save(session, saved_model_path, global_step=steps)
  print('Saved model: {}'.format(saved_model))

  return constants.ExitCode.SUCCESS
Ejemplo n.º 8
0
def main():
    # Argument parser
    parser = argparse.ArgumentParser(
        description='Generate linechain summaries and plots.')
    parser.add_argument(
        'runs',
        type=str,
        nargs='*',
        help='run directory name (default: all folders in "data/" directory)')
    parser.add_argument(
        '-c',
        '--compare',
        dest='compare',
        action='store_true',
        help='compare summary plots for different runs side by side')
    parser.add_argument(
        '--overwrite-all',
        dest='overwrite',
        action='store_true',
        help='re-generate summary files even if they already exist (default: \
              ask for each run)')
    parser.add_argument(
        '--keep-all',
        dest='keep',
        action='store_true',
        help='do not generate summary file if it already exists (default: ask \
              for each run)')
    args = parser.parse_args()
    # Add all runs in data directory if none are specified
    if len(args.runs) == 0:
        args.runs = glob(f'data{os.sep}*{os.sep}*{os.sep}')

    # Initialize run objects; skip missing directories
    runs = utils.init_runs(args.runs)

    for run in runs:
        print(f'\n-- {run.mode} {run.name} --')
        # Log output file
        log_file = os.path.join(run.summary_dir, 'linechain.log')
        # Confirm to overwrite if summary already exists
        if args.keep: overwrite = False
        elif args.overwrite: overwrite = True
        elif os.path.exists(run.linechain_file):
            over = input('Found linechain.pkl for this run. Overwrite? (y/N) ')
            overwrite = True if over == 'y' else False
        else:
            overwrite = True

        if overwrite:
            run.linecounts, run.lc_summary = save_summary(run, log_file)
        else:
            run.lc_summary = pd.read_pickle(run.linechain_file)
            run.linecounts = pd.read_pickle(run.linecounts_file)

        if not args.compare:
            # Plot line parameters
            print('Plotting...')
            # Plot linecount colormaps
            for i, channel in enumerate(run.channels):
                plot_file = os.path.join(run.plot_dir, f'linecounts{i}.png')
                plot.linecounts_cmap(run, channel, plot_file)
                if channel in run.lc_summary.index.unique(level='CHANNEL'):
                    for param in run.lc_summary.index.unique(
                            level='PARAMETER'):
                        plot_file = os.path.join(
                            run.plot_dir, f'linechain_{param.lower()}{i}.png')
                        plot.linechain_scatter(run,
                                               channel,
                                               param,
                                               plot_file=plot_file,
                                               show=False)

    if args.compare:
        p = utils.Progress(runs[0].channels, '\nPlotting run comparisons...')
        multirun_dir = os.path.join('out', 'multirun')
        if not os.path.exists(multirun_dir): os.makedirs(multirun_dir)
        for i, channel in enumerate(runs[0].channels):
            plot.compare_linecounts(runs,
                                    channel,
                                    plot_file=os.path.join(
                                        multirun_dir, f'linecounts{i}.png'))
            p.update(i)

    print('Done!')
Ejemplo n.º 9
0
def save_summary(run, log_file=None):
    '''
    Returns a summary DataFrame for all linechain files in the given run.
    
    Input
    -----
      run : Run object
      log_file : string, path to log file (if any)
    '''
    # Set up log file
    log = utils.Log(log_file, f'linechain.py log file for {run.name}')

    # Generate iterable of channels and times
    all_lc = list(itertools.product(run.channels, run.time_dirs))
    counts = []
    summaries = []
    # Set up progress indicator
    p = utils.Progress(all_lc, f'Importing {run.name} linechain...')
    for i, t in enumerate(all_lc):
        channel, time_dir = t
        ch_idx = run.get_channel_index(channel)
        # Counts for each viable model
        lc_file = os.path.join(time_dir, f'linechain_channel{ch_idx}.dat')
        time_counts = get_counts(lc_file)
        counts.append(time_counts)
        # Spectral line summary statistics
        summaries.append(
            summarize_linechain(run, time_dir, channel, time_counts, log))
        # Update progress indicator
        p.update(i)

    # Combine counts into one DataFrame
    counts = pd.DataFrame(counts,
                          index=pd.MultiIndex.from_product(
                              [run.channels, run.gps_times],
                              names=['CHANNEL', 'TIME']))
    # Combine with DataFrame of missing times
    missing = pd.DataFrame(columns=counts.columns,
                           index=pd.MultiIndex.from_product(
                               [run.channels, run.missing_times],
                               names=['CHANNEL', 'TIME']))
    counts = pd.concat([counts, missing]).sort_index(level=[0, 1])
    counts = counts.astype('float64')
    # Log final output
    log.log('All line counts:')
    log.log(counts.to_string(max_cols=80))
    # Output to file
    counts.to_pickle(run.linecounts_file)
    print('Model counts written to ' + run.linecounts_file)

    # Combine summaries into one DataFrame
    summaries = pd.concat(summaries, axis=0)
    midx = pd.MultiIndex.from_tuples(
        summaries.index, names=['CHANNEL', 'TIME', 'LINE', 'PARAMETER'])
    summaries.index = midx
    # Log final output
    log.log('All summaries:')
    log.log(summaries.to_string(max_cols=80))
    # Output to file
    summaries.to_pickle(run.linechain_file)
    print('Summary written to ' + run.linechain_file)
    return counts, summaries
Ejemplo n.º 10
0
def main():
    # Argument parser
    parser = argparse.ArgumentParser(
        description='Generate PSD summaries and plots.'
    )
    parser.add_argument('runs', type=str, nargs='*', 
        help='run directory name (default: all folders in "data/" directory)'
    )
    parser.add_argument('-c', '--compare', dest='compare', action='store_true',
            help='compare summary plots for different runs side by side')
    parser.add_argument('--overwrite-all', dest='overwrite', action='store_true',
        help='re-generate summary files even if they already exist (default: \
              ask for each run)'
    )
    parser.add_argument('--keep-all', dest='keep', action='store_true',
        help='do not generate summary file if it already exists (default: ask \
              for each run)'
    )
    args = parser.parse_args()
    # Add all runs in data directory if none are specified
    if len(args.runs) == 0: 
        args.runs = glob(f'data{os.sep}*{os.sep}*{os.sep}')
    
    # Initialize run objects; skip missing directories
    runs = utils.init_runs(args.runs)
    
    # Import impacts file, if any
    impacts_file = 'impacts.dat'
    impacts = np.array([])
    if os.path.exists(impacts_file):
        impacts = get_impacts(impacts_file)
    
    for run in runs:
        print(f'\n-- {run.mode} {run.name} --')
        # Log output file
        log_file = os.path.join(run.summary_dir, 'psd.log')
        log = utils.Log(log_file, f'psd.py log file for {run.name}')
        # Confirm to overwrite if summary already exists
        if args.keep: overwrite = False
        elif args.overwrite: overwrite = True
        elif os.path.exists(run.psd_file):
            over = input('Found psd.pkl for this run. Overwrite? (y/N) ')
            overwrite = True if over == 'y' else False
        else: overwrite = True

        # Import / generate summary PSD DataFrame
        if overwrite:
            run.psd_summary = save_summary(run)
        else:
            run.psd_summary = pd.read_pickle(run.psd_file)
        
        # Make plots
        df = run.psd_summary
        # Frequency slices: roughly logarithmic, low-frequency
        plot_frequencies = np.array([1e-3, 3e-3, 5e-3, 1e-2, 3e-2, 5e-2])
        plot_frequencies = get_exact_freq(run.psd_summary, plot_frequencies)
        # Time slices: get even spread of times
        n = 6
        indices = [int(i / (n-1) * len(run.gps_times)) for i in range(1,n-1)]
        slice_times = sorted([run.gps_times[0], run.gps_times[-1]] +
            [run.gps_times[i] for i in indices]
        )
        
        if not args.compare:
            p = utils.Progress(run.channels, 'Plotting...')
            for i, channel in enumerate(run.channels):
                # FFT analysis
                fft_file = os.path.join(run.plot_dir, f'fft{i}.png')
                rfftfreq, rfft = fft(run, channel, plot_frequencies, log)
                plot.fft(rfftfreq, rfft, run, channel, plot_frequencies, 
                        logfreq=False, plot_file=fft_file)
                # Colormap
                cmap_file = os.path.join(run.plot_dir, f'colormap{i}.png')
                plot.save_colormaps(run, channel, cmap_file)
                # Frequency slices
                fslice_file = os.path.join(run.plot_dir, f'fslice{i}.png')
                plot.save_freq_slices([run], channel, plot_frequencies, 
                        impacts=impacts, plot_file=fslice_file)
                # Time slices
                tslice_file = os.path.join(run.plot_dir, f'tslice{i}.png')
                plot.save_time_slices(run, channel, slice_times, tslice_file)
                # Update progress
                p.update(i)
        
    # Plot run comparisons
    if args.compare:
        p = utils.Progress(runs[0].channels, '\nPlotting run comparisons...')
        multirun_dir = os.path.join('out', 'multirun')
        if not os.path.exists(multirun_dir): os.makedirs(multirun_dir)
        for i, channel in enumerate(runs[0].channels):
            plot.compare_colormaps(runs, channel, 
                    plot_file=os.path.join(multirun_dir, f'colormap{i}.png'))
            plot.save_freq_slices(runs, channel, plot_frequencies, 
                    impacts=impacts, 
                    plot_file=os.path.join(multirun_dir, f'fslice{i}.png'))
            fft_freqs = np.array([1e-3, 5e-3, 3e-2])
            fft_freqs = get_exact_freq(runs[0].psd_summary, fft_freqs)
            plot.compare_fft(runs, channel, fft_freqs, 
                    plot_file=os.path.join(multirun_dir, f'fft{i}.png'))
            p.update(i)
    
    print('Done!')