Ejemplo n.º 1
0
def parrot_initialization_rgc(dataset, emb_path, dc=None, encoder=None, dddqn=None):
  '''
  Trains the rgc to repeat the input
  '''
  # TODO save optimizer
  if dc is None:
    dc = DataContainer(dataset, emb_path)
    dc.prepare_data()
  x_batch, y_parrot_batch, sl_batch = u.to_batch(dc.x, dc.y_parrot_padded, dc.sl, batch_size=dc.batch_size)

  # initialize rnn cell of the encoder and the dddqn
  rep = input('Load RNN cell pretrained for the encoder & dddqn? (y or n): ')
  if encoder is None:
    encoder = EncoderRNN(num_units=256)
  if rep == 'y' or rep == '':
    encoder.load(name='EncoderRNN-0')
  else:
    choose_best_rnn_pretrained(encoder, encoder.encoder_cell, dc, search_size=1, multiprocessed=False)
  # we do not need to train the dddqn rnn layer since we already trained the encoder rnn layer
  # we just have to initialize the dddqn rnn layer weights with the ones from the encoder
  if dddqn is None:
    dddqn = DDDQN(dc.word2idx, dc.idx2word, dc.idx2emb)
  u.init_rnn_layer(dddqn.lstm)
  u.update_layer(dddqn.lstm, encoder.encoder_cell)

  # define the loss function used to pretrain the rgc
  def get_loss(encoder, dddqn, epoch, x, y, sl, sos, max_steps, verbose=True):
    preds, logits, _, _, _ = pu.full_encoder_dddqn_pass(x, sl, encoder, dddqn, sos, max_steps, training=True)
    logits = tf.nn.softmax(logits)  # normalize logits between 0 & 1 to allow training through cross-entropy
    sl = [end_idx + 1 for end_idx in sl]  # sl = [len(sequence)-1, ...] => +1 to get the len
    loss = u.cross_entropy_cost(logits, y, sequence_lengths=sl)
    if verbose:
      acc_words, acc_sentences = u.get_acc_word_seq(logits, y, sl)
      logging.info('Epoch {} -> loss = {} | acc_words = {} | acc_sentences = {}'.format(epoch, loss, acc_words, acc_sentences))
    return loss

  rep = input('Load pretrained RGC-ENCODER-DDDQN? (y or n): ')
  if rep == 'y' or rep == '':
    encoder.load('RGC/Encoder')
    dddqn.load('RGC/DDDQN')

  rep = input('Train RGC-ENCODER-DDDQN? (y or n): ')
  if rep == 'y' or rep == '':
    optimizer = tf.train.AdamOptimizer()
    # training loop over epoch and batchs
    for epoch in range(300):
      verbose = True
      for x, y, sl in zip(x_batch, y_parrot_batch, sl_batch):
        sos = dc.get_sos_batch_size(len(x))
        optimizer.minimize(lambda: get_loss(encoder, dddqn, epoch, x, y, sl, sos, dc.max_tokens, verbose=verbose))
        verbose = False
      encoder.save(name='RGC/Encoder')
      dddqn.save(name='RGC/DDDQN')
      acc = pu.get_acc_full_dataset(dc, encoder, dddqn)
      logging.info('Validation accuracy = {}'.format(acc))
      if acc > 0.95:
        logging.info('Stopping criteria on validation accuracy raised')
        break

  return encoder, dddqn, dc
Ejemplo n.º 2
0
Archivo: rgc.py Proyecto: thbeucher/RGC
 def get_dc(self, dc, split_size):
     if dc is None:
         self.dc = DataContainer(self.dataset,
                                 self.emb_path,
                                 test_size=split_size)
         self.dc.prepare_data()
     else:
         self.dc = dc
Ejemplo n.º 3
0
 def __init__(self, debug=False):
     CreepControl.__init__(self)
     DataContainer.__init__(self)
     self.debug = debug
     self.actions = []
     self.add_action = None
     self.unit_commands = [
         BlockExpansions(self),
         DefendWorkerRush(self),
         DefendRushBuildings(self),
         DistributeWorkers(self),
         ArmyControl(self),
         QueensAbilities(self),
         CreepTumor(self),
         Drone(self),
         Overseer(self),
         Overlord(self),
         Buildings(self),
     ]
     self.train_commands = [
         TrainOverlord(self),
         TrainWorker(self),
         TrainQueen(self),
         TrainUltralisk(self),
         TrainZergling(self),
         TrainOverseer(self),
         TrainMutalisk(self),
         TrainHydralisk(self),
     ]
     self.build_commands = [
         BuildPool(self),
         BuildExpansion(self),
         BuildExtractor(self),
         BuildEvochamber(self),
         BuildCavern(self),
         BuildPit(self),
         BuildHive(self),
         BuildLair(self),
         BuildSpines(self),
         BuildSpores(self),
         BuildSpire(self),
         BuildHydraden(self),
     ]
     self.upgrade_commands = [
         UpgradeChitinousPlating(self),
         UpgradeMetabolicBoost(self),
         UpgradeAdrenalGlands(self),
         UpgradeEvochamber(self),
         UpgradePneumatizedCarapace(self),
         UpgradeBurrow(self),
         UpgradeGroovedSpines(self),
         UpgradeMuscularAugments(self),
     ]
     self.locations = []
     self.ordered_expansions = []
     self.building_positions = []
    def __init__(self, parent, showRegister):
        super().__init__()

        self.dataContainer = DataContainer(self)
        self.dataContainer.grid(row=0,
                                column=0,
                                columnspan=4,
                                padx=25,
                                pady=(25, 0),
                                sticky='NSEW')

        registerButton = tk.Button(self,
                                   text='REGISTRO',
                                   bg='#ffbf45',
                                   font=font.Font(family='Helvetica',
                                                  size='12',
                                                  weight='bold'),
                                   command=showRegister)
        registerButton.grid(row=1, column=0, padx=25, pady=10, sticky='NSEW')

        searchButton = tk.Button(self,
                                 text="BUSCAR",
                                 bg='#4a5fff',
                                 font=font.Font(family='Helvetica',
                                                size='12',
                                                weight='bold'),
                                 command=self.search)
        searchButton.grid(row=1, column=1, padx=25, pady=10, sticky='NSEW')

        deleteButton = tk.Button(self,
                                 text='ELIMINAR',
                                 bg='#ed3833',
                                 font=font.Font(family='Helvetica',
                                                size='12',
                                                weight='bold'),
                                 command=self.delete)
        deleteButton.grid(row=1, column=2, padx=25, pady=10, sticky='NSEW')

        acknowledgmentButton = tk.Button(self,
                                         text='PREMIAR',
                                         bg='#4ca0ff',
                                         font=font.Font(family='Helvetica',
                                                        size='12',
                                                        weight='bold'),
                                         command=self.acknowledge)
        acknowledgmentButton.grid(row=1,
                                  column=3,
                                  padx=25,
                                  pady=10,
                                  sticky='NSEW')
Ejemplo n.º 5
0
 def create_data_container(self):
     return DataContainer(
         author_list=self.author_list,
         author_dict=self.author_dict,
         commit_list=self.commit_list,
         file_list=self.file_list,
         file_dict=self.file_dict,
         file_change_list=self.file_change_list
     )
Ejemplo n.º 6
0
def get_data():
    dc = DataContainer(os.environ['INPUT'], os.environ['EMB'])
    dc.prepare_data()
    x_a = [sample for batch in dc.x_train for sample in batch] + dc.x_te
    sl_a = [sample for batch in dc.sl_train for sample in batch] + dc.sl_te
    y_parrot_a = [
        sample for batch in dc.y_parrot_padded_batch for sample in batch
    ] + dc.y_p_p_te
    sos = dc.get_sos_batch_size(len(x_a))
    encoder = EncoderRNN()
    decoder = DecoderRNN(dc.word2idx,
                         dc.idx2word,
                         dc.idx2emb,
                         max_tokens=dc.max_tokens,
                         attention=False)
    optimizer = tf.train.AdamOptimizer()
    x_batch = u.create_batch(x_a, batch_size=dc.batch_size)
    y_parrot_batch = u.create_batch(y_parrot_a, batch_size=dc.batch_size)
    sl_batch = u.create_batch(sl_a, batch_size=dc.batch_size)
    return dc, x_a, sl_a, y_parrot_a, sos, encoder, decoder, optimizer, x_batch, y_parrot_batch, sl_batch
def get_clean_dataframes(data):
    print('received data')
    io_data = io.StringIO(data)
    df = pd.read_csv(io_data, index_col=0)
    df = clean_df(df)
    print('cleaned data')
    labels = pd.DataFrame()
    labels['target'] = df.apply(
        lambda row: is_popular(row, df['popularity'].mean()), axis=1)
    print('label targets generated')
    features = transform_min_max(df)
    print('min-maxed features')
    return DataContainer(features, labels)
Ejemplo n.º 8
0
def parrot_initialization_encoder_decoder(dataset, emb_path, attention):
  '''
  Trains the encoder-decoder to reproduce the input
  '''
  dc = DataContainer(dataset, emb_path)
  dc.prepare_data()

  x_batch, y_parrot_batch, sl_batch = u.to_batch(dc.x, dc.y_parrot_padded, dc.sl, batch_size=dc.batch_size)

  def get_loss(encoder, decoder, epoch, x, y, sl, sos):
    output, cell_state = encoder.forward(x, sl)
    loss = decoder.get_loss(epoch, sos, (cell_state, output), y, sl, x, encoder.outputs)
    return loss

  if os.path.isdir('models/Encoder-Decoder'):
    rep = input('Load previously trained Encoder-Decoder? (y or n): ')
    if rep == 'y' or rep == '':
      encoder = EncoderRNN()
      decoder = DecoderRNN(dc.word2idx, dc.idx2word, dc.idx2emb, max_tokens=dc.max_tokens, attention=attention)
      encoder.load(name='Encoder-Decoder/Encoder')
      decoder.load(name='Encoder-Decoder/Decoder')
      sos = dc.get_sos_batch_size(len(dc.x))
      see_parrot_results(encoder, decoder, 'final', dc.x, dc.y_parrot_padded, dc.sl, sos, greedy=True)
    else:
      encoder, decoder = choose_coders(dc, attention, search_size=5)
  else:
    encoder, decoder = choose_coders(dc, attention, search_size=5)

  optimizer = tf.train.AdamOptimizer()

  for epoch in range(300):
    for x, y, sl in zip(x_batch, y_parrot_batch, sl_batch):
      sos = dc.get_sos_batch_size(len(x))
      # grad_n_vars = optimizer.compute_gradients(lambda: get_loss(encoder, decoder, epoch, x, y, sl, sos))
      # optimizer.apply_gradients(grad_n_vars)
      optimizer.minimize(lambda: get_loss(encoder, decoder, epoch, x, y, sl, sos))
    if epoch % 30 == 0:
      # to reduce training time, compute global accuracy only every 30 epochs
      sos = dc.get_sos_batch_size(len(dc.x))
      see_parrot_results(encoder, decoder, epoch, dc.x, dc.y_parrot_padded, dc.sl, sos, greedy=True)
      # see_parrot_results(encoder, decoder, epoch, dc.x, dc.y_parrot_padded, dc.sl, sos)
    encoder.save(name='Encoder-Decoder/Encoder')
    decoder.save(name='Encoder-Decoder/Decoder')
    if decoder.parrot_stopping:
      break
    # x_batch, y_parrot_batch, sl_batch = u.shuffle_data(x_batch, y_parrot_batch, sl_batch)
    # strangely, shuffle data between epoch make the training realy noisy

  return encoder, decoder, dc
Ejemplo n.º 9
0
 def __init__(self,
              language='en',
              test_size=0.2,
              dc=None,
              input_file=None,
              prepare_data=False):
     self.test_size = test_size
     if dc is None:
         self.dc = DataContainer(input_file, '')
     else:
         self.dc = dc
     self.vectorizer = TfidfVectorizer(max_df=0.5,
                                       use_idf=True,
                                       smooth_idf=True,
                                       tokenizer=lambda x: x.split(' '))
     self.classifier = LinearSVC(tol=0.5)
     if prepare_data:
         self.prepare_data(self.dc.sources, self.dc.labels)
Ejemplo n.º 10
0
def data_analyser(state: str, smooth_data: bool) -> DataContainer:
    # from_date = date(2020, 3, 1)
    # to_date = date.today() - timedelta(days=1)
    # dates = []

    # month = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

    # with open('databases/database.json', 'r') as db:
    #     database = json.load(db)

    # if database['to_date'] != f'{to_date.day:02d}-{month[to_date.month-1]}-{to_date.year}':
    #     json_database_builder.main()
    #     database = None
    #     with open('databases/database.json', 'r') as db:
    #         database = json.load(db)
    if state == 'IN':
        state = 'TT'

    api_url = 'https://data.covid19india.org/v4/min/timeseries.min.json'
    time_series_response = json.loads(requests.get(api_url).text)

    state_data = time_series_response[state]['dates']
    dates_str = list(state_data.keys())

    confirmed_daily = []
    deceased_daily = []
    recovered_daily = []
    dates = []

    for date_str in dates_str:
        dates.append(date.fromisoformat(date_str))
        try:
            confirmed_daily.append(
                int(state_data[date_str]['delta']['confirmed']))
        except KeyError:
            confirmed_daily.append(0)
        try:
            deceased_daily.append(
                int(state_data[date_str]['delta']['deceased']))
        except KeyError:
            deceased_daily.append(0)
        try:
            recovered_daily.append(
                int(state_data[date_str]['delta']['recovered']))
        except KeyError:
            recovered_daily.append(0)

    confirmed_daily = np.array(confirmed_daily)
    recovered_daily = np.array(recovered_daily)
    deceased_daily = np.array(deceased_daily)

    data_len = len(dates_str)

    total_count = np.cumsum(confirmed_daily)
    total_recovered = np.cumsum(recovered_daily)
    total_deceased = np.cumsum(deceased_daily)

    active_cases = total_count - total_recovered - total_deceased

    offset_days = 30

    # beta = np.divide(confirmed_daily[offset_days:], 1)
    beta = np.gradient(total_count[offset_days:]) / active_cases[offset_days:]
    # beta = np.divide(confirmed_daily[offset_days:], active_cases[offset_days:])

    # gamma = np.divide(recovered_daily[offset_days:] + deceased_daily[offset_days:], 1)
    gamma = np.gradient(
        total_recovered[offset_days:] +
        total_deceased[offset_days:]) / active_cases[offset_days:]
    # gamma = np.divide(recovered_daily[offset_days:] + deceased_daily[offset_days:], active_cases[offset_days:])

    if smooth_data:
        gamma[gamma == 0] = np.nan
        beta[beta == 0] = np.nan

        smooth_factor = 0.25
        beta[change_factor_series(beta) > smooth_factor] = np.nan
        smooth_by_interpolation(beta)

        smooth_factor = 0.80
        gamma[change_factor_series(gamma) > smooth_factor] = np.nan
        smooth_by_interpolation(gamma)

    reproductive_number = np.divide(beta, gamma)
    beta_len = len(beta)
    counts = np.zeros((5, data_len))
    counts[0, :] = total_count
    counts[1, :] = active_cases
    counts[2, :][:beta_len] = beta
    counts[3, :][:beta_len] = gamma
    counts[4, :][:beta_len] = reproductive_number

    return DataContainer(dates, offset_days, counts)
Ejemplo n.º 11
0
    from data_container import DataContainer

    argparser = argparse.ArgumentParser(prog='dddqn.py', description='')
    argparser.add_argument('--input',
                           metavar='INPUT',
                           default=os.environ['INPUT'],
                           type=str)
    argparser.add_argument('--emb',
                           metavar='EMB',
                           default=os.environ['EMB'],
                           type=str)
    args = argparser.parse_args()

    tfe.enable_eager_execution()

    dc = DataContainer(args.input, args.emb)
    dc.prepare_data()
    mainDQN = DDDQN(dc.word2idx,
                    dc.idx2word,
                    dc.idx2emb,
                    max_tokens=dc.max_tokens)
    targetDQN = DDDQN(dc.word2idx,
                      dc.idx2word,
                      dc.idx2emb,
                      max_tokens=dc.max_tokens)

    x = np.asarray(dc.x_train[0])
    state = mainDQN.lstm.zero_state(dc.batch_size, dtype=tf.float64)
    for mt in range(dc.max_tokens):
        Qvalue, action, state = mainDQN.forward(x[:, mt, :], state)
        targetDQN.forward(x[:, mt, :], state)
Ejemplo n.º 12
0
Archivo: rgc.py Proyecto: thbeucher/RGC
class RGC(object):
    def __init__(self,
                 dataset,
                 emb_path,
                 name='RGC',
                 dc=None,
                 bbc=None,
                 split_size=0.5):
        self.name = name
        self.dataset = dataset
        self.emb_path = emb_path
        self.get_dc(dc, split_size)
        self.encoder = EncoderRNN(num_units=256)
        self.dddqn = DDDQN(self.dc.word2idx,
                           self.dc.idx2word,
                           self.dc.idx2emb,
                           max_tokens=self.dc.max_tokens)
        self.bbc = BlackBoxClassifier(
            dc=self.dc, prepare_data=True) if bbc is None else bbc

    def get_dc(self, dc, split_size):
        if dc is None:
            self.dc = DataContainer(self.dataset,
                                    self.emb_path,
                                    test_size=split_size)
            self.dc.prepare_data()
        else:
            self.dc = dc

    def pretrain(self):
        logging.info('Launch of parrot initilization...')
        self.encoder, self.dddqn, self.dc = parrot_initialization_rgc(
            self.dataset,
            self.emb_path,
            dc=self.dc,
            encoder=self.encoder,
            dddqn=self.dddqn)

    def update(self, rgc, init_layers=False):
        '''
    Updates rgc layers with the given RGC network
    Set init_layers to True if the RGC network you want to update is freshly instanciate
    '''
        if init_layers:
            self.encoder.init_layers()
            self.dddqn.init_layers()
        u.update_layer(self.encoder.encoder_cell, rgc.encoder.encoder_cell)
        self.dddqn.update(rgc.dddqn)

    # def get_training_format(self, x, sl, y, sos, lstm_states, preds, Qs):
    #   training = []  # list of tuple (x, sl, lstm_state, e, y, s, a, r, s', t)
    #   # lstm state = (max_step, tuple_size, batch_size, )
    #   for i, p in enumerate(preds):
    #     s = ''
    #     e = sos[i]
    #     for j, a in enumerate(p):
    #       s1 = s + self.dc.idx2word[a]
    #       lstm_state = (lstm_states[j][0][i], lstm_states[j][1][i])
    #       terminal = True if j == len(p) - 1 else False
    #       r = self.bbc.get_reward(s, y[i], terminal=terminal)
    #       experience = (x[i], sl[i], lstm_state, e, y[i], s, a, r, s1, terminal)
    #       s = s1
    #       e = self.dc.idx2emb[a]
    #       training.append(experience)
    #   return training

    def predict(self, x, sl, return_all=True):
        '''
    Performs RGC forward pass

    Inputs:
      -> x, numpy array, shape = [batch_size, input_dim], example: [batch_size, sequence_length, embedding_dim]
      -> sl, list of int, list of last sequence indice for each sample in given batch

    Outputs:
      -> sentences, list of string, predicted sentences
    '''
        sos = self.dc.get_sos_batch_size(len(x))
        preds, logits, lstm_states, Q, Qs = pu.full_encoder_dddqn_pass(
            x, sl, self.encoder, self.dddqn, sos, self.dc.max_tokens)
        # fill prediction with random word if only eos token in prediction, -> penalize no prediction
        preds = [s if s[0] != self.dc.word2idx['eos'] else [np.random.choice(list(self.dc.word2idx.values()))]\
                 for s in preds.numpy().tolist()]
        preds = [
            s[:s.index(self.dc.word2idx['eos'])]
            if self.dc.word2idx['eos'] in s else s for s in preds
        ]
        sentences = [' '.join([self.dc.idx2word[i] for i in s]) for s in preds]
        if return_all:
            return sentences, preds, lstm_states, Q, Qs
        else:
            return sentences

    def test_pretrained(self):
        '''
    Trains the bbc with training data, gets predictions on test data
    then gets new sentences from test data with RGC and gets predictions
    on this new test data
    print classification report for the two results
    '''
        logging.info('Train of the BBC...')
        self.bbc.train(self.bbc.x_train, self.bbc.y_train)
        logging.info('Classification report of BBC on test data...')
        self.bbc.predict_test(self.bbc.x_test, self.bbc.y_test)

        x, sl, _, _ = self.dc.transform_sources(self.bbc.x_test,
                                                emb=self.dc.emb)
        new_x_test, _ = self.predict(x, sl, return_all=False)
        logging.info(
            'Classification report of BBC on test data transformed by RGC...')
        self.bbc.predict_test(new_x_test, self.bbc.y_test)
Ejemplo n.º 13
0
from data_container import DataContainer
import dash_d3cloud
import plotly.express as px

import pandas as pd

app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
server = app.server

navbar = dbc.NavbarSimple(
    brand="Tweets Analysis Demo",
    brand_href="#",
    sticky="top",
)

obj_dc = DataContainer('twitter_small.csv')

x_dates, y_freq = obj_dc.tweets_by_date()
date_ranges = obj_dc.dates_range

date_mark = {i: date_ranges[i] for i in range(0, len(date_ranges))}


trace_1 = go.Scatter(x=x_dates, y=y_freq, name='date', line=dict(width=2, color='rgb(229, 151, 50)'))

layout = go.Layout(hovermode='closest',
                   xaxis={'title': 'Date'},
                   yaxis={'title': 'Tweets Frequency'}, margin={'t': 0})

fig = go.Figure(data=[trace_1], layout=layout)