def __init__(self, V=10000): self.vocab = None self.zipped_filename = "data/sst/trainDevTestTrees_PTB.zip" self.target_names = None # set by self.process() # Download datasets if not os.path.isfile(self.zipped_filename): data_dir = os.path.dirname(self.zipped_filename) print("Downloading treebank to {:s}".format(data_dir)) self.zipped_filename = download_sst(data_dir) print("Loading SST from {:s}".format(self.zipped_filename)) self.train_trees = self.get_trees("train") print("Training set: {:,} trees".format(len(self.train_trees))) self.dev_trees = self.get_trees("dev") print("Development set: {:,} trees".format(len(self.dev_trees))) self.test_trees = self.get_trees("test") print("Test set: {:,} trees".format(len(self.test_trees))) # Verify that number of sentences matches the published size. assert (len(self.train_trees) == 8544) assert (len(self.dev_trees) == 1101) assert (len(self.test_trees) == 2210) # Build vocabulary over training set print("Building vocabulary - ", end="") train_words = utils.flatten( self.canonicalize(t.leaves()) for t in self.train_trees) self.vocab = vocabulary.Vocabulary(train_words, size=V) print("{:,} words".format(self.vocab.size))
def __init__(self, sess, state_space, act_space, lr=1e-2, tau=0.01, name=None, agent_id=None, grad_norm_clipping=None, units=64): super().__init__(name) self._lr = lr self._tau = tau self.num_units = units self.sess = sess self.agent_id = agent_id self.grad_norm_clipping = grad_norm_clipping self._action_space = act_space self._observation_space = state_space self._loss = None self._train_op = None self.act_dim = flatten(self._action_space) self.obs_input = tf.placeholder(tf.float32, shape=(None, ) + self._observation_space, name="Obs") self.tar_act = tf.placeholder(tf.float32, shape=(None, ) + (self.act_dim, ), name="tar_act") with tf.variable_scope("eval"): self._eval_scope = tf.get_variable_scope().name self._eval_act = self._construct(self.act_dim) with tf.variable_scope("target"): self._target_scope = tf.get_variable_scope().name self._target_act = self._construct(self.act_dim) with tf.name_scope("Update"): # smooth average update process self._update_op = [ tf.assign(t_var, e_var) for t_var, e_var in zip(self.t_variables, self.e_variables) ] self._soft_update_op = [ tf.assign(t_var, self._tau * e_var + (1. - self._tau) * t_var) for t_var, e_var in zip(self.t_variables, self.e_variables) ] with tf.name_scope("BCInit"): self._bc_loss = tf.reduce_mean( tf.square(self.tar_act - tf.nn.softmax(self.logits))) #self._bc_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.tar_act, logits=self.logits)) bc_optim = tf.train.AdamOptimizer(self._lr) self._train_bc_op = bc_optim.minimize(self._bc_loss)
def create_batch(csv_files, save_batch_file, batch_size=25, hit_size=5, keep_batched=False): list_hits = [] for csv_file in csv_files: print("Processing csv file ", csv_file) df_qa_pairs = pd.read_csv(csv_file) # drop pairs that have already been batched if keep_batched: df_qa_pairs_unbatched = df_qa_pairs else: df_qa_pairs_unbatched = df_qa_pairs.loc[df_qa_pairs[bc.BATCHED] == False] # get the hearing info, then drop hearing_columns = [ bc.BATCH_HEARING_ID, bc.BATCH_HEARING_TITLE, bc.BATCH_HEARING_DATE, bc.BATCH_HEARING_SUMMARY ] hearing_info = df_qa_pairs_unbatched.iloc[0][ hearing_columns].values.tolist() drop_columns = hearing_columns + [ bc.BATCHED, bc.LABELED_TYPE, bc.LABELED_INTENT ] df_qa_pairs_unbatched.drop(columns=drop_columns, inplace=True) # group into hits hits = list( map( list, utils.grouper_without_fill( df_qa_pairs_unbatched.values.tolist(), hit_size))) hits = utils.flatten(hits) # add hearing info to beginning of hit hits = [hearing_info + hit for hit in hits] list_hits.append(hits) # mix across hearings mixed_hits = list(utils.roundrobin(*list_hits)) mixed_hits = mixed_hits[:batch_size] mixed_hits_df = pd.DataFrame(mixed_hits, columns=bc.BATCH_HEADER) mixed_hits_df.to_csv(save_batch_file, index=False) # update original csv file to mark qa pairs as batched for csv_file in csv_files: print("Marking qa pairs as batched for file ", csv_file) df_qa_pairs = pd.read_csv(csv_file) hearing_id = df_qa_pairs[bc.BATCH_HEARING_ID].values[0] turn_ids = mixed_hits_df.loc[ mixed_hits_df[bc.BATCH_HEARING_ID] == hearing_id, [ bc.BATCH_Q1_ID, bc.BATCH_Q2_ID, bc.BATCH_Q3_ID, bc. BATCH_Q4_ID, bc.BATCH_Q5_ID ]].values.flatten() df_qa_pairs.loc[df_qa_pairs[bc.BATCH_Q1_ID].isin(turn_ids), bc.BATCHED] = True df_qa_pairs.to_csv(csv_file, index=False)
def TestDataRequirements(self, threeDinstance): r = Requirements() r.addRequirements(self.itemkindRequirements, threeDinstance.itemkinds) r.addRequirements(self.boxkindRequirements, threeDinstance.boxkinds) r.addRequirements(self.palletkindRequirements, threeDinstance.palletkinds) r.addRequirements(self.containerkindRequirements, threeDinstance.containerkinds) r.addRequirements( self.loadingspaceRequirements, flatten([c.loadingspaces for c in threeDinstance.containerkinds])) return r.valid, r.getWarnings()
def __init__(self, sess, state_space, act_space, lr=1e-2, name=None, agent_id=None, discrete=True, sample_action=True): super().__init__(name) self._lr = lr self.sess = sess self.agent_id = agent_id self.discrete = discrete self.sample_action = sample_action self._action_space = act_space self._observation_space = state_space self._loss = None self._train_op = None self.act_dim = flatten(self._action_space) self.obs_input = tf.placeholder(tf.float32, shape=(None, ) + self._observation_space, name="Obs") self.target_act = tf.placeholder(tf.float32, shape=(None, ) + self._action_space, name="TAct") self._scope = tf.get_variable_scope().name self._logits = self._construct(self.act_dim) self._act = tf.nn.softmax(self._logits) with tf.variable_scope("optimization"): if not discrete: self._loss = tf.reduce_mean( tf.square(self.target_act - self._act)) else: self._loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( labels=self.target_act, logits=self._logits)) optimizer = tf.train.AdamOptimizer(self._lr) self._train_op = optimizer.minimize(self._loss)
def buildVocab(self, vocabSize=None, verbose=False, return_vocab_objects=False): """ Builds the vocabulary based on the initial data file vocabSize(int, default: None-all words) - max number of words to use for vocabulary (only used for training) verbose(boolean, default: False) - print extra info """ print("----------------------------------------------------") print("building vocabulary from TRAINING data...") flatData = [w for w in zip(*utils.flatten(self.train_sentences))] # remember these vocabs will have the <s>, </s>, and <unk> tags in there # sizes need to be interpreted "-3" - consider replacing... self.vocab = vocabulary.Vocabulary(flatData[0], size=vocabSize) self.posTags = vocabulary.Vocabulary(flatData[1]) self.nerTags = vocabulary.Vocabulary(flatData[2]) self.capitalTags = vocabulary.Vocabulary(flatData[3]) if verbose: print( "vocabulary for words, posTags, nerTags built and stored in object" ) print("vocab size =", vocabSize) print("10 sampled words from vocabulary\n", list(self.vocab.wordset)[:10], "\n") print("number of unique pos Tags in training =", self.posTags.size) print("all posTags used\n", list(self.posTags.wordset), "\n") print("number of unique NER tags in training =", self.nerTags.size) print("all nerTags for prediction", list(self.nerTags.wordset), "\n") print("number of unique capitalization tags in training =", self.capitalTags.size) print('all capitalTags for prediction', list(self.capitalTags.wordset), "\n") if return_vocab_objects: return self.vocab, self.posTags, self.nerTags, self.capitalTags
if __name__ == "__main__": n_cpu = max(1, multiprocessing.cpu_count() - 1) pool = multiprocessing.Pool(n_cpu) print("Finding hosts...") host_data = utils.read_local_host_data(c.LOCAL_HOST_DATA_PATH) host_data = utils.resolve_ipv4s(host_data) ipv4s = list(map(lambda x: x["ipv4"], host_data)) assert all(ipv4s), "MissingHostError: Not all hosts have been found" print("Hosts found...") while 1: try: password = getpass.getpass(f"[sudo] password for {c.USERNAME}: ") pool = multiprocessing.Pool(n_cpu) status = pool.map(install, ((ipv4, password) for ipv4 in ipv4s)) pool.close() pool.join() except paramiko.ssh_exception.AuthenticationException: print("Incorrect password. Please try again.") else: break status = list(status) status = utils.flatten(status) status = sorted(status, key=lambda x: "success" in x, reverse=True) print(*status, sep="\n", end="")
def GetAllLoadingspaces(self): return sorted(flatten([c.loadingspaces for c in self.containerkinds]), key=lambda x: x.id)