Example #1
0
    def __init__(self, V=10000):
        self.vocab = None
        self.zipped_filename = "data/sst/trainDevTestTrees_PTB.zip"
        self.target_names = None  # set by self.process()

        # Download datasets
        if not os.path.isfile(self.zipped_filename):
            data_dir = os.path.dirname(self.zipped_filename)
            print("Downloading treebank to {:s}".format(data_dir))
            self.zipped_filename = download_sst(data_dir)
        print("Loading SST from {:s}".format(self.zipped_filename))

        self.train_trees = self.get_trees("train")
        print("Training set:     {:,} trees".format(len(self.train_trees)))
        self.dev_trees = self.get_trees("dev")
        print("Development set:  {:,} trees".format(len(self.dev_trees)))
        self.test_trees = self.get_trees("test")
        print("Test set:         {:,} trees".format(len(self.test_trees)))

        # Verify that number of sentences matches the published size.
        assert (len(self.train_trees) == 8544)
        assert (len(self.dev_trees) == 1101)
        assert (len(self.test_trees) == 2210)

        # Build vocabulary over training set
        print("Building vocabulary - ", end="")
        train_words = utils.flatten(
            self.canonicalize(t.leaves()) for t in self.train_trees)
        self.vocab = vocabulary.Vocabulary(train_words, size=V)
        print("{:,} words".format(self.vocab.size))
Example #2
0
    def __init__(self,
                 sess,
                 state_space,
                 act_space,
                 lr=1e-2,
                 tau=0.01,
                 name=None,
                 agent_id=None,
                 grad_norm_clipping=None,
                 units=64):
        super().__init__(name)

        self._lr = lr
        self._tau = tau
        self.num_units = units

        self.sess = sess
        self.agent_id = agent_id
        self.grad_norm_clipping = grad_norm_clipping

        self._action_space = act_space
        self._observation_space = state_space

        self._loss = None
        self._train_op = None

        self.act_dim = flatten(self._action_space)

        self.obs_input = tf.placeholder(tf.float32,
                                        shape=(None, ) +
                                        self._observation_space,
                                        name="Obs")
        self.tar_act = tf.placeholder(tf.float32,
                                      shape=(None, ) + (self.act_dim, ),
                                      name="tar_act")

        with tf.variable_scope("eval"):
            self._eval_scope = tf.get_variable_scope().name
            self._eval_act = self._construct(self.act_dim)

        with tf.variable_scope("target"):
            self._target_scope = tf.get_variable_scope().name
            self._target_act = self._construct(self.act_dim)

        with tf.name_scope("Update"):  # smooth average update process
            self._update_op = [
                tf.assign(t_var, e_var)
                for t_var, e_var in zip(self.t_variables, self.e_variables)
            ]
            self._soft_update_op = [
                tf.assign(t_var, self._tau * e_var + (1. - self._tau) * t_var)
                for t_var, e_var in zip(self.t_variables, self.e_variables)
            ]

        with tf.name_scope("BCInit"):
            self._bc_loss = tf.reduce_mean(
                tf.square(self.tar_act - tf.nn.softmax(self.logits)))
            #self._bc_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.tar_act, logits=self.logits))
            bc_optim = tf.train.AdamOptimizer(self._lr)
            self._train_bc_op = bc_optim.minimize(self._bc_loss)
def create_batch(csv_files,
                 save_batch_file,
                 batch_size=25,
                 hit_size=5,
                 keep_batched=False):
    list_hits = []
    for csv_file in csv_files:
        print("Processing csv file ", csv_file)
        df_qa_pairs = pd.read_csv(csv_file)

        # drop pairs that have already been batched
        if keep_batched:
            df_qa_pairs_unbatched = df_qa_pairs
        else:
            df_qa_pairs_unbatched = df_qa_pairs.loc[df_qa_pairs[bc.BATCHED] ==
                                                    False]

        # get the hearing info, then drop
        hearing_columns = [
            bc.BATCH_HEARING_ID, bc.BATCH_HEARING_TITLE, bc.BATCH_HEARING_DATE,
            bc.BATCH_HEARING_SUMMARY
        ]
        hearing_info = df_qa_pairs_unbatched.iloc[0][
            hearing_columns].values.tolist()
        drop_columns = hearing_columns + [
            bc.BATCHED, bc.LABELED_TYPE, bc.LABELED_INTENT
        ]
        df_qa_pairs_unbatched.drop(columns=drop_columns, inplace=True)
        # group into hits
        hits = list(
            map(
                list,
                utils.grouper_without_fill(
                    df_qa_pairs_unbatched.values.tolist(), hit_size)))
        hits = utils.flatten(hits)

        # add hearing info to beginning of hit
        hits = [hearing_info + hit for hit in hits]
        list_hits.append(hits)

    # mix across hearings
    mixed_hits = list(utils.roundrobin(*list_hits))
    mixed_hits = mixed_hits[:batch_size]
    mixed_hits_df = pd.DataFrame(mixed_hits, columns=bc.BATCH_HEADER)
    mixed_hits_df.to_csv(save_batch_file, index=False)

    # update original csv file to mark qa pairs as batched
    for csv_file in csv_files:
        print("Marking qa pairs as batched for file ", csv_file)
        df_qa_pairs = pd.read_csv(csv_file)
        hearing_id = df_qa_pairs[bc.BATCH_HEARING_ID].values[0]
        turn_ids = mixed_hits_df.loc[
            mixed_hits_df[bc.BATCH_HEARING_ID] == hearing_id, [
                bc.BATCH_Q1_ID, bc.BATCH_Q2_ID, bc.BATCH_Q3_ID, bc.
                BATCH_Q4_ID, bc.BATCH_Q5_ID
            ]].values.flatten()
        df_qa_pairs.loc[df_qa_pairs[bc.BATCH_Q1_ID].isin(turn_ids),
                        bc.BATCHED] = True
        df_qa_pairs.to_csv(csv_file, index=False)
 def TestDataRequirements(self, threeDinstance):
     r = Requirements()
     r.addRequirements(self.itemkindRequirements, threeDinstance.itemkinds)
     r.addRequirements(self.boxkindRequirements, threeDinstance.boxkinds)
     r.addRequirements(self.palletkindRequirements,
                       threeDinstance.palletkinds)
     r.addRequirements(self.containerkindRequirements,
                       threeDinstance.containerkinds)
     r.addRequirements(
         self.loadingspaceRequirements,
         flatten([c.loadingspaces for c in threeDinstance.containerkinds]))
     return r.valid, r.getWarnings()
Example #5
0
    def __init__(self,
                 sess,
                 state_space,
                 act_space,
                 lr=1e-2,
                 name=None,
                 agent_id=None,
                 discrete=True,
                 sample_action=True):
        super().__init__(name)

        self._lr = lr

        self.sess = sess
        self.agent_id = agent_id
        self.discrete = discrete
        self.sample_action = sample_action

        self._action_space = act_space
        self._observation_space = state_space

        self._loss = None
        self._train_op = None

        self.act_dim = flatten(self._action_space)

        self.obs_input = tf.placeholder(tf.float32,
                                        shape=(None, ) +
                                        self._observation_space,
                                        name="Obs")
        self.target_act = tf.placeholder(tf.float32,
                                         shape=(None, ) + self._action_space,
                                         name="TAct")

        self._scope = tf.get_variable_scope().name
        self._logits = self._construct(self.act_dim)
        self._act = tf.nn.softmax(self._logits)

        with tf.variable_scope("optimization"):
            if not discrete:
                self._loss = tf.reduce_mean(
                    tf.square(self.target_act - self._act))
            else:
                self._loss = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits(
                        labels=self.target_act, logits=self._logits))
            optimizer = tf.train.AdamOptimizer(self._lr)
            self._train_op = optimizer.minimize(self._loss)
    def buildVocab(self,
                   vocabSize=None,
                   verbose=False,
                   return_vocab_objects=False):
        """
        Builds the vocabulary based on the initial data file
        
        vocabSize(int, default: None-all words) - max number of words to use for vocabulary
                                                  (only used for training)
        verbose(boolean, default: False)        - print extra info
        """
        print("----------------------------------------------------")
        print("building vocabulary from TRAINING data...")

        flatData = [w for w in zip(*utils.flatten(self.train_sentences))]

        # remember these vocabs will have the <s>, </s>, and <unk> tags in there
        # sizes need to be interpreted "-3" - consider replacing...
        self.vocab = vocabulary.Vocabulary(flatData[0], size=vocabSize)
        self.posTags = vocabulary.Vocabulary(flatData[1])
        self.nerTags = vocabulary.Vocabulary(flatData[2])
        self.capitalTags = vocabulary.Vocabulary(flatData[3])

        if verbose:
            print(
                "vocabulary for words, posTags, nerTags built and stored in object"
            )
            print("vocab size =", vocabSize)
            print("10 sampled words from vocabulary\n",
                  list(self.vocab.wordset)[:10], "\n")
            print("number of unique pos Tags in training =", self.posTags.size)
            print("all posTags used\n", list(self.posTags.wordset), "\n")
            print("number of unique NER tags in training =", self.nerTags.size)
            print("all nerTags for prediction", list(self.nerTags.wordset),
                  "\n")
            print("number of unique capitalization tags in training =",
                  self.capitalTags.size)
            print('all capitalTags for prediction',
                  list(self.capitalTags.wordset), "\n")

        if return_vocab_objects:
            return self.vocab, self.posTags, self.nerTags, self.capitalTags
Example #7
0

if __name__ == "__main__":

    n_cpu = max(1, multiprocessing.cpu_count() - 1)
    pool = multiprocessing.Pool(n_cpu)

    print("Finding hosts...")
    host_data = utils.read_local_host_data(c.LOCAL_HOST_DATA_PATH)
    host_data = utils.resolve_ipv4s(host_data)
    ipv4s = list(map(lambda x: x["ipv4"], host_data))
    assert all(ipv4s), "MissingHostError: Not all hosts have been found"
    print("Hosts found...")

    while 1:
        try:
            password = getpass.getpass(f"[sudo] password for {c.USERNAME}: ")
            pool = multiprocessing.Pool(n_cpu)
            status = pool.map(install, ((ipv4, password) for ipv4 in ipv4s))
            pool.close()
            pool.join()
        except paramiko.ssh_exception.AuthenticationException:
            print("Incorrect password. Please try again.")
        else:
            break

    status = list(status)
    status = utils.flatten(status)
    status = sorted(status, key=lambda x: "success" in x, reverse=True)
    print(*status, sep="\n", end="")
 def GetAllLoadingspaces(self):
     return sorted(flatten([c.loadingspaces for c in self.containerkinds]),
                   key=lambda x: x.id)