def evaluate_dataset_point(model, data , addbase):
    pred_trees = []
    gold_trees = []
    for i, inst in enumerate(data):
        pred_scores = []
        for tree in inst.kbest:
            if tree.size == inst.gold.size:
                pred_scores.append(model.predict(tree))
            else:
                print 'error'
                pred_scores.append(-1000)
        #data_util.normalize(pred_scores)
        if addbase:
            data_util.normalize(inst.scores)
            scores = [p_s + b_s for p_s, b_s in zip(pred_scores, inst.scores)]
        else:
            scores = pred_scores
        max_id = scores.index(max(scores))
        #print max_id,scores[max_id]
        for line in inst.lines[max_id]:
            pred_trees.append(line)
        pred_trees.append('\n')
        for line in inst.gold_lines:
            gold_trees.append(line)
        gold_trees.append('\n')
    res = eval_tool.evaluate(pred_trees,gold_trees)
    print 'f1score: %.4f' % (res[0])
    return res
Esempio n. 2
0
def evaluate_dataset_point(model, data, addbase, ratio=1):
    pred_trees = []
    gold_trees = []
    for i, inst in enumerate(data):
        pred_scores = [
            model.predict(tree) for tree in inst.kbest
            if tree.size == inst.gold.size
        ]
        data_util.normalize(pred_scores)
        if addbase:
            data_util.normalize(inst.scores)
            scores = [
                ratio * p_s + (1 - ratio) * b_s
                for p_s, b_s in zip(pred_scores, inst.scores)
            ]
        else:
            scores = pred_scores
        max_id = scores.index(max(scores))
        #print "pred: %.4f    base: %.4f" % (pred_scores[max_id],inst.scores[max_id])
        for line in inst.lines[max_id]:
            pred_trees.append(line)
        pred_trees.append('\n')
        for line in inst.gold_lines:
            gold_trees.append(line)
        gold_trees.append('\n')
    res = eval_tool.evaluate(pred_trees, gold_trees)
    print 'ratio: %f f1score: %.4f' % (ratio, res[0])
    return res
Esempio n. 3
0
    def add_embedding(self, helper, input_placeholder):
        """添加embedding层,shape (None, max_length, n_features*embed_size)
        :return:
        """

        vocab = open(self.vocab_file)
        vectors = open(self.word2vec_file)
        pre_embeddings = np.array(np.random.randn(
            len(helper.tok2id) + 1, self.embed_size),
                                  dtype=np.float32)

        pre_embeddings[0] = 0.
        for word, vec in load_word_vector_mapping(vocab, vectors).items():
            word = normalize(word)
            if word in helper.tok2id:
                pre_embeddings[helper.tok2id[word]] = vec

        logger.info("初始化 embeddings.")
        vocab.close()
        vectors.close()

        embed = tf.Variable(pre_embeddings, name="embed")
        # shape(None, max_length, n_features)
        features = tf.nn.embedding_lookup(embed, input_placeholder)
        self.embeddings = tf.reshape(features,
                                     shape=(-1, self.max_length,
                                            self.n_features * self.embed_size))

        return self.embeddings
Esempio n. 4
0
def run_policy(args):

    import gym
    env = gym.make(args.envname)
    max_steps = args.max_timesteps or env.spec.timestep_limit

    returns = []
    observations = []
    actions = []

    from policy.model import Net
    model = Net(env.observation_space.shape[0], env.action_space.shape[0])
    # Is GPU available?
    use_gpu = torch.cuda.is_available()
    use_gpu = False
    if use_gpu:
        model = model.cuda()

    model.load_state_dict(torch.load(args.model))
    print("Using model: ", args.model)
    model.eval()

    latest_stat = pickle.load(open(data_util.get_latest('stats/*'), 'rb'))

    for i in range(args.num_rollouts):
        print('iteration:', i)
        obs = env.reset()
        done = False
        totalr = 0.
        steps = 0
        while not done:
            obs = np.array(obs, dtype='float32')
            obs = data_util.normalize(obs, *latest_stat)
            if use_gpu:
                obs = torch.from_numpy(obs)
                action = model(Variable(obs.cuda(),
                                        volatile=True)).data.numpy()
            else:
                action = model(Variable(torch.from_numpy(obs),
                                        volatile=True)).data.numpy()
            observations.append(obs)
            actions.append(action)
            obs, r, done, _ = env.step(action)
            totalr += r
            steps += 1
            if args.render:
                env.render()
            if steps % 100 == 0: print("%i/%i" % (steps, max_steps))
            if steps >= max_steps:
                break
        returns.append(totalr)

    print('returns', returns)
    print('mean return', np.mean(returns))
    print('std of return', np.std(returns))
Esempio n. 5
0
    def extract_data(self,
                     filepath,
                     ind_features=_PARAIND_FEAT,
                     dep_features=_PARADEP_FEAT,
                     labels_per_sent=None,
                     labels_per_window=None):
        """Extract features, reduce dimensions with a PCA and return data.

        Exports raw- and PCA-reduced data both in arff- and numpy-format.
        """
        start = time.clock()
        self.dictVectorizer = DictVectorizer(sparse=False)
        filename = os.path.split(filepath)[1]
        directory = os.path.split(filepath)[0]
        plain_reader = PlaintextCorpusReader(
            directory, [filename],
            word_tokenizer=RegexpTokenizer("(-?\d+\.\d+)|[\w']+|[" +
                                           string.punctuation + "]"),
            sent_tokenizer=LineTokenizer(blanklines="discard"),
            encoding='utf8')

        # create new subdir for extracted data
        if _NEW_SUBDIR is not None:
            path = os.path.join(directory, _NEW_SUBDIR)
            if not os.path.exists(path):
                os.makedirs(path)
            path = os.path.join(path, os.path.splitext(filename)[0])
            # print "path {}".format(path)
        else:
            path = os.path.splitext(filepath)[0]
            # print "path {}".format(path)

        # filepaths for weka- and numpy-files
        arff_filepath = path + ".arff"
        arff_filepath_pca = path + "_pca95.arff"
        numpy_filepath = path + ".npy"
        numpy_filepath_pca = path + "_pca95.npy"

        # print(":time: Reader created, time elapsed {}").format(time.clock() - start)
        paras = plain_reader.paras()
        # print(":time: Paras created, time elapsed {}").format(time.clock() - start)
        sents = plain_reader.sents()
        # print(":time: Sents created, time elapsed {}").format(time.clock() - start)

        # get paragraph boundaries for sliding-window
        self.boundaries = util.get_boundaries(paras)
        boundaries_backup = self.boundaries

        # check if all files necessary exist, if yes - unpickle/load them and return data
        if util.files_already_exist([
                numpy_filepath_pca,
        ]):
            print "Features already extracted. Calculating clusters...\n"
            matrix_sklearn_pca = numpy.load(numpy_filepath_pca)
            return filepath, self.boundaries, matrix_sklearn_pca, len(sents)

        # save correct target-labels and additional info of current data
        targets_path = open(path + ".tbs", "wb")
        pickle.dump((labels_per_sent, labels_per_window, boundaries_backup,
                     len(sents), _WINDOW_SIZE, _STEP_SIZE), targets_path)

        # print(":time: Boundaries calculated, time elapsed {}").format(time.clock() - start)
        self.data = self.extract_features(sents, _WINDOW_SIZE, _STEP_SIZE,
                                          ind_features, dep_features)
        # self.data[year] = self.extract_features_para(paras, ind_features, dep_features)
        # print(":time: Features extracted, time elapsed {}").format(time.clock() - start)
        self.all_features = self.unified_features(self.data)
        # print(":time: Unified features, time elapsed {}").format(time.clock() - start)
        matrix_sklearn = self.feature_matrix_sklearn(
            self.generator_data(self.data))
        # print(":time: Matrix sklearn created, time elapsed {}").format(time.clock() - start)
        matrix_sklearn = util.normalize(matrix_sklearn)
        # print(":time: Matrix normalized, time elapsed {}").format(time.clock() - start)

        print "Exporting raw-data..."
        util.export_arff(matrix_sklearn,
                         self.dictVectorizer.get_feature_names(),
                         arff_filepath,
                         filename + "_RAW",
                         labels_per_window,
                         file_info=None)
        numpy.save(numpy_filepath, matrix_sklearn)

        # print "matrix dimensions before pca: {}".format(matrix_sklearn.shape)
        feature_names, feature_names_part = None, None
        if _DO_PCA:
            print "PCA calculation..."
            matrix_sklearn_pca, feature_names = util.pca(
                matrix_sklearn, self.dictVectorizer.get_feature_names())
            util.export_arff(matrix_sklearn_pca,
                             feature_names,
                             arff_filepath_pca,
                             filename + "_PCA95",
                             labels_per_window,
                             file_info=None)
            numpy.save(numpy_filepath_pca, matrix_sklearn_pca)

            del matrix_sklearn
        gc.collect()
        return filepath, boundaries_backup, matrix_sklearn_pca, len(sents)
Esempio n. 6
0
    def extract_data(self, filepath, ind_features=_PARAIND_FEAT, dep_features=_PARADEP_FEAT, labels_per_sent=None, labels_per_window=None):
        """Extract features, reduce dimensions with a PCA and return data.

        Exports raw- and PCA-reduced data both in arff- and numpy-format.
        """
        start = time.clock()
        self.dictVectorizer = DictVectorizer(sparse=False)
        filename = os.path.split(filepath)[1]
        directory = os.path.split(filepath)[0]
        plain_reader = PlaintextCorpusReader(
            directory, 
            [filename],
            word_tokenizer=RegexpTokenizer("(-?\d+\.\d+)|[\w']+|["+string.punctuation+"]"),
            sent_tokenizer=LineTokenizer(blanklines="discard"),
            encoding='utf8')

        # create new subdir for extracted data
        if _NEW_SUBDIR is not None:
            path = os.path.join(directory, _NEW_SUBDIR)
            if not os.path.exists(path):
                os.makedirs(path)
            path = os.path.join(path, os.path.splitext(filename)[0])            
            # print "path {}".format(path)
        else:
            path = os.path.splitext(filepath)[0]
            # print "path {}".format(path)

        # filepaths for weka- and numpy-files
        arff_filepath = path + ".arff"
        arff_filepath_pca = path + "_pca95.arff"
        numpy_filepath = path + ".npy"
        numpy_filepath_pca = path + "_pca95.npy"
        
        # print(":time: Reader created, time elapsed {}").format(time.clock() - start)
        paras = plain_reader.paras()
        # print(":time: Paras created, time elapsed {}").format(time.clock() - start)
        sents = plain_reader.sents()
        # print(":time: Sents created, time elapsed {}").format(time.clock() - start)

        # get paragraph boundaries for sliding-window
        self.boundaries = util.get_boundaries(paras)
        boundaries_backup = self.boundaries

        # check if all files necessary exist, if yes - unpickle/load them and return data
        if util.files_already_exist([numpy_filepath_pca,]):
            print "Features already extracted. Calculating clusters...\n"
            matrix_sklearn_pca = numpy.load(numpy_filepath_pca)
            return filepath, self.boundaries, matrix_sklearn_pca, len(sents)

        # save correct target-labels and additional info of current data
        targets_path = open(path + ".tbs", "wb")
        pickle.dump((labels_per_sent, labels_per_window, boundaries_backup, len(sents), _WINDOW_SIZE, _STEP_SIZE), targets_path)

        # print(":time: Boundaries calculated, time elapsed {}").format(time.clock() - start)
        self.data = self.extract_features(sents, _WINDOW_SIZE, _STEP_SIZE, ind_features, dep_features)
        # self.data[year] = self.extract_features_para(paras, ind_features, dep_features)
        # print(":time: Features extracted, time elapsed {}").format(time.clock() - start)
        self.all_features = self.unified_features(self.data)
        # print(":time: Unified features, time elapsed {}").format(time.clock() - start)
        matrix_sklearn = self.feature_matrix_sklearn(self.generator_data(self.data))
        # print(":time: Matrix sklearn created, time elapsed {}").format(time.clock() - start)
        matrix_sklearn = util.normalize(matrix_sklearn)
        # print(":time: Matrix normalized, time elapsed {}").format(time.clock() - start)
        
        print "Exporting raw-data..."
        util.export_arff(matrix_sklearn, self.dictVectorizer.get_feature_names(), arff_filepath, filename+"_RAW", labels_per_window, file_info=None)
        numpy.save(numpy_filepath, matrix_sklearn)
        
        # print "matrix dimensions before pca: {}".format(matrix_sklearn.shape)
        feature_names, feature_names_part = None, None
        if _DO_PCA:
            print "PCA calculation..."
            matrix_sklearn_pca, feature_names = util.pca(matrix_sklearn, self.dictVectorizer.get_feature_names())
            util.export_arff(matrix_sklearn_pca, feature_names, arff_filepath_pca, filename+"_PCA95", labels_per_window, file_info=None)
            numpy.save(numpy_filepath_pca, matrix_sklearn_pca)
            
            del matrix_sklearn
        gc.collect()
        return filepath, boundaries_backup, matrix_sklearn_pca, len(sents)