Ejemplo n.º 1
0
    def test_rename(self):
        new_tmp_dir = "testmkdir/"
        chainerio.makedirs("file://" + new_tmp_dir)

        src = os.path.join("file://", new_tmp_dir, 'src')
        dst = os.path.join("file://", new_tmp_dir, 'dst')
        with chainerio.open(src, 'w') as fp:
            fp.write('foobar')

        chainerio.rename(src, dst)
        with chainerio.open(dst, 'r') as fp:
            data = fp.read()
            assert data == 'foobar'

        assert not chainerio.exists(src)
        assert chainerio.exists(dst)
        chainerio.remove(new_tmp_dir, True)
Ejemplo n.º 2
0
def load_vocabulary(path):
    # CHAINERIO add
    with chainerio.open(path, mode='r') as f:
        # CHAINERIO add end
        # +2 for UNK and EOS
        word_ids = {line.strip(): i + 2 for i, line in enumerate(f)}
    word_ids['<UNK>'] = 0
    word_ids['<EOS>'] = 1
    return word_ids
    def maybe_load(self):
        self.global_step = None
        self.f_id = None
        self.files = None
        checkpoint = None

        if chio.exists(self.args.output_dir):
            model_names = [f for f in chio.list(
                self.args.output_dir)
                if f.endswith(".pt.{}".format(self.team))]
            if len(model_names) != 0:
                self.args.resume_step = max(
                    [int(x.split(
                        '.pt.{}'.format(self.team))[0].split('_')[1].strip())
                     for x in model_names])
                self.global_step = self.args.resume_step

        if self.global_step is not None:
            print("Load from {}".format(os.path.join(self.args.output_dir,
                                                     "ckpt_{}.pt.{}".format(
                                                         self.global_step,
                                                         self.team))))
            with chio.open(os.path.join(self.args.output_dir,
                                        "ckpt_{}.pt.{}".format(
                                            self.global_step, self.team)),
                           "rb") as f:
                checkpoint = torch.load(f, map_location="cpu")
            self.model.load_state_dict(checkpoint['model'],
                                       strict=False)
            self.another_model.load_state_dict(
                checkpoint['another_model'], strict=False)
            if self.args.phase2:
                self.global_step -= self.args.phase1_end_step
            if is_main_process():
                print("resume step from ", self.args.resume_step)

            if self.args.phase2:
                keys = list(checkpoint['optimizer']['state'].keys())
                # Override hyperparameters from Phase 1
                for key in keys:
                    checkpoint['optimizer']['state'][key]['step'] = \
                        self.global_step
                for iter, item in enumerate(
                        checkpoint['optimizer']['param_groups']):
                    checkpoint['optimizer']['param_groups'][iter]['t_total'] =\
                        self.args.max_steps
                    checkpoint['optimizer']['param_groups'][iter]['warmup'] = \
                        self.args.warmup_proportion
                    checkpoint['optimizer']['param_groups'][iter]['lr'] = \
                        self.args.learning_rate
            self.optimizer.load_state_dict(checkpoint['optimizer'])

            # Restore AMP master parameters
            self.f_id = checkpoint['files'][0]
            self.files = checkpoint['files'][1:]
Ejemplo n.º 4
0
    def test_rename(self):
        new_tmp_dir = tempfile.TemporaryDirectory()

        try:
            src = os.path.join("file://", new_tmp_dir.name, 'src')
            dst = os.path.join("file://", new_tmp_dir.name, 'dst')
            with chainerio.open(src, 'w') as fp:
                fp.write('foobar')

            assert chainerio.exists(src)
            assert not chainerio.exists(dst)

            chainerio.rename(src, dst)
            with chainerio.open(dst, 'r') as fp:
                data = fp.read()
                assert data == 'foobar'

            assert not chainerio.exists(src)
            assert chainerio.exists(dst)
        finally:
            new_tmp_dir.cleanup()
Ejemplo n.º 5
0
def read_corpus(path, max_size):
    # CHAINERIO modify
    with chainerio.open(path, mode='r', encoding='utf-8') as f:
    # CHAINERIO modify end
            trees = []
            for line in f:
                line = line.strip()
                tree = SexpParser(line).parse()
                trees.append(tree)
                if max_size and len(trees) >= max_size:
                    break

    return trees
Ejemplo n.º 6
0
    def test_root_fs_override(self):
        from pyarrow import hdfs

        hdfs_tmpfile = "tmpfile_hdfs"
        hdfs_file_string = "this is a test string for hdfs"

        conn = hdfs.connect()
        with conn.open(hdfs_tmpfile, "wb") as f:
            f.write(hdfs_file_string.encode('utf-8'))

        chainerio.set_root("hdfs")
        with chainerio.open(hdfs_tmpfile, "r") as fp:
            self.assertEqual(fp.read(), hdfs_file_string)

        # override with full URI
        with open(__file__, "r") as my_script:
            with chainerio.open("file://" + __file__, "r") as fp:
                self.assertEqual(fp.read(), my_script.read())

        with chainerio.open(hdfs_tmpfile, "r") as fp:
            self.assertEqual(fp.read(), hdfs_file_string)

        conn.delete(hdfs_tmpfile)
        conn.close()
Ejemplo n.º 7
0
def load_data(vocabulary, path):
    n_lines = count_lines(path)
    bar = progressbar.ProgressBar()
    data = []
    print('loading...: %s' % path)

    # CHAINERIO add
    with chainerio.open(path, mode='r') as f:
        # CHAINERIO add end
        for line in bar(f, max_value=n_lines):
            words = line.strip().split()
            array = numpy.array([vocabulary.get(w, UNK) for w in words],
                                numpy.int32)
            data.append(array)
    return data
Ejemplo n.º 8
0
def count_lines(path):
    # CHAINERIO add
    with chainerio.open(path, mode='r') as f:
        # CHAINERIO add end
        return sum([1 for _ in f])
Ejemplo n.º 9
0
# CHAINERIO import
import chainerio
# CHAINERIO end

import numpy as np

import matplotlib
matplotlib.use('Agg')

mushroomsfile = 'mushrooms.csv'
# uncomment to use HDFS, remember to put the mushroomsfile to HDFS
# chainerio.set_root("hdfs")

# CHAINERIO read file start
mushroomsdata = chainerio.open(mushroomsfile, 'r')
# CHAINERIO read file end

data_array = np.genfromtxt(mushroomsdata,
                           delimiter=',',
                           dtype=str,
                           skip_header=1)

for col in range(data_array.shape[1]):
    data_array[:, col] = np.unique(data_array[:, col], return_inverse=True)[1]

X = data_array[:, 1:].astype(np.float32)
Y = data_array[:, 0].astype(np.int32)[:, None]
train, test = datasets.split_dataset_random(datasets.TupleDataset(X, Y),
                                            int(data_array.shape[0] * .7))