Beispiel #1
0
    def load_go_data(
            self,
            data_type='train',  # <1>
            num_samples=1000):  # <2>
        index = KGSIndex(data_directory=self.data_dir)
        index.download_files()  # <3>

        sampler = Sampler(data_dir=self.data_dir)
        data = sampler.draw_data(data_type, num_samples)  # <4>

        zip_names = set()
        indices_by_zip_name = {}
        for filename, index in data:
            zip_names.add(filename)  # <5>
            if filename not in indices_by_zip_name:
                indices_by_zip_name[filename] = []
            indices_by_zip_name[filename].append(index)  # <6>
        for zip_name in zip_names:
            base_name = zip_name.replace('.tar.gz', '')
            data_file_name = base_name + data_type
            if not os.path.isfile(self.data_dir + '/' + data_file_name):
                self.process_zip(zip_name, data_file_name,
                                 indices_by_zip_name[zip_name])  # <7>

        features_and_labels = self.consolidate_games(data_type, data)  # <8>
        return features_and_labels
    def load_go_data(self, data_type='train', num_samples=1000):
        # download all games from KGS to local data directory
        index = KGSIndex(data_directory=self.data_dir)
        index.download_files()

        # sampler instance selects the specified number of games for a data type
        sampler = Sampler(data_dir=self.data_dir)
        data = sampler.draw_data(data_type, num_samples)

        # collect all zip files
        zip_names = set()
        indices_by_zip_name = {}
        for filename, index in data:
            zip_names.add(filename)
            if filename not in indices_by_zip_name:
                indices_by_zip_name[filename] = []
            indices_by_zip_name[filename].append(index)

        # group all SGF file indices by zip_file_name
        for zip_name in zip_names:
            base_name = zip_name.replace('.tar.gz', '')
            data_file_name = base_name + data_type
            # Process each zip file individually
            if not os.path.isfile(self.data_dir + '/' + data_file_name):
                self.process_zip(zip_name, data_file_name,
                                 indices_by_zip_name[zip_name])

        # Aggregate and return files
        features_and_labels = self.consolidate_games(data_type, data)
        return features_and_labels
Beispiel #3
0
    def load_go_data(self,
                     data_type='train',
                     num_samples=1000,
                     download=False):
        """
        棋譜データの読み込み
        
        Parameters
        ----------
        data_type : str
            trainまたはtest
        num_samples : int
            読み込むゲームの数
        download : bool
            データのwebからのダウンロードを行うかどうか
        
        Returns
        -------
        features_and_labels
            [0]: 特徴量のリスト
            [1]: ラベルのリスト
        """

        # 必要なら棋譜データをダウンロード
        if download:
            index = KGSIndex(data_directory=self.data_dir)
            index.download_files()

        # Samplerによって必要数のゲーム数を含む(zipファイル名,ゲームインデックス)のリストを取得
        # なお,Samplerは2014年より前の棋譜か後の棋譜かでtestとtrainを分けている
        sampler = Sampler(data_dir=self.data_dir)
        data = sampler.draw_data(data_type, num_samples)

        # ファイル名のsetと「ファイル名->ゲームインデックスのリスト」となる辞書を作成
        zip_names = set()
        indices_by_zip_name = {}
        for filename, index in data:
            zip_names.add(filename)
            if filename not in indices_by_zip_name:
                indices_by_zip_name[filename] = []
            indices_by_zip_name[filename].append(index)

        # setに入っている必要なtar.gzを,未解凍なら解凍
        for zip_name in zip_names:
            base_name = zip_name.replace('.tar.gz', '')
            data_file_name = base_name + data_type
            if not os.path.isfile(self.data_dir + '/' + data_file_name):
                self.process_zip(zip_name, data_file_name,
                                 indices_by_zip_name[zip_name])

        # 必要なデータが分かったので特徴量とラベルを取得する
        features_and_labels = self.consolidate_games(data_type, data)
        return features_and_labels
Beispiel #4
0
    def load_go_data(self, data_type='train', num_samples=1000,
                     use_generator=False):
        index = KGSIndex(data_directory=self.data_dir)
        index.download_files()

        sampler = Sampler(data_dir=self.data_dir, num_test_games=int(num_samples/10))
        data = sampler.draw_data(data_type, num_samples)

        self.map_to_workers(data_type, data)  # <1>
        if use_generator:
            generator = DataGenerator(self.data_dir, data)
            return generator  # <2>
        else:
            features_and_labels = self.consolidate_games(data_type, data)
            return features_and_labels  # <3>
Beispiel #5
0
    def load_go_data(self, data_type, num_samples,
                     use_generator=False):
        index = KGSIndex(data_directory=self.data_dir)
        index.download_files()

        sampler = Sampler(data_dir=self.data_dir)
        data = sampler.draw_data(data_type, num_samples)

        self.map_to_workers(data_type, data)  # <1>
        if use_generator:
            if self.data_dir is None:
                self.data_dir = 'D:\\CODE\\Python\\Go\\code\\dlgo\\data\\tarfiles'
            generator = DataGenerator(self.data_dir, data)
            return generator  # <2>
        else:
            features_and_labels = self.consolidate_games(data_type, data)
            return features_and_labels  # <3>
Beispiel #6
0
    def load_go_data(self,
                     data_type='train',
                     num_samples=1000,
                     use_generator=True,
                     download=False):
        """
        ファイルの読み込みを並列で行いながら特徴量とラベルを取得する

        Parameters
        ----------
        data_type : str
            'train'または'test'
        num_samples : int
            取得する棋譜の数
        use_generator : bool
            yieldによるミニバッチの取得を行う
        download : bool
            ファイルのダウンロードを行う
        
        Returns
        -------
        次のいずれか
        generator : generator
            ミニバッチの取得を行うgeneratorを返す
        features_and_labels : tuple
            特徴量とラベルを一度に取得する
        """

        if download:
            index = KGSIndex(data_directory=self.data_dir)
            index.download_files()

        sampler = Sampler(data_dir=self.data_dir)
        data = sampler.draw_data(data_type, num_samples)

        self.map_to_workers(data_type, data)
        if use_generator:
            # generatorではすべてのデータをメモリに持つわけではない
            # なので取得データの保存は行わない
            generator = DataGenerator(self.data_dir, data)
            return generator
        else:
            features_and_labels = self.consolidate_games(data_type, data)
            return features_and_labels
Beispiel #7
0
 def load_go_data(self, data_type='train', num_samples=1000):
     index = KGSIndex(data_directory=self.data_dir)
     index.download_files()
     sampler = Sampler(data_dir=self.data_dir)
     data = sampler.draw_data(data_type, num_samples)
     #print(data)
     zip_names = set()
     indices_by_zip_name = {}
     for filename, index in data:
         zip_names.add(filename)
         if filename not in indices_by_zip_name:
             indices_by_zip_name[filename] = []
         indices_by_zip_name[filename].append(index)
     for zip_name in zip_names:
         basename = zip_name.replace('.tar.gz', '')
         data_file_name = basename + data_type
         if not os.path.isfile(self.data_dir + data_file_name):
             #print(self.data_dir + data_file_name)
             self.process_zip(zip_name, data_file_name,
                              indices_by_zip_name[zip_name])
Beispiel #8
0
    def load_go_data(self,
                     data_type='train',
                     num_samples=1000,
                     use_generator=False):
        index = KGSIndex(data_directory=self.data_dir)
        index.download_files()

        sampler = Sampler(data_dir=self.data_dir)
        data = sampler.draw_data(data_type, num_samples)

        # ワークロードをCPUにマップする
        self.map_to_workers(data_type, data)  # <1>
        if use_generator:
            generator = DataGenerator(self.data_dir, data)

            # 囲碁データジェネレータを返すか
            return generator  # <2>
        else:
            features_and_labels = self.consolidate_games(data_type, data)

            # 以前のように結合されたデータを返す
            return features_and_labels  # <3>
Beispiel #9
0
    def draw_training_games(self):
        index = KGSIndex(data_directory=self.data_dir)

        for fileinfo in index.file_info:
            filename = fileinfo['filename']
            #year = int(filename.split('-')[1].split('-')[0])
            #if year > self.cap_year:
            #  continue
            num_games = fileinfo['num_games']
            for i in range(num_games):
                sample = (filename, i)
                if sample not in self.train_games:
                    self.train_games.append(sample)
        print('toal num training samples: ' + str(len(self.train_games)))
Beispiel #10
0
    def load_go_data(self,
                     data_type='train',
                     num_samples=1000,
                     use_generator=False):
        # Khoi tao KGSIndex()
        index = KGSIndex(data_directory=self.data_dir)
        # Download tat ca games tu KGS toi thu muc data_directory. Neu data co san, khong can download mot lan nua
        index.download_files()

        sampler = Sampler(data_dir=self.data_dir)
        # Sample chon so luong games cu the cho data_type
        data = sampler.draw_data(data_type, num_samples)

        # Map workload to CPUs
        self.map_to_workers(data_type, data)
        if use_generator:
            generator = DataGenerator(self.data_dir, data)
            # Tra ve Go data generator
            return generator
        else:
            features_and_labels = self.consolidate_games(data_type, data)
            # Tra ve features va labels
            return features_and_labels
Beispiel #11
0
    def load_go_data(
            self,
            data_type='train',  # <1>
            num_samples=1000):  # <2>
        index = KGSIndex(data_directory=self.data_dir)

        # KGSから全てのゲームをローカルのデータディレクトリにダウンロード。
        # データがすでに利用可能な場合は、再度ダウンロードされない。
        index.download_files()  # <3>

        sampler = Sampler(data_dir=self.data_dir)

        # Sampleインスタンスは、選択されたデータ種別のために指定された数のゲームを選択する
        data = sampler.draw_data(data_type, num_samples)  # <4>

        zip_names = set()
        indices_by_zip_name = {}
        for filename, index in data:
            # データに含まれるすべてのzipファイル名をリストにまとめる
            zip_names.add(filename)  # <5>
            if filename not in indices_by_zip_name:
                indices_by_zip_name[filename] = []

            # 全てのSGFファイルのインデックスをzipファイル名でグループ化する
            indices_by_zip_name[filename].append(index)  # <6>
        for zip_name in zip_names:
            base_name = zip_name.replace('.tar.gz', '')
            data_file_name = base_name + data_type
            if not os.path.isfile(self.data_dir + '/' + data_file_name):

                # zipファイルは個別に処理される
                self.process_zip(zip_name, data_file_name,
                                 indices_by_zip_name[zip_name])  # <7>

        # 各zipの特徴量とラベルが結合され、返される
        features_and_labels = self.consolidate_games(data_type, data)  # <8>
        return features_and_labels
Beispiel #12
0
    def load_go_data(self, data_type='train',  # As `data_type` you can choose either 'train' or 'test'
                     num_samples=1000):  # `num_samples` refers to the number of games to load data from
        index = KGSIndex(data_directory=self.data_dir)
        index.download_files()  # download all games from KGS to our local data directory. If data is available, it won't be downloaded again

        sampler = Sampler(data_dir=self.data_dir)
        data = sampler.draw_data(data_type, num_samples)  # The `Sampler` instance selects the specified number of games for a data type

        zip_names = set()
        indices_by_zip_name = {}
        for filename, index in data:
            zip_names.add(filename)  # We collect all zip file names contained in the data in a list
            if filename not in indices_by_zip_name:
                indices_by_zip_name[filename] = []
            indices_by_zip_name[filename].append(index)  # Then we group all SGF file indices by zip file name
        for zip_name in zip_names:
            base_name = zip_name.replace('.tar.gz', '')
            data_file_name = base_name + data_type
            if not os.path.isfile(self.data_dir + '/' + data_file_name):
                # The zip files are then processed individually
                self.process_zip(zip_name, data_file_name, indices_by_zip_name[zip_name])
        # Features and labels from each zip are then aggregated and returned
        features_and_labels = self.consolidate_games(data_type, data)
        return features_and_labels
Beispiel #13
0
 def draw_training_games(self):
     # get list of all non-test games, that are no later than dec 2014
     # ignore games after cap_year to keep training data stable
     index = KGSIndex(data_directory=self.data_dir)
     for file_info in index.file_info:
         filename = file_info['filename']
         year = int(filename.split('-')[1].split('_')[0])
         if year > self.cap_year:
             continue
         num_games = file_info['num_games']
         for i in range(num_games):
             sample = (filename, i)
             if sample not in self.test_games:
                 self.train_games.append(sample)
     print('total num training games: ' + str(len(self.train_games)))
Beispiel #14
0
 def draw_training_samples(self, num_sample_games):
     available_games = []
     index = KGSIndex(data_directory=self.data_dir)
     for fileinfo in index.file_info:
         filename = fileinfo['filename']
         #year = int(filename.split('-')[1].split('-')[0])
         #if year > self.cap_year:
         #  continue
         num_games = fileinfo['num_games']
         for i in range(num_games):
             available_games.append((filename, i))
     sample_set = set()
     while len(sample_set) < num_sample_games:
         sample = random.choice(available_games)
         if sample not in sample_set:
             sample_set.add(sample)
     print('Drawn ' + str(num_sample_games) + ' samples')
     return list(sample_set)
Beispiel #15
0
    def draw_all_training(self):
        available_games = []
        index = KGSIndex(data_directory=self.data_dir)

        for fileinfo in index.file_info:
            filename = fileinfo['filename']
            #year = int(filename.split('-')[1].split('-')[0])
            #if year > self.cap_year:
            #  continue
            num_games = fileinfo['num_games']
            for i in range(num_games):
                available_games.append((filename, i))
        print('>>> Total number of games used: ' + str(len(available_games)))
        sample_set = set()
        for sample in available_games:
            if sample not in sample_set:
                sample_set.add(sample)
        print('Drawn all samples, ie ' + str(len(sample_set)) + ' samples:')
        return list(sample_set)
        pass
Beispiel #16
0
    def draw_training_samples(self, num_sample_games):
        '''Draw training games, not overlapping with any of the test games.'''
        available_games = []
        index = KGSIndex(data_directory=self.data_dir)
        for fileinfo in index.file_info:
            filename = fileinfo['filename']
            year = int(filename.split('-')[1].split('_')[0])
            if year > self.cap_year:
                continue
            num_games = fileinfo['num_games']
            for i in range(num_games):
                available_games.append((filename, i))
        print('total num games: ' + str(len(available_games)))

        sample_set = set()
        while len(sample_set) < num_sample_games:
            sample = random.choice(available_games)
            if sample not in self.test_games:
                sample_set.add(sample)
        print('Drawn ' + str(num_sample_games) + ' samples:')
        return list(sample_set)
Beispiel #17
0
    def draw_samples(self, num_sample_games):
        '''Draw num_sample_games many training games from index.'''
        available_games = []
        index = KGSIndex(data_directory=self.data_dir)

        for fileinfo in index.file_info:
            filename = fileinfo['filename']
            year = int(filename.split('-')[1].split('_')[0])
            if year > self.cap_year:
                continue
            num_games = fileinfo['num_games']
            for i in range(num_games):
                available_games.append((filename, i))
        print('>>> Total number of games used: ' + str(len(available_games)))

        sample_set = set()
        while len(sample_set) < num_sample_games:
            sample = random.choice(available_games)
            if sample not in sample_set:
                sample_set.add(sample)
        print('Drawn ' + str(num_sample_games) + ' samples:')
        return list(sample_set)
Beispiel #18
0
    def draw_all_training(self):
        '''Draw all available training games.'''
        available_games = []
        index = KGSIndex(data_directory=self.data_dir)

        for fileinfo in index.file_info:
            filename = fileinfo['filename']
            year = int(filename.split('-')[1].split('_')[0])
            if year > self.cap_year:
                continue
            if 'num_games' in fileinfo.keys():
                num_games = fileinfo['num_games']
            else:
                continue
            for i in range(num_games):
                available_games.append((filename, i))
        print('total num games: ' + str(len(available_games)))

        sample_set = set()
        for sample in available_games:
            if sample not in self.test_games:
                sample_set.add(sample)
        print('Drawn all samples, ie ' + str(len(sample_set)) + ' samples:')
        return list(sample_set)
Beispiel #19
0
from dlgo.data.index_processor import KGSIndex

index = KGSIndex()
index.download_files()