コード例 #1
0
def main(args):
    args.embedding_output_path = join(args.EMBED_DATA_DIR, args.output_path) 
    print('> START  ')
    print('> parameter  ')
    for k, v in args._get_kwargs():
        print('> {} : {}'.format(k, v))
    print('')
    print('> Action ')
    pool_path = args.data_path
    GPU_NUM = args.gpu_num
    embedding_type_name = args.embedding_type_name
    output_embedding_data = args.embedding_output_path 

    sym_line_list = [ _.strip() for _ in open(pool_path, mode = 'r' , encoding= 'utf-8')]
    
    number_of_processes = GPU_NUM if GPU_NUM < len(sym_line_list) else len(sym_line_list)
    num_of_tasks = len(sym_line_list)//number_of_processes    
    
    tasks = [sym_line_list[_ * num_of_tasks : (_ + 1) * num_of_tasks] for _ in range(number_of_processes)]
    
    
    tasks_to_accomplish = Manager().Queue()
    for task in tasks:
        tasks_to_accomplish.put(task)
    tasks_finished = Manager().Queue()
    processes = []
    # creating processes
    for i in range(number_of_processes):
        p = Process(target = make_embedding, args = (tasks_to_accomplish, tasks_finished, i,embedding_type_name , ))
        processes.append(p)
        p.start()
    store_target = []
    
    for p in processes:
        p.join()

    while not tasks_finished.empty():
        store_target.append(tasks_finished.get_nowait())
    
    with open(output_embedding_data, 'wb') as f:
        pickle.dump(store_target, f, pickle.HIGHEST_PROTOCOL)

    
    
    
    return True
コード例 #2
0
class RolloutDataSet(Dataset):
    def __init__(self, discount_factor):
        super().__init__()
        self.frames = []
        self.value = []
        self.start = 0
        self.discount_factor = discount_factor
        self.game_length = 0
        self.gl = []
        self.collected_rollouts = 0
        self.reward_total = 0
        self.rollouts = Manager().Queue()

    def add_rollout(self, rollout):
        """Threadsafe method to add rollouts to the dataset"""
        self.rollouts.put(rollout)

    def post_process(self):
        """Call to process data after all data collected"""
        while not self.rollouts.empty():
            self.process_rollout(self.rollouts.get())
        self.normalize()

    def process_rollout(self, rollout):
        global tb_step
        for observation, action, reward, done, info in rollout:
            self.append(observation, reward, action, done)

            if reward == 0:
                self.game_length += 1
            else:
                self.gl.append(self.game_length)
                self.reward_total += reward
                self.game_length = 0

            if done:
                self.collected_rollouts += 1
                tb.add_scalar('reward', self.reward_total, tb_step)
                tb.add_scalar('ave_game_len', statistics.mean(self.gl),
                              tb_step)
                self.gl = []
                self.reward_total = 0
                tb_step += 1

    def append(self, observation, reward, action, done):
        self.frames.append((observation, reward, action, done))
        if reward != 0.0:
            self.end_game()

    def end_game(self):
        values = []
        cum_value = 0.0
        # calculate values
        for step in reversed(range(self.start, len(self.frames))):
            cum_value = self.frames[step][1] + cum_value * self.discount_factor
            values.append(cum_value)
        self.value = self.value + list(reversed(values))
        self.start = len(self.frames)

    def normalize(self):
        mean = statistics.mean(self.value)
        stdev = statistics.stdev(self.value)
        self.value = [(vl - mean) / stdev for vl in self.value]

    def total_reward(self):
        return sum([reward[1] for reward in self.frames])

    def __getitem__(self, item):
        observation, reward, action, done = self.frames[item]
        value = self.value[item]
        observation_t = to_tensor(np.expand_dims(observation, axis=2))
        return observation_t, reward, action, value, done

    def __len__(self):
        return len(self.frames)
コード例 #3
0
ファイル: main.py プロジェクト: 3to80/simentice_final
def main(args):
    '''
    embedding_path에 있는 data 
        - [sentence#1, vecetor#1]
        - [sentence#2, vecetor#2]
        ...
    '''
    print('> START  ')
    print('> parameter  ')
    for k, v in args._get_kwargs():
        print('> {} : {}'.format(k, v))
    print('')
    print('> Action ')
    # 0. sentence_embeddings 준비
    embedding_type_name = args.embedding_type_name
    topk = args.topk
    target_data_path = args.target_data_path
    ground_data_path = args.ground_data_path
    source_embedding_path = args.source_embedding_path
    number_of_processes = args.gpu_num

    # 1. source pool loading
    # [(vector#1, sentence#1), (vector#2, sentence#2) ... ]
    source_pool = load_embedding_data(source_embedding_path)
    src_embeddings = [_[0] for _ in source_pool]
    src_sentences = [_[1] for _ in source_pool]
    # 2. target data split
    target_data_list = [
        _.strip() for _ in open(target_data_path, mode='r', encoding='utf-8')
    ]
    number_of_processes = number_of_processes if number_of_processes < len(
        target_data_list) else len(target_data_list)
    num_of_tasks = len(target_data_list) // number_of_processes
    tasks = [
        target_data_list[_ * num_of_tasks:(_ + 1) * num_of_tasks]
        for _ in range(number_of_processes)
    ]

    # 3. queue 준비
    tasks_to_accomplish = Manager().Queue()
    tasks_finished = Manager().Queue()
    for task in tasks:
        tasks_to_accomplish.put(task)

    processes = []
    # process 생성
    # target에 대해서 encoding 하고 가장 벡터가 유사한 것을 찾는다.
    for i in range(number_of_processes):
        p = Process(target = multi_inference \
                    , args = (embedding_type_name, tasks_to_accomplish, tasks_finished, i+1, src_sentences, src_embeddings,))
        processes.append(p)
        p.start()

    for p in processes:
        p.join()

    # 결과 파일로 저장
    store_target = []
    while not tasks_finished.empty():
        store_target.append(tasks_finished.get_nowait())

    if ground_data_path:
        gt_data = [
            _.strip()
            for _ in open(ground_data_path, mode='r', encoding='utf-8')
        ]
        for idx, val in enumerate(zip(store_target, gt_data)):
            store_target[idx].append(val[-1])

    time_tag = datetime.datetime.now().strftime('%Y%m%d%H%S')

    head_line = ['<target>', '<inference>', '<ground_truth>']
    with open(args.output_data_path, mode='w', encoding='utf-8') as wdesc:

        wdesc.writelines('\t'.join(head_line))
        wdesc.writelines('\n')

        for i in store_target:
            # target = data[0]
            # inference = data[1]
            # sbert_only_inference = data[2]
            line = '\t'.join(i)
            wdesc.writelines(line)
            wdesc.writelines('\n')

    print('> FINISH - result file : {}'.format(args.output_data_path))
    return True