def repr_code(args): device = torch.device( f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") config = getattr(configs, 'config_' + args.model)() ##### Define model ###### logger.info('Constructing Model..') model = getattr(models, args.model)(config) #initialize the model if args.reload_from > 0: ckpt_path = f'./output/{args.model}/{args.dataset}/models/step{args.reload_from}.h5' model.load_state_dict(torch.load(ckpt_path, map_location=device)) model = model.to(device) model.eval() data_path = args.data_path + args.dataset + '/' use_set = eval(config['dataset_name'])( data_path, config['use_names'], config['name_len'], config['use_apis'], config['api_len'], config['use_tokens'], config['tokens_len']) data_loader = torch.utils.data.DataLoader(dataset=use_set, batch_size=args.batch_size, shuffle=False, drop_last=False, num_workers=1) chunk_id = 0 vecs, n_processed = [], 0 for batch in tqdm(data_loader): batch_gpu = [tensor.to(device) for tensor in batch] with torch.no_grad(): reprs = model.code_encoding(*batch_gpu).data.cpu().numpy() reprs = reprs.astype(np.float32) # [batch x dim] if config[ 'sim_measure'] == 'cos': # do normalization for fast cosine computation reprs = normalize(reprs) vecs.append(reprs) n_processed = n_processed + batch[0].size(0) print("n_processed ", n_processed, " args.chunk_size ", args.chunk_size) if n_processed >= args.chunk_size: output_path = f"{data_path}{config['use_codevecs'][:-3]}_part{chunk_id}.h5" save_vecs(np.vstack(vecs), output_path) chunk_id += 1 vecs, n_processed = [], 0 # save the last chunk (probably incomplete) output_path = f"{data_path}{config['use_codevecs'][:-3]}_part{chunk_id}.h5" print("repr_code saved at ", output_path) save_vecs(np.vstack(vecs), output_path)
def repr_code(self,model): model.eval() vecs=None use_set = CodeSearchDataset(self.conf['workdir'], self.conf['use_names'],self.conf['name_len'], self.conf['use_apis'],self.conf['api_len'], self.conf['use_tokens'],self.conf['tokens_len']) data_loader = torch.utils.data.DataLoader(dataset=use_set, batch_size=1000, shuffle=False, drop_last=False, num_workers=1) for names,apis,toks in data_loader: names, apis, toks = [tensor.to(self.device) for tensor in [names, apis, toks]] reprs = model.code_encoding(names,apis,toks).data.cpu().numpy() vecs=reprs if vecs is None else np.concatenate((vecs, reprs),0) vecs = normalize(vecs) save_vecs(vecs,self.path+self.conf['use_codevecs']) return vecs
def repr_code(args): device = torch.device( f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") config = getattr(configs, 'config_' + args.model)() ##### Define model ###### logger.info('Constructing Model..') model = getattr(models, args.model)(config) #initialize the model if args.reload_from > 0: model.load_state_dict( torch.load( f'./output/{args.model}/{args.dataset}/models/epo{args.reload_from}.h5' )) model = model.to(device) model.eval() data_path = args.data_path + args.dataset + '/' use_set = eval(config['dataset_name'])( data_path, config['use_names'], config['name_len'], config['use_apis'], config['api_len'], config['use_tokens'], config['tokens_len']) data_loader = torch.utils.data.DataLoader(dataset=use_set, batch_size=1000, shuffle=False, drop_last=False, num_workers=1) vecs = None for batch in tqdm(data_loader): batch_gpu = [tensor.to(device) for tensor in batch] with torch.no_grad(): reprs = model.code_encoding(*batch_gpu).data.cpu().numpy() vecs = reprs if vecs is None else np.concatenate((vecs, reprs), 0) vecs = normalize(vecs) save_vecs(vecs, data_path + config['use_codevecs'])
def repr_code(args, ast2id, code2id, nl2id, id2nl): with torch.no_grad(): device = torch.device( f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") config = getattr(configs, 'config_' + args.model)() ##### Define model ###### logger.info('Constructing Model..') logger.info(os.getcwd()) model = getattr(models, args.model)(config, ast2id) #initialize the model if args.reload_from > 0: ckpt_path = f'./output/{args.model}/{args.dataset}/models/step{args.reload_from}.h5' model.load_state_dict(torch.load(ckpt_path, map_location=device)) model = model.to(device) model.eval() data_path = args.data_path + args.datasave + '/' ''' use_set = eval(config['dataset_name'])(data_path, config['use_names'], config['name_len'], config['use_apis'], config['api_len'], config['use_tokens'], config['tokens_len']) data_loader = torch.utils.data.DataLoader(dataset=use_set, batch_size=args.batch_size, shuffle=False, drop_last=False, num_workers=1) ''' train_data_set = TreeDataSet( file_name=args.data_path + '/train.json', ast_path=args.data_path + '/tree/train/', ast2id=ast2id, nl2id=nl2id, max_ast_size=args.code_max_len, max_simple_name_size=args.max_simple_name_len, k=args.k, max_comment_size=args.comment_max_len, use_code=True, desc=config['valid_desc'], desclen=config['desc_len']) data_loader = DataLoaderX(dataset=train_data_set, batch_size=args.batch_size, shuffle=True, num_workers=2) chunk_id = 0 vecs, n_processed = [], 0 for batch in tqdm(data_loader): torch.cuda.empty_cache() batch_gpu = [tensor.to(device).long() for tensor in batch] with torch.no_grad(): reprs = model.getcodevec(*batch_gpu).data.cpu().numpy() reprs = reprs.astype(np.float32) # [batch x dim] if config[ 'sim_measure'] == 'cos': # do normalization for fast cosine computation reprs = normalize(reprs) vecs.append(reprs) n_processed = n_processed + batch[0].size(0) if n_processed >= args.chunk_size: output_path = f"{data_path}{config['use_codevecs'][:-3]}_part{chunk_id}.h5" save_vecs(np.vstack(vecs), output_path) chunk_id += 1 vecs, n_processed = [], 0 # save the last chunk (probably incomplete) output_path = f"{data_path}{config['use_codevecs'][:-3]}_part{chunk_id}.h5" save_vecs(np.vstack(vecs), output_path)