def __save_param(self, _dir, _loss): if self.multi_dev: torch.save(self.net.module.state_dict(), PJOIN(_dir, 'weight.pth')) else: torch.save(self.net.state_dict(), PJOIN(_dir, 'weight.pth')) torch.save( { 'opt': self.opt.state_dict(), 'sch': self.sch.state_dict(), 'epoch': self.epoch, 'cur_loss': _loss, }, PJOIN(_dir, 'others.pth'))
def load_param(self, cfg): direct = cfg.system['load_path'] if direct is None: return if cfg.mode == 'test': print('Test at position: ' + direct) elif not cfg.optimizer['no_opt']: other = torch.load(PJOIN(direct, 'others.pth'), map_location=lambda storage, loc: storage) self.opt.load_state_dict(other['opt']) self.sch.load_state_dict(other['sch']) self.best = other.get('cur_loss', None) self.epoch_start += other.get('epoch', 0) self.epoch_end += other.get('epoch', 0) weight = torch.load(PJOIN(direct, 'weight.pth'), map_location=lambda storage, loc: storage) self.net.load_state_dict(weight)
def save(self, loss_now): best_save = PJOIN(self.result_dir, 'ckp', 'best') if self.epoch == self.epoch_start: os.makedirs(best_save) if self.best is None: self.best = loss_now self.__save_param(best_save, self.best) return if loss_now < self.best: self.__save_param(best_save, self.best) if self.epoch != self.epoch_end - 1: if self.save_epoch == 0: return # Just save the best if self.epoch % self.save_epoch != 0: return # Save every save_epoch now_save = PJOIN(self.result_dir, 'ckp', str(self.epoch)) os.makedirs(now_save) self.__save_param(now_save, loss_now)
def log_record(self, dic, board_name): log = 'Epoch:{:0>4} '.format(self.epoch) for key, val in dic.items(): log += '{}:{:.5f} '.format(key, val) if len(dic) > 4: print(board_name) print(log.replace(' ', '\n\r')) else: print(board_name, log) if self.writer is not None: with open(PJOIN(self.result_dir, board_name + '_log.txt'), 'a+') as f: f.write(log + '\n') self.writer.add_scalars(board_name, dic, self.epoch) else: with open(PJOIN(self.result_dir, 'FinalTest.txt'), 'a+') as f: f.write(log.replace(' ', '\n\r'))
def MergeAll(folder_name): global COMMENT_ID for file in os.listdir(folder_name): if os.path.isdir(PJOIN(folder_name, file)): MergeAll(PJOIN(folder_name, file)) continue print("Merging... ", PJOIN(folder_name, file)) tree = ET.parse(PJOIN(folder_name, file)) root = tree.getroot() for comment in root.getchildren(): file_path = comment.attrib["file_path"] if file_path[:2] == '..': file_path = file_path[3:] comment.set('file_path', file_path) comment.set('id', str(COMMENT_ID)) COMMENT_ID += 1 ROOT.append(comment)
def MergeAll(folder_name): global COMMENT_ID, ROOT for file in os.listdir(folder_name): abspath = PJOIN(folder_name, file) if os.path.isdir(abspath): MergeAll(abspath) continue if not abspath.endswith("_comments.xml"): continue if DEBUG: print("Merging... ",PJOIN(folder_name, file)) tree = ET.parse(abspath) root = tree.getroot() for comment in root.getchildren(): comment.set('comment_id', str(COMMENT_ID)) COMMENT_ID += 1 ROOT.append(comment)
def get_all_training_data(): all_files = [] if DATA_FILES[0] == 'all': for file in os.listdir(DATA_DIR): if file[:2] == 'X_': all_files.append(file[2:]) else: all_files = DATA_FILES all_x = [] all_y = [] for file in all_files: train_x = pd.read_csv(PJOIN(DATA_DIR, "X_" + file), header=None) all_x.append(np.array(train_x)) train_y = pd.read_csv(PJOIN(DATA_DIR, "Y_" + file), header=None) all_y.append(train_y) all_x = np.concatenate(all_x) all_y = np.concatenate(all_y) #print(all_x.shape,all_y.shape) all_y = all_y.reshape(all_y.shape[0]) return all_x, all_y
def runForFolder(folder_name): for file in os.listdir(PJOIN(BASE_DIR, PROJECT_NAME, folder_name)): if os.path.isdir(PJOIN(BASE_DIR, PROJECT_NAME, folder_name, file)): runForFolder(PJOIN(folder_name, file)) continue split_filename = os.path.splitext(file) if split_filename[1] not in [".cpp", ".c"]: continue xml_file = split_filename[0] + "_clang.xml" if not os.path.exists(PJOIN(XML_BASE_DIR, folder_name, xml_file)): print("XML File Does not Exist: ", PJOIN(XML_BASE_DIR, folder_name, xml_file)) continue os.system("python2 GenerateCommentsXMLForAFile.py " + PJOIN(BASE_DIR, PROJECT_NAME, folder_name, file) + " " + VOCAB_FILE + " " + PROBLEM_DOMAIN_FILE + " " + PJOIN(XML_BASE_DIR, folder_name, xml_file) + " " + PJOIN(PROJECT_NAME, folder_name, file) + " " + PJOIN(OUTPUT_FOLDER, folder_name, split_filename[0] + "_comments.xml"))
def get_all_training_data(): all_files = [] if DATA_FILES[0] == 'all': for file in os.listdir(DATA_DIR): if file[:2] == 'X_': all_files.append(file[2:]) else: all_files = DATA_FILES all_x = [] all_y = [] for file in all_files: train_x = pd.read_csv(PJOIN(DATA_DIR,"X_"+file),header=None) all_x.append(train_x) all_x = pd.concat(all_x) return all_x
def runForFolder(folder_name, my_due=list(), first=False): for file in os.listdir(folder_name): abspath = PJOIN(folder_name, file) if os.path.isdir(abspath): runForFolder(abspath, my_due) continue (filename, ext) = os.path.splitext(abspath) if ext not in ['.c', '.C', '.cc', '.cpp', '.cxx', '.c++']: continue outprefix = getOutputLoc(abspath) if not os.path.exists(outprefix + '_clang.xml'): print('Skipping: No compile instructions: ' + abspath) continue if REUSE and os.path.exists(outprefix + '_comments.xml'): print('Skipping: Already Exists: ', outprefix + '_comments.xml') continue print('Generating comments:', abspath) my_due.append((abspath, outprefix)) if first: with ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool: pool.map(makeComGenCall, my_due)
def __init__(self, cfg): super(Docker, self).__init__() # 模型预设 print('Compiling the model ...') network_file = 'model.{}'.format(cfg.system['net'][0]) dataset_file = 'dataset.{}'.format(cfg.dataset['file_name']) network_module = importlib.import_module(network_file) dataset_module = importlib.import_module(dataset_file) self.dev = torch.device('cuda', cfg.system['gpu'][0]) if len(cfg.system['gpu'])>=1 and torch.cuda.is_available() \ else torch.device('cpu') self.multi_dev = len(cfg.system['gpu']) > 1 self.epoch = 'test' ## 模型加载 self.net = getattr(network_module, cfg.system['net'][1])(**cfg.system['net_param']).to( self.dev) self.criterion = network_module.loss(**cfg.system['loss_param']) ## 优化器加载 if cfg.mode == 'train': self.best = None self.epoch_start = 1 self.eval_on_train = cfg.optimizer['eval_on_train'] self.epoch_end = cfg.optimizer['max_epoch'] + 1 self.save_epoch = cfg.optimizer['save_epoch'] self.max_batch = cfg.optimizer['max_batch'] if cfg.optimizer['type'] == 'adam': self.opt = torch.optim.Adam(self.net.parameters(), lr=cfg.optimizer['learning_rate'], **cfg.optimizer['adam']) elif cfg.optimizer['type'] == 'sgd': self.opt = torch.optim.SGD(self.net.parameters(), lr=cfg.optimizer['learning_rate'], **cfg.optimizer['sgd']) self.sch = torch.optim.lr_scheduler.MultiStepLR( self.opt, cfg.optimizer['milestones'], gamma=cfg.optimizer['decay_rate'], last_epoch=-1) self.load_param(cfg) ## GPU 分配 if self.multi_dev and torch.cuda.device_count() > 1: self.net = nn.DataParallel(self.net, cfg.system['gpu']) # 数据集载入 print('Loading the dataset ...') if cfg.mode == 'train': self.trainloader = dataset_module.dataloader( cfg.dataset[cfg.mode], cfg.mode) if self.max_batch is None: self.max_batch = len(self.trainloader) self.testloader = dataset_module.dataloader( cfg.dataset['test'], 'test') if cfg.optimizer['test_on_train'] else None else: self.testloader = dataset_module.dataloader( cfg.dataset[cfg.mode], cfg.mode) # 评估方式 self.result_dir = cfg.system['result_dir'] self.evaluate = network_module.evaluate(**cfg.system['evaluate_param']) self.evaluate.result_dir = PJOIN(self.result_dir, 'save') # Tensorboard self.writer = SummaryWriter(PJOIN( self.result_dir, 'tensorboard')) if cfg.mode == 'train' else None
def runForFolder(folder_name): for file in os.listdir(PJOIN(BASE_DIR, PROJECT_NAME, folder_name)): if os.path.isdir(PJOIN(BASE_DIR, PROJECT_NAME, folder_name, file)): if not os.path.exists(PJOIN(OUTPUT_FOLDER, folder_name, file)): os.mkdir(PJOIN(OUTPUT_FOLDER, folder_name, file)) runForFolder(PJOIN(folder_name, file)) continue split_filename = os.path.splitext(file) if split_filename[1] not in [".cpp",".c"]: continue xml_file = split_filename[0] + "_clang.xml" if not os.path.exists(PJOIN(XML_BASE_DIR, folder_name, xml_file)): try: s = "python parsers/clang_parser.py " + PJOIN(BASE_DIR, PROJECT_NAME, folder_name, file) for d in dep[PJOIN(BASE_DIR, PROJECT_NAME, folder_name, file)]: s += " " + d os.system(s) os.system("mv "+ PJOIN(BASE_DIR, folder_name, xml_file) + " " + PJOIN(XML_BASE_DIR, folder_name, xml_file)) pass except: continue if os.path.exists(PJOIN(OUTPUT_FOLDER, folder_name, split_filename[0]+"_comments.xml")): print("Skipping: Already Exists: ", PJOIN(OUTPUT_FOLDER, folder_name, split_filename[0]+"_comments.xml")) continue os.chdir("comments/") os.system("python2 GenerateCommentsXMLForAFile.py " + PJOIN(BASE_DIR, PROJECT_NAME, folder_name, file) + " " + VOCAB_FILE + " " + PROBLEM_DOMAIN_FILE + " " + PJOIN(XML_BASE_DIR, folder_name, xml_file) + " " + PJOIN(PROJECT_NAME, folder_name, file) + " " + PJOIN(OUTPUT_FOLDER, folder_name, split_filename[0]+"_comments.xml")) os.chdir("../")
X = [] Y = [] for file in os.listdir(FEATURES_DIR): if not file.endswith("train.csv"): continue if not PROJECT_NAME in file: continue fName = file[file.find(PROJECT_NAME) + 1 + len(PROJECT_NAME):-10] fName = fName.replace("_", "/") if fName not in found: print("LEFT: ", fName) continue #print(fName) anno_data = annotations_map[CODENAME_TO_COMMENTSFILENAME[fName]] features_file = pd.read_csv(PJOIN(FEATURES_DIR, file), header=None, encoding="ISO-8859–1") features_np = np.array(features_file) features_map = {} for feat in features_np: if len(feat[2:]) > 12: print("ERROR: Length of features greater than 12") print(feat) features_map[feat[1]] = feat[2:14] for comments_data in anno_data: if comments_data[0] not in features_map: print("Comment NOT FOUND:", comments_data[0]) continue features = features_map[comments_data[0]] labels_intuitive = comments_data[-2]
def getOutputLoc(path): relpath = path[path.index(PROJECT_NAME):] relpath = '/'.join(relpath.split('/')[1:]) outpath = os.path.join(OUTPUTS_DIR, relpath) return PJOIN(outpath, os.path.splitext(outpath.split('/')[-1])[0])
# -*- coding: utf-8 -*- import pandas as pd import numpy as np from os.path import join as PJOIN import sys if len(sys.argv) != 3: print("Give 2 Arguments - 1) Annonations file name, 2) Project Name") exit(-1) FILE_NAME = sys.argv[1] PROJECT_NAME = sys.argv[2] FILE_PATH = PJOIN("DATA", "ANNOTATED", FILE_NAME) OUTPUT_FILE_PATH = PJOIN("DATA", "GENERATED", FILE_NAME) MAP = {'U': 'U', 'PU': 'P', 'NU': 'N'} THRESHOLD = 10 # c is a vector of size 31 [comment text, C1, C2, ......., C30] def get_label(c): #IF C18 OR C19 OR C20 OR C21 OR C22 OR C28 OR C29 THEN U if c[18] or c[19] or c[20] or c[21] or c[22] or c[28] or c[29]: return 'U' #IF C9 AND C3 THEN U if c[9] and c[3]: return 'U' #IF C11 AND C3 THEN U if c[11] and c[3]: return 'U' #IF (C25 OR C23 OR C26 OR C27 ) AND C3 THEN U if (c[25] or c[23] or c[26] or c[27]) and c[3]: