def run_describe_dataset(): df = get_dataset() describe_bq_dataset(df) repos = pd.read_csv(join(DATA_PATH, 'repos_full.csv')) print("Initial Java repositories", len(repos[repos.language == 'Java'])) generete_computed_values(df)
def candidate_followup_files(): repos = pd.read_csv(join(DATA_PATH, 'wellcomming_projects.csv')) repos_of_interest = repos.repo_name.unique() df = get_dataset() relevant_smells = [ 'NPathComplexity', 'FallThrough', 'JavadocParagraph', 'TrailingComment', 'IllegalImport', 'AvoidStaticImport', 'IllegalCatch', 'ParameterAssignment', 'UnnecessaryParentheses' ] df = df[df.repo_name.isin(repos_of_interest)] df = df[ ((df.NPathComplexity > 0) & (df.NPathComplexity < 3)) | ((df.FallThrough > 0) & (df.FallThrough < 3)) # | ((df.JavadocParagraph > 0 ) & ( df.JavadocParagraph < 3)) # | ((df.TrailingComment > 0 ) & ( df.TrailingComment < 3)) | ((df.IllegalImport > 0) & (df.IllegalImport < 3)) | ((df.AvoidStaticImport > 0) & (df.AvoidStaticImport < 3)) | ((df.IllegalCatch > 0) & (df.IllegalCatch < 3)) | ((df.ParameterAssignment > 0) & (df.ParameterAssignment < 3)) | ((df.UnnecessaryParentheses > 0) & (df.UnnecessaryParentheses < 3))] df['robust_smells_num'] = df[relevant_smells].sum(axis=1) df['random_metric'] = np.random.randint(1, 100, df.shape[0]) df = df[ ['repo_name', 'robust_smells_num', 'random_metric', 'full_file_name'] + relevant_smells] df = df.sort_values(['repo_name', 'random_metric'], ascending=[False, False]) df.to_csv(join(DATA_PATH, 'candidate_followup_files.csv'), index=False) print("files", len(df))
def clean_dataset(path): dataset = data_utils.get_dataset() dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) dataset = data_utils.remove_features(dataset) dataset.to_csv(path, index=False)
def candidate_followup_projects(): df = get_dataset() robust_smells = get_robust_smells() file_with_robust_smells = df[ ((df.NPathComplexity > 0) & (df.NPathComplexity < 3)) | ((df.FallThrough > 0) & (df.FallThrough < 3)) #| ((df.JavadocParagraph > 0 ) & ( df.JavadocParagraph < 3)) #| ((df.TrailingComment > 0 ) & ( df.TrailingComment < 3)) | ((df.IllegalImport > 0) & (df.IllegalImport < 3)) | ((df.AvoidStaticImport > 0) & (df.AvoidStaticImport < 3)) | ((df.IllegalCatch > 0) & (df.IllegalCatch < 3)) | ((df.ParameterAssignment > 0) & (df.ParameterAssignment < 3)) | ((df.UnnecessaryParentheses > 0) & (df.UnnecessaryParentheses < 3))] agg = file_with_robust_smells.groupby(['repo_name'], as_index=False).agg( {'file': 'count'}) agg = agg.rename(columns={'file': 'files_with_robust_smells'}) agg = agg[agg.files_with_robust_smells >= 15] agg = agg.sort_values('files_with_robust_smells', ascending=False) agg.to_csv(join(DATA_PATH, 'candidate_followup_projects.csv'), index=False) repos = pd.read_csv(join(DATA_PATH, 'repo_profile.csv')) repos['wellcomming'] = repos.apply(lambda x: 1 if x.retention_prob > 0.3 and x.authors > 20 and x.onboarding_prob > 0.3 else 0, axis=1) repos = repos[repos['wellcomming'] == 1] repos = repos[['repo_name']] df = pd.merge(repos, agg, on='repo_name') df.to_csv(join(DATA_PATH, 'wellcomming_projects.csv'), index=False) print("projects", len(df))
def get_data(self): with tf.name_scope('data'): self.anchor = np.array([-1,-1,2,2]) train_data, test_data = data_utils.get_dataset(self.batch_size, anchor = self.anchor) iterator = tf.data.Iterator.from_structure(train_data.output_types, train_data.output_shapes) img, self.people_label, self.car_label, self.iou_scores, self.bbox_matrix, self.tx_star, self.ty_star, self.tw_star, self.th_star, self.label, self.people_mask, self.car_mask = iterator.get_next() self.img = img self.train_init = iterator.make_initializer(train_data) self.test_init = iterator.make_initializer(test_data)
def train(args): train, test = du.get_dataset(args) enc_train, dec_train = du.pad(train[0], train[1], args) enc_test, dec_test = du.pad(test[0], test[1], args) if args.decoder_go_padding: args.decoder_time_steps += 1 if args.encoder_end_padding: args.encoder_time_steps += 1 enc_train_oh = du.one_hot(enc_train, args) dec_train_oh = du.one_hot(dec_train, args) enc_test_oh = du.one_hot(enc_test, args) dec_test_oh = du.one_hot(dec_test, args) initializer = tf.random_uniform_initializer(-args.intializations, args.intializations, seed=args.seed) with tf.Session() as sess: model = EncDecModel(args) tf.initialize_all_variables().run() # Input feed: encoder inputs, decoder inputs, as provided. train_feed = model.feed(enc_train_oh, dec_train_oh) test_feed = model.feed(enc_test_oh, dec_test_oh) encoder_inputs, decoder_inputs = enc_train_oh, dec_train_oh for epoch in xrange(1, args.epochs): run_epoch(sess, model, args, encoder_inputs, decoder_inputs) loss = sess.run([model.loss], train_feed)[0] test_loss = sess.run([model.loss], test_feed)[0] print("[%s] Loss : %s" % (epoch, loss), "test loss : %s" % test_loss) if epoch % args.decay_epoch == 0: lr_value = sess.run([model.learning_rate])[0] * args.lr_decay print("New learning rate %s" % lr_value) model.set_lr(sess, lr_value) args.decay_epoch = args.decay_epoch * 2 model.training = False model.keep_prob = 1.0 enc_sample = enc_test_oh[:, 0, :].reshape( [-1, 1, args.upper_limit + 1]) dec_sample = dec_test_oh[:, 0, :].reshape( [-1, 1, args.upper_limit + 1]) sample_feed = model.feed(enc_sample, dec_sample) print( enc_test[:, 0], dec_test[:, 0], sess.run([model.predictions], sample_feed)[0][1].reshape([-1])) model.training = True model.keep_prob = args.keep_prob
def evaluate_smell_monotonocity(): df = get_dataset() relevant_columns = set(df.columns) - NON_PREDICTIVE_FEATURES monotone_df = evaluate_monotonocity( df, relevant_columns, monotone_column='quality_group', monotone_order=['reduced_risk', 'other', 'hotspot']) return monotone_df
def main(args): print('dataset =', flags.FLAGS.dataset) with tf.Graph().as_default(): # dataset input, always using CPU for this section with tf.device('/cpu:0'): # dataset source test_dataset = get_dataset(dset=flags.FLAGS.dataset, mode='test') # iterator iterator = tf.data.Iterator.from_structure( test_dataset.output_types, test_dataset.output_shapes) # get a new batch from iterator get_batch = iterator.get_next() # ops for initializing the iterators # for choosing dataset for one epoch test_init_op = iterator.make_initializer(test_dataset) # restore saved model and run testing init_op = tf.global_variables_initializer() with tf.Session() as sess: model_path = flags.FLAGS.ckpt_path model_meta = model_path + '.meta' saver = tf.train.import_meta_graph(model_meta) print(datetime.now(), 'meta graph imported from', model_meta) saver.restore(sess, model_path) print(datetime.now(), 'model restored') # import operators for reference accuracy = tf.get_collection('accuracy')[0] images = tf.get_collection('images')[0] labels = tf.get_collection('labels')[0] is_training = tf.get_collection('is_training')[0] sess.run(init_op) print(datetime.now(), 'model initialized') # testing phase print('==== testing phase ====') # specify dataset for test sess.run(test_init_op) # get batch for testing test_images, test_labels = sess.run(get_batch) # run taining op test_acc = sess.run(accuracy, feed_dict={ images: test_images, labels: test_labels, is_training: False }) print(datetime.now(), 'testing result: acc={:.4f}'.format(test_acc))
def plot_duration_by_length(): df = get_dataset() df['CCP'] = df['corrective_rate'].map( lambda x: round(ccp_estimator.estimate_positives(x), 2)) fig = plot_deciles(df, grouping_column='line_count', metric_column='CCP', title='CCP by Line Count Deciles', xaxis_title='Number of Lines', output_file=None) fig.show() print("Perason corrective rate and line count", df.corr()['line_count']['corrective_rate'])
def run_experiment(config, learner): dataset = None print(config) log_params(config) if (config['experiment_name'] == 'cie10'): dataset = data_utils.get_dataset_null_empty() else: dataset = data_utils.get_dataset() log_param('experiment_name', config["experiment_name"]) build_dataset_experiment(config["experiment_name"], dataset) build_model(config, learner) print( '----------------------------------------------------------------------------------------\n' )
def file_by_author_twin_analysis(): df = get_dataset(binary=False) single_author_files = df[df.authors == 1] keys= ['repo_name', 'Author_email'] filtering_function = lambda x: x.full_file_name_x == x.full_file_name_y comparision_function= lambda first, second : second > first \ if isinstance(first, numbers.Number) and isinstance(second, numbers.Number) \ else None comparision_columns = SINGLE_SMELL + [CCP, 'full_file_name'] # TODO - ADD groups comp_df = compare_twin_behaviours(first_behaviour=single_author_files , second_behaviour=single_author_files , keys=keys , comparision_columns=comparision_columns , comparision_function=comparision_function , filtering_function=filtering_function) comp_df.to_csv(os.path.join(DATA_PATH, 'file_by_author_twin_analysis.csv')) #comp_df = pd.read_csv(os.path.join(DATA_PATH, 'file_by_author_twin_analysis.csv')) Pearson = comp_df.corr()[CCP + COMPARISON_SUFFIX] Pearson_df = pd.DataFrame(Pearson).reset_index() Pearson_df.columns = ['feature', 'Pearson'] Pearson_df = Pearson_df.sort_values('Pearson', ascending=False) print(Pearson_df) Pearson_df.to_csv(os.path.join(DATA_PATH, 'file_by_author_twin_corr.csv') , index=False) stats = compute_confusion_matrics(df=comp_df , concept=CCP + COMPARISON_SUFFIX , columns=[i + COMPARISON_SUFFIX for i in SINGLE_SMELL] , keys=keys) stats_df = pd.DataFrame.from_dict(stats, orient='index') stats_df = (stats_df.reset_index()).rename(columns={'index': 'feature'}) stats_df['feature'] = stats_df['feature'].map(lambda x : x[:-4]) stats_df = stats_df.sort_values(['precision_lift','feature'] , ascending=[False, True]) stats_df.to_csv(os.path.join(DATA_PATH, AUTHOR_TWIN_CM_FILE) , index=False) return Pearson_df
def evaluate_model(weight_name): batch_size = 10 x_train, y_train = get_data(aug=True, name='train') x_test, y_test = get_data(aug=False, name='test') num_data = len(x_test) [x_test] = img_standardization(x_train, x_test) x_test = _parse_function(x_test, im_size=224) dataset_test = get_dataset(x_test, y_test, batch_size, resize=False) model = tf.keras.models.load_model('./weight/' + weight_name, compile=True) # because evaluate() will calculate loss and metrics['accuracy'], so recompiling the loaded model is necessary [loss, acc] = model.evaluate(dataset_test, steps=math.ceil(num_data / batch_size)) print('TEST loss: ', loss) print('TEST acc: ', acc) return
def main(): args = parse.parse_args() dataset = get_dataset(args) if args.mode == 'train': # 将两个句子拼接在一起然后使用句子分类模型 model = BertForSequenceClassification.from_pretrained( 'bert-base-chinese').to(device) train(model, dataset, args) else: # args = torch.load( # os.path.join(args.output_dir, # f'checkpoint-{args.best_step}/training_args.bin')) model = BertForSequenceClassification.from_pretrained( os.path.join(args.output_dir, f'checkpoint-{args.best_step}') ).to(device) pred = predict(model, dataset, args)[0] pred = pd.Series(pred.numpy().tolist()) res_csv = pd.concat([dataset.df['qid'], pred], axis=1) res_csv.to_csv(os.path.join(args.output_dir, 'result.csv'), header=False, index=False, sep='\t')
if __name__ == '__main__': batch_size = 64 nb_epoch = 50 image_size = 224 nb_classes = 9 channels = 3 print("Splitting data into test/ train datasets") df_train = pd.read_csv('data/iter0_im_tr_sa.csv', names=['file_name', 'label', 'do_aug']) df_test = pd.read_csv('data/iter0_im_te.csv', names=['file_name', 'label', 'do_aug']) df_val = pd.read_csv('data/iter0_im_val.csv', names=['file_name', 'label', 'do_aug']) print("Read data with normalization and augmentation") x_train , y_train = get_dataset(df_train, image_size, isDicom=True) x_valid, y_valid = get_dataset(df_val, image_size, isDicom=True) x_test, y_test = get_dataset(df_test, image_size, isDicom=True) # x, y = get_dogcat_dataset(img_rows) # # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42) # x_valid = x_test # y_valid = y_test # print("Reshaping Data") # print("X_train Shape: ", x_train.shape) # x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, channels) # x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, channels) # x_valid = x_valid.reshape(x_valid.shape[0], img_rows, img_cols, channels)
import torch.optim as optim from torch.utils.data import DataLoader from torch.utils.data import random_split from tqdm import tqdm from data_utils import get_dataset, build_vocab, DQDataset, collate_fn from model import SiameseNet, loss_fn, accuracy_score, run_on_example DATA_PATH = r'D:/Jupyter work/Duplicate Question Detection/questions.csv' TRAIN_BATCH_SIZE = 32 VALIDATE_BATCH_SIZE = 64 TEST_BATCH_SIZE = 64 dataset = get_dataset(DATA_PATH) s = dataset[['question1', 'question2', 'is_duplicate']].values print("Spliting dataset") train, test_and_val = train_test_split(s, test_size=0.3) same_idx = np.where(train[:, 2] == 1)[0] train_set = train[same_idx] print("Building vocab") vocab = build_vocab(train_set) print("Creating Dataloader") dlt = DQDataset(train_set, vocab)
def length_groups(): df = get_dataset() for i in [0.25, 0.75]: print("length quantile", i, df.line_count.quantile(i))
if __name__ == '__main__': # Execution start time, used to calculate total script runtime. startTime = time() # Config dropout = 0.20 lr_rate = 0.001 loss_patience = 1 units = [32, 16] # Displays first n test predicted/expected results in the terminal window. Does not affect training/testing. print_results = 10 # Multi gpu support. Replace the below number with # of gpus. Default: gpus=0 gpus = 0 # Check that our train/test data is available, then load it. train, test = data_utils.get_dataset() # Split train data into input (X) and output (Y) variables. X_train = train[:, 1:3197] y_train = train[:, 0] # Split test data into input (X) and output (Y) variables. X_test = test[:, 1:3197] y_test = test[:, 0] # Normalize train and test features X_train, X_test = normalize_data(X_train, X_test) # Create model. model = build_model(gpus, units, dropout)
# some config data_dir = 'data/CASIA-WebFace_mtcnn_182/' # data directory containing aligned face patches validation_set_split_ratio = 0.3 max_nrof_epochs = 50 validate_every_n_epochs = 1 batch_size = 256 image_size = (160, 160) epoch_size = 1000 # number of batch per epoch embedding_size = 512 # Dimensionality of the embedding random_crop = True # augmentation random_flip = True # augmentation random_rotate = True # augmentation keep_prob = 0.8 min_nrof_val_images_per_class = 0 # minimum number of image per class # dataset = data_utils.get_dataset(data_dir) if validation_set_split_ratio > 0: train_set, val_set = data_utils.split_dataset( dataset, validation_set_split_ratio, min_nrof_val_images_per_class, 'SPLIT_IMAGES') else: train_set, val_set = data_utils.split_dataset( dataset, validation_set_split_ratio, min_nrof_val_images_per_class, 'SPLIT_IMAGES') # Let's take a look at the dataset print(type(train_set)) print(len(train_set)) print(train_set[:5]) print(train_set[0]) df_graph = tf.Graph()
def main(args): sleep(random.random()) output_dir = os.path.expanduser(args.output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) # Store some git revision info in a text file in the log directory src_path, _ = os.path.split(os.path.realpath(__file__)) # facenet.store_revision_info(src_path, output_dir, ' '.join(sys.argv)) dataset = data_utils.get_dataset(args.input_dir) print('Creating networks and loading parameters') with tf.Graph().as_default(): gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpu_memory_fraction) sess = tf.Session(config=tf.ConfigProto( gpu_options=gpu_options, log_device_placement=False)) with sess.as_default(): pnet, rnet, onet = align.detect_face.create_mtcnn(sess, None) minsize = 20 # minimum size of face threshold = [0.6, 0.7, 0.7] # three steps's threshold factor = 0.709 # scale factor # Add a random key to the filename to allow alignment using multiple processes random_key = np.random.randint(0, high=99999) bounding_boxes_filename = os.path.join( output_dir, 'bounding_boxes_%05d.txt' % random_key) with open(bounding_boxes_filename, "w") as text_file: nrof_images_total = 0 nrof_successfully_aligned = 0 if args.random_order: random.shuffle(dataset) for cls in dataset: output_class_dir = os.path.join(output_dir, cls.name) if not os.path.exists(output_class_dir): os.makedirs(output_class_dir) if args.random_order: random.shuffle(cls.image_paths) for image_path in cls.image_paths: nrof_images_total += 1 filename = os.path.splitext(os.path.split(image_path)[1])[0] output_filename = os.path.join( output_class_dir, filename+'.png') print(image_path) if not os.path.exists(output_filename): try: img = misc.imread(image_path) except (IOError, ValueError, IndexError) as e: errorMessage = '{}: {}'.format(image_path, e) print(errorMessage) else: if img.ndim < 2: print('Unable to align "%s"' % image_path) text_file.write('%s\n' % (output_filename)) continue if img.ndim == 2: img = data_utils.to_rgb(img) img = img[:, :, 0:3] bounding_boxes, _ = align.detect_face.detect_face( img, minsize, pnet, rnet, onet, threshold, factor) nrof_faces = bounding_boxes.shape[0] if nrof_faces > 0: det = bounding_boxes[:, 0:4] det_arr = [] img_size = np.asarray(img.shape)[0:2] if nrof_faces > 1: if args.detect_multiple_faces: for i in range(nrof_faces): det_arr.append(np.squeeze(det[i])) else: bounding_box_size = ( det[:, 2]-det[:, 0])*(det[:, 3]-det[:, 1]) img_center = img_size / 2 offsets = np.vstack( [(det[:, 0]+det[:, 2])/2-img_center[1], (det[:, 1]+det[:, 3])/2-img_center[0]]) offset_dist_squared = np.sum( np.power(offsets, 2.0), 0) # some extra weight on the centering index = np.argmax( bounding_box_size-offset_dist_squared*2.0) det_arr.append(det[index, :]) else: det_arr.append(np.squeeze(det)) for i, det in enumerate(det_arr): det = np.squeeze(det) bb = np.zeros(4, dtype=np.int32) bb[0] = np.maximum(det[0]-args.margin/2, 0) bb[1] = np.maximum(det[1]-args.margin/2, 0) bb[2] = np.minimum( det[2]+args.margin/2, img_size[1]) bb[3] = np.minimum( det[3]+args.margin/2, img_size[0]) cropped = img[bb[1]:bb[3], bb[0]:bb[2], :] scaled = misc.imresize( cropped, (args.image_size, args.image_size), interp='bilinear') nrof_successfully_aligned += 1 filename_base, file_extension = os.path.splitext( output_filename) if args.detect_multiple_faces: output_filename_n = "{}_{}{}".format( filename_base, i, file_extension) else: output_filename_n = "{}{}".format( filename_base, file_extension) misc.imsave(output_filename_n, scaled) text_file.write('%s %d %d %d %d\n' % ( output_filename_n, bb[0], bb[1], bb[2], bb[3])) else: print('Unable to align "%s"' % image_path) text_file.write('%s\n' % (output_filename)) print('Total number of images: %d' % nrof_images_total) print('Number of successfully aligned images: %d' % nrof_successfully_aligned)
def train_vgg16(lr=1e-4, epochs=50): x_train, y_train = get_data(aug=True, name='train') x_val, y_val = get_data(aug=True, name='val') x_test, y_test = get_data(aug=False, name='test') num_data = x_train.shape[0] num_test = x_test.shape[0] print('training set before preprocessing: ', x_train.shape) print('validation set before preprocessing: ', x_val.shape) [x_train, x_val, x_test] = img_standardization(x_train, x_train, x_val, x_test) # parse numpy arrays into resized tensors x_train = _parse_function(x_train, im_size=224) x_val = _parse_function(x_val, im_size=224) x_test = _parse_function(x_test, im_size=224) batch_size = 16 dataset_train = get_dataset(x_train, y_train, batch_size, resize=False) dataset_val = get_dataset(x_val, y_val, batch_size, resize=False) dataset_test = get_dataset(x_test, y_test, batch_size, resize=False) # build model print('building model...') model = new_vgg16() # compile adam = tf.keras.optimizers.Adam(lr=lr) model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy']) # callbacks checkpointer = ModelCheckpoint('./weight/vgg16_ECG200_03.h5', monitor='val_loss', save_best_only=True) # reduce_lr = LearningRateScheduler(lr_scheduler) reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=3, min_lr=1e-6) tensorboard = TensorBoard(log_dir='./log/ECG200/vgg16/', write_graph=False, batch_size=batch_size) print('start training...') histoty = model.fit(dataset_train, steps_per_epoch=math.ceil(num_data / batch_size), epochs=epochs, validation_data=dataset_val, validation_steps=math.ceil(num_data / batch_size), callbacks=[checkpointer, reduce_lr, tensorboard], verbose=2) # Testing # [loss, acc] = model.evaluate(x_test, y_test, batch_size=batch_size) [loss, acc] = model.evaluate(dataset_test, steps=math.ceil(num_test / batch_size)) print( 'TEST loss: ', loss, ) print('TEST accuracy: ', acc) return histoty
import pandas as pd import numpy as np from data_utils import get_dataset from preprocessing import remove_object_cols from models import kfold_lgb, get_logistic from submission_utils import OptimizedRounder, generate_submission from evaluation_utils import sklearn_quadratic_kappa TARGET_COL = 'AdoptionSpeed' if __name__ == '__main__': # step 1 - load and transform data # load train and test tabular datasets datasets = { dataset_type: get_dataset(dataset_type) for dataset_type in ('train', 'test') } # remove all string columns from dataset # todo: investigate if there are no int/float categorical cols left that hasn't been one-hot encoded cleaned_datasets = { dataset_type: remove_object_cols(dataset) for dataset_type, dataset in datasets.items() } # extract training labels y_train = cleaned_datasets['train'][TARGET_COL] print(cleaned_datasets) # step 2 - train a model and get it's outputs # get outputs from k-fold CV LGBM training
def load_data(audio_config, data_config): return get_dataset(data_config, audio_config)
parser.add_argument("--dev-data-path", default="data/test_set.npz") parser.add_argument("--evaluate-every", default=500) parser.add_argument("--model-dir", default="./model_dir") parser.add_argument("--n-epochs", default=30) parser.add_argument("--batch-size", default=100) args = parser.parse_args() train_data = np.load(args.train_data_path) dev_data = np.load(args.dev_data_path) X_train = train_data["features"] y_train = train_data["labels"].reshape(-1, 1) X_dev = dev_data["features"] y_dev = dev_data["labels"].reshape(-1, 1) train_set = get_dataset(X_train, y_train, n_epochs=args.n_epochs, batch_size=args.batch_size) dev_set = get_dataset(X_dev, y_dev, shuffle=False) data_iter = tf.contrib.data.Iterator.from_structure( train_set.output_types, train_set.output_shapes) train_init = data_iter.make_initializer(train_set) dev_init = data_iter.make_initializer(dev_set) # Initialize model path timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(args.model_dir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir):
import tensorflow as tf import data_utils import params batch_size = 64 lr = 0.001 epoch_num = 100 display_step = 10 graph = tf.Graph() with graph.as_default(): # get dataset d = data_utils.get_files('./data/spoken_numbers_pcm', 1) arr_x, arr_y = data_utils.get_dataset(d) # training set data_x_train = tf.data.Dataset.from_tensor_slices(arr_x[128:]) data_y_train = tf.data.Dataset.from_tensor_slices(arr_y[128:]) training_set = tf.data.Dataset.zip((data_x_train, data_y_train)).batch(batch_size).shuffle(512) iterator_train = training_set.make_initializable_iterator() ne_train = iterator_train.get_next() # validation set data_x_val = tf.data.Dataset.from_tensor_slices(arr_x[:128]) data_y_val = tf.data.Dataset.from_tensor_slices(arr_y[:128]) validation_set = tf.data.Dataset.zip((data_x_val, data_y_val)).batch(128) iterator_validation = validation_set.make_initializable_iterator() ne_validation = iterator_validation.get_next() # define the placeholder x = tf.placeholder(tf.float32, [None, 168, 13, 1]) y = tf.placeholder(tf.float32, [None, 10])
def bgu_etl(): df = pd.read_csv(join(DATA_PATH, 'bgu_dataset.csv')) print("records", len(df)) print("projects", df.Project.unique()) print("Version", df.Version.unique()) #print("File", df.File.unique()) metrics = ['file_ccp', 'worse_10_hs', 'reduced_risk'] keys = ['repo_name', 'full_file_name'] project_versions = df.groupby(['Project'], as_index=False).agg({'Version': max}) df = pd.merge(df, project_versions, left_on=['Version', 'Project'], right_on=['Version', 'Project'], how='inner') smells_df = get_dataset() smells_df['project'] = smells_df.repo_name.map( lambda x: x[x.find('/') + 1:]) smells_repos = smells_df['project'].unique() bug_repos = [ 'camel' 'hadoop' 'flink' 'kafka' 'openmeetings' 'karaf' 'hbase' 'uima-ruta' 'lucene-solr' 'deltaspike' 'jackrabbit-oak' 'pulsar' 'ofbiz' 'cayenne' 'commons-codec' 'parquet-mr' 'kylin' 'hive' 'commons-validator' 'maven-surefire' 'syncope' 'commons-math' 'tomcat' 'atlas' 'struts' 'tika' 'servicecomb-java-chassis' 'ranger' 'cassandra' 'cxf' 'avro' 'nifi' 'bookkeeper' 'clerezza' 'systemml' 'asterixdb' 'maven' 'zeppelin' 'commons-collections' 'jena' 'calcite' 'tez' 'commons-lang' 'activemq' 'curator' 'phoenix' 'samza' 'nutch' 'qpid-jms' 'directory-kerby' 'juneau' 'myfaces-tobago' 'isis' 'wicket' 'santuario-java' 'helix' 'storm' 'airavata' 'myfaces' 'commons-dbcp' 'commons-vfs' 'opennlp' 'tomee' 'tinkerpop' 'directory-server' 'commons-compress' 'accumulo' 'giraph' 'johnzon' 'jclouds' 'manifoldcf' 'shiro' 'knox' 'drill' 'crunch' 'commons-io' 'commons-cli' 'jackrabbit' 'openwebbeans' 'xmlgraphics-fop' 'tajo' 'commons-email' 'directory-studio' 'tapestry-5' 'archiva' 'olingo-odata4' 'openjpa' 'commons-jexl' 'roller' 'reef' 'activemq-artemis' 'beam' 'metron' 'plc4x' 'cocoon' 'carbondata' 'commons-csv' 'commons-beanutils' 'commons-net' 'continuum' ] joint_df = pd.merge(df, smells_df, left_on=['File', 'Project'], right_on=['full_file_name', 'project'], how='inner') joint_df.to_csv(join(DATA_PATH, BGU_DATASET), index=False)
torch.cuda.manual_seed(seed) np.random.seed(seed) if __name__ == '__main__': parser = get_parser() args = parser.parse_args() torch.cuda.set_device(args.cuda) print_args(args) set_seed(args.seed) # get dataset: train_data, test_data = get_dataset(args.dataset, args.val) # get data loaders for training and testing: # env 0 and 1 are used for training # env 2 is used for validation # env 3 is used for testing # initialize model and optimizer based on the dataset and the method if args.dataset[:4] == 'beer' or args.dataset == 'pubmed': model, opt = get_model(args, train_data.vocab) else: model, opt = get_model(args) # start training print("{}, Start training {} on train env for {}".format( datetime.datetime.now().strftime('%02y/%02m/%02d %H:%M:%S'),
def main(args): print('dataset =', flags.FLAGS.dataset) TRAIN_SIZE = dataset_size[flags.FLAGS.dataset]['train'] TRAIN_STEPS_PER_EPOCH = int(TRAIN_SIZE // flags.FLAGS.batch_size) # Constants describing the training process. NUM_EPOCHS_PER_DECAY = 100.0 # Epochs after which learning rate decays. LEARNING_RATE_DECAY_FACTOR = 0.5 # Learning rate decay factor. train_log_base = flags.FLAGS.log_directory train_case = flags.FLAGS.dataset train_case += '_bs_' + str(flags.FLAGS.batch_size) train_case += '_lr_' + str(flags.FLAGS.init_lr) train_case += '_l2s_' + str(flags.FLAGS.l2_scale) train_log_dir = os.path.join(train_log_base, train_case) if not tf.gfile.Exists(train_log_base): tf.gfile.MakeDirs(train_log_base) if not tf.gfile.Exists(train_log_dir): tf.gfile.MakeDirs(train_log_dir) with tf.Graph().as_default(): # create global step global_step = tf.train.get_or_create_global_step() with tf.name_scope('input_pipe'): # use epoch count to pick fold index for cross validation epoch_count = tf.floordiv(global_step, TRAIN_STEPS_PER_EPOCH) fold_index = tf.floormod(epoch_count, 10) # 10-fold dataset # dataset input, always using CPU for this section with tf.device('/cpu:0'): # dataset source trn_dataset = get_dataset( dset=flags.FLAGS.dataset, mode='train', batch_size=flags.FLAGS.batch_size, fold_index=fold_index) vld_dataset = get_dataset( dset=flags.FLAGS.dataset, mode='valid', fold_index=fold_index) # iterator iterator = tf.data.Iterator.from_structure( trn_dataset.output_types, trn_dataset.output_shapes) # get a new batch from iterator get_batch = iterator.get_next() # ops for initializing the iterators # for choosing dataset for one epoch trn_init_op = iterator.make_initializer(trn_dataset) vld_init_op = iterator.make_initializer(vld_dataset) # placeholder for images and labels images, labels = create_placeholder_for_input( dset=flags.FLAGS.dataset) is_training = tf.placeholder(tf.bool, name='is_training') tf.add_to_collection('is_training', is_training) tf.summary.image('images', images) # neural network model if flags.FLAGS.dataset == 'mnist': model_network = lenet elif flags.FLAGS.dataset == 'cifar10': model_network = cifarnet else: raise(ValueError, 'Invalid dataset') logits, end_points = model_network(images, is_training=is_training, l2_scale=flags.FLAGS.l2_scale) # print name and shape of each tensor print("layers:") for k_, v_ in end_points.items(): print('name =', v_.name, ', shape =', v_.get_shape()) # print the total size of trainable variables n_params = 0 for var_ in tf.trainable_variables(): var_shape = var_.get_shape() n_params_var = 1 for dim_ in var_shape: n_params_var *= dim_.value n_params += n_params_var print("model parameter size:", n_params) # prediction of this batch with tf.name_scope('prediction'): pred = tf.argmax(tf.nn.softmax(logits), axis=1) match_count = tf.reduce_sum(tf.cast(tf.equal(pred,labels), tf.float32)) # note: here the running batch size can be changed in testing mode, # so we cannot reuse the batch size from flags running_batch_size = tf.cast(tf.size(pred),tf.float32) accuracy = match_count / running_batch_size tf.add_to_collection('accuracy', accuracy) tf.summary.scalar('accuracy', accuracy) # loss function with tf.name_scope('losses'): raw_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits), name = 'cross_entropy') tf.summary.scalar('raw_loss', raw_loss) regu_loss = tf.add_n(tf.losses.get_regularization_losses()) tf.summary.scalar('regu_loss', regu_loss) total_loss = raw_loss + regu_loss tf.summary.scalar('total_loss', total_loss) # specify learning rate decay_steps = int(TRAIN_STEPS_PER_EPOCH * NUM_EPOCHS_PER_DECAY) lr = tf.train.exponential_decay( flags.FLAGS.init_lr, global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) tf.summary.scalar('learning_rate', lr) # add histograms for trainable variables for var_ in tf.trainable_variables(): tf.summary.histogram(var_.op.name, var_) # specify optimizer opt = tf.train.GradientDescentOptimizer(lr) # compute gradients and apply # note: with batch norm layers we have to use update_ops # to get hidden variables into the list needed # to be trained update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): grads = opt.compute_gradients(total_loss) # add histograms for gradients for grad_, var_ in grads: if grad_ is not None: tf.summary.histogram(var_.op.name + '/gradients', grad_) # train op train_op = opt.apply_gradients(grads, global_step=global_step) # summerize all summary = tf.summary.merge_all() # summary writer summary_writer = tf.summary.FileWriter(train_log_dir) # checkpoint saver saver = tf.train.Saver() # session part init_op = tf.global_variables_initializer() with tf.Session() as sess: # initialization sess.run(init_op) summary_writer.add_graph(sess.graph) # epoch loop for epoch in range(flags.FLAGS.num_epochs): print(datetime.now(), 'epoch:', epoch+1, '/', flags.FLAGS.num_epochs) # training phase print('==== training phase ====') # specify dataset for training sess.run(trn_init_op) # training loop for step in range(TRAIN_STEPS_PER_EPOCH): # get batch for training trn_images, trn_labels = sess.run(get_batch) # run taining op _, l_, acc_, sum_ = sess.run( [train_op, total_loss, accuracy, summary], feed_dict={ images: trn_images, labels: trn_labels, is_training: True}) if (step+1) % flags.FLAGS.log_freq == 0: print( datetime.now(), 'training step:', step+1, '/', TRAIN_STEPS_PER_EPOCH, 'loss={:.5f}'.format(l_), 'acc={:.4f}'.format(acc_)) summary_writer.add_summary(sum_, epoch*TRAIN_STEPS_PER_EPOCH + step) # validation phase print('==== validation phase ====') # specify dataset for validation sess.run(vld_init_op) # get batch for validation vld_images, vld_labels = sess.run(get_batch) # run taining op vld_acc, vld_loss = sess.run([accuracy, total_loss],feed_dict={ images: vld_images, labels: vld_labels, is_training: False}) print( datetime.now(), 'validation result: loss={:.5f}'.format(vld_loss), 'acc={:.4f}'.format(vld_acc)) # checkpoint saving print(datetime.now(), 'saving checkpoint of model ...') ckpt_name = os.path.join(train_log_dir,'model_epoch'+str(epoch+1)+'.ckpt') saver.save(sess, ckpt_name) print(datetime.now(), ckpt_name, 'saved') # epoch end print(datetime.now(), 'epoch:', epoch+1, 'done') print('training done')
def run_manual_models(): df = get_dataset() manual_model_reduced_risk(df) manual_model_hotspots(df)
def main(): # Get command-line args and set seed args = get_train_test_args() util_adversarial.set_seed(args.seed) # Load model model = AdversarialModel(args) tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') if args.do_train: # Make /save directory if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util_adversarial.get_save_dir(args.save_dir, args.run_name) # Get logger log = util_adversarial.get_logger(args.save_dir, 'log_train') log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') # Set the device to cuda if GPU available args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # Load training data log.info("Preparing Training Data...") train_dataset, train_dict = data_utils.get_dataset( args, args.train_datasets, args.train_dir, tokenizer, 'train') train_loader = DataLoader( train_dataset, batch_size=args. batch_size, # batches the examples into groups of 16 sampler=RandomSampler(train_dataset)) # Load validation data log.info("Preparing Validation Data...") val_dataset, val_dict = data_utils.get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val') val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=SequentialSampler(val_dataset)) # Train!!! trainer = Trainer(args, log) trainer.train(model, train_loader, val_loader, val_dict) if args.continue_to_eval: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util_adversarial.get_logger(args.save_dir, f'log_{split_name}') # Load model model.to(args.device) # Load eval data eval_dataset, eval_dict = data_utils.get_dataset( args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) # Evaluate!!! trainer = Trainer(args, log) eval_preds, eval_scores = trainer.evaluate(model, data_loader=eval_loader, data_dict=eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval in continue_to_eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]]) if args.do_eval: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util_adversarial.get_logger(args.save_dir, f'log_{split_name}') checkpoint_path = os.path.join(args.save_dir, 'checkpoint') checkpoint_path_qa_output = os.path.join(args.save_dir, 'qa_output_state') # Load model model = AdversarialModel(args, load_path=args.saved_model_filename) # model.load(checkpoint_path) # model.load_qa_output_model(checkpoint_path_qa_output) model.to(args.device) # Load eval data eval_dataset, eval_dict = data_utils.get_dataset( args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) # Evaluate!!! trainer = Trainer(args, log) eval_preds, eval_scores = trainer.evaluate(model, data_loader=eval_loader, data_dict=eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])