Esempio n. 1
0
def run_describe_dataset():
    df = get_dataset()
    describe_bq_dataset(df)

    repos = pd.read_csv(join(DATA_PATH, 'repos_full.csv'))
    print("Initial Java repositories", len(repos[repos.language == 'Java']))
    generete_computed_values(df)
Esempio n. 2
0
def candidate_followup_files():

    repos = pd.read_csv(join(DATA_PATH, 'wellcomming_projects.csv'))
    repos_of_interest = repos.repo_name.unique()

    df = get_dataset()
    relevant_smells = [
        'NPathComplexity', 'FallThrough', 'JavadocParagraph',
        'TrailingComment', 'IllegalImport', 'AvoidStaticImport',
        'IllegalCatch', 'ParameterAssignment', 'UnnecessaryParentheses'
    ]

    df = df[df.repo_name.isin(repos_of_interest)]
    df = df[
        ((df.NPathComplexity > 0) & (df.NPathComplexity < 3))
        | ((df.FallThrough > 0) & (df.FallThrough < 3))
        # | ((df.JavadocParagraph > 0 ) & ( df.JavadocParagraph < 3))
        # | ((df.TrailingComment > 0 ) & ( df.TrailingComment < 3))
        | ((df.IllegalImport > 0) & (df.IllegalImport < 3))
        | ((df.AvoidStaticImport > 0) & (df.AvoidStaticImport < 3))
        | ((df.IllegalCatch > 0) & (df.IllegalCatch < 3))
        | ((df.ParameterAssignment > 0) & (df.ParameterAssignment < 3))
        | ((df.UnnecessaryParentheses > 0) & (df.UnnecessaryParentheses < 3))]
    df['robust_smells_num'] = df[relevant_smells].sum(axis=1)

    df['random_metric'] = np.random.randint(1, 100, df.shape[0])
    df = df[
        ['repo_name', 'robust_smells_num', 'random_metric', 'full_file_name'] +
        relevant_smells]
    df = df.sort_values(['repo_name', 'random_metric'],
                        ascending=[False, False])

    df.to_csv(join(DATA_PATH, 'candidate_followup_files.csv'), index=False)

    print("files", len(df))
Esempio n. 3
0
def clean_dataset(path):
    dataset = data_utils.get_dataset()
    dataset = data_utils.clean_afec_dpto(dataset)
    dataset = data_utils.clean_riesgo_vida(dataset)
    dataset = data_utils.clean_cie_10(dataset)
    dataset = data_utils.remove_features(dataset)
    dataset.to_csv(path, index=False)
Esempio n. 4
0
def candidate_followup_projects():
    df = get_dataset()
    robust_smells = get_robust_smells()

    file_with_robust_smells = df[
        ((df.NPathComplexity > 0) & (df.NPathComplexity < 3))
        | ((df.FallThrough > 0) & (df.FallThrough < 3))
        #| ((df.JavadocParagraph > 0 ) & ( df.JavadocParagraph < 3))
        #| ((df.TrailingComment > 0 ) & ( df.TrailingComment < 3))
        | ((df.IllegalImport > 0) & (df.IllegalImport < 3))
        | ((df.AvoidStaticImport > 0) & (df.AvoidStaticImport < 3))
        | ((df.IllegalCatch > 0) & (df.IllegalCatch < 3))
        | ((df.ParameterAssignment > 0) & (df.ParameterAssignment < 3))
        | ((df.UnnecessaryParentheses > 0) & (df.UnnecessaryParentheses < 3))]

    agg = file_with_robust_smells.groupby(['repo_name'], as_index=False).agg(
        {'file': 'count'})
    agg = agg.rename(columns={'file': 'files_with_robust_smells'})
    agg = agg[agg.files_with_robust_smells >= 15]
    agg = agg.sort_values('files_with_robust_smells', ascending=False)
    agg.to_csv(join(DATA_PATH, 'candidate_followup_projects.csv'), index=False)

    repos = pd.read_csv(join(DATA_PATH, 'repo_profile.csv'))
    repos['wellcomming'] = repos.apply(lambda x: 1
                                       if x.retention_prob > 0.3 and x.authors
                                       > 20 and x.onboarding_prob > 0.3 else 0,
                                       axis=1)
    repos = repos[repos['wellcomming'] == 1]
    repos = repos[['repo_name']]
    df = pd.merge(repos, agg, on='repo_name')
    df.to_csv(join(DATA_PATH, 'wellcomming_projects.csv'), index=False)
    print("projects", len(df))
Esempio n. 5
0
 def get_data(self):
     with tf.name_scope('data'):
         self.anchor = np.array([-1,-1,2,2])
         train_data, test_data = data_utils.get_dataset(self.batch_size, anchor = self.anchor)
        
         iterator = tf.data.Iterator.from_structure(train_data.output_types, 
                                                     train_data.output_shapes)
         img, self.people_label, self.car_label, self.iou_scores, self.bbox_matrix, self.tx_star, self.ty_star, self.tw_star, self.th_star, self.label, self.people_mask, self.car_mask = iterator.get_next()
      
         self.img = img
         self.train_init = iterator.make_initializer(train_data)
         self.test_init = iterator.make_initializer(test_data)
Esempio n. 6
0
def train(args):
    train, test = du.get_dataset(args)

    enc_train, dec_train = du.pad(train[0], train[1], args)
    enc_test, dec_test = du.pad(test[0], test[1], args)

    if args.decoder_go_padding:
        args.decoder_time_steps += 1
    if args.encoder_end_padding:
        args.encoder_time_steps += 1

    enc_train_oh = du.one_hot(enc_train, args)
    dec_train_oh = du.one_hot(dec_train, args)
    enc_test_oh = du.one_hot(enc_test, args)
    dec_test_oh = du.one_hot(dec_test, args)

    initializer = tf.random_uniform_initializer(-args.intializations,
                                                args.intializations,
                                                seed=args.seed)

    with tf.Session() as sess:
        model = EncDecModel(args)
        tf.initialize_all_variables().run()

        # Input feed: encoder inputs, decoder inputs, as provided.
        train_feed = model.feed(enc_train_oh, dec_train_oh)
        test_feed = model.feed(enc_test_oh, dec_test_oh)
        encoder_inputs, decoder_inputs = enc_train_oh, dec_train_oh

        for epoch in xrange(1, args.epochs):
            run_epoch(sess, model, args, encoder_inputs, decoder_inputs)
            loss = sess.run([model.loss], train_feed)[0]
            test_loss = sess.run([model.loss], test_feed)[0]
            print("[%s] Loss : %s" % (epoch, loss),
                  "test loss : %s" % test_loss)
            if epoch % args.decay_epoch == 0:
                lr_value = sess.run([model.learning_rate])[0] * args.lr_decay
                print("New learning rate %s" % lr_value)
                model.set_lr(sess, lr_value)
                args.decay_epoch = args.decay_epoch * 2

            model.training = False
            model.keep_prob = 1.0
            enc_sample = enc_test_oh[:, 0, :].reshape(
                [-1, 1, args.upper_limit + 1])
            dec_sample = dec_test_oh[:, 0, :].reshape(
                [-1, 1, args.upper_limit + 1])
            sample_feed = model.feed(enc_sample, dec_sample)
            print(
                enc_test[:, 0], dec_test[:, 0],
                sess.run([model.predictions], sample_feed)[0][1].reshape([-1]))
            model.training = True
            model.keep_prob = args.keep_prob
Esempio n. 7
0
def evaluate_smell_monotonocity():

    df = get_dataset()

    relevant_columns = set(df.columns) - NON_PREDICTIVE_FEATURES

    monotone_df = evaluate_monotonocity(
        df,
        relevant_columns,
        monotone_column='quality_group',
        monotone_order=['reduced_risk', 'other', 'hotspot'])
    return monotone_df
Esempio n. 8
0
def main(args):
    print('dataset =', flags.FLAGS.dataset)

    with tf.Graph().as_default():
        # dataset input, always using CPU for this section
        with tf.device('/cpu:0'):
            # dataset source
            test_dataset = get_dataset(dset=flags.FLAGS.dataset, mode='test')
            # iterator
            iterator = tf.data.Iterator.from_structure(
                test_dataset.output_types, test_dataset.output_shapes)
            # get a new batch from iterator
            get_batch = iterator.get_next()
            # ops for initializing the iterators
            # for choosing dataset for one epoch
        test_init_op = iterator.make_initializer(test_dataset)

        # restore saved model and run testing
        init_op = tf.global_variables_initializer()
        with tf.Session() as sess:
            model_path = flags.FLAGS.ckpt_path
            model_meta = model_path + '.meta'
            saver = tf.train.import_meta_graph(model_meta)
            print(datetime.now(), 'meta graph imported from', model_meta)
            saver.restore(sess, model_path)
            print(datetime.now(), 'model restored')

            # import operators for reference
            accuracy = tf.get_collection('accuracy')[0]
            images = tf.get_collection('images')[0]
            labels = tf.get_collection('labels')[0]
            is_training = tf.get_collection('is_training')[0]

            sess.run(init_op)
            print(datetime.now(), 'model initialized')

            # testing phase
            print('==== testing phase ====')
            # specify dataset for test
            sess.run(test_init_op)
            # get batch for testing
            test_images, test_labels = sess.run(get_batch)
            # run taining op
            test_acc = sess.run(accuracy,
                                feed_dict={
                                    images: test_images,
                                    labels: test_labels,
                                    is_training: False
                                })
            print(datetime.now(),
                  'testing result: acc={:.4f}'.format(test_acc))
Esempio n. 9
0
def plot_duration_by_length():
    df = get_dataset()
    df['CCP'] = df['corrective_rate'].map(
        lambda x: round(ccp_estimator.estimate_positives(x), 2))

    fig = plot_deciles(df,
                       grouping_column='line_count',
                       metric_column='CCP',
                       title='CCP by Line Count Deciles',
                       xaxis_title='Number of Lines',
                       output_file=None)
    fig.show()

    print("Perason corrective rate and line count",
          df.corr()['line_count']['corrective_rate'])
Esempio n. 10
0
def run_experiment(config, learner):
    dataset = None
    print(config)
    log_params(config)

    if (config['experiment_name'] == 'cie10'):
        dataset = data_utils.get_dataset_null_empty()
    else:
        dataset = data_utils.get_dataset()

    log_param('experiment_name', config["experiment_name"])

    build_dataset_experiment(config["experiment_name"], dataset)
    build_model(config, learner)
    print(
        '----------------------------------------------------------------------------------------\n'
    )
Esempio n. 11
0
def file_by_author_twin_analysis():
    df = get_dataset(binary=False)
    single_author_files = df[df.authors == 1]

    keys= ['repo_name', 'Author_email']
    filtering_function = lambda x: x.full_file_name_x == x.full_file_name_y
    comparision_function= lambda first, second : second > first \
        if isinstance(first, numbers.Number) and isinstance(second, numbers.Number) \
        else None

    comparision_columns = SINGLE_SMELL + [CCP, 'full_file_name'] # TODO - ADD groups

    comp_df = compare_twin_behaviours(first_behaviour=single_author_files
                                        , second_behaviour=single_author_files
                                        , keys=keys
                                        , comparision_columns=comparision_columns
                                        , comparision_function=comparision_function
                                        , filtering_function=filtering_function)

    comp_df.to_csv(os.path.join(DATA_PATH, 'file_by_author_twin_analysis.csv'))

    #comp_df = pd.read_csv(os.path.join(DATA_PATH, 'file_by_author_twin_analysis.csv'))

    Pearson = comp_df.corr()[CCP + COMPARISON_SUFFIX]
    Pearson_df = pd.DataFrame(Pearson).reset_index()
    Pearson_df.columns = ['feature', 'Pearson']
    Pearson_df = Pearson_df.sort_values('Pearson', ascending=False)

    print(Pearson_df)
    Pearson_df.to_csv(os.path.join(DATA_PATH, 'file_by_author_twin_corr.csv')
                      , index=False)

    stats = compute_confusion_matrics(df=comp_df
                , concept=CCP + COMPARISON_SUFFIX
                , columns=[i + COMPARISON_SUFFIX for i in SINGLE_SMELL]
                , keys=keys)
    stats_df = pd.DataFrame.from_dict(stats, orient='index')
    stats_df = (stats_df.reset_index()).rename(columns={'index': 'feature'})
    stats_df['feature'] = stats_df['feature'].map(lambda x : x[:-4])
    stats_df = stats_df.sort_values(['precision_lift','feature'] , ascending=[False, True])
    stats_df.to_csv(os.path.join(DATA_PATH, AUTHOR_TWIN_CM_FILE)
                                    , index=False)


    return Pearson_df
Esempio n. 12
0
def evaluate_model(weight_name):
    batch_size = 10
    x_train, y_train = get_data(aug=True, name='train')
    x_test, y_test = get_data(aug=False, name='test')
    num_data = len(x_test)
    [x_test] = img_standardization(x_train, x_test)
    x_test = _parse_function(x_test, im_size=224)
    dataset_test = get_dataset(x_test, y_test, batch_size, resize=False)

    model = tf.keras.models.load_model('./weight/' + weight_name, compile=True)
    # because evaluate() will calculate loss and metrics['accuracy'], so recompiling the loaded model is necessary

    [loss, acc] = model.evaluate(dataset_test,
                                 steps=math.ceil(num_data / batch_size))

    print('TEST loss: ', loss)
    print('TEST acc: ', acc)

    return
Esempio n. 13
0
def main():
    args = parse.parse_args()
    dataset = get_dataset(args)

    if args.mode == 'train':
        # 将两个句子拼接在一起然后使用句子分类模型
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-chinese').to(device)
        train(model, dataset, args)
    else:
        # args = torch.load(
        #     os.path.join(args.output_dir,
        #                  f'checkpoint-{args.best_step}/training_args.bin'))
        model = BertForSequenceClassification.from_pretrained(
            os.path.join(args.output_dir, f'checkpoint-{args.best_step}')
        ).to(device)
        pred = predict(model, dataset, args)[0]
        pred = pd.Series(pred.numpy().tolist())
        res_csv = pd.concat([dataset.df['qid'], pred], axis=1)
        res_csv.to_csv(os.path.join(args.output_dir, 'result.csv'),
                       header=False, index=False, sep='\t')
Esempio n. 14
0

if __name__ == '__main__':
    batch_size = 64
    nb_epoch = 50
    image_size = 224
    nb_classes = 9
    channels = 3

    print("Splitting data into test/ train datasets")
    df_train = pd.read_csv('data/iter0_im_tr_sa.csv', names=['file_name', 'label', 'do_aug'])
    df_test = pd.read_csv('data/iter0_im_te.csv', names=['file_name', 'label', 'do_aug'])
    df_val = pd.read_csv('data/iter0_im_val.csv', names=['file_name', 'label', 'do_aug'])

    print("Read data with normalization and augmentation")
    x_train , y_train = get_dataset(df_train, image_size, isDicom=True)
    x_valid, y_valid = get_dataset(df_val, image_size, isDicom=True)
    x_test, y_test = get_dataset(df_test, image_size, isDicom=True)

    # x, y = get_dogcat_dataset(img_rows)
    #
    # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
    # x_valid = x_test
    # y_valid = y_test

    # print("Reshaping Data")
    # print("X_train Shape: ", x_train.shape)
    # x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, channels)
    # x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, channels)
    # x_valid = x_valid.reshape(x_valid.shape[0], img_rows, img_cols, channels)
Esempio n. 15
0
import torch.optim as optim
from torch.utils.data import DataLoader

from torch.utils.data import random_split
from tqdm import tqdm

from data_utils import get_dataset, build_vocab, DQDataset, collate_fn
from model import SiameseNet, loss_fn, accuracy_score, run_on_example

DATA_PATH = r'D:/Jupyter work/Duplicate Question Detection/questions.csv'

TRAIN_BATCH_SIZE = 32
VALIDATE_BATCH_SIZE = 64
TEST_BATCH_SIZE = 64

dataset = get_dataset(DATA_PATH)

s = dataset[['question1', 'question2', 'is_duplicate']].values

print("Spliting dataset")
train, test_and_val = train_test_split(s, test_size=0.3)

same_idx = np.where(train[:, 2] == 1)[0]

train_set = train[same_idx]

print("Building vocab")
vocab = build_vocab(train_set)

print("Creating Dataloader")
dlt = DQDataset(train_set, vocab)
Esempio n. 16
0
def length_groups():
    df = get_dataset()
    for i in [0.25, 0.75]:
        print("length quantile", i, df.line_count.quantile(i))
Esempio n. 17
0
if __name__ == '__main__':
    # Execution start time, used to calculate total script runtime.
    startTime = time()

    # Config
    dropout = 0.20
    lr_rate = 0.001
    loss_patience = 1
    units = [32, 16]
    # Displays first n test predicted/expected results in the terminal window. Does not affect training/testing.
    print_results = 10
    # Multi gpu support. Replace the below number with # of gpus. Default: gpus=0
    gpus = 0

    # Check that our train/test data is available, then load it.
    train, test = data_utils.get_dataset()

    # Split train data into input (X) and output (Y) variables.
    X_train = train[:, 1:3197]
    y_train = train[:, 0]

    # Split test data into input (X) and output (Y) variables.
    X_test = test[:, 1:3197]
    y_test = test[:, 0]

    # Normalize train and test features
    X_train, X_test = normalize_data(X_train, X_test)

    # Create model.
    model = build_model(gpus, units, dropout)
# some config
data_dir = 'data/CASIA-WebFace_mtcnn_182/'  # data directory containing aligned face patches
validation_set_split_ratio = 0.3
max_nrof_epochs = 50
validate_every_n_epochs = 1
batch_size = 256
image_size = (160, 160)
epoch_size = 1000  # number of batch per epoch
embedding_size = 512  # Dimensionality of the embedding
random_crop = True  # augmentation
random_flip = True  # augmentation
random_rotate = True  # augmentation
keep_prob = 0.8
min_nrof_val_images_per_class = 0  # minimum number of image per class
#
dataset = data_utils.get_dataset(data_dir)
if validation_set_split_ratio > 0:
    train_set, val_set = data_utils.split_dataset(
        dataset, validation_set_split_ratio, min_nrof_val_images_per_class,
        'SPLIT_IMAGES')
else:
    train_set, val_set = data_utils.split_dataset(
        dataset, validation_set_split_ratio, min_nrof_val_images_per_class,
        'SPLIT_IMAGES')
# Let's take a look at the dataset
print(type(train_set))
print(len(train_set))
print(train_set[:5])
print(train_set[0])

df_graph = tf.Graph()
Esempio n. 19
0
def main(args):
    sleep(random.random())
    output_dir = os.path.expanduser(args.output_dir)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # Store some git revision info in a text file in the log directory
    src_path, _ = os.path.split(os.path.realpath(__file__))
    # facenet.store_revision_info(src_path, output_dir, ' '.join(sys.argv))
    dataset = data_utils.get_dataset(args.input_dir)

    print('Creating networks and loading parameters')

    with tf.Graph().as_default():
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=args.gpu_memory_fraction)
        sess = tf.Session(config=tf.ConfigProto(
            gpu_options=gpu_options, log_device_placement=False))
        with sess.as_default():
            pnet, rnet, onet = align.detect_face.create_mtcnn(sess, None)

    minsize = 20  # minimum size of face
    threshold = [0.6, 0.7, 0.7]  # three steps's threshold
    factor = 0.709  # scale factor

    # Add a random key to the filename to allow alignment using multiple processes
    random_key = np.random.randint(0, high=99999)
    bounding_boxes_filename = os.path.join(
        output_dir, 'bounding_boxes_%05d.txt' % random_key)

    with open(bounding_boxes_filename, "w") as text_file:
        nrof_images_total = 0
        nrof_successfully_aligned = 0
        if args.random_order:
            random.shuffle(dataset)
        for cls in dataset:
            output_class_dir = os.path.join(output_dir, cls.name)
            if not os.path.exists(output_class_dir):
                os.makedirs(output_class_dir)
                if args.random_order:
                    random.shuffle(cls.image_paths)
            for image_path in cls.image_paths:
                nrof_images_total += 1
                filename = os.path.splitext(os.path.split(image_path)[1])[0]
                output_filename = os.path.join(
                    output_class_dir, filename+'.png')
                print(image_path)
                if not os.path.exists(output_filename):
                    try:
                        img = misc.imread(image_path)
                    except (IOError, ValueError, IndexError) as e:
                        errorMessage = '{}: {}'.format(image_path, e)
                        print(errorMessage)
                    else:
                        if img.ndim < 2:
                            print('Unable to align "%s"' % image_path)
                            text_file.write('%s\n' % (output_filename))
                            continue
                        if img.ndim == 2:
                            img = data_utils.to_rgb(img)
                        img = img[:, :, 0:3]

                        bounding_boxes, _ = align.detect_face.detect_face(
                            img, minsize, pnet, rnet, onet, threshold, factor)
                        nrof_faces = bounding_boxes.shape[0]
                        if nrof_faces > 0:
                            det = bounding_boxes[:, 0:4]
                            det_arr = []
                            img_size = np.asarray(img.shape)[0:2]
                            if nrof_faces > 1:
                                if args.detect_multiple_faces:
                                    for i in range(nrof_faces):
                                        det_arr.append(np.squeeze(det[i]))
                                else:
                                    bounding_box_size = (
                                        det[:, 2]-det[:, 0])*(det[:, 3]-det[:, 1])
                                    img_center = img_size / 2
                                    offsets = np.vstack(
                                        [(det[:, 0]+det[:, 2])/2-img_center[1], (det[:, 1]+det[:, 3])/2-img_center[0]])
                                    offset_dist_squared = np.sum(
                                        np.power(offsets, 2.0), 0)
                                    # some extra weight on the centering
                                    index = np.argmax(
                                        bounding_box_size-offset_dist_squared*2.0)
                                    det_arr.append(det[index, :])
                            else:
                                det_arr.append(np.squeeze(det))

                            for i, det in enumerate(det_arr):
                                det = np.squeeze(det)
                                bb = np.zeros(4, dtype=np.int32)
                                bb[0] = np.maximum(det[0]-args.margin/2, 0)
                                bb[1] = np.maximum(det[1]-args.margin/2, 0)
                                bb[2] = np.minimum(
                                    det[2]+args.margin/2, img_size[1])
                                bb[3] = np.minimum(
                                    det[3]+args.margin/2, img_size[0])
                                cropped = img[bb[1]:bb[3], bb[0]:bb[2], :]
                                scaled = misc.imresize(
                                    cropped, (args.image_size, args.image_size), interp='bilinear')
                                nrof_successfully_aligned += 1
                                filename_base, file_extension = os.path.splitext(
                                    output_filename)
                                if args.detect_multiple_faces:
                                    output_filename_n = "{}_{}{}".format(
                                        filename_base, i, file_extension)
                                else:
                                    output_filename_n = "{}{}".format(
                                        filename_base, file_extension)
                                misc.imsave(output_filename_n, scaled)
                                text_file.write('%s %d %d %d %d\n' % (
                                    output_filename_n, bb[0], bb[1], bb[2], bb[3]))
                        else:
                            print('Unable to align "%s"' % image_path)
                            text_file.write('%s\n' % (output_filename))

    print('Total number of images: %d' % nrof_images_total)
    print('Number of successfully aligned images: %d' %
          nrof_successfully_aligned)
Esempio n. 20
0
def train_vgg16(lr=1e-4, epochs=50):
    x_train, y_train = get_data(aug=True, name='train')
    x_val, y_val = get_data(aug=True, name='val')
    x_test, y_test = get_data(aug=False, name='test')

    num_data = x_train.shape[0]
    num_test = x_test.shape[0]
    print('training set before preprocessing: ', x_train.shape)
    print('validation set before preprocessing: ', x_val.shape)

    [x_train, x_val, x_test] = img_standardization(x_train, x_train, x_val,
                                                   x_test)

    # parse numpy arrays into resized tensors
    x_train = _parse_function(x_train, im_size=224)
    x_val = _parse_function(x_val, im_size=224)
    x_test = _parse_function(x_test, im_size=224)

    batch_size = 16
    dataset_train = get_dataset(x_train, y_train, batch_size, resize=False)
    dataset_val = get_dataset(x_val, y_val, batch_size, resize=False)
    dataset_test = get_dataset(x_test, y_test, batch_size, resize=False)

    # build model
    print('building model...')
    model = new_vgg16()

    # compile
    adam = tf.keras.optimizers.Adam(lr=lr)
    model.compile(optimizer=adam,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # callbacks
    checkpointer = ModelCheckpoint('./weight/vgg16_ECG200_03.h5',
                                   monitor='val_loss',
                                   save_best_only=True)
    # reduce_lr = LearningRateScheduler(lr_scheduler)
    reduce_lr = ReduceLROnPlateau(monitor='loss',
                                  factor=0.5,
                                  patience=3,
                                  min_lr=1e-6)
    tensorboard = TensorBoard(log_dir='./log/ECG200/vgg16/',
                              write_graph=False,
                              batch_size=batch_size)
    print('start training...')
    histoty = model.fit(dataset_train,
                        steps_per_epoch=math.ceil(num_data / batch_size),
                        epochs=epochs,
                        validation_data=dataset_val,
                        validation_steps=math.ceil(num_data / batch_size),
                        callbacks=[checkpointer, reduce_lr, tensorboard],
                        verbose=2)
    # Testing
    # [loss, acc] = model.evaluate(x_test, y_test, batch_size=batch_size)
    [loss, acc] = model.evaluate(dataset_test,
                                 steps=math.ceil(num_test / batch_size))
    print(
        'TEST loss: ',
        loss,
    )
    print('TEST accuracy: ', acc)
    return histoty
Esempio n. 21
0
import pandas as pd
import numpy as np

from data_utils import get_dataset
from preprocessing import remove_object_cols
from models import kfold_lgb, get_logistic
from submission_utils import OptimizedRounder, generate_submission
from evaluation_utils import sklearn_quadratic_kappa

TARGET_COL = 'AdoptionSpeed'

if __name__ == '__main__':
    # step 1 - load and transform data
    # load train and test tabular datasets
    datasets = {
        dataset_type: get_dataset(dataset_type)
        for dataset_type in ('train', 'test')
    }
    # remove all string columns from dataset
    # todo: investigate if there are no int/float categorical cols left that hasn't been one-hot encoded
    cleaned_datasets = {
        dataset_type: remove_object_cols(dataset)
        for dataset_type, dataset in datasets.items()
    }
    # extract training labels
    y_train = cleaned_datasets['train'][TARGET_COL]

    print(cleaned_datasets)

    # step 2 - train a model and get it's outputs
    # get outputs from k-fold CV LGBM training
Esempio n. 22
0
def load_data(audio_config, data_config):
    return get_dataset(data_config, audio_config)
Esempio n. 23
0
    parser.add_argument("--dev-data-path", default="data/test_set.npz")
    parser.add_argument("--evaluate-every", default=500)
    parser.add_argument("--model-dir", default="./model_dir")
    parser.add_argument("--n-epochs", default=30)
    parser.add_argument("--batch-size", default=100)
    args = parser.parse_args()

    train_data = np.load(args.train_data_path)
    dev_data = np.load(args.dev_data_path)
    X_train = train_data["features"]
    y_train = train_data["labels"].reshape(-1, 1)
    X_dev = dev_data["features"]
    y_dev = dev_data["labels"].reshape(-1, 1)

    train_set = get_dataset(X_train,
                            y_train,
                            n_epochs=args.n_epochs,
                            batch_size=args.batch_size)
    dev_set = get_dataset(X_dev, y_dev, shuffle=False)

    data_iter = tf.contrib.data.Iterator.from_structure(
        train_set.output_types, train_set.output_shapes)
    train_init = data_iter.make_initializer(train_set)
    dev_init = data_iter.make_initializer(dev_set)

    # Initialize model path
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(args.model_dir, "runs", timestamp))
    print("Writing to {}\n".format(out_dir))
    checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
    if not os.path.exists(checkpoint_dir):
Esempio n. 24
0
import tensorflow as tf
import data_utils
import params

batch_size = 64
lr = 0.001
epoch_num = 100
display_step = 10

graph = tf.Graph()
with graph.as_default():
    # get dataset
    d = data_utils.get_files('./data/spoken_numbers_pcm', 1)
    arr_x, arr_y = data_utils.get_dataset(d)
    # training set
    data_x_train = tf.data.Dataset.from_tensor_slices(arr_x[128:])
    data_y_train = tf.data.Dataset.from_tensor_slices(arr_y[128:])
    training_set = tf.data.Dataset.zip((data_x_train, data_y_train)).batch(batch_size).shuffle(512)
    iterator_train = training_set.make_initializable_iterator()
    ne_train = iterator_train.get_next()
    # validation set
    data_x_val = tf.data.Dataset.from_tensor_slices(arr_x[:128])
    data_y_val = tf.data.Dataset.from_tensor_slices(arr_y[:128])
    validation_set = tf.data.Dataset.zip((data_x_val, data_y_val)).batch(128)
    iterator_validation = validation_set.make_initializable_iterator()
    ne_validation = iterator_validation.get_next()

    # define the placeholder
    x = tf.placeholder(tf.float32, [None, 168, 13, 1])
    y = tf.placeholder(tf.float32, [None, 10])
Esempio n. 25
0
def bgu_etl():

    df = pd.read_csv(join(DATA_PATH, 'bgu_dataset.csv'))
    print("records", len(df))
    print("projects", df.Project.unique())
    print("Version", df.Version.unique())
    #print("File", df.File.unique())
    metrics = ['file_ccp', 'worse_10_hs', 'reduced_risk']
    keys = ['repo_name', 'full_file_name']

    project_versions = df.groupby(['Project'],
                                  as_index=False).agg({'Version': max})

    df = pd.merge(df,
                  project_versions,
                  left_on=['Version', 'Project'],
                  right_on=['Version', 'Project'],
                  how='inner')

    smells_df = get_dataset()
    smells_df['project'] = smells_df.repo_name.map(
        lambda x: x[x.find('/') + 1:])
    smells_repos = smells_df['project'].unique()
    bug_repos = [
        'camel'
        'hadoop'
        'flink'
        'kafka'
        'openmeetings'
        'karaf'
        'hbase'
        'uima-ruta'
        'lucene-solr'
        'deltaspike'
        'jackrabbit-oak'
        'pulsar'
        'ofbiz'
        'cayenne'
        'commons-codec'
        'parquet-mr'
        'kylin'
        'hive'
        'commons-validator'
        'maven-surefire'
        'syncope'
        'commons-math'
        'tomcat'
        'atlas'
        'struts'
        'tika'
        'servicecomb-java-chassis'
        'ranger'
        'cassandra'
        'cxf'
        'avro'
        'nifi'
        'bookkeeper'
        'clerezza'
        'systemml'
        'asterixdb'
        'maven'
        'zeppelin'
        'commons-collections'
        'jena'
        'calcite'
        'tez'
        'commons-lang'
        'activemq'
        'curator'
        'phoenix'
        'samza'
        'nutch'
        'qpid-jms'
        'directory-kerby'
        'juneau'
        'myfaces-tobago'
        'isis'
        'wicket'
        'santuario-java'
        'helix'
        'storm'
        'airavata'
        'myfaces'
        'commons-dbcp'
        'commons-vfs'
        'opennlp'
        'tomee'
        'tinkerpop'
        'directory-server'
        'commons-compress'
        'accumulo'
        'giraph'
        'johnzon'
        'jclouds'
        'manifoldcf'
        'shiro'
        'knox'
        'drill'
        'crunch'
        'commons-io'
        'commons-cli'
        'jackrabbit'
        'openwebbeans'
        'xmlgraphics-fop'
        'tajo'
        'commons-email'
        'directory-studio'
        'tapestry-5'
        'archiva'
        'olingo-odata4'
        'openjpa'
        'commons-jexl'
        'roller'
        'reef'
        'activemq-artemis'
        'beam'
        'metron'
        'plc4x'
        'cocoon'
        'carbondata'
        'commons-csv'
        'commons-beanutils'
        'commons-net'
        'continuum'
    ]

    joint_df = pd.merge(df,
                        smells_df,
                        left_on=['File', 'Project'],
                        right_on=['full_file_name', 'project'],
                        how='inner')
    joint_df.to_csv(join(DATA_PATH, BGU_DATASET), index=False)
Esempio n. 26
0
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)


if __name__ == '__main__':
    parser = get_parser()
    args = parser.parse_args()

    torch.cuda.set_device(args.cuda)

    print_args(args)

    set_seed(args.seed)

    # get dataset:
    train_data, test_data = get_dataset(args.dataset, args.val)

    # get data loaders for training and testing:
    # env 0 and 1 are used for training
    # env 2 is used for validation
    # env 3 is used for testing

    # initialize model and optimizer based on the dataset and the method
    if args.dataset[:4] == 'beer' or args.dataset == 'pubmed':
        model, opt = get_model(args, train_data.vocab)
    else:
        model, opt = get_model(args)

    # start training
    print("{}, Start training {} on train env for {}".format(
        datetime.datetime.now().strftime('%02y/%02m/%02d %H:%M:%S'),
Esempio n. 27
0
def main(args):
  print('dataset =', flags.FLAGS.dataset)
  TRAIN_SIZE = dataset_size[flags.FLAGS.dataset]['train']
  TRAIN_STEPS_PER_EPOCH = int(TRAIN_SIZE // flags.FLAGS.batch_size)
  # Constants describing the training process.
  NUM_EPOCHS_PER_DECAY = 100.0      # Epochs after which learning rate decays.
  LEARNING_RATE_DECAY_FACTOR = 0.5  # Learning rate decay factor.
  
  train_log_base = flags.FLAGS.log_directory
  train_case = flags.FLAGS.dataset 
  train_case += '_bs_' + str(flags.FLAGS.batch_size)
  train_case += '_lr_' + str(flags.FLAGS.init_lr)
  train_case += '_l2s_' + str(flags.FLAGS.l2_scale)
  train_log_dir = os.path.join(train_log_base, train_case) 
  if not tf.gfile.Exists(train_log_base):
    tf.gfile.MakeDirs(train_log_base)
  if not tf.gfile.Exists(train_log_dir):
    tf.gfile.MakeDirs(train_log_dir)
  
  with tf.Graph().as_default(): 
    # create global step
    global_step = tf.train.get_or_create_global_step()
    
    with tf.name_scope('input_pipe'):
      # use epoch count to pick fold index for cross validation
      epoch_count = tf.floordiv(global_step, TRAIN_STEPS_PER_EPOCH)
      fold_index = tf.floormod(epoch_count, 10) # 10-fold dataset
      
      # dataset input, always using CPU for this section
      with tf.device('/cpu:0'):
        # dataset source
        trn_dataset = get_dataset(
            dset=flags.FLAGS.dataset, mode='train',
            batch_size=flags.FLAGS.batch_size,
            fold_index=fold_index)
        vld_dataset = get_dataset(
            dset=flags.FLAGS.dataset, mode='valid',
            fold_index=fold_index)
        # iterator 
        iterator = tf.data.Iterator.from_structure(
            trn_dataset.output_types,
            trn_dataset.output_shapes)
        # get a new batch from iterator
        get_batch = iterator.get_next()
        # ops for initializing the iterators
        # for choosing dataset for one epoch
        trn_init_op = iterator.make_initializer(trn_dataset)
        vld_init_op = iterator.make_initializer(vld_dataset)
 
    # placeholder for images and labels
    images, labels = create_placeholder_for_input(
        dset=flags.FLAGS.dataset)
    is_training = tf.placeholder(tf.bool, name='is_training')
    tf.add_to_collection('is_training', is_training)
    tf.summary.image('images', images)

    # neural network model
    if flags.FLAGS.dataset == 'mnist':
      model_network = lenet
    elif flags.FLAGS.dataset == 'cifar10':
      model_network = cifarnet
    else:
      raise(ValueError, 'Invalid dataset') 
    logits, end_points = model_network(images, is_training=is_training,
          l2_scale=flags.FLAGS.l2_scale)
    
    # print name and shape of each tensor
    print("layers:")
    for k_, v_ in end_points.items():
      print('name =', v_.name, ', shape =', v_.get_shape())
    # print the total size of trainable variables
    n_params = 0
    for var_ in tf.trainable_variables():
      var_shape = var_.get_shape()
      n_params_var = 1
      for dim_ in var_shape:
        n_params_var *= dim_.value
      n_params += n_params_var
    print("model parameter size:", n_params)
    
    # prediction of this batch
    with tf.name_scope('prediction'):
      pred = tf.argmax(tf.nn.softmax(logits), axis=1)
      match_count =  tf.reduce_sum(tf.cast(tf.equal(pred,labels), tf.float32))
      # note: here the running batch size can be changed in testing mode,
      #       so we cannot reuse the batch size from flags
      running_batch_size = tf.cast(tf.size(pred),tf.float32)
      accuracy = match_count / running_batch_size
      tf.add_to_collection('accuracy', accuracy)
      tf.summary.scalar('accuracy', accuracy)
 
    # loss function
    with tf.name_scope('losses'):
      raw_loss = tf.reduce_mean(
          tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits),
          name = 'cross_entropy')
      tf.summary.scalar('raw_loss', raw_loss)
      regu_loss = tf.add_n(tf.losses.get_regularization_losses())
      tf.summary.scalar('regu_loss', regu_loss)
      total_loss = raw_loss + regu_loss    
      tf.summary.scalar('total_loss', total_loss)

    # specify learning rate
    decay_steps = int(TRAIN_STEPS_PER_EPOCH * NUM_EPOCHS_PER_DECAY)
    lr = tf.train.exponential_decay(
        flags.FLAGS.init_lr,
        global_step,
        decay_steps,
        LEARNING_RATE_DECAY_FACTOR,
        staircase=True)
    tf.summary.scalar('learning_rate', lr)
    
    # add histograms for trainable variables
    for var_ in tf.trainable_variables():
      tf.summary.histogram(var_.op.name, var_)

    # specify optimizer
    opt = tf.train.GradientDescentOptimizer(lr)
    
    # compute gradients and apply
    # note: with batch norm layers we have to use update_ops
    #       to get hidden variables into the list needed
    #       to be trained
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops): 
      grads = opt.compute_gradients(total_loss)
    
      # add histograms for gradients
      for grad_, var_ in grads:
        if grad_ is not None:
          tf.summary.histogram(var_.op.name + '/gradients', grad_)
    
      # train op
      train_op = opt.apply_gradients(grads, global_step=global_step)
    
    # summerize all
    summary = tf.summary.merge_all()
    # summary writer
    summary_writer = tf.summary.FileWriter(train_log_dir)
    # checkpoint saver
    saver = tf.train.Saver()
    
    # session part
    init_op = tf.global_variables_initializer()
    with tf.Session() as sess:
      # initialization
      sess.run(init_op)
      summary_writer.add_graph(sess.graph)
      
      # epoch loop
      for epoch in range(flags.FLAGS.num_epochs):
        print(datetime.now(), 'epoch:', epoch+1, '/', flags.FLAGS.num_epochs)
        
        # training phase
        print('==== training phase ====')
        # specify dataset for training
        sess.run(trn_init_op)
        # training loop
        for step in range(TRAIN_STEPS_PER_EPOCH):
          # get batch for training
          trn_images, trn_labels = sess.run(get_batch)
          # run taining op
          _, l_, acc_, sum_ = sess.run(
              [train_op, total_loss, accuracy, summary],
              feed_dict={
                  images: trn_images,
                  labels: trn_labels,
                  is_training: True})
          if (step+1) % flags.FLAGS.log_freq == 0:
            print(
                datetime.now(),
                'training step:', step+1, '/', TRAIN_STEPS_PER_EPOCH,
                'loss={:.5f}'.format(l_),
                'acc={:.4f}'.format(acc_))
            summary_writer.add_summary(sum_, epoch*TRAIN_STEPS_PER_EPOCH + step)
        
        # validation phase
        print('==== validation phase ====')
        # specify dataset for validation
        sess.run(vld_init_op)
        # get batch for validation
        vld_images, vld_labels = sess.run(get_batch)
        # run taining op
        vld_acc, vld_loss = sess.run([accuracy, total_loss],feed_dict={
            images: vld_images,
            labels: vld_labels,
            is_training: False})
        print(
            datetime.now(),
            'validation result: loss={:.5f}'.format(vld_loss),
            'acc={:.4f}'.format(vld_acc))
        
        # checkpoint saving
        print(datetime.now(), 'saving checkpoint of model ...')
        ckpt_name = os.path.join(train_log_dir,'model_epoch'+str(epoch+1)+'.ckpt')
        saver.save(sess, ckpt_name)
        print(datetime.now(), ckpt_name, 'saved')
        
        # epoch end
        print(datetime.now(), 'epoch:', epoch+1, 'done')
  
  print('training done')
Esempio n. 28
0
def run_manual_models():
    df = get_dataset()

    manual_model_reduced_risk(df)
    manual_model_hotspots(df)
Esempio n. 29
0
def main():
    # Get command-line args and set seed
    args = get_train_test_args()
    util_adversarial.set_seed(args.seed)

    # Load model
    model = AdversarialModel(args)
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        'distilbert-base-uncased')

    if args.do_train:
        # Make /save directory
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        args.save_dir = util_adversarial.get_save_dir(args.save_dir,
                                                      args.run_name)

        # Get logger
        log = util_adversarial.get_logger(args.save_dir, 'log_train')
        log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}')

        # Set the device to cuda if GPU available
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')

        # Load training data
        log.info("Preparing Training Data...")
        train_dataset, train_dict = data_utils.get_dataset(
            args, args.train_datasets, args.train_dir, tokenizer, 'train')
        train_loader = DataLoader(
            train_dataset,
            batch_size=args.
            batch_size,  # batches the examples into groups of 16
            sampler=RandomSampler(train_dataset))
        # Load validation data
        log.info("Preparing Validation Data...")
        val_dataset, val_dict = data_utils.get_dataset(args,
                                                       args.train_datasets,
                                                       args.val_dir, tokenizer,
                                                       'val')
        val_loader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                sampler=SequentialSampler(val_dataset))

        # Train!!!
        trainer = Trainer(args, log)
        trainer.train(model, train_loader, val_loader, val_dict)

    if args.continue_to_eval:
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util_adversarial.get_logger(args.save_dir, f'log_{split_name}')

        # Load model
        model.to(args.device)

        # Load eval data
        eval_dataset, eval_dict = data_utils.get_dataset(
            args, args.eval_datasets, args.eval_dir, tokenizer, split_name)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=args.batch_size,
                                 sampler=SequentialSampler(eval_dataset))

        # Evaluate!!!
        trainer = Trainer(args, log)
        eval_preds, eval_scores = trainer.evaluate(model,
                                                   data_loader=eval_loader,
                                                   data_dict=eval_dict,
                                                   return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}'
                                for k, v in eval_scores.items())
        log.info(f'Eval in continue_to_eval {results_str}')

        # Write submission file
        sub_path = os.path.join(args.save_dir,
                                split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])

    if args.do_eval:
        args.device = torch.device(
            'cuda') if torch.cuda.is_available() else torch.device('cpu')
        split_name = 'test' if 'test' in args.eval_dir else 'validation'
        log = util_adversarial.get_logger(args.save_dir, f'log_{split_name}')
        checkpoint_path = os.path.join(args.save_dir, 'checkpoint')
        checkpoint_path_qa_output = os.path.join(args.save_dir,
                                                 'qa_output_state')

        # Load model
        model = AdversarialModel(args, load_path=args.saved_model_filename)
        # model.load(checkpoint_path)
        # model.load_qa_output_model(checkpoint_path_qa_output)
        model.to(args.device)

        # Load eval data
        eval_dataset, eval_dict = data_utils.get_dataset(
            args, args.eval_datasets, args.eval_dir, tokenizer, split_name)
        eval_loader = DataLoader(eval_dataset,
                                 batch_size=args.batch_size,
                                 sampler=SequentialSampler(eval_dataset))

        # Evaluate!!!
        trainer = Trainer(args, log)
        eval_preds, eval_scores = trainer.evaluate(model,
                                                   data_loader=eval_loader,
                                                   data_dict=eval_dict,
                                                   return_preds=True,
                                                   split=split_name)
        results_str = ', '.join(f'{k}: {v:05.2f}'
                                for k, v in eval_scores.items())
        log.info(f'Eval {results_str}')

        # Write submission file
        sub_path = os.path.join(args.save_dir,
                                split_name + '_' + args.sub_file)
        log.info(f'Writing submission file to {sub_path}...')
        with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
            csv_writer = csv.writer(csv_fh, delimiter=',')
            csv_writer.writerow(['Id', 'Predicted'])
            for uuid in sorted(eval_preds):
                csv_writer.writerow([uuid, eval_preds[uuid]])
Esempio n. 30
0
def load_data(audio_config, data_config):
    return get_dataset(data_config, audio_config)