def __init__(self): print("load law school") data_dir = 'dataset' data_file = path.join(data_dir, 'lawschs1_1.dta') if not path.exists(data_file): request.urlretrieve( 'http://www.seaphe.org/databases/FOIA/lawschs1_1.dta', data_file) dataset = pd.read_stata(data_file) dataset.drop([ 'enroll', 'asian', 'black', 'hispanic', 'white', 'missingrace', 'urm' ], axis=1, inplace=True) dataset.dropna(axis=0, inplace=True, subset=['admit']) dataset.replace(to_replace='', value=np.nan, inplace=True) dataset.dropna(axis=0, inplace=True) dataset = dataset[dataset['race'] != 'Asian'] for col in dataset.columns: if dataset[col].isnull().sum() > 0: dataset.drop(col, axis=1, inplace=True) self.con_vars = ['lsat', 'gpa'] self.cat_vars = [ col for col in dataset.columns if col not in self.con_vars ] self.columns_name = self.con_vars + self.cat_vars self.data = dataset[self.columns_name] self.data_info = get_data_info(self.data, self.cat_vars) self.con_loc = [dataset.columns.get_loc(var) for var in self.con_vars]
def main(_): start_time = time.time() print('Loading data info ...') word2id, FLAGS.max_aspect_len, FLAGS.max_context_len = get_data_info( FLAGS.train_file_name, FLAGS.test_file_name, FLAGS.data_info, FLAGS.pre_processed) print('Loading training data and testing data ...') train_data = read_data(FLAGS.train_file_name, word2id, FLAGS.max_aspect_len, FLAGS.max_context_len, FLAGS.train_data, FLAGS.pre_processed) test_data = read_data(FLAGS.test_file_name, word2id, FLAGS.max_aspect_len, FLAGS.max_context_len, FLAGS.test_data, FLAGS.pre_processed) print('Loading pre-trained word vectors ...') FLAGS.embedding_matrix = load_word_embeddings(FLAGS.embedding_file_name, FLAGS.embedding_dim, word2id) with tf.Session() as sess: model = IAN(FLAGS, sess) model.build_model() model.run(train_data, test_data) end_time = time.time() print('Time Costing: %s' % (end_time - start_time))
def main(use_cpu=1, batch_size=16, num_of_classes=8, first_n_byte=2000000): model = malConv.MalConv(num_of_classes) if torch.cuda.is_available() and torch.cuda.device_count() > 0: print("gpu!!!!!!!") device = torch.device("cuda:0") model = nn.Sequential(model) model = nn.DataParallel(model) model.to(device) else: device = torch.device("cpu") # Extracting data to data loaders. data_folder, classes_list = ut.get_data_info('classes.txt') train_set_list, train_labels, test_set_list, test_labels = ut.gen_train_and_dev_data_sets( data_folder, classes_list) train_loader = DataLoader(ExeDataset(train_set_list, train_labels, first_n_byte), batch_size=batch_size, shuffle=True, num_workers=use_cpu) test_loader = DataLoader(ExeDataset(test_set_list, test_labels, first_n_byte), batch_size=batch_size, shuffle=False, num_workers=use_cpu) train.train_on(model, train_loader, test_loader, len(test_labels), len(train_labels), device, batch_size)
def main(_): start_time = time.time() print('Loading data info ...') word2id, FLAGS.max_aspect_len, FLAGS.max_context_len = get_data_info( FLAGS.dataset, FLAGS.pre_processed) print('Loading training data and testing data ...') train_data = read_data(word2id, FLAGS.max_aspect_len, FLAGS.max_context_len, FLAGS.dataset + 'train', FLAGS.pre_processed) test_data = read_data(word2id, FLAGS.max_aspect_len, FLAGS.max_context_len, FLAGS.dataset + 'test', FLAGS.pre_processed) print('Loading pre-trained word vectors ...') FLAGS.embedding_matrix = load_word_embeddings(FLAGS.embedding_file_name, FLAGS.embedding_dim, word2id) config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.5 with tf.Session(config=config) as sess: model = IAN(FLAGS, sess) model.build_model(train_data, test_data) model.run() end_time = time.time() print('Time Costing: %s' % (end_time - start_time))
def main(_): start_time = time.time() print('Loading data info ...') word2id, FLAGS.max_aspect_len, FLAGS.max_context_len = get_data_info( dataset, pre_processed) print('Loading training data ,validation and testing data ...') train_data = read_data(word2id, FLAGS.max_aspect_len, FLAGS.max_context_len, dataset + 'train', pre_processed) test_data = read_data(word2id, FLAGS.max_aspect_len, FLAGS.max_context_len, dataset + 'val', pre_processed) test_new_data = read_data(word2id, FLAGS.max_aspect_len, FLAGS.max_context_len, dataset + 'test', pre_processed) print('Loading pre-trained word vectors ...') FLAGS.embedding_matrix = load_word_embeddings(embedding_file_name, FLAGS.embedding_dim, word2id) model = IAN(FLAGS) run(model, train_data, test_data, test_new_data) end_time = time.time() print('Time Costing: %s' % (end_time - start_time))
def __init__(self,train,con_vars): print("load Synthetic Adult...") #self.cat_vars = [col for col in df_health.columns if col not in self.con_vars] self.con_vars = con_vars self.cat_vars = [col for col in train.columns if col not in self.con_vars] self.columns_name = self.con_vars + self.cat_vars self.train = train[self.columns_name] #get data info self.data_info = get_data_info(self.train ,self.cat_vars) print("Data info:", self.data_info) self.con_loc = [self.train.columns.get_loc(var) for var in self.con_vars]
def main(_): print('Loading data info ...') FLAGS.word2id, FLAGS.max_sentence_len, FLAGS.max_aspect_len = get_data_info(FLAGS.train_fname, FLAGS.test_fname, FLAGS.data_info, FLAGS.pre_processed) print('Loading training data and testing data ...') train_data = read_data(FLAGS.train_fname, FLAGS.word2id, FLAGS.max_sentence_len, FLAGS.max_aspect_len, FLAGS.train_data, FLAGS.pre_processed) test_data = read_data(FLAGS.test_fname, FLAGS.word2id, FLAGS.max_sentence_len, FLAGS.max_aspect_len, FLAGS.test_data, FLAGS.pre_processed) print('Loading pre-trained word vectors ...') FLAGS.word2vec = load_word_embeddings(FLAGS.embedding_fname, FLAGS.embedding_dim, FLAGS.word2id) with tf.Session() as sess: model = RAM(FLAGS, sess) model.build_model() model.run(train_data, test_data)
def __init__(self, i=1): print("load synthetic compas") df_compas = pd.read_csv( "./GenerateData/Compas/Compas_syn600_bs100_seed1_times_10.csv") df_compas = df_compas[3694 * i:3694 * (i + 1)] df_compas.loc[(df_compas["diff_jail"] < 0, "diff_jail")] = 0 self.con_vars = ['age', 'diff_custody', 'diff_jail', 'priors_count'] self.cat_vars = [ col for col in df_compas.columns if col not in self.con_vars ] self.columns_name = self.con_vars + self.cat_vars self.data = df_compas[self.columns_name] self.con_loc = [ self.data.columns.get_loc(var) for var in self.con_vars ] #get data info self.data_info = get_data_info(self.data, self.cat_vars) print("Data info:", self.data_info)
def __init__(self): print("load Synthetic law school") df_lawsch = pd.read_csv( "./GenerateData/lawschool/lawschool_syn_300_bs500_seed0_times_10.csv" ) df_lawsch = df_lawsch[:43011] df_lawsch['lsat'] = df_lawsch['lsat'].astype('int') df_lawsch['gpa'] = df_lawsch['gpa'].round(decimals=2) self.con_vars = ['lsat', 'gpa'] self.cat_vars = [ col for col in df_lawsch.columns if col not in self.con_vars ] self.columns_name = self.con_vars + self.cat_vars self.data = df_lawsch[self.columns_name] self.data_info = get_data_info(self.data, self.cat_vars) self.con_loc = [ self.data.columns.get_loc(var) for var in self.con_vars ]
def transform(self): self.columns_name = self.data.columns self.output_info = get_data_info(self.data, self.categorical_columns) self.data = pd.get_dummies(self.data, columns=self.categorical_columns, prefix_sep='=') #onehot:numpy array self.scaler = MinMaxScaler() self.data[self.c_vars] = self.scaler.fit_transform( self.data[self.c_vars]) print('Attributes', self.columns_name) print('Data info:', self.output_info) #change to numpy array data_np = self.data.values #change range to [-1,1] #data_np = (data_np[:,:] - 0.5)*2 return data_np
def main(_): print('Loading data info ...') #FLAGS.word2id, FLAGS.max_sentence_len, FLAGS.max_aspect_len = get_data_info(FLAGS.train_fname, FLAGS.test_fname, FLAGS.data_info, FLAGS.pre_processed) print('Buoc 1: lay thong tin co ban cu du lieu train va test ...') word2id, max_sentence_len, max_aspect_len = get_data_info( FLAGS.train_fname, FLAGS.test_fname, FLAGS.data_info, FLAGS.pre_processed) #sys.exit() #sys.exit() #tf.app.flags.DEFINE_string('word2id', word2id, 'word2id') tf.app.flags.DEFINE_integer('max_sentence_len', max_sentence_len, 'max sentence len') tf.app.flags.DEFINE_integer('max_aspect_len', max_aspect_len, 'max aspect len') print('Buoc 2: Loading training data and testing data ...') print('Buoc 2.1: doac training data ...') train_data = read_data(FLAGS.train_fname, word2id, max_sentence_len, max_aspect_len, FLAGS.train_data, FLAGS.pre_processed, FLAGS.sentiment_data) #sys.exit() print('Buoc 2.2: doac testing data ...') test_data = read_data(FLAGS.test_fname, word2id, max_sentence_len, max_aspect_len, FLAGS.test_data, FLAGS.pre_processed, FLAGS.sentiment_data) print('Loading pre-trained word vectors ...') word2vec = load_word_embeddings(FLAGS.embedding_fname, FLAGS.embedding_dim, word2id) with tf.Session() as sess: model = RAM(FLAGS, word2id, word2vec, sess) print('Build model ...') model.build_model() print('Run model ...') model.run(train_data, test_data)
def main(_): print('Loading data info ...') FLAGS.word2id, FLAGS.max_aspect_len, FLAGS.max_context_len = get_data_info( FLAGS.train_fname, FLAGS.test_fname, FLAGS.data_info, FLAGS.pre_processed) print('Loading training data and testing data ...') train_data = read_data(FLAGS.train_fname, FLAGS.word2id, FLAGS.max_aspect_len, FLAGS.max_context_len, FLAGS.train_data, FLAGS.pre_processed) test_data = read_data(FLAGS.test_fname, FLAGS.word2id, FLAGS.max_aspect_len, FLAGS.max_context_len, FLAGS.test_data, FLAGS.pre_processed) print('Loading pre-trained word vectors ...') FLAGS.word2vec = load_word_embeddings(FLAGS.embedding_fname, FLAGS.embedding_dim, FLAGS.word2id) with tf.Session() as sess: model = IAN(FLAGS, sess) model.build_model() model.run(train_data, test_data)
def __init__(self): print("load compas") data_dir = 'dataset' data_file = path.join(data_dir, 'compas-scores-two-years.csv') df = pd.read_csv(data_file) print(df.shape) df = df[df['days_b_screening_arrest'] >= -30] df = df[df['days_b_screening_arrest'] <= 30] df = df[df['is_recid'] != -1] df = df[df['c_charge_degree'] != '0'] df = df[df['score_text'] != 'N/A'] df['in_custody'] = pd.to_datetime(df['in_custody']) df['out_custody'] = pd.to_datetime(df['out_custody']) df['diff_custody'] = (df['out_custody'] - df['in_custody']).dt.days df['c_jail_in'] = pd.to_datetime(df['c_jail_in']) df['c_jail_out'] = pd.to_datetime(df['c_jail_out']) df['diff_jail'] = (df['c_jail_out'] - df['c_jail_in']).dt.days df.drop([ 'id', 'name', 'first', 'last', 'v_screening_date', 'compas_screening_date', 'dob', 'c_case_number', 'screening_date', 'in_custody', 'out_custody', 'c_jail_in', 'c_jail_out' ], axis=1, inplace=True) df = df[df['race'].isin(['African-American', 'Caucasian'])] features = df.drop([ 'is_recid', 'is_violent_recid', 'violent_recid', 'two_year_recid' ], axis=1) labels = 1 - df['two_year_recid'] features = features[[ 'age', 'sex', 'race', 'diff_custody', 'diff_jail', 'priors_count', 'juv_fel_count', 'c_charge_degree', 'v_score_text' ]] self.data = pd.concat([features, labels], axis=1) self.data[['juv_fel_count', 'two_year_recid' ]] = self.data[['juv_fel_count', 'two_year_recid']].astype('object') #self.data = self.data.drop(['diff_jail'],axis=1) # discretize diff_custody #diff_custody(self.data) self.con_vars = [ i for i in self.data.columns if self.data[i].dtype == 'int64' or self.data[i].dtype == 'float64' ] self.cat_vars = [ i for i in self.data.columns if i not in self.con_vars ] self.columns_name = self.con_vars + self.cat_vars self.data = self.data[self.columns_name] self.con_loc = [ self.data.columns.get_loc(var) for var in self.con_vars ] #get data info self.data_info = get_data_info(self.data, self.cat_vars) print("Data info:", self.data_info)
'embedding file name') tf.app.flags.DEFINE_string('embedding', 'glove', 'oov') tf.app.flags.DEFINE_string('train_fname', './data/laptop/train.txt', 'training file name') tf.app.flags.DEFINE_string('test_fname', './data/laptop/test.txt', 'testing file name') tf.app.flags.DEFINE_string('data_info', './data/data_info.txt', 'the file saving data information') tf.app.flags.DEFINE_string('train_data', './data/train_data.txt', 'the file saving training data') tf.app.flags.DEFINE_string('test_data', './data/test_data.txt', 'the file saving testing data') print('Loading data info ...') FLAGS.word2id, FLAGS.max_aspect_len, FLAGS.max_context_len = get_data_info( FLAGS.train_fname, FLAGS.test_fname, FLAGS.data_info, FLAGS.pre_processed) print('Loading training data and testing data ...') train_data = read_data(FLAGS.train_fname, FLAGS.word2id, FLAGS.max_aspect_len, FLAGS.max_context_len, FLAGS.train_data, FLAGS.pre_processed) test_data = read_data(FLAGS.test_fname, FLAGS.word2id, FLAGS.max_aspect_len, FLAGS.max_context_len, FLAGS.test_data, FLAGS.pre_processed) print('Loading pre-trained word vectors ...') FLAGS.word2vec = load_word_embeddings(FLAGS.embedding_fname, FLAGS.embedding_dim, FLAGS.word2id) with tf.Session() as sess: