def main(): global model_path, train_dataset_path, test_dataset_path model = {} while True: print("======= MENU =======") print("\t1. Treinar Modelo") print("\t2. Salvar Modelo") print("\t3. Carregar Modelo") print("\t4. Testar Modelo") print("\t0. Sair") opt = input() break_lines() if opt == '1': train_dataset = utils.parse_dataset(train_dataset_path) model = train(train_dataset) elif opt == '2': if not model: print("> Você precisa ter um modelo treinado para isso!") else: save(model, model_path) elif opt == '3': model = load(model_path) elif opt == '4': if not model: print("> Você precisa ter um modelo treinado para isso!") else: test_dataset = utils.parse_dataset(test_dataset_path) test_main(model, test_dataset) elif opt == '0': break else: print("- OPÇÃO INVÁLIDA -")
def get_data(config): df, sparse_cols, num_train = utils.get_dataset_df(config.data_dir, debug=config.is_debug) print('cols:', ', '.join(df.columns)) print('Sparse_cols:', sparse_cols) ignore_cols = ['target'] dense_cols = [ col for col in df.columns if col not in sparse_cols + ignore_cols ] sparse_cols.sort() dense_cols.sort() # print('Get embeddings ....') # # embedding = {col:utils.binary_embedding(df[col].max()+1) for col in sparse_cols} # embedding = utils.load_embedding(df, sparse_cols, config.data_dir, method=config.embedding_method) # print('Processing embeddings ....') # x = utils.process_embedding(df, dense_cols, embedding) feat_dict, feat_dim = utils.gen_feat_dict(df, sparse_cols, ignore_cols) Xi, Xv = utils.parse_dataset(df, feat_dict, sparse_cols, ignore_cols) idx0 = df[df['target'] == 0].index.values idx1 = df[df['target'] == 1].index.values del df gc.collect() x_neg_train_i = Xi[idx0, :] x_pos_train_i = Xi[idx1, :] x_neg_train_v = Xv[idx0, :] x_pos_train_v = Xv[idx1, :] x_test_i = Xi[num_train:] x_test_v = Xv[num_train:] x_neg_train = (x_neg_train_i, x_neg_train_v) x_pos_train = (x_pos_train_i, x_pos_train_v) x_test = (x_test_i, x_test_v) return x_neg_train, x_pos_train, x_test, feat_dim
def __init__(self, audio_conf, manifest_filepath, metadata_file_path, labels, normalize=False, speed_volume_perturb=False, spec_augment=False): """ Dataset that loads tensors via a csv containing file paths to audio files and transcripts separated by a comma. Each new line is a different sample. Example below: /path/to/audio.wav,/path/to/audio.txt ... :param audio_conf: Dictionary containing the sample rate, window and the window length/stride in seconds :param metadata_file_path: Path to manifest csv as describe above :param labels: String containing all the possible characters to map to :param normalize: Apply standard mean and deviation normalization to audio tensor :param speed_volume_perturb(default False): Apply random tempo and gain perturbations :param spec_augment(default False): Apply simple spectral augmentation to mel spectograms """ ids = parse_dataset(metadata_file_path, manifest_filepath) self.ids = ids self.size = len(ids) self.labels_map = dict([(labels[i], i) for i in range(len(labels))]) super(SpectrogramDataset, self).__init__(audio_conf, normalize, speed_volume_perturb, spec_augment)
from sklearn.metrics import average_precision_score from baseline import run as run_baseline from naive_bayes import run as run_nb from random_forest import run as run_rf from SVM_classification import run as run_svm from fully_connected import run as run_nnet from utils import parse_dataset from utils import get_training_set from utils import run_with_cv if __name__ == '__main__': logging.basicConfig(level='INFO') with open('positive_set.json', 'r') as f: positive_set, positive_meta = parse_dataset(json.load(f)) with open('negative_set.json', 'r') as f: negative_set, negative_meta = parse_dataset(json.load(f)) seed = 1 methods = [ ('baseline', run_baseline, 'navy'), ('naive bayes', run_nb, 'turquoise'), ('random forest', run_rf, 'darkorange'), ('svm', run_svm, 'cornflowerblue'), ('neural net', run_nnet, 'teal'), ] X, y, meta = get_training_set(positive_set, positive_meta, negative_set, negative_meta, seed)
def main(config=None): results = {} if config is None: # create instance of config config = Config() eng_path = os.path.join( 'parsed_data_lowercased', 'eng_test_bio_bpe{}.txt'.format('1' if config.pos_target else '')) ger_path = os.path.join( 'parsed_data_lowercased', 'ger_test_bio_bpe{}.txt'.format('1' if config.pos_target else '')) ned_path = os.path.join( 'parsed_data_lowercased', 'ned_test_bio_bpe{}.txt'.format('1' if config.pos_target else '')) spa_path = os.path.join( 'parsed_data_lowercased', 'esp_test_bio_bpe{}.txt'.format('1' if config.pos_target else '')) data_filepaths = [ eng_path, ned_path, # spa_path, ger_path, ] for data_filepath in data_filepaths: # get dataset encoding = 'utf-8' data_laser, pad_len = parse_dataset_laser(data_filepath, config.label_to_idx, config.word_to_idx, pos_target=config.pos_target, encoding=encoding) data, pad_len = parse_dataset(data_filepath, config.label_to_idx, config.word_to_idx, pos_target=config.pos_target, encoding=encoding) ##################################################################### # SETUP ##################################################################### subfolder = 'POS' if config.pos_target else 'NER' langfolder = config.langfolder base_path = os.path.join('saves_lc', langfolder, subfolder, 'LASEREmbedderBase.pt') base_gru_path = os.path.join('saves_lc', langfolder, subfolder, 'LASEREmbedderBaseGRU.pt') i_path = os.path.join('saves_lc', langfolder, subfolder, 'LASEREmbedderI.pt') iii_path = os.path.join('saves_lc', langfolder, subfolder, 'LASEREmbedderIII.pt') elmo_path = os.path.join('saves_lc', langfolder, subfolder, 'LASEREmbedderIIIELMo.pt') paths = [ # base_path, # base_gru_path, i_path, # iii_path, # elmo_path ] embedders = [ # LASEREmbedderBase, #(config.model_path, pad_len), # LASEREmbedderBaseGRU, #(config.model_path, pad_len), LASEREmbedderI, #(config.model_path), # LASEREmbedderIII, #(config.model_path), # LASEREmbedderIIIELMo, #(config.model_path) ] use_laser = [ # False, # False, True, # True, # True ] lang_results = {} for embedder, path, d in zip(embedders, paths, use_laser): emb = embedder(config.model_path, bpe_pad_len=pad_len, static_lstm=config.static_lstm, drop_before=config.drop_before_laser, drop_after=config.drop_after_laser, drop_within=config.drop_within_lstm) print(path) dset = data_laser if d else data f1 = eval_model_dataset(config, emb, dset, pad_len, path, d) lang_results[emb.__class__.__name__] = f1 del emb empty_cache() results[data_filepath] = lang_results print(results) return results, config
import os import pandas as pd import utils KAGGLE_DATA_ROOT = os.path.abspath('/Users/travisclarke/kaggle-data') ANNOTATIONS_FILE = os.path.join(KAGGLE_DATA_ROOT, 'stage_1_train_labels.csv') TRAIN_DIR = os.path.join(KAGGLE_DATA_ROOT, 'stage_1_train_images') TRAIN_ANNOT_DIR = os.path.join(KAGGLE_DATA_ROOT, 'stage_1_train_annots') # load and parse images and annotations raw_annotations = pd.read_csv(ANNOTATIONS_FILE) images = utils.parse_dataset(raw_annotations) utils.cvt_annots_to_xml(images, TRAIN_ANNOT_DIR, TRAIN_DIR)
def main(config=None, embedders_to_train=None): # create instance of config if config is None: config = Config() if embedders_to_train is None: embedders_to_train = [ # 'LASEREmbedderBase', # # 'LASEREmbedderBaseGRU', 'LASEREmbedderI', # 'LASEREmbedderIII', # 'LASEREmbedderIIIELMo', ] encoding = 'utf-8' static_lstm = False # parse datasets train_laser, tr_pad_len = parse_dataset_laser(config.filename_train, config.label_to_idx, config.word_to_idx, pos_target=config.pos_target, encoding=encoding) dev_laser, dev_pad_len = parse_dataset_laser(config.filename_dev, config.label_to_idx, config.word_to_idx, pos_target=config.pos_target, encoding=encoding) # else: train_base, tr_pad_len = parse_dataset(config.filename_train, config.label_to_idx, config.word_to_idx, pos_target=config.pos_target, encoding=encoding) dev_base, dev_pad_len = parse_dataset(config.filename_dev, config.label_to_idx, config.word_to_idx, pos_target=config.pos_target, encoding=encoding) # # build model embedder_base = LASEREmbedderBase #(config.model_path, tr_pad_len) embedder_base_gru = LASEREmbedderBaseGRU #(config.model_path, tr_pad_len) embedderI = LASEREmbedderI #(config.model_path, static_lstm = False) embedderIII = LASEREmbedderIII #(config.model_path, static_lstm = False) embedderIIIElmo = LASEREmbedderIIIELMo embedders = { 'LASEREmbedderBase': embedder_base, 'LASEREmbedderBaseGRU': embedder_base_gru, 'LASEREmbedderI': embedderI, 'LASEREmbedderIII': embedderIII, 'LASEREmbedderIIIELMo': embedderIIIElmo } # model_name = { # embedder_base:'LASEREmbedderBase', # embedder_base_gru:'LASEREmbedderBaseGRU', # embedderI:'LASEREmbedderI', # embedderIII:'LASEREmbedderIII', # embedderIIIElmo:'LASEREmbedderIIIELMo', # } use_laser = { 'LASEREmbedderBase': False, 'LASEREmbedderBaseGRU': False, 'LASEREmbedderI': True, 'LASEREmbedderIII': True, 'LASEREmbedderIIIELMo': True } for embedder in embedders_to_train: # set output filename laser = use_laser[embedder] config.set_model_name(embedder) config.use_laser = laser # config.set_params(laser) train = train_laser if laser else train_base dev = dev_laser if laser else dev_base model = embedders[embedder](config.model_path, bpe_pad_len=tr_pad_len, static_lstm=static_lstm, drop_before=config.drop_before_laser, drop_after=config.drop_after_laser, drop_within=config.drop_in_laser) # try: fit(config, model, tr_pad_len, dev_pad_len, train, dev) del model empty_cache() time.sleep(60) # free up CUDA memory
parser.add_argument('--graph_path', help='Path to pickle file of citation graph') parser.add_argument('--year_dict_path', help='Path to pickle file of Publication year Dictionary') args = parser.parse_args() logging.StreamHandler(sys.stdout) logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO) if __name__ == '__main__': global_citation_graph = '' if args.graph_path is None: logging.info('Parsing Dataset') global_citation_graph = utils.parse_dataset(args.dataset) utils.dump_file(args.dumps, 'global_citation_graph_full', global_citation_graph) else: logging.info('Getting Pickle Graph') global_citation_graph = utils.get_pickle_dump(args.graph_path, 'global_citation_graph') paper_year_dict = ' ' if args.year_dict_path is None: logging.info('Parsing Dates from dataset') paper_year_dict = utils.parse_dates(args.dataset) logging.info('Serialising Paper- Year Dictionary') utils.dump_file(args.dumps, 'paper_year_dict', paper_year_dict) else: logging.info('Unpickling the Date Dictionary')