def __init__(self, opt): print('CoQA Preprocessing') self.opt = opt self.spacyDir = opt['FEATURE_FOLDER'] self.train_file = os.path.join(opt['datadir'], opt['CoQA_TRAIN_FILE']) self.dev_file = os.path.join(opt['datadir'], opt['CoQA_DEV_FILE']) self.glove_file = os.path.join(opt['datadir'], opt['INIT_WORD_EMBEDDING_FILE']) self.glove_dim = 300 self.official = 'OFFICIAL' in opt self.data_prefix = 'coqa-' if self.official: self.glove_vocab = load_glove_vocab(self.glove_file, self.glove_dim, to_lower=False) print('Official prediction initializes...') print('Loading training vocab and vocab char...') self.train_vocab, self.train_char_vocab, self.train_embedding = self.load_data( ) self.test_file = self.opt['OFFICIAL_TEST_FILE'] return dataset_labels = ['train', 'dev'] allExist = True for dataset_label in dataset_labels: if not os.path.exists( os.path.join( self.spacyDir, self.data_prefix + dataset_label + '-preprocessed.json')): allExist = False if allExist: return print( 'Previously result not found, creating preprocessed files now...') self.glove_vocab = load_glove_vocab(self.glove_file, self.glove_dim, to_lower=False) if not os.path.isdir(self.spacyDir): os.makedirs(self.spacyDir) print('Directory created: ' + self.spacyDir) for dataset_label in dataset_labels: self.preprocess(dataset_label)
def __init__(self, opt): print('CoQA Preprocessing') self.opt = opt self.spacyDir = opt['FEATURE_FOLDER'] self.glove_file = os.path.join(opt['datadir'], opt['INIT_WORD_EMBEDDING_FILE']) self.glove_dim = 300 self.BuildTestVocabulary = 'BuildTestVocabulary' in opt self.n_gram = opt['n_gram'] dataset_labels = self.opt['Task'].split(',') self.dataset_labels = dataset_labels if 'train' in dataset_labels: dataset_labels.remove('train') dataset_labels = ['train'] + dataset_labels assert dataset_labels[0] == 'train' self.dataset_labels = dataset_labels allExist = True for dataset_label in dataset_labels: if not os.path.exists( os.path.join(self.spacyDir, dataset_label + '-preprocessed.msgpack')): allExist = False if allExist: return if 'train' not in dataset_labels: self.train_vocab, self.train_char_vocab, self.train_embedding = self.load_data( ) # else: # assert False print( 'Previously result not found, creating preprocessed files now...') if 'FastText' in self.opt: self.fasttext_model = os.path.join(opt['datadir'], opt['fasttext_model']) if 'GLOVE' in self.opt: print( 'Previously result not found, creating preprocessed files now...' ) self.glove_vocab = load_glove_vocab(self.glove_file, self.glove_dim, to_lower=False) if not os.path.isdir(self.spacyDir): os.makedirs(self.spacyDir) print('Directory created: ' + self.spacyDir) for dataset_label in dataset_labels: if self.BuildTestVocabulary: if dataset_label == 'train': self.preprocess(dataset_label) else: self.preprocess(dataset_label)
def __init__(self, opt): print('CoQA Preprocessing') self.opt = opt self.spacyDir = opt['FEATURE_FOLDER'] self.train_file = os.path.join(opt['datadir'], opt['CoQA_TRAIN_FILE']) self.dev_file = os.path.join(opt['datadir'], opt['CoQA_DEV_FILE']) self.glove_file = os.path.join(opt['datadir'], opt['INIT_WORD_EMBEDDING_FILE']) self.glove_dim = 300 print("The path of dev file: ", self.dev_file) print("The path of FEATURE_FOLDER: ", self.spacyDir) self.data_prefix = 'coqa-' dataset_labels = ['train', 'dev'] allExist = True for dataset_label in dataset_labels: if not os.path.exists( os.path.join( self.spacyDir, self.data_prefix + dataset_label + '-preprocessed.json')): allExist = False if allExist: return print( 'Previously result not found, creating preprocessed files now...') self.glove_vocab = load_glove_vocab(self.glove_file, self.glove_dim, to_lower=False) if not os.path.isdir(self.spacyDir): os.makedirs(self.spacyDir) print('Directory created: ' + self.spacyDir) for dataset_label in dataset_labels: self.preprocess(dataset_label)