Exemple #1
0
    def __init__(self, opt):
        print('CoQA Preprocessing')
        self.opt = opt
        self.spacyDir = opt['FEATURE_FOLDER']
        self.train_file = os.path.join(opt['datadir'], opt['CoQA_TRAIN_FILE'])
        self.dev_file = os.path.join(opt['datadir'], opt['CoQA_DEV_FILE'])

        self.glove_file = os.path.join(opt['datadir'],
                                       opt['INIT_WORD_EMBEDDING_FILE'])
        self.glove_dim = 300

        self.official = 'OFFICIAL' in opt
        self.data_prefix = 'coqa-'

        if self.official:
            self.glove_vocab = load_glove_vocab(self.glove_file,
                                                self.glove_dim,
                                                to_lower=False)
            print('Official prediction initializes...')
            print('Loading training vocab and vocab char...')
            self.train_vocab, self.train_char_vocab, self.train_embedding = self.load_data(
            )
            self.test_file = self.opt['OFFICIAL_TEST_FILE']
            return

        dataset_labels = ['train', 'dev']
        allExist = True
        for dataset_label in dataset_labels:
            if not os.path.exists(
                    os.path.join(
                        self.spacyDir, self.data_prefix + dataset_label +
                        '-preprocessed.json')):
                allExist = False

        if allExist:
            return

        print(
            'Previously result not found, creating preprocessed files now...')
        self.glove_vocab = load_glove_vocab(self.glove_file,
                                            self.glove_dim,
                                            to_lower=False)
        if not os.path.isdir(self.spacyDir):
            os.makedirs(self.spacyDir)
            print('Directory created: ' + self.spacyDir)

        for dataset_label in dataset_labels:
            self.preprocess(dataset_label)
    def __init__(self, opt):
        print('CoQA Preprocessing')
        self.opt = opt
        self.spacyDir = opt['FEATURE_FOLDER']
        self.glove_file = os.path.join(opt['datadir'],
                                       opt['INIT_WORD_EMBEDDING_FILE'])
        self.glove_dim = 300
        self.BuildTestVocabulary = 'BuildTestVocabulary' in opt
        self.n_gram = opt['n_gram']

        dataset_labels = self.opt['Task'].split(',')
        self.dataset_labels = dataset_labels
        if 'train' in dataset_labels:
            dataset_labels.remove('train')
            dataset_labels = ['train'] + dataset_labels
            assert dataset_labels[0] == 'train'
        self.dataset_labels = dataset_labels
        allExist = True
        for dataset_label in dataset_labels:
            if not os.path.exists(
                    os.path.join(self.spacyDir,
                                 dataset_label + '-preprocessed.msgpack')):
                allExist = False

        if allExist:
            return
        if 'train' not in dataset_labels:
            self.train_vocab, self.train_char_vocab, self.train_embedding = self.load_data(
            )
        # else:
        #     assert False

        print(
            'Previously result not found, creating preprocessed files now...')
        if 'FastText' in self.opt:
            self.fasttext_model = os.path.join(opt['datadir'],
                                               opt['fasttext_model'])
        if 'GLOVE' in self.opt:
            print(
                'Previously result not found, creating preprocessed files now...'
            )
            self.glove_vocab = load_glove_vocab(self.glove_file,
                                                self.glove_dim,
                                                to_lower=False)

        if not os.path.isdir(self.spacyDir):
            os.makedirs(self.spacyDir)
            print('Directory created: ' + self.spacyDir)

        for dataset_label in dataset_labels:
            if self.BuildTestVocabulary:
                if dataset_label == 'train':
                    self.preprocess(dataset_label)
            else:
                self.preprocess(dataset_label)
Exemple #3
0
    def __init__(self, opt):
        print('CoQA Preprocessing')
        self.opt = opt
        self.spacyDir = opt['FEATURE_FOLDER']
        self.train_file = os.path.join(opt['datadir'], opt['CoQA_TRAIN_FILE'])
        self.dev_file = os.path.join(opt['datadir'], opt['CoQA_DEV_FILE'])
        self.glove_file = os.path.join(opt['datadir'],
                                       opt['INIT_WORD_EMBEDDING_FILE'])
        self.glove_dim = 300
        print("The path of dev file: ", self.dev_file)
        print("The path of FEATURE_FOLDER: ", self.spacyDir)
        self.data_prefix = 'coqa-'

        dataset_labels = ['train', 'dev']
        allExist = True
        for dataset_label in dataset_labels:
            if not os.path.exists(
                    os.path.join(
                        self.spacyDir, self.data_prefix + dataset_label +
                        '-preprocessed.json')):
                allExist = False

        if allExist:
            return

        print(
            'Previously result not found, creating preprocessed files now...')
        self.glove_vocab = load_glove_vocab(self.glove_file,
                                            self.glove_dim,
                                            to_lower=False)
        if not os.path.isdir(self.spacyDir):
            os.makedirs(self.spacyDir)
            print('Directory created: ' + self.spacyDir)

        for dataset_label in dataset_labels:
            self.preprocess(dataset_label)