def __init__(self, path, load_ext_feats=False): """ Create a Castor dataset involving pairs of texts """ fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD), ('aid', self.AID_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)] examples = [] with open(os.path.join(path, 'a.toks'), 'r') as f1, open(os.path.join(path, 'b.toks'), 'r') as f2: sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1] sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2] word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2) self.word_to_doc_cnt = word_to_doc_cnt if not load_ext_feats: overlap_feats = get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt) else: overlap_feats = np.loadtxt(os.path.join(path, 'overlap_feats.txt')) with open(os.path.join(path, 'id.txt'), 'r') as id_file, open(os.path.join(path, 'sim.txt'), 'r') as label_file: for i, (pair_id, l1, l2, ext_feats, label) in enumerate(zip(id_file, sent_list_1, sent_list_2, overlap_feats, label_file)): pair_id = pair_id.rstrip('.\n') label = label.rstrip('.\n') example_list = [pair_id, l1, l2, ext_feats, label, i + 1, ' '.join(l1), ' '.join(l2)] example = Example.fromlist(example_list, fields) examples.append(example) super(CastorPairDataset, self).__init__(examples, fields)
def __init__(self, path): """ Create a Castor dataset involving pairs of texts """ fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)] examples = [] ids, labels, sent_list_1, sent_list_2 = [], [], [], [] with open(path) as f: for line in f: content = json.loads(line) sent_list_1.append(content['question']) sent_list_2.append(content['qaquestion']) word_to_doc_cnt = get_pairwise_word_to_doc_freq( sent_list_1, sent_list_2) overlap_feats = get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt) self.word_to_doc_cnt = word_to_doc_cnt with open(path) as f: for line in f: content = json.loads(line) ids.append(content['qid']) labels.append(content['qarel']) for pair_id, l1, l2, ext_feats, label in zip(ids, sent_list_1, sent_list_2, overlap_feats, labels): example = Example.fromlist([ pair_id, l1, l2, ext_feats, label, ' '.join(l1), ' '.join(l2) ], fields) examples.append(example) super(SemevalDataset, self).__init__(examples, fields)
def __init__(self, path, **kwargs): """ Create a Semeval dataset instance """ fields = [('qid', self.QID_FIELD), ('qaid', self.QID_FIELD), ('label', self.LABEL_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD)] examples = [] with open(path) as infile: for line in infile: content = json.loads(line) sent_list_1 = content['question'] sent_list_2 = content['qaquestion'] word_to_doc_cnt = get_pairwise_word_to_doc_freq( sent_list_1, sent_list_2) overlap_feats = get_pairwise_overlap_features( sent_list_1, sent_list_2, word_to_doc_cnt) overlap_feats = [] values = [ content['qid'], content['qaid'], content['qarel'], content['question'], content['qaquestion'], ' '.join(content['question']), ' '.join(content['qaquestion']), overlap_feats ] examples.append(Example.fromlist(values, fields)) super(Semeval, self).__init__(examples, fields, **kwargs)