def __init__(self, fields, path, extension='.txt', **kwargs): examples = [] num_sequences = len(fields) data_files = glob.glob(os.path.join(path, '*' + extension)) for data_file in data_files: # Read the file line by line, and create examples from series # of num_sequences consecutive lines with io.open(os.path.expanduser(data_file), encoding="utf8") as f: line_buffer = [] for line in f: if len(line_buffer) == num_sequences: # Make a new example example = Example.fromlist(line_buffer, fields) examples.append(example) # Remove the first sentence line_buffer.pop(0) line_buffer.append(line) print('Found %d examples' % (len(examples))) super(StoryDataset, self).__init__(examples, fields, **kwargs) def foo(x): sort_keys = [] for i in xrange(0, len(fields)): example = getattr(x, fields[i][0]) sort_keys.append(len(example)) return sort_keys self.sort_key = foo #lambda x: len(x.field_0)
def __init__(self, path, load_ext_feats=False): """ Create a Castor dataset involving pairs of texts """ fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD), ('aid', self.AID_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)] examples = [] with open(os.path.join(path, 'a.toks'), 'r') as f1, open(os.path.join(path, 'b.toks'), 'r') as f2: sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1] sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2] word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2) self.word_to_doc_cnt = word_to_doc_cnt if not load_ext_feats: overlap_feats = get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt) else: overlap_feats = np.loadtxt(os.path.join(path, 'overlap_feats.txt')) with open(os.path.join(path, 'id.txt'), 'r') as id_file, open(os.path.join(path, 'sim.txt'), 'r') as label_file: for i, (pair_id, l1, l2, ext_feats, label) in enumerate(zip(id_file, sent_list_1, sent_list_2, overlap_feats, label_file)): pair_id = pair_id.rstrip('.\n') label = label.rstrip('.\n') example_list = [pair_id, l1, l2, ext_feats, label, i + 1, ' '.join(l1), ' '.join(l2)] example = Example.fromlist(example_list, fields) examples.append(example) super(CastorPairDataset, self).__init__(examples, fields)
def __init__(self, path): """ Create a MSRP dataset instance """ fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)] examples = [] with open(os.path.join(path, 'a.toks'), 'r') as f1, open(os.path.join(path, 'b.toks'), 'r') as f2: sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1] sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2] word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2) self.word_to_doc_cnt = word_to_doc_cnt with open(os.path.join(path, 'id.txt'), 'r') as id_file, open(os.path.join(path, 'sim.txt'), 'r') as label_file: for pair_id, l1, l2, label in zip(id_file, sent_list_1, sent_list_2, label_file): pair_id = pair_id.rstrip('.\n') label = label.rstrip('.\n') ext_feats = [] # Number features sent1_nums, sent2_nums = [], [] match = self.NUMBER_PATTERN.search(' '.join(l1)) if match: for g in match.groups(): if g is not None: sent1_nums.append(g) match = self.NUMBER_PATTERN.search(' '.join(l2)) if match: for g in match.groups(): if g is not None: sent2_nums.append(g) sent1_nums = set(sent1_nums) sent2_nums = set(sent2_nums) exact = int(sent1_nums == sent2_nums) superset = int(sent1_nums.issuperset(sent2_nums) or sent2_nums.issuperset(sent1_nums)) ext_feats.append(1 if (exact or (len(sent1_nums) == 0 and len(sent2_nums) == 0)) else 0) ext_feats.append(exact) ext_feats.append(superset) # Length difference ext_feats.append(len(l2) - len(l1)) # Overlap overlap = len(set(l1) & set(l2)) ext_feats.append(overlap / len(l1)) ext_feats.append(overlap / len(l2)) example = Example.fromlist([pair_id, l1, l2, ext_feats, label, ' '.join(l1), ' '.join(l2)], fields) examples.append(example) super(MSRP, self).__init__(examples, fields)
def __init__(self, path: str, text_field: Field, label_field: Field, **kwargs) -> None: fields = [('text', text_field), ('label', label_field)] examples = [] with open(path) as f: for line in f.readlines(): line = line.strip() label = line[-1] text = line[:-2] examples.append(Example.fromlist([text, label], fields)) super().__init__(examples, fields, **kwargs)
def seg(self,sentences): examples = [] fields=[('unigram', self.unigram_field), ('fwd_bigram', self.bigram_field),('back_bigram', self.bigram_field)] for sent in sentences: columns = [[], [], []] chars = ['<BOS>'] + list(sent) + ['<EOS>'] for c,f_bi,b_bi in zip(chars[1:-1],zip(chars,chars[1:]),zip(chars[1:],chars[2:])): fwd_bi = ''.join(f_bi) back_bi = ''.join(b_bi) columns[0].append(c) columns[1].append(fwd_bi) columns[2].append(back_bi) examples.append(Example.fromlist(columns,fields)) dataset = data.Dataset(examples,fields) iter = data.BucketIterator(dataset, batch_size=64, train=False, shuffle=False, sort=False, device=device) decoded =self.model.decode(iter) segmented_sentence = self.BMSE2seg(sentences,decoded) return segmented_sentence
def __init__(self, path): """ Create a Castor dataset involving pairs of texts """ fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)] examples = [] ids, labels, sent_list_1, sent_list_2 = [], [], [], [] with open(path) as f: for line in f: content = json.loads(line) sent_list_1.append(content['question']) sent_list_2.append(content['qaquestion']) word_to_doc_cnt = get_pairwise_word_to_doc_freq( sent_list_1, sent_list_2) overlap_feats = get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt) self.word_to_doc_cnt = word_to_doc_cnt with open(path) as f: for line in f: content = json.loads(line) ids.append(content['qid']) labels.append(content['qarel']) for pair_id, l1, l2, ext_feats, label in zip(ids, sent_list_1, sent_list_2, overlap_feats, labels): example = Example.fromlist([ pair_id, l1, l2, ext_feats, label, ' '.join(l1), ' '.join(l2) ], fields) examples.append(example) super(SemevalDataset, self).__init__(examples, fields)
'ENC_EMB_DIM': 256, 'DEC_EMB_DIM': 256, 'ENC_HID_DIM': 512, 'DEC_HID_DIM': 512, 'ENC_DROPOUT': 0.5, 'DEC_DROPOUT': 0.5 } network = create_seq2seq(network_params, device) network.load_state_dict(torch.load('weights/tut1-model.pt')) # sentence = input('Enter sentence in german: ') sentence = 'Ein Hund rennt im Schnee.' while sentence is not 'exit': # Convert custom sentence to tensor example = Example.fromlist([sentence], [('de', src_field)]) batch = [example.de] idx_input = src_field.process(batch).to(device) # Translate this tensor output_probs = network(idx_input, None, 0) idx_output = output_probs.squeeze(1).argmax(axis=1) # TODO is actually probs, not idx # Convert back output_sentence = ' '.join([trg_field.vocab.itos[idx] for idx in idx_output]) print(output_sentence) sentence = input('Enter sentence in german: ')
def fromTSV(data, fields): return Example.fromlist(data.split('\t'), fields)