Exemple #1
0
    def __init__(self, fields, path, extension='.txt', **kwargs):
        examples = []

        num_sequences = len(fields)

        data_files = glob.glob(os.path.join(path, '*' + extension))
        for data_file in data_files:
            # Read the file line by line, and create examples from series
            # of num_sequences consecutive lines
            with io.open(os.path.expanduser(data_file), encoding="utf8") as f:
                line_buffer = []
                for line in f:
                    if len(line_buffer) == num_sequences:
                        # Make a new example
                        example = Example.fromlist(line_buffer, fields)
                        examples.append(example)

                        # Remove the first sentence
                        line_buffer.pop(0)
                    line_buffer.append(line)

        print('Found %d examples' % (len(examples)))
        super(StoryDataset, self).__init__(examples, fields, **kwargs)

        def foo(x):
            sort_keys = []
            for i in xrange(0, len(fields)):
                example = getattr(x, fields[i][0])
                sort_keys.append(len(example))
            return sort_keys

        self.sort_key = foo  #lambda x: len(x.field_0)
    def __init__(self, path, load_ext_feats=False):
        """
        Create a Castor dataset involving pairs of texts
        """
        fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD),
                  ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD),
                  ('aid', self.AID_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)]

        examples = []
        with open(os.path.join(path, 'a.toks'), 'r') as f1, open(os.path.join(path, 'b.toks'), 'r') as f2:
            sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1]
            sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2]

        word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2)
        self.word_to_doc_cnt = word_to_doc_cnt

        if not load_ext_feats:
            overlap_feats = get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt)
        else:
            overlap_feats = np.loadtxt(os.path.join(path, 'overlap_feats.txt'))

        with open(os.path.join(path, 'id.txt'), 'r') as id_file, open(os.path.join(path, 'sim.txt'), 'r') as label_file:
            for i, (pair_id, l1, l2, ext_feats, label) in enumerate(zip(id_file, sent_list_1, sent_list_2, overlap_feats, label_file)):
                pair_id = pair_id.rstrip('.\n')
                label = label.rstrip('.\n')
                example_list = [pair_id, l1, l2, ext_feats, label, i + 1, ' '.join(l1), ' '.join(l2)]
                example = Example.fromlist(example_list, fields)
                examples.append(example)

        super(CastorPairDataset, self).__init__(examples, fields)
Exemple #3
0
    def __init__(self, path):
        """
        Create a MSRP dataset instance
        """
        fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD),
                ('label', self.LABEL_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)]

        examples = []
        with open(os.path.join(path, 'a.toks'), 'r') as f1, open(os.path.join(path, 'b.toks'), 'r') as f2:
            sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1]
            sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2]

        word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2)
        self.word_to_doc_cnt = word_to_doc_cnt

        with open(os.path.join(path, 'id.txt'), 'r') as id_file, open(os.path.join(path, 'sim.txt'), 'r') as label_file:
            for pair_id, l1, l2, label in zip(id_file, sent_list_1, sent_list_2, label_file):
                pair_id = pair_id.rstrip('.\n')
                label = label.rstrip('.\n')
                ext_feats = []

                # Number features
                sent1_nums, sent2_nums = [], []
                match = self.NUMBER_PATTERN.search(' '.join(l1))
                if match:
                    for g in match.groups():
                        if g is not None:
                            sent1_nums.append(g)

                match = self.NUMBER_PATTERN.search(' '.join(l2))
                if match:
                    for g in match.groups():
                        if g is not None:
                            sent2_nums.append(g)

                sent1_nums = set(sent1_nums)
                sent2_nums = set(sent2_nums)
                exact = int(sent1_nums == sent2_nums)
                superset = int(sent1_nums.issuperset(sent2_nums) or sent2_nums.issuperset(sent1_nums))
                ext_feats.append(1 if (exact or (len(sent1_nums) == 0 and len(sent2_nums) == 0)) else 0)
                ext_feats.append(exact)
                ext_feats.append(superset)

                # Length difference
                ext_feats.append(len(l2) - len(l1))

                # Overlap
                overlap = len(set(l1) & set(l2))
                ext_feats.append(overlap / len(l1))
                ext_feats.append(overlap / len(l2))

                example = Example.fromlist([pair_id, l1, l2, ext_feats, label, ' '.join(l1), ' '.join(l2)], fields)
                examples.append(example)

        super(MSRP, self).__init__(examples, fields)
Exemple #4
0
    def __init__(self, path: str, text_field: Field, label_field: Field, **kwargs) -> None:
        fields = [('text', text_field), ('label', label_field)]
        examples = []

        with open(path) as f:
            for line in f.readlines():
                line = line.strip()
                label = line[-1]
                text = line[:-2]
                examples.append(Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)
Exemple #5
0
    def seg(self,sentences):
        examples = []
        fields=[('unigram', self.unigram_field), ('fwd_bigram', self.bigram_field),('back_bigram', self.bigram_field)]
        for sent in sentences:
            columns = [[], [], []]
            chars = ['<BOS>'] + list(sent) + ['<EOS>']
            for c,f_bi,b_bi in zip(chars[1:-1],zip(chars,chars[1:]),zip(chars[1:],chars[2:])):
                fwd_bi = ''.join(f_bi)
                back_bi = ''.join(b_bi)
                columns[0].append(c)
                columns[1].append(fwd_bi)
                columns[2].append(back_bi)
            examples.append(Example.fromlist(columns,fields))

        dataset = data.Dataset(examples,fields)
        iter = data.BucketIterator(dataset, batch_size=64, train=False, shuffle=False, sort=False, device=device)

        decoded =self.model.decode(iter)
        segmented_sentence = self.BMSE2seg(sentences,decoded)
        return segmented_sentence
Exemple #6
0
    def __init__(self, path):
        """
        Create a Castor dataset involving pairs of texts
        """
        fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD),
                  ('sentence_2', self.TEXT_FIELD),
                  ('ext_feats', self.EXT_FEATS_FIELD),
                  ('label', self.LABEL_FIELD),
                  ('sentence_1_raw', self.RAW_TEXT_FIELD),
                  ('sentence_2_raw', self.RAW_TEXT_FIELD)]

        examples = []

        ids, labels, sent_list_1, sent_list_2 = [], [], [], []
        with open(path) as f:
            for line in f:
                content = json.loads(line)
                sent_list_1.append(content['question'])
                sent_list_2.append(content['qaquestion'])

        word_to_doc_cnt = get_pairwise_word_to_doc_freq(
            sent_list_1, sent_list_2)
        overlap_feats = get_pairwise_overlap_features(sent_list_1, sent_list_2,
                                                      word_to_doc_cnt)
        self.word_to_doc_cnt = word_to_doc_cnt

        with open(path) as f:
            for line in f:
                content = json.loads(line)
                ids.append(content['qid'])
                labels.append(content['qarel'])

        for pair_id, l1, l2, ext_feats, label in zip(ids, sent_list_1,
                                                     sent_list_2,
                                                     overlap_feats, labels):
            example = Example.fromlist([
                pair_id, l1, l2, ext_feats, label, ' '.join(l1), ' '.join(l2)
            ], fields)
            examples.append(example)

        super(SemevalDataset, self).__init__(examples, fields)
Exemple #7
0
        'ENC_EMB_DIM': 256,
        'DEC_EMB_DIM': 256,
        'ENC_HID_DIM': 512,
        'DEC_HID_DIM': 512,
        'ENC_DROPOUT': 0.5,
        'DEC_DROPOUT': 0.5
    }
    network = create_seq2seq(network_params, device)
    network.load_state_dict(torch.load('weights/tut1-model.pt'))

    # sentence = input('Enter sentence in german: ')
    sentence = 'Ein Hund rennt im Schnee.'
    while sentence is not 'exit':
        # Convert custom sentence to tensor

        example = Example.fromlist([sentence], [('de', src_field)])
        batch = [example.de]
        idx_input = src_field.process(batch).to(device)

        # Translate this tensor
        output_probs = network(idx_input, None, 0)
        idx_output = output_probs.squeeze(1).argmax(axis=1)
        # TODO is actually probs, not idx

        # Convert back
        output_sentence = ' '.join([trg_field.vocab.itos[idx] for idx in idx_output])

        print(output_sentence)
        sentence = input('Enter sentence in german: ')

def fromTSV(data, fields):
    return Example.fromlist(data.split('\t'), fields)