Beispiel #1
0
def test_char_tokenizer(input_path, output_path, longest_token=None):
    tokenizer = Tokenizer(
        nb_words=None, bos_str='-NULL-', eos_str=None, mode='chars', longest_token=longest_token,
    )
    tokenizer.fit_one(smart_ropen(input_path))
    seqs = tokenizer.to_sequences(smart_ropen(input_path), dtype='int32')
    with open(output_path, 'w') as fo:
        tokenizer.write_as_text(seqs, output_stream=fo)
    return tokenizer
Beispiel #2
0
def test():
    path1 = '/home/wferrei1/github/dgm4nlp/data/en-fr/trial.en-fr.en'
    path2 = '/home/wferrei1/github/dgm4nlp/data/en-fr/trial.en-fr.fr'

    print('Fitting tokenizer')
    tok1 = nlputils.Tokenizer(nb_words=None,
                              bos_str='-NULL-',
                              eos_str=None,
                              mode='chars',
                              longest_token=10)
    tok1.fit_one(smart_ropen(path1))
    print(tok1.vocab_size())
    tok2 = nlputils.Tokenizer(nb_words=None,
                              bos_str=None,
                              eos_str=None,
                              mode='chars',
                              longest_token=12)
    tok2.fit_one(smart_ropen(path2))
    print(tok2.vocab_size())

    print('Memory mapping data')
    text = Multitext3D(
        [path1, path2],
        [tok1, tok2],
        shortest=[2, 2],
        longest=[20, 20],
        trim=[True, True],
        batch_dtype='int32',
        mask_dtype='bool',
    )

    print(text.nb_samples(), text.longest_sequence(0),
          text.deepest_sequence(0))
    print(text.nb_samples(), text.longest_sequence(1),
          text.deepest_sequence(1))
    import sys
    for i, (xm, ym) in enumerate(
            text.batch_iterator(1,
                                dynamic_sequence_length=True,
                                dynamic_sequence_depth=True)):
        x, m1 = xm
        x, m1 = x[0], m1[0]
        y, m2 = ym
        y, m2 = y[0], m2[0]
        print(x)
        print(m1)
        print(y)
        print(m2)
        print()

        tok1.write_as_text([x], sys.stdout)
        tok2.write_as_text([y], sys.stdout)
        if i == 1:
            break
Beispiel #3
0
def construct_mmap(input_path, output_path, tokenizer, selection, nb_tokens, dtype):
    """
    Construct memory map for selected sentences in a corpus.

    :param input_path: path to text
    :param output_path: path to memory map file
    :param tokenizer: tokenizer for text
    :param selection: array of binary selectors
    :param nb_tokens: total number of tokens in the selected corpus
    :param dtype: data type for memmap
    :return: np.array with shape (nb_samples,) where array[i] is the length of the ith sequence
    """

    # construct memory mapped array
    mmap = np.memmap(output_path, dtype=dtype, mode='w+', shape=nb_tokens)

    # prepare for populating memmap
    offset = 0
    sample_length = []

    # populate memory map
    for sid, seq in enumerate(tokenizer.to_sequences_iterator(smart_ropen(input_path))):
        if not selection[sid]:  # skip sentences that do not comply with length constraints
            continue
        # here we have a valid sequence, thus we memory map it
        mmap[offset:offset + seq.shape[0]] = seq
        offset += seq.shape[0]
        sample_length.append(seq.shape[0])

    del mmap

    return np.array(sample_length, dtype='int64')
Beispiel #4
0
def bound_length(input_paths, tokenizers, shortest, longest):
    """
    Return an np.array which flags whether all parallel segments comply with length constraints
    and count the number of tokens in each stream (considering valid sequences only).

    :param input_paths: paths (list/tuple) to each part of the parallel collection
    :param tokenizers: list/tuple of tokenizers
    :param shortest: shortest valid sequence for each part of the parallel collection
    :param longest: longest valid sequence for each part of the parallel collection
    :return: selection (nb_samples,) and counts (nb_streams,)
    """

    # get an iterator for each stream
    nb_streams = len(input_paths)
    iterators = [tokenizers[i].to_sequences_iterator(smart_ropen(input_paths[i])) for i in range(nb_streams)]

    # length checks
    selection = []
    nb_tokens = [0] * nb_streams
    for seqs in zip(*iterators):  # get a sequence from each iterator
        # check if every sequence complies with its respective length bounds
        if not all(lower <= seq.shape[0] <= upper for lower, upper, seq in zip(shortest, longest, seqs)):
            selection.append(False)  # excluded
        else:
            selection.append(True)  # included
            # increase token count
            for i, seq in enumerate(seqs):
                nb_tokens[i] += seq.shape[0]

    return np.array(selection, dtype=bool), np.array(nb_tokens, dtype='int64')
def _load_cand(cand_file):
    candidates = dict()
    with smart_ropen(cand_file) as cf:
        for line in cf:
            (key, cand) = line.strip().split('::')
            #(word, _) = key.split('.')
            cols = cand.split(';')
            candidates[key] = cols
    return candidates
def _load_test(test_file):
    keys = []
    sids = []
    wids = []
    test_sent = dict()

    with smart_ropen(test_file) as testf:
        for line in testf:
            (key, sid, wid, sent) = line.strip().split('\t')
            keys.append(key)
            sids.append(int(sid))
            wids.append(int(wid))
            test_sent[int(sid)] = sent.split(' ')
    return keys, sids, wids, test_sent
Beispiel #7
0
def read_naacl_distributions(naacl_path, x_lengths, y_lengths):
    """
    Read NAACL-formatted alignment files.

    :param path: path to file
    :return: a list of pairs [sure set, possible set]
        each entry in the set maps an input position to an output position
        sentences start from 1 and a NULL token is indicated with position 0
    """

    with smart_ropen(naacl_path) as fi:
        data = []
        current = None
        for i, line in enumerate(fi.readlines()):
            fields = line.split()
            if not fields:
                continue
            prob = 1.0  # by default we assume prob 1.0
            if len(fields) < 3:
                raise ValueError('Missing required fields in line %d: %s' % (i, line.strip()))
            snt_id, x, y = int(fields[0]), int(fields[1]), int(fields[2])
            if len(fields) == 5:
                prob = float(fields[4])
            if len(fields) == 4:
                if fields[3] not in {'S', 'P'}:
                    prob = float(fields[3])
            if current is None or snt_id != current:
                data.append(defaultdict(list))
                current = snt_id
            # make y 0-based
            # x is already 0-based (where 0 points to NULL)
            data[-1][y - 1].append((x, prob))

        distributions = []
        for ainfo, x_len, y_len in zip(data, x_lengths, y_lengths):
            dist = np.zeros((y_len, x_len), dtype=float)
            for y in range(y_len):
                if y not in ainfo:
                    dist[y, 0] = 1.  # align it to NULL
                else:
                    for x, prob in ainfo[y]:
                        dist[y, x] += prob
            normalisers = dist.sum(-1)
            #normalisers = np.where(np.not_equal(normalisers, 0), normalisers, np.ones(y_len))
            dist /= np.expand_dims(normalisers, 1)
            distributions.append(dist)
            #print(y_len, x_len, dist)

    return distributions
Beispiel #8
0
def prepare_training(
        x_path,
        # data pre-processing
        nb_words=None,
        shortest_sequence=None,
        longest_sequence=None,
        # padding
        bos_str=None,
        eos_str=None,
        # normalisation
        lowercase=False,
        name='training') -> [Tokenizer, Text]:
    """
    Construct vocabularies/tokenizers and memory-map the training data.

    :param x_path:
    :param y_path:
    :param nb_words:
    :param shortest_sequence:
    :param longest_sequence:
    :param bos_str:
    :param eos_str:
    :param name:
    :return:
    """
    # Prepare vocabularies
    logging.info('Fitting vocabulary')
    tk = Tokenizer(nb_words=nb_words,
                   bos_str=bos_str,
                   eos_str=eos_str,
                   lowercase=lowercase)
    tk.fit_one(smart_ropen(x_path))
    logging.info('  vocab-size=%d', tk.vocab_size())

    # Prepare training corpus
    logging.info('Memory mapping training data')
    training = Text(x_path,
                    tokenizer=tk,
                    shortest=shortest_sequence,
                    longest=longest_sequence,
                    trim=True,
                    mask_dtype='float32',
                    name=name)
    # in case the longest sequence was shorter than we thought
    longest_sequence = training.longest_sequence()
    logging.info(' training-samples=%d longest=%s tokens=%s',
                 training.nb_samples(), longest_sequence, training.nb_tokens())

    return tk, training
Beispiel #9
0
def test_text(input_path, output_path):
    """
    Test the reconstruction of a corpus passing it through Tokenizer/Text pipeline.
        Example:
            text.test_text('data/en-fr/test.en-fr.en', 'data/en-fr/test.en-fr.en-mono')

    :param input_path: a text file
    :param output_path: where to save its reconstruction
    """
    tk = Tokenizer()
    tk.fit_one(smart_ropen(input_path))
    text = Text(input_path, tk)
    with open(output_path, 'w') as fo:
        for b, m in text.batch_iterator(100, shorter_batch='trim'):
            tk.write_as_text(b, fo)
    return text
Beispiel #10
0
def read_naacl_alignments(path, reverse=False):
    """
    Read NAACL-formatted alignment files.

    :param path: path to file
    :param reverse: reverse links (that is, if input is x-y, output becomes= y-x)
    :return: a list of pairs [sure set, possible set]
        each entry in the set maps an input position to an output position
        sentences start from 1 and a NULL token is indicated with position 0
    """
    with smart_ropen(path) as fi:
        ainfo = {}
        for i, line in enumerate(fi.readlines()):
            fields = line.split()
            if not fields:
                continue
            sure = True  # by default we assumed Sure links
            prob = 1.0  # by default we assume prob 1.0
            if len(fields) < 3:
                raise ValueError('Missing required fields in line %d: %s' % (i, line.strip()))
            snt_id, x, y = int(fields[0]), int(fields[1]), int(fields[2])
            if x == 0 or y == 0:  # we ignore NULL links
                continue
            if reverse:
                x, y = y, x
            if len(fields) == 5:
                sure = fields[3] == 'S'
                prob = float(fields[4])
            if len(fields) == 4:
                if fields[3] in {'S', 'P'}:
                    sure = fields[3] == 'S'
                else:
                    prob = float(fields[3])
            snt_info = ainfo.get(snt_id, None)
            if snt_info is None:
                snt_info = [set(), set()]  # S and P sets
                ainfo[snt_id] = snt_info
            if sure:  # Note that S links are also P links: http://dl.acm.org/citation.cfm?id=992810
                snt_info[0].add((x, y))
                snt_info[1].add((x, y))
            else:
                snt_info[1].add((x, y))
    return tuple(v for k, v in sorted(ainfo.items(), key=lambda pair: pair[0]))
Beispiel #11
0
def prepare_training3d(
        x_path,
        y_path,
        # data pre-processing
        nb_chars=[None, None],
        longest_word=[None, None],
        shortest_sequence=[None, None],
        longest_sequence=[None, None],
        # padding
        bos_str=[None, None],
        eos_str=[None, None],
        # normalisation
        lowercase=False,
        batch_dtype='int32',
        mask_dtype='bool',
        name='training') -> [list, Multitext3D]:
    """
    Construct vocabularies/tokenizers and memory-map the training data.

    :param x_path:
    :param y_path:
    :param nb_words:
    :param shortest_sequence:
    :param longest_sequence:
    :param bos_str:
    :param eos_str:
    :param name:
    :return:
    """
    training_paths = [x_path, y_path]
    # Prepare vocabularies
    logging.info('Fitting (char) vocabularies')
    tks = []
    for i, (path, vs, bos, eos, longword) in enumerate(
            zip(training_paths, nb_chars, bos_str, eos_str, longest_word)):
        logging.info(' stream=%d', i)
        # tokenizer with a bounded vocabulary
        tks.append(
            Tokenizer(nb_words=vs,
                      bos_str=bos,
                      eos_str=eos,
                      lowercase=lowercase,
                      mode='chars',
                      longest_token=longword))
        tks[-1].fit_one(smart_ropen(path))
        logging.info('  (char) vocab-size=%d', tks[-1].vocab_size())

    # Prepare training corpus
    logging.info('Memory mapping (char) training data')
    training = Multitext3D(training_paths,
                           tokenizers=tks,
                           shortest=shortest_sequence,
                           longest=longest_sequence,
                           trim=[True, True],
                           batch_dtype=batch_dtype,
                           mask_dtype=mask_dtype,
                           name=name)
    # in case the longest sequence was shorter than we thought
    longest_sequence = [
        training.longest_sequence(0),
        training.longest_sequence(1)
    ]
    deepest_sequence = [
        training.deepest_sequence(0),
        training.deepest_sequence(1)
    ]
    logging.info(
        ' training-samples=%d longest=%s deepest=%s tokens=%s',
        training.nb_samples(), longest_sequence, deepest_sequence,
        [training.nb_tokens(0), training.nb_tokens(1)])

    return tks, training