def streaming_create_tsv_reader(func,
                                line,
                                polymath,
                                seqs,
                                num_workers,
                                is_test=False,
                                misc=None):
    ctokens, qtokens, atokens, cwids, qwids, baidx, eaidx, ccids, qcids = tsv2ctf.tsv_iter(
        line, polymath.vocab, polymath.chars, polymath.wg_dim, is_test, misc)
    batch = {
        'cwids': [],
        'qwids': [],
        'baidx': [],
        'eaidx': [],
        'ccids': [],
        'qcids': []
    }
    batch['cwids'].append(cwids)
    batch['qwids'].append(qwids)
    batch['baidx'].append(baidx)
    batch['eaidx'].append(eaidx)
    batch['ccids'].append(ccids)
    batch['qcids'].append(qcids)

    if len(batch['cwids']) > 0:
        context_g_words = C.Value.one_hot([[
            C.Value.ONE_HOT_SKIP if i >= polymath.wg_dim else i for i in cwids
        ] for cwids in batch['cwids']], polymath.wg_dim)
        context_ng_words = C.Value.one_hot([[
            C.Value.ONE_HOT_SKIP if i < polymath.wg_dim else i -
            polymath.wg_dim for i in cwids
        ] for cwids in batch['cwids']], polymath.wn_dim)
        query_g_words = C.Value.one_hot([[
            C.Value.ONE_HOT_SKIP if i >= polymath.wg_dim else i for i in qwids
        ] for qwids in batch['qwids']], polymath.wg_dim)
        query_ng_words = C.Value.one_hot([[
            C.Value.ONE_HOT_SKIP if i < polymath.wg_dim else i -
            polymath.wg_dim for i in qwids
        ] for qwids in batch['qwids']], polymath.wn_dim)
        context_chars = [
            np.asarray(
                [[[c for c in cc + [0] * max(0, polymath.word_size - len(cc))]]
                 for cc in ccid],
                dtype=np.float32) for ccid in batch['ccids']
        ]
        query_chars = [
            np.asarray(
                [[[c for c in qc + [0] * max(0, polymath.word_size - len(qc))]]
                 for qc in qcid],
                dtype=np.float32) for qcid in batch['qcids']
        ]
        answer_begin = [
            np.asarray(ab, dtype=np.float32) for ab in batch['baidx']
        ]
        answer_end = [
            np.asarray(ae, dtype=np.float32) for ae in batch['eaidx']
        ]

        return {
            argument_by_name(func, 'cgw'): context_g_words,
            argument_by_name(func, 'qgw'): query_g_words,
            argument_by_name(func, 'cnw'): context_ng_words,
            argument_by_name(func, 'qnw'): query_ng_words,
            argument_by_name(func, 'cc'): context_chars,
            argument_by_name(func, 'qc'): query_chars,
            argument_by_name(func, 'ab'): answer_begin,
            argument_by_name(func, 'ae'): answer_end
        }
def create_tsv_reader(func,
                      tsv_file,
                      polymath,
                      seqs,
                      num_workers,
                      is_test=False,
                      misc=None):
    with open(tsv_file, 'r', encoding='utf-8') as f:
        eof = False
        batch_count = 0
        while not (eof and (batch_count % num_workers) == 0):
            batch_count += 1
            batch = {
                'cwids': [],
                'qwids': [],
                'baidx': [],
                'eaidx': [],
                'ccids': [],
                'qcids': []
            }

            while not eof and len(batch['cwids']) < seqs:
                line = f.readline()
                if not line:
                    eof = True
                    break

                if misc is not None:
                    import re
                    misc['uid'].append(re.match('^([^\t]*)', line).groups()[0])

                ctokens, qtokens, atokens, cwids, qwids, baidx, eaidx, ccids, qcids = tsv2ctf.tsv_iter(
                    line, polymath.vocab, polymath.chars, polymath.wg_dim,
                    is_test, misc)

                batch['cwids'].append(cwids)
                batch['qwids'].append(qwids)
                batch['baidx'].append(baidx)
                batch['eaidx'].append(eaidx)
                batch['ccids'].append(ccids)
                batch['qcids'].append(qcids)

            if len(batch['cwids']) > 0:
                context_g_words = C.Value.one_hot([[
                    C.Value.ONE_HOT_SKIP if i >= polymath.wg_dim else i
                    for i in cwids
                ] for cwids in batch['cwids']], polymath.wg_dim)
                context_ng_words = C.Value.one_hot([[
                    C.Value.ONE_HOT_SKIP if i < polymath.wg_dim else i -
                    polymath.wg_dim for i in cwids
                ] for cwids in batch['cwids']], polymath.wn_dim)
                query_g_words = C.Value.one_hot([[
                    C.Value.ONE_HOT_SKIP if i >= polymath.wg_dim else i
                    for i in qwids
                ] for qwids in batch['qwids']], polymath.wg_dim)
                query_ng_words = C.Value.one_hot([[
                    C.Value.ONE_HOT_SKIP if i < polymath.wg_dim else i -
                    polymath.wg_dim for i in qwids
                ] for qwids in batch['qwids']], polymath.wn_dim)
                context_chars = [
                    np.asarray([[[
                        c for c in cc +
                        [0] * max(0, polymath.word_size - len(cc))
                    ]] for cc in ccid],
                               dtype=np.float32) for ccid in batch['ccids']
                ]
                query_chars = [
                    np.asarray([[[
                        c for c in qc +
                        [0] * max(0, polymath.word_size - len(qc))
                    ]] for qc in qcid],
                               dtype=np.float32) for qcid in batch['qcids']
                ]
                answer_begin = [
                    np.asarray(ab, dtype=np.float32) for ab in batch['baidx']
                ]
                answer_end = [
                    np.asarray(ae, dtype=np.float32) for ae in batch['eaidx']
                ]

                yield {
                    argument_by_name(func, 'cgw'): context_g_words,
                    argument_by_name(func, 'qgw'): query_g_words,
                    argument_by_name(func, 'cnw'): context_ng_words,
                    argument_by_name(func, 'qnw'): query_ng_words,
                    argument_by_name(func, 'cc'): context_chars,
                    argument_by_name(func, 'qc'): query_chars,
                    argument_by_name(func, 'ab'): answer_begin,
                    argument_by_name(func, 'ae'): answer_end
                }
            else:
                yield {
                }  # need to generate empty batch for distributed training
Ejemplo n.º 3
0
def create_tsv_reader(func, tsv_file, polymath, seqs, num_workers, is_test=False, misc=None):
    with open(tsv_file, 'r', encoding='utf-8') as f:
        eof = False
        batch_count = 0
        while not(eof and (batch_count % num_workers) == 0):
            batch_count += 1
            batch={'cwids':[], 'qwids':[], 'baidx':[], 'eaidx':[], 'ccids':[], 'qcids':[]}

            while not eof and len(batch['cwids']) < seqs:
                line = f.readline()
                if not line:
                    eof = True
                    break

                if misc is not None:
                    import re
                    misc['uid'].append(re.match('^([^\t]*)', line).groups()[0])

                ctokens, qtokens, atokens, cwids, qwids,  baidx,   eaidx, ccids, qcids = tsv2ctf.tsv_iter(line, polymath.vocab, polymath.chars, is_test, misc)

                batch['cwids'].append(cwids)
                batch['qwids'].append(qwids)
                batch['baidx'].append(baidx)
                batch['eaidx'].append(eaidx)
                batch['ccids'].append(ccids)
                batch['qcids'].append(qcids)

            if len(batch['cwids']) > 0:
                context_g_words  = C.Value.one_hot([[C.Value.ONE_HOT_SKIP if i >= polymath.wg_dim else i for i in cwids] for cwids in batch['cwids']], polymath.wg_dim)
                context_ng_words = C.Value.one_hot([[C.Value.ONE_HOT_SKIP if i < polymath.wg_dim else i - polymath.wg_dim for i in cwids] for cwids in batch['cwids']], polymath.wn_dim)
                query_g_words    = C.Value.one_hot([[C.Value.ONE_HOT_SKIP if i >= polymath.wg_dim else i for i in qwids] for qwids in batch['qwids']], polymath.wg_dim)
                query_ng_words   = C.Value.one_hot([[C.Value.ONE_HOT_SKIP if i < polymath.wg_dim else i - polymath.wg_dim for i in qwids] for qwids in batch['qwids']], polymath.wn_dim)
                context_chars = [np.asarray([[[c for c in cc+[0]*max(0,polymath.word_size-len(cc))]] for cc in ccid], dtype=np.float32) for ccid in batch['ccids']]
                query_chars   = [np.asarray([[[c for c in qc+[0]*max(0,polymath.word_size-len(qc))]] for qc in qcid], dtype=np.float32) for qcid in batch['qcids']]
                answer_begin = [np.asarray(ab, dtype=np.float32) for ab in batch['baidx']]
                answer_end   = [np.asarray(ae, dtype=np.float32) for ae in batch['eaidx']]

                yield { argument_by_name(func, 'cgw'): context_g_words,
                        argument_by_name(func, 'qgw'): query_g_words,
                        argument_by_name(func, 'cnw'): context_ng_words,
                        argument_by_name(func, 'qnw'): query_ng_words,
                        argument_by_name(func, 'cc' ): context_chars,
                        argument_by_name(func, 'qc' ): query_chars,
                        argument_by_name(func, 'ab' ): answer_begin,
                        argument_by_name(func, 'ae' ): answer_end }
            else:
                yield {} # need to generate empty batch for distributed training