Exemple #1
0
def tokenizer_from_json(json_string):
    """Parses a JSON tokenizer configuration file and returns a
    tokenizer instance.
    # Arguments
        json_string: JSON string encoding a tokenizer configuration.
    # Returns
        A Keras Tokenizer instance
    """
    tokenizer_config = json.loads(json_string)
    config = tokenizer_config.get('config')

    word_counts = json.loads(config.pop('word_counts'))
    word_docs = json.loads(config.pop('word_docs'))
    index_docs = json.loads(config.pop('index_docs'))
    # Integer indexing gets converted to strings with json.dumps()
    index_docs = {int(k): v for k, v in index_docs.items()}
    index_word = json.loads(config.pop('index_word'))
    index_word = {int(k): v for k, v in index_word.items()}
    word_index = json.loads(config.pop('word_index'))

    tokenizer = Tokenizer(**config)
    tokenizer.word_counts = word_counts
    tokenizer.word_docs = word_docs
    tokenizer.index_docs = index_docs
    tokenizer.word_index = word_index
    tokenizer.index_word = index_word

    return tokenizer
Exemple #2
0
def load_tokenizer_from_file(filename):

    tokenizer = Tokenizer()

    with open(filename, 'r') as infile:
        tokenizer_data = json.load(infile)

    tokenizer.word_counts = OrderedDict(tokenizer_data['word_counts'])
    tokenizer.word_docs = tokenizer_data['word_docs']
    tokenizer.word_index = tokenizer_data['word_index']
    tokenizer.document_count = tokenizer_data['document_count']
    tokenizer.index_docs = tokenizer_data['index_docs']

    return tokenizer
Exemple #3
0
def tokenizer_from_json(json_string):
    tokenizer_config = json.loads(json_string)
    config = tokenizer_config.get('config')

    word_counts = json.loads(config.pop('word_counts'))
    word_docs = json.loads(config.pop('word_docs'))
    index_docs = json.loads(config.pop('index_docs'))
    # Integer indexing gets converted to strings with json.dumps()
    index_docs = {int(k): v for k, v in index_docs.items()}
    index_word = json.loads(config.pop('index_word'))
    index_word = {int(k): v for k, v in index_word.items()}
    word_index = json.loads(config.pop('word_index'))

    tokenizer = Tokenizer(**config)
    tokenizer.word_counts = word_counts
    tokenizer.word_docs = word_docs
    tokenizer.index_docs = index_docs
    tokenizer.word_index = word_index
    tokenizer.index_word = index_word

    return tokenizer
def datagen(max_posts,
            max_length,
            stype='training',
            batch_size=32,
            force_full=False,
            randposts=False,
            mintf=1,
            mindf=2,
            noempty=False,
            prep=None,
            returntok=False,
            balbatch=True):
    assert stype in ['training', 'validation', 'testing']
    looponce = force_full or stype != 'training'
    fn = 'rsdd_posts/%s.gz' % stype

    print("loading %s posts" % stype)
    f = gzip.open(fn, 'rt')
    labels = {}
    allposts = {}
    for i, line in enumerate(f):
        user = str(i)
        d = json.loads(line)[0]
        if d['label'] == 'control':
            labels[user] = np.array([1, 0], dtype=np.float32)
        elif d['label'] == 'depression':
            labels[user] = np.array([0, 1], dtype=np.float32)
        elif d['label'] is None:
            continue
        else:
            raise RuntimeError("unknown label: %s" % d['label'])
        allposts[user] = [post for dt, post in d['posts']]
    f.close()

    tokfn = "tok_tf%s_df%s.p" % (mintf, mindf)
    load_tokenizer = looponce or os.path.exists(tokfn)

    if load_tokenizer:
        print("loading tokenizer")
        tok = pickle.load(open(tokfn, 'rb'))
    else:
        assert stype == 'training', "cannot fit tokenizer on validation or testing data"
        print("tokenizing %s users" % len(allposts))
        tok = Tokenizer(nb_words=None)
        tok.fit_on_texts(post for uposts in allposts.values()
                         for post in uposts)

        # remove all tokens with a low DF or TF
        removed = 0
        for term in list(tok.word_index.keys()):
            if tok.word_docs[term] < mindf or tok.word_counts[term] < mintf:
                removed += 1
                del tok.word_docs[term]
                del tok.word_counts[term]
                del tok.word_index[term]
        tok.index_docs = None
        idxs = {}
        nexti = 1
        for term, oldi in sorted(tok.word_index.items()):
            idxs[term] = nexti
            nexti += 1
        assert len(tok.word_index) == len(idxs)
        tok.word_index = idxs

        print("terms removed: %s; remaining: %s" %
              (removed, len(tok.word_index)))
        pickle.dump(tok, open(tokfn, 'wb'), protocol=-1)

    nb_words = len(tok.word_index) + 1

    # remove empty posts
    if noempty:
        noempty_cache = "noempty_tf%s_df%s_%s_mp%s_ml%s.p" % (
            mintf, mindf, max_posts, max_length, stype)
        if os.path.exists(noempty_cache):
            print("loading cached noempty posts")
            allposts, before, after = pickle.load(open(noempty_cache, 'rb'))
        else:
            print("removing empty posts")
            before, after = [], []
            for user in list(allposts.keys()):
                before.append(len(allposts[user]))
                kept = []
                for upost in allposts[user]:
                    skip = True
                    for term in text_to_word_sequence(upost):
                        if term in tok.word_index:
                            skip = False
                            break

                    if not skip:
                        kept.append(upost)

                if len(kept) > 0:
                    allposts[user] = kept
                    after.append(len(allposts[user]))
                else:
                    del allposts[user]

            import scipy.stats
            print("posts before noempty:", scipy.stats.describe(before))
            print("posts after  noempty:", scipy.stats.describe(after))
            print("#users before vs. after: %s vs. %s" %
                  (len(before), len(after)))
            pickle.dump((allposts, before, after),
                        open(noempty_cache, 'wb'),
                        protocol=-1)

    print("found %s words; generator ready" % nb_words)

    def vecify(uposts):
        assert prep is None or not randposts, "incompatible"
        if randposts or prep == 'bran':
            idxs = np.random.permutation(min(max_posts, len(uposts)))
            chosen = [uposts[idx] for idx in idxs]
        elif prep == 'dist':
            if max_posts >= len(uposts):
                chosen = uposts[:max_posts]
            else:
                idxs = np.linspace(0,
                                   len(uposts) - 1,
                                   num=max_posts,
                                   dtype=np.int)
                chosen = [uposts[idx] for idx in idxs]
        elif prep == 'rev':
            chosen = uposts[-max_posts:]
        else:
            chosen = uposts[:max_posts]

        seqs = pad_sequences(tok.texts_to_sequences(chosen), maxlen=max_length)
        if len(seqs) < max_posts:
            seqs = np.pad(seqs, ((0, max_posts - len(seqs)), (0, 0)),
                          mode='constant')
        return seqs

    if looponce:

        def gen(meta=False):
            X, y = [], []
            extra = []
            while True:
                for user, uposts in allposts.items():
                    X.append(vecify(uposts))
                    y.append(labels[user])
                    if meta:
                        extra.append((user, len(uposts)))

                    if len(X) == batch_size:
                        X, y = np.array(X), np.array(y)
                        print("...shouldn't happen")
                        yield (X, y)
                        X, y = [], []

                if looponce and len(X) > 0:
                    X, y = np.array(X), np.array(y)
                    if meta:
                        yield (X, y, extra)
                        X, y, extra = [], [], []
                    else:
                        yield (X, y)
                        X, y = [], []

                if looponce:
                    break
    else:

        def gen_nbb():
            bylabel = {}
            for user, uposts in allposts.items():
                label = np.argmax(labels[user])
                bylabel.setdefault(label, []).append(uposts)
            print([(k, len(v)) for k, v in bylabel.items()])

            X, y = [], []
            neglabel = np.array([1, 0], dtype=np.float32)
            poslabel = np.array([0, 1], dtype=np.float32)
            poscount = len(bylabel[1])
            while True:
                idxs = ([(1, i) for i in np.random.permutation(poscount)] + [
                    (0, i)
                    for i in np.random.permutation(len(bylabel[0]))[:poscount]
                ])
                idxs = [idxs[i] for i in np.random.permutation(len(idxs))]

                for label, idx in idxs:
                    X.append(vecify(bylabel[label][idx]))
                    if label == 0:
                        y.append(neglabel)
                    elif label == 1:
                        y.append(poslabel)
                    else:
                        raise RuntimeError("invalid label: %s" % label)

                    if len(X) == batch_size:
                        X, y = np.array(X), np.array(y)
                        yield (X, y)
                        X, y = [], []

        def gen_bal():
            bylabel = {}
            for user, uposts in allposts.items():
                label = np.argmax(labels[user])
                bylabel.setdefault(label, []).append(uposts)
            print([(k, len(v)) for k, v in bylabel.items()])

            assert batch_size % len(bylabel) == 0
            idxs = {}
            for label in bylabel:
                idxs[label] = list(range(len(bylabel[label])))

            X, y = [], []
            neglabel = np.array([1, 0], dtype=np.float32)
            poslabel = np.array([0, 1], dtype=np.float32)
            while True:
                for label in bylabel:
                    random.shuffle(idxs[label])

                for posidx, negidx in zip(idxs[1], idxs[0]):
                    X.append(vecify(bylabel[1][posidx]))
                    y.append(poslabel)

                    X.append(vecify(bylabel[0][negidx]))
                    y.append(neglabel)

                    if len(X) == batch_size:
                        X, y = np.array(X), np.array(y)
                        yield (X, y)
                        X, y = [], []

        if balbatch:
            gen = gen_bal
        else:
            gen = gen_nbb

    if returntok:
        return nb_words, gen, tok
    else:
        return nb_words, gen
    config           = tokenizer_config.get('config')
​
    word_counts = json.loads(config.pop('word_counts'))
    word_docs   = json.loads(config.pop('word_docs'))
    index_docs  = json.loads(config.pop('index_docs'))
    
    # Integer indexing gets converted to strings with json.dumps()
    index_docs = {int(k): v for k, v in index_docs.items()}
    index_word = json.loads(config.pop('index_word'))
    index_word = {int(k): v for k, v in index_word.items()}
    word_index = json.loads(config.pop('word_index'))
​
    tokenizer             = Tokenizer(**config)
    tokenizer.word_counts = word_counts
    tokenizer.word_docs   = word_docs
    tokenizer.index_docs  = index_docs
    tokenizer.word_index  = word_index
    tokenizer.index_word  = index_word
​
​
    return tokenizer
​
​
​
​
​
def create_tf_example_row(input_row):
​
    # convert to string
    password = str(input_row[0])
​