Esempio n. 1
0
import jieba.posseg  #需要另外加载一个词性标注模块

import pandas as pd
# from nltk.tokenize import word_tokenize

P2P = 'total'
# ********************************************************************* #
datapath = '../../data/' + P2P + '/' + P2P + '.csv'
outpath = '../../data/' + P2P + '/data(%s).json'
dictpath = '../../data/' + P2P + '/mydict(%s).json'
debug_flag = False
stop = False
# ********************************************************************* #

mydict = Dictionary()
mydict.add_word('<pad>')
# mydict.add_word('<unk>')
stopping_word = open('../../data/stopping_word', 'r',
                     encoding='utf-8').readlines()
for i in range(len(stopping_word)):
    stopping_word[i] = stopping_word[i].strip()

reviews = pd.read_csv(datapath, index_col=0, header=0, encoding='utf-8')
labels = list(reviews['reviewEvaluation'])
reviews = list(reviews['reviewContent'])
# reviews = open(datapath).readlines()
n_reviews = len(reviews)
print('%d条评论将被载入...' % n_reviews)

if debug_flag:
    size = '5'
Esempio n. 2
0
    def __init__(self,
                 corpus_dir,
                 w2v,
                 dictionary=None,
                 w2v_lbound=16,
                 w2v_ubound=2**16,
                 corpus_lbound=2,
                 ctx_len=12,
                 pad=0,
                 is_wikitext=False,
                 is_chimera=False,
                 is_jnlpba=False):
        if dictionary is None:
            dictionary = Dictionary(w2v.vector_size)

        if is_wikitext:
            corpus = [
                fi.lower().split()
                for fi in (corpus_dir /
                           'wiki.train.tokens').open().readlines()
            ]
            corpus += [
                fi.lower().split()
                for fi in (corpus_dir /
                           'wiki.valid.tokens').open().readlines()
            ]
            corpus += [
                fi.lower().split()
                for fi in (corpus_dir / 'wiki.test.tokens').open().readlines()
            ]
            corpus = np.array(corpus)
        elif is_chimera:
            corpus = []
            with (corpus_dir / 'dataset.txt').open(encoding='latin1') as f:
                lines = f.readlines()[1:]
                for i in range(0, len(lines), 2):
                    fields = lines[i].rstrip('\n').split('\t')
                    nonce = fields[1].lower()
                    sents = fields[3].lower().split('@@')
                    pivot_comp = lines[i + 1].split('\t')[5].lower().split('_')
                    corpus += [
                        sent.replace(nonce, pivot_comp[0 if i %
                                                       2 == 0 else 1]).split()
                        for i, sent in enumerate(sents)
                    ]
            corpus = np.unique(corpus)
        elif is_jnlpba:
            ps = ['train/Genia4ERtask2.iob2', 'test/Genia4EReval2.iob2']
            corpus = []
            sent = []
            for p in ps:
                for w in (corpus_dir / p).open().readlines():
                    if w.startswith("###MEDLINE:"):
                        if sent:
                            corpus += [sent]
                        sent = []
                        continue

                    w = w.strip().lower()
                    if w != '':
                        w = w.split()
                        w = w[0]
                        sent += [w]
                corpus += [sent]
            corpus = np.array(corpus)
        print(f"Corpus shape: {corpus.shape}")

        word_count = defaultdict(int)
        oov_words = []
        oov_dataset = {}
        for sent in corpus:
            for w in sent:
                word_count[w] += 1
                dictionary.add_word(w, w2v)
                if w not in oov_dataset and w not in dictionary.word2idx:
                    if w in string.punctuation:
                        continue
                    oov_words.append(w)
                    oov_dataset[w] = [[], []]

        words = []
        for w in dictionary.word2idx:
            if w != '<unk>' and w2v_ubound > w2v.wv.vocab[
                    w].count > w2v_lbound and word_count[w] > corpus_lbound:
                words.append(w)
        print(f"Number of valid words: {len(words)}")

        train_dataset = {}
        valid_dataset = {}
        for w, prob in zip(words, np.random.random(len(words))):
            if prob < 0.9:
                train_dataset[w] = [[], []]
            else:
                valid_dataset[w] = [[], []]

        for sent in corpus:
            words_valid = []
            words_train = []
            words_oov = []

            for idx, w in enumerate(sent):
                if w in valid_dataset:
                    words_valid += [[w, idx]]
                elif w in train_dataset:
                    words_train += [[w, idx]]
                elif w in oov_dataset:
                    words_oov += [[w, idx]]

            if len(words_valid) > 0 or len(words_train) > 0 or len(
                    words_oov) > 0:
                sent_word_ids = dictionary.sent2idx(sent)

                if len(words_valid) > 0:
                    for w, idx in words_valid:
                        if np.count_nonzero(
                                sent_word_ids[idx - ctx_len:idx + 1 +
                                              ctx_len]) > ctx_len:
                            valid_dataset[w][0] += [
                                sent_word_ids[idx - ctx_len:idx]
                            ]
                            valid_dataset[w][1] += [
                                sent_word_ids[idx + 1:idx + 1 + ctx_len]
                            ]

                if len(words_train) > 0:
                    for w, idx in words_train:
                        if np.count_nonzero(
                                sent_word_ids[idx - ctx_len:idx + 1 +
                                              ctx_len]) > ctx_len:
                            train_dataset[w][0] += [
                                sent_word_ids[idx - ctx_len:idx]
                            ]
                            train_dataset[w][1] += [
                                sent_word_ids[idx + 1:idx + 1 + ctx_len]
                            ]

                if len(words_oov) > 0:
                    for w, idx in words_oov:
                        if np.count_nonzero(
                                sent_word_ids[idx - ctx_len:idx + 1 +
                                              ctx_len]) > ctx_len:
                            oov_dataset[w][0] += [
                                sent_word_ids[idx - ctx_len:idx]
                            ]
                            oov_dataset[w][1] += [
                                sent_word_ids[idx + 1:idx + 1 + ctx_len]
                            ]

        for w in valid_dataset:
            lefts = pad_sequences(valid_dataset[w][0],
                                  max_len=ctx_len,
                                  pad=pad,
                                  pre=True)
            rights = pad_sequences(valid_dataset[w][1],
                                   max_len=ctx_len,
                                   pad=pad,
                                   pre=False)
            valid_dataset[w] = np.concatenate((lefts, rights), axis=1)

        for w in train_dataset:
            lefts = pad_sequences(train_dataset[w][0],
                                  max_len=ctx_len,
                                  pad=pad,
                                  pre=True)
            rights = pad_sequences(train_dataset[w][1],
                                   max_len=ctx_len,
                                   pad=pad,
                                   pre=False)
            train_dataset[w] = np.concatenate((lefts, rights), axis=1)

        for w in oov_dataset:
            lefts = pad_sequences(oov_dataset[w][0],
                                  max_len=ctx_len,
                                  pad=pad,
                                  pre=True)
            rights = pad_sequences(oov_dataset[w][1],
                                   max_len=ctx_len,
                                   pad=pad,
                                   pre=False)
            oov_dataset[w] = np.concatenate((lefts, rights), axis=1)

        print(f"Train size: {len(train_dataset.keys())}")
        print(f"Valid size: {len(valid_dataset.keys())}")
        print(f"OOV size: {len(oov_words)}")

        print(
            f"Train >0 ctxts size: {len([w for w in train_dataset.keys() if len(train_dataset[w]) > 0])}"
        )
        print(
            f"Valid >0 ctxts size: {len([w for w in valid_dataset.keys() if len(valid_dataset[w]) > 0])}"
        )
        print(
            f"OOV >0 ctxts size: {len([w for w in oov_words if len(oov_dataset[w]) > 0])}"
        )

        train_ctxt_lens = [len(train_dataset[w]) for w in train_dataset.keys()]
        valid_ctxt_lens = [len(valid_dataset[w]) for w in valid_dataset.keys()]
        oov_ctxt_lens = [len(oov_dataset[w]) for w in oov_words]

        print(
            f"Number of Train with ctxts size = index {[train_ctxt_lens.count(i) for i in range(10)]}"
        )
        print(
            f"Number of Valid with ctxts size = index {[valid_ctxt_lens.count(i) for i in range(10)]}"
        )
        print(
            f"Number of OOV with ctxts size = index {[oov_ctxt_lens.count(i) for i in range(10)]}"
        )

        oov_word_counts = [word_count[w] for w in oov_words]
        inds = np.argsort(-np.array(oov_word_counts))[:10]

        print(
            f"Number of OOV words with count = index {[oov_word_counts.count(i) for i in range(10)]}"
        )
        print(f"Most frequent OOV words: {[oov_words[i] for i in inds]} "
              f"frequencies: {[oov_word_counts[i] for i in inds]}")

        self.dictionary = dictionary
        self.train_dataset = train_dataset
        self.train_words = list(train_dataset.keys())
        self.valid_dataset = valid_dataset
        self.valid_words = list(valid_dataset.keys())
        self.oov_dataset = oov_dataset
        self.oov_words = list(oov_dataset.keys())
        self.w2v = w2v
        self.ctx_len = ctx_len
        self.train_k2words = {}
        self.valid_k2words = {}