import torch from google.cloud import storage import tokenizers from transformers import BertTokenizer from tokenizers import BertWordPieceTokenizer from torch.utils.data import Dataset, TensorDataset, DataLoader from torch.utils.data.sampler import RandomSampler import numpy as np import random import jieba import logging logging.getLogger("jieba").setLevel(logging.WARNING) tokenizer = BertWordPieceTokenizer(vocab_file='../tokenizer/vocab.txt') tokenizer.add_special_tokens(["<nl>"]) tokenizer.enable_truncation(max_length=512) tokenizer.enable_padding(length=512) client = storage.Client() blobs = [] size = 0 for blob in client.list_blobs('tfrc-tfrc', prefix='public_model/corpus/'): if (blob.name.endswith('.txt')): blobs.append(blob) sub_blobs = random.sample(blobs, 5) def iterator_gen(generator, handler=None, parallel=False): try: import gc import multiprocessing as multiprocessing
class Tweets(Dataset): def __init__(self, device='cpu', pad=150, test=False, N=4): self.samples = [] self.pad = pad self.tokenizer = BertWordPieceTokenizer( "./data/bert-base-uncased-vocab.txt", lowercase=True, clean_text=True) self.tokenizer.enable_padding(max_length=pad - 1) # -1 for sentiment token self.tokenizer.add_special_tokens(['[POS]']) self.tokenizer.add_special_tokens(['[NEG]']) self.tokenizer.add_special_tokens(['[NEU]']) self.vocab = self.tokenizer.get_vocab() self.sent_t = { 'positive': self.tokenizer.token_to_id('[POS]'), 'negative': self.tokenizer.token_to_id('[NEG]'), 'neutral': self.tokenizer.token_to_id('[NEU]') } self.pos_set = {'UNK': 0} all_pos = load('help/tagsets/upenn_tagset.pickle').keys() for i, p in enumerate(all_pos): self.pos_set[p] = i + 1 self.tweet_tokenizer = TweetTokenizer() data = None if test is True: data = pd.read_csv(TEST_PATH).values for row in data: tid, tweet, sentiment = tuple(row) pos_membership = [0] * len(tweet) pos_tokens = self.tweet_tokenizer.tokenize(tweet) pos = nltk.pos_tag(pos_tokens) offset = 0 for i, token in enumerate(pos_tokens): start = tweet.find(token, offset) end = start + len(token) if pos[i][1] in self.pos_set: pos_membership[start:end] = [self.pos_set[pos[i][1]] ] * len(token) offset += len(token) tokens = self.tokenizer.encode(tweet) word_to_index = tokens.ids offsets = tokens.offsets token_pos = [0] * len(word_to_index) # get pos info for i, (s, e) in enumerate(offsets): if word_to_index[i] == 0 or word_to_index[ i] == 101 or word_to_index[i] == 102: pass elif s != e: sub = pos_membership[s:e] token_pos[i] = max(set(sub), key=sub.count) token_pos = [0] + token_pos word_to_index = [self.sent_t[sentiment]] + word_to_index offsets = [(0, 0)] + offsets offsets = np.array([[off[0], off[1]] for off in offsets]) word_to_index = np.array(word_to_index) token_pos = np.array(token_pos) self.samples.append({ 'tid': tid, 'sentiment': sentiment, 'tweet': word_to_index, 'offsets': offsets, 'raw_tweet': tweet, 'pos': token_pos }) else: data = pd.read_csv(TRAIN_PATH).values if N > 0: data = augment_n(data, N=N) for row in data: tid, tweet, selection, sentiment = tuple(row) char_membership = [0] * len(tweet) pos_membership = [0] * len(tweet) si = tweet.find(selection) if si < 0: char_membership[0:] = [1] * len(char_membership) else: char_membership[si:si + len(selection)] = [1] * len(selection) pos_tokens = self.tweet_tokenizer.tokenize(tweet) pos = nltk.pos_tag(pos_tokens) offset = 0 for i, token in enumerate(pos_tokens): start = tweet.find(token, offset) end = start + len(token) if pos[i][1] in self.pos_set: pos_membership[start:end] = [self.pos_set[pos[i][1]] ] * len(token) offset += len(token) tokens = self.tokenizer.encode(tweet) word_to_index = tokens.ids offsets = tokens.offsets token_membership = [0] * len(word_to_index) token_pos = [0] * len(word_to_index) # Inclusive indices start = None end = None for i, (s, e) in enumerate(offsets): if word_to_index[i] == 0 or word_to_index[ i] == 101 or word_to_index[i] == 102: token_membership[i] = -1 elif sum(char_membership[s:e]) > 0: token_membership[i] = 1 if start is None: start = i + 1 end = i + 1 # get pos info for i, (s, e) in enumerate(offsets): if word_to_index[i] == 0 or word_to_index[ i] == 101 or word_to_index[i] == 102: pass elif s != e: sub = pos_membership[s:e] token_pos[i] = max(set(sub), key=sub.count) if start is None: print("Data Point Error") print(tweet) print(selection) continue # token_membership = torch.LongTensor(token_membership).to(device) word_to_index = [self.sent_t[sentiment]] + word_to_index token_membership = [-1] + token_membership offsets = [(0, 0)] + offsets token_pos = [0] + token_pos offsets = np.array([[off[0], off[1]] for off in offsets]) word_to_index = np.array(word_to_index) token_membership = np.array(token_membership).astype('float') token_pos = np.array(token_pos) if tid is None: raise Exception('None field detected') if sentiment is None: raise Exception('None field detected') if word_to_index is None: raise Exception('None field detected') if token_membership is None: raise Exception('None field detected') if selection is None: raise Exception('None field detected') if tweet is None: raise Exception('None field detected') if start is None: raise Exception('None field detected') if end is None: raise Exception('None field detected') if offsets is None: raise Exception('None field detected') self.samples.append({ 'tid': tid, 'sentiment': sentiment, 'tweet': word_to_index, 'selection': token_membership, 'raw_selection': selection, 'raw_tweet': tweet, 'start': start, 'end': end, 'offsets': offsets, 'pos': token_pos }) def get_splits(self, val_size=.3): N = len(self.samples) indices = np.random.permutation(N) split = int(N * (1 - val_size)) train_indices = indices[0:split] valid_indices = indices[split:] return train_indices, valid_indices def k_folds(self, k=5): N = len(self.samples) indices = np.random.permutation(N) return np.array_split(indices, k) def __len__(self): return len(self.samples) def __getitem__(self, idx): try: return self.samples[idx] except TypeError: pass return [self.samples[i] for i in idx]