Esempio n. 1
0
import torch
from google.cloud import storage
import tokenizers
from transformers import BertTokenizer
from tokenizers import BertWordPieceTokenizer
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.utils.data.sampler import RandomSampler
import numpy as np
import random
import jieba
import logging
logging.getLogger("jieba").setLevel(logging.WARNING)

tokenizer = BertWordPieceTokenizer(vocab_file='../tokenizer/vocab.txt')
tokenizer.add_special_tokens(["<nl>"])
tokenizer.enable_truncation(max_length=512)
tokenizer.enable_padding(length=512)
client = storage.Client()
blobs = []
size = 0
for blob in client.list_blobs('tfrc-tfrc', prefix='public_model/corpus/'):
    if (blob.name.endswith('.txt')):
        blobs.append(blob)

sub_blobs = random.sample(blobs, 5)


def iterator_gen(generator, handler=None, parallel=False):
    try:
        import gc
        import multiprocessing as multiprocessing
Esempio n. 2
0
class Tweets(Dataset):
    def __init__(self, device='cpu', pad=150, test=False, N=4):
        self.samples = []
        self.pad = pad

        self.tokenizer = BertWordPieceTokenizer(
            "./data/bert-base-uncased-vocab.txt",
            lowercase=True,
            clean_text=True)

        self.tokenizer.enable_padding(max_length=pad -
                                      1)  # -1 for sentiment token

        self.tokenizer.add_special_tokens(['[POS]'])
        self.tokenizer.add_special_tokens(['[NEG]'])
        self.tokenizer.add_special_tokens(['[NEU]'])
        self.vocab = self.tokenizer.get_vocab()

        self.sent_t = {
            'positive': self.tokenizer.token_to_id('[POS]'),
            'negative': self.tokenizer.token_to_id('[NEG]'),
            'neutral': self.tokenizer.token_to_id('[NEU]')
        }

        self.pos_set = {'UNK': 0}
        all_pos = load('help/tagsets/upenn_tagset.pickle').keys()

        for i, p in enumerate(all_pos):
            self.pos_set[p] = i + 1

        self.tweet_tokenizer = TweetTokenizer()

        data = None
        if test is True:
            data = pd.read_csv(TEST_PATH).values
            for row in data:
                tid, tweet, sentiment = tuple(row)

                pos_membership = [0] * len(tweet)

                pos_tokens = self.tweet_tokenizer.tokenize(tweet)
                pos = nltk.pos_tag(pos_tokens)
                offset = 0

                for i, token in enumerate(pos_tokens):
                    start = tweet.find(token, offset)
                    end = start + len(token)
                    if pos[i][1] in self.pos_set:
                        pos_membership[start:end] = [self.pos_set[pos[i][1]]
                                                     ] * len(token)
                    offset += len(token)

                tokens = self.tokenizer.encode(tweet)
                word_to_index = tokens.ids
                offsets = tokens.offsets

                token_pos = [0] * len(word_to_index)
                # get pos info
                for i, (s, e) in enumerate(offsets):
                    if word_to_index[i] == 0 or word_to_index[
                            i] == 101 or word_to_index[i] == 102:
                        pass
                    elif s != e:
                        sub = pos_membership[s:e]
                        token_pos[i] = max(set(sub), key=sub.count)

                token_pos = [0] + token_pos
                word_to_index = [self.sent_t[sentiment]] + word_to_index
                offsets = [(0, 0)] + offsets
                offsets = np.array([[off[0], off[1]] for off in offsets])
                word_to_index = np.array(word_to_index)
                token_pos = np.array(token_pos)

                self.samples.append({
                    'tid': tid,
                    'sentiment': sentiment,
                    'tweet': word_to_index,
                    'offsets': offsets,
                    'raw_tweet': tweet,
                    'pos': token_pos
                })

        else:

            data = pd.read_csv(TRAIN_PATH).values
            if N > 0:
                data = augment_n(data, N=N)

            for row in data:
                tid, tweet, selection, sentiment = tuple(row)

                char_membership = [0] * len(tweet)
                pos_membership = [0] * len(tweet)
                si = tweet.find(selection)
                if si < 0:
                    char_membership[0:] = [1] * len(char_membership)
                else:
                    char_membership[si:si +
                                    len(selection)] = [1] * len(selection)

                pos_tokens = self.tweet_tokenizer.tokenize(tweet)
                pos = nltk.pos_tag(pos_tokens)
                offset = 0

                for i, token in enumerate(pos_tokens):
                    start = tweet.find(token, offset)
                    end = start + len(token)
                    if pos[i][1] in self.pos_set:
                        pos_membership[start:end] = [self.pos_set[pos[i][1]]
                                                     ] * len(token)
                    offset += len(token)

                tokens = self.tokenizer.encode(tweet)
                word_to_index = tokens.ids
                offsets = tokens.offsets

                token_membership = [0] * len(word_to_index)
                token_pos = [0] * len(word_to_index)

                # Inclusive indices
                start = None
                end = None
                for i, (s, e) in enumerate(offsets):
                    if word_to_index[i] == 0 or word_to_index[
                            i] == 101 or word_to_index[i] == 102:
                        token_membership[i] = -1
                    elif sum(char_membership[s:e]) > 0:
                        token_membership[i] = 1
                        if start is None:
                            start = i + 1
                        end = i + 1

                # get pos info
                for i, (s, e) in enumerate(offsets):
                    if word_to_index[i] == 0 or word_to_index[
                            i] == 101 or word_to_index[i] == 102:
                        pass
                    elif s != e:
                        sub = pos_membership[s:e]
                        token_pos[i] = max(set(sub), key=sub.count)

                if start is None:
                    print("Data Point Error")
                    print(tweet)
                    print(selection)
                    continue
                # token_membership = torch.LongTensor(token_membership).to(device)
                word_to_index = [self.sent_t[sentiment]] + word_to_index
                token_membership = [-1] + token_membership
                offsets = [(0, 0)] + offsets
                token_pos = [0] + token_pos

                offsets = np.array([[off[0], off[1]] for off in offsets])
                word_to_index = np.array(word_to_index)
                token_membership = np.array(token_membership).astype('float')
                token_pos = np.array(token_pos)

                if tid is None:
                    raise Exception('None field detected')
                if sentiment is None:
                    raise Exception('None field detected')
                if word_to_index is None:
                    raise Exception('None field detected')
                if token_membership is None:
                    raise Exception('None field detected')
                if selection is None:
                    raise Exception('None field detected')
                if tweet is None:
                    raise Exception('None field detected')
                if start is None:
                    raise Exception('None field detected')
                if end is None:
                    raise Exception('None field detected')
                if offsets is None:
                    raise Exception('None field detected')

                self.samples.append({
                    'tid': tid,
                    'sentiment': sentiment,
                    'tweet': word_to_index,
                    'selection': token_membership,
                    'raw_selection': selection,
                    'raw_tweet': tweet,
                    'start': start,
                    'end': end,
                    'offsets': offsets,
                    'pos': token_pos
                })

    def get_splits(self, val_size=.3):
        N = len(self.samples)
        indices = np.random.permutation(N)
        split = int(N * (1 - val_size))
        train_indices = indices[0:split]
        valid_indices = indices[split:]
        return train_indices, valid_indices

    def k_folds(self, k=5):
        N = len(self.samples)
        indices = np.random.permutation(N)
        return np.array_split(indices, k)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        try:
            return self.samples[idx]
        except TypeError:
            pass
        return [self.samples[i] for i in idx]