def prepare_data_and_model(Model, args, using_gpu=True): if args.test: ## # narvi #train_path = "/home/zhouy/thesis/data/text_classification_data/train_try.csv" #test_path = "/home/zhouy/thesis/data/text_classification_data/test_try.csv" # tut thinkstation # train_path = "/media/yi/harddrive/codes/thesis_sentimentAnalysis/data/text_classification_data/train_try.csv" # test_path = "/media/yi/harddrive/codes/thesis_sentimentAnalysis/data/text_classification_data/test_try.csv" # # tripadvisor dataset # # xps test_path = "D:\\sentimentAnalysis\\data\\text_classification_data\\test_model_data\\rev_sent_5_score_train_test\\tripadvisor\\test_try.csv" train_path = "D:\\sentimentAnalysis\\data\\text_classification_data\\test_model_data\\rev_sent_5_score_train_test\\tripadvisor\\train_try.csv" else: # original dataset # # narvi #train_path = "/home/zhouy/thesis/data/text_classification_data/tripadvisor_train_dataset.csv" #test_path = "/home/zhouy/thesis/data/text_classification_data/tripadvisor_test_dataset.csv" # # tut thinkstation # train_path = "/home/yi/sentimentAnalysis/algos/5_ToxicCommentClassification-pytorch/data/train.csv" # test_path = "/home/yi/sentimentAnalysis/algos/5_ToxicCommentClassification-pytorch/data/test.csv" # # xps # train_path = "D:/sentimentAnalysis/algos/5_ToxicCommentClassification-pytorch/data/train.csv" # test_path = "D:/sentimentAnalysis/algos/5_ToxicCommentClassification-pytorch/data/test.csv" # tripadvisor dataset # xps train_path = "D:/sentimentAnalysis/data/text_classification_data/tripadvisor_train_dataset.csv" test_path = "D:/sentimentAnalysis/data/text_classification_data/tripadvisor_test_dataset.csv" def tokenize(text): fileters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' trans_map = str.maketrans(fileters, " " * len(fileters)) text = text.translate(trans_map) text = [ tok.text for tok in spacy_en.tokenizer(text) if tok.text != ' ' ] tokenized_text = [] auxiliary_verbs = ['am', 'is', 'are', 'was', 'were', "'s"] for token in text: if token == "n't": tmp = 'not' elif token == "'ll": tmp = 'will' elif token in auxiliary_verbs: tmp = 'be' else: tmp = token tokenized_text.append(tmp) return tokenized_text if args.dataset == 'tripadvisor': TEXT = data.Field(tokenize=tokenize, lower=True, batch_first=True, truncate_first=True) LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True) test = CustomDataset(test_path, text_field=TEXT, label_field=LABEL, test=True) train = CustomDataset(train_path, text_field=TEXT, label_field=LABEL) # should save the above train, test, these two variables. if args.wordembedding == "glove-6b": vectors = GloVe(name='6B', dim=args.embed_dim) elif args.wordembedding == "FastText": vectors = FastText(language='en') else: NotImplementedError # # FastText # vectors = FastText(name='6B', dim=args.embed_dim) vectors.unk_init = init.xavier_uniform # 下面这行代码报错 # TEXT.build_vocab(train, vectors=vectors, max_size=30000) TEXT.build_vocab(train, vectors=vectors, max_size=10000, min_freq=10) LABEL.build_vocab(train) print('train.fields', train.fields) print('train.name', getattr(train, 'text')) print('len(train)', len(train)) print('vars(train[0])', vars(train[0])) # using the training corpus to create the vocabulary train_iter = data.Iterator(dataset=train, batch_size=args.batch_size, train=True, repeat=False, device=0 if using_gpu else -1) test_iter = data.Iterator(dataset=test, batch_size=args.batch_size, train=False, sort=False, device=0 if using_gpu else -1) # the number of unique words num_tokens = len(TEXT.vocab.itos) args.num_tokens = num_tokens dev_iter = test_iter elif args.dataset == 'SST': text_field = data.Field(batch_first=True, lower=True, tokenize=tokenize) label_field = data.Field(sequential=False, batch_first=True) train_data, dev_data, test_data = datasets.SST.splits( text_field, label_field, fine_grained=True) vectors = GloVe(name='6B', dim=args.embed_dim) text_field.build_vocab(train_data, vectors=vectors, min_freq=1) label_field.build_vocab(train_data) train_iter = data.Iterator(train_data, batch_size=args.batch_size, device=0 if using_gpu else -1, train=True, repeat=False, sort=False, shuffle=True) dev_iter = data.Iterator(dev_data, batch_size=args.batch_size, device=0 if using_gpu else -1, train=False, repeat=False, sort=False, shuffle=False) test_iter = data.Iterator(test_data, batch_size=args.batch_size, device=0 if using_gpu else -1, train=False, repeat=False, sort=False, shuffle=False) # train_iter, dev_iter, test_iter = sst(text_field, label_field) # train_iter, dev_iter, test_iter = SST.iters(batch_size=16, device=0 if using_gpu else -1, vectors="glove.6B.300d") # config.target_class = train_iter.dataset.NUM_CLASSES args.num_tokens = len(text_field.vocab) args.num_classes = len(label_field.vocab) - 1 print("num_classes: ", args.num_classes) if args.model == "VDCNN": net = Model(depth=29, vocabulary_size=args.num_tokens, embed_size=16, n_classes=args.num_classes, k=2, optional_shortcut=True) else: net = Model(args) # # copy pretrained glove word embedding into the model # net.embedding.weight.data.copy_(TEXT.vocab.vectors) if using_gpu: net.cuda() return train_iter, test_iter, net
def __init__(self, batch_size=30, device=-1): self.batch_size = batch_size self.device = device #Define fields TEXT = data.Field(lower=True, include_lengths=False, batch_first=True) CHAR = data.Field(lower=True, include_lengths=False, batch_first=True, tokenize=list) TEXT_C = data.NestedField(CHAR) LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True) INDEX = data.Field(sequential=False, use_vocab=False, batch_first=True) ID = data.RawField() fields = [("context", TEXT), ("query", TEXT), ("label", LABEL), ("context_c", TEXT_C), ("query_c", TEXT_C), ("index", INDEX)] train_data = [] val_data = [] dev_data = [] #Generate examples print("Loading datasets...") print("Loading training set...") try: with open("./data/processed/train_set.data", 'rb') as f: train_data = pickle.load(f) print("Loaded training set from file.") except: print( "Failed to loaded training set from file. Processing training data..." ) with open("./data/squad/train.context") as f: context = list(f) with open("./data/squad/train.question") as f: query = list(f) with open("./data/squad/train.span") as f: label = list(f) for i in range(len(label)): splited = list(map(int, label[i].split())) label[i] = splited for i in tqdm(range(len(context)), ascii=True): list_content = [ context[i], query[i], label[i], context[i], query[i], i ] train_ex = data.Example.fromlist(list_content, fields) train_data.append(train_ex) with open("./data/processed/train_set.data", 'wb') as f: pickle.dump(train_data, f) train_set = data.Dataset(train_data, fields) print("Loading dev set...") try: with open("./data/processed/dev_set.data", 'rb') as f: dev_data = pickle.load(f) print("Loaded dev set from file.") except: print("Failed to loaded dev set from file. Processing dev data...") with open("./data/squad/dev.context") as f: context = list(f) with open("./data/squad/dev.question") as f: query = list(f) with open("./data/squad/dev.span") as f: label = list(f) for i in range(len(label)): splited = list(map(int, label[i].split())) label[i] = splited for i in tqdm(range(len(context)), ascii=True): list_content = [ context[i], query[i], label[i], context[i], query[i], i ] dev_ex = data.Example.fromlist(list_content, fields) dev_data.append(dev_ex) with open("./data/processed/dev_set.data", 'wb') as f: pickle.dump(dev_data, f) dev_set = data.Dataset(dev_data, fields) print("Loading validation set...") try: with open("./data/processed/val_set.data", 'rb') as f: val_data = pickle.load(f) print("Loaded validation set from file.") except: print( "Failed to loaded validation set from file. Processing validation data..." ) with open("./data/squad/val.context") as f: context = list(f) with open("./data/squad/val.question") as f: query = list(f) with open("./data/squad/val.span") as f: label = list(f) for i in range(len(label)): splited = list(map(int, label[i].split())) label[i] = splited for i in tqdm(range(len(context)), ascii=True): list_content = [ context[i], query[i], label[i], context[i], query[i], i ] val_ex = data.Example.fromlist(list_content, fields) val_data.append(val_ex) with open("./data/processed/val_set.data", 'wb') as f: pickle.dump(val_data, f) val_set = data.Dataset(val_data, fields) print("Loading word embeddings...") glove_vecs = GloVe(name='6B', dim=100) glove_vecs.unk_init = nn.init.xavier_uniform_ print("Building vocabulary...") TEXT.build_vocab(train_set, vectors=glove_vecs) TEXT_C.build_vocab(train_set, min_freq=20) self.vocab_vec = TEXT.vocab.vectors print(len(self.vocab_vec), " words in word vocabulary.") self.char_size = len(TEXT_C.vocab) print(len(TEXT_C.vocab), " tokens in char vocabulary.") print("Generating iterator...") self.train_iter = iter( data.Iterator(train_set, batch_size=self.batch_size, device=self.device, sort_key=lambda x: len(x.context), repeat=True, sort=True)) self.dev_iter = iter( data.Iterator(dev_set, batch_size=self.batch_size, device=self.device, sort_key=lambda x: len(x.context), repeat=False, sort=True)) self.val_iter = iter( data.Iterator(val_set, batch_size=self.batch_size, device=self.device, sort_key=lambda x: len(x.context), repeat=True, sort=True)) print("DataLoader initiated.")