def get_data(config): train, dev, test = smt_dataset( directory="../data/", train=True, dev=True, test=True, fine_grained=True, ) def filter_neutrals(data, labels): logger.info("Filtering neutral labels for binary task") new_data, new_labels = [], [] for d, l in zip(data, labels): # l positive or very positive if "positive" in l: new_data.append(d) new_labels.append("positive") # l negative or very negative elif "negative" in l: new_data.append(d) new_labels.append("negative") else: continue return new_data, new_labels raw_train = [d["text"] for d in train] labels_train = [d["label"] for d in train] raw_dev = [d["text"] for d in dev] labels_dev = [d["label"] for d in dev] raw_test = [d["text"] for d in dev] labels_test = [d["label"] for d in dev] num_labels = 5 if config.binary: raw_train, labels_train = filter_neutrals(raw_train, labels_train) raw_dev, labels_dev = filter_neutrals(raw_dev, labels_dev) raw_test, labels_test = filter_neutrals(raw_test, labels_test) num_labels = 2 return ( raw_train, labels_train, raw_dev, labels_dev, raw_test, labels_test, num_labels, )
def test_smt_dataset_row(mock_urlretrieve): mock_urlretrieve.side_effect = urlretrieve_side_effect # Check a row are parsed correctly train, dev, test = smt_dataset(directory=directory, test=True, dev=True, train=True) assert len(train) > 0 assert len(dev) > 0 assert len(test) > 0 assert train[5] == { 'text': "Whether or not you 're enlightened by any of Derrida 's lectures on `` the other '' " + "and `` the self , '' Derrida is an undeniably fascinating and playful fellow .", 'label': 'positive' } train = smt_dataset(directory=directory, train=True, subtrees=True) assert train[3] == {'text': 'Rock', 'label': 'neutral'} train = smt_dataset(directory=directory, train=True, subtrees=True, fine_grained=True) assert train[4] == { 'text': "is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a" + " splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven" + " Segal .", 'label': 'very positive' } # Clean up shutil.rmtree(os.path.join(directory, 'trees'))
def __init__(self, mode='train', subtrees=False, embedder=None, tokenizer=None, granularity=2, threshold=3): if granularity == 5: fine_grained = True else: fine_grained = False if tokenizer: self.tokenizer = Tokenizer(tokenizer) self.subtrees = subtrees if mode == 'train': self.data = list( smt_dataset('sst/', train=True, fine_grained=fine_grained, subtrees=self.subtrees)) if mode == 'val': self.data = list( smt_dataset('sst/', train=False, dev=True, fine_grained=fine_grained, subtrees=self.subtrees)) if mode == 'test': self.data = list( smt_dataset('sst/', train=False, test=True, fine_grained=fine_grained, subtrees=self.subtrees)) if fine_grained is True: label_to_id = {} label_to_id['very negative'] = 0 label_to_id['negative'] = 1 label_to_id['neutral'] = 2 label_to_id['positive'] = 3 label_to_id['very positive'] = 4 else: label_to_id = {} label_to_id['very negative'] = 0 label_to_id['negative'] = 0 label_to_id['neutral'] = 2 label_to_id['positive'] = 1 label_to_id['very positive'] = 1 if self.subtrees == False: for i in self.data: i['label'] = label_to_id[i['label']] if tokenizer: i['text'] = self.tokenizer.tokenize(i['text']) else: data_list = [] count = 0 for i in self.data: if len(i['text'].split()) >= threshold: label = label_to_id[i['label']] if tokenizer: text = self.tokenizer.tokenize(i['text']) else: text = i['text'] data_list.append({'text': text, 'label': label}) count += 1 self.data = data_list del data_list del_idxs = [] if fine_grained is False: for i in range(len(self.data)): if self.data[i]['label'] == 2: del_idxs.append(i) self.data = [ self.data[x] for x in range(len(self.data)) if x not in del_idxs ]
from slp.plbind.module import RnnPLModule from slp.util.log import configure_logging from slp.data.collators import SequenceClassificationCollator from slp.modules.classifier import Classifier from slp.modules.rnn import WordRNN from slp.plbind.trainer import make_trainer, watch_model from slp.plbind.helpers import FromLogits collate_fn = SequenceClassificationCollator(device="cpu") if __name__ == "__main__": EXPERIMENT_NAME = "smt-words-sentiment-classification" configure_logging(f"logs/{EXPERIMENT_NAME}") train, dev = smt_dataset(directory="../data/", train=True, dev=True) raw_train = [d["text"] for d in train] labels_train = [d["label"] for d in train] raw_dev = [d["text"] for d in dev] labels_dev = [d["label"] for d in dev] ldm = PLDataModuleFromCorpus( raw_train, labels_train, val=raw_dev, val_labels=labels_dev, batch_size=8, batch_size_eval=32, collate_fn=collate_fn,
to_token_ids = ToTokenIds(word2idx) to_tensor = ToTensor(device='cpu') def create_dataloader(d): d = (DatasetWrapper(d).map(tokenizer).map(to_token_ids).map(to_tensor)) return DataLoader( d, batch_size=8, num_workers=1, pin_memory=True, shuffle=True, collate_fn=collate_fn) train_loader, dev_loader = map( create_dataloader, smt_dataset(directory='../data/', train=True, dev=True)) model = Classifier( WordRNN(256, embeddings, bidirectional=True, merge_bi='cat', packed_sequence=True, attention=False, device=DEVICE), 512, 3) optimizer = Adam([p for p in model.parameters() if p.requires_grad], lr=1e-3) criterion = nn.CrossEntropyLoss() metrics = { 'accuracy': Accuracy(), 'loss': Loss(criterion) } trainer = SequentialTrainer(model, optimizer, checkpoint_dir='../checkpoints',
neut_indices = [i for i, x in enumerate(labels) if x == "neutral"] neg_indices = [i for i, x in enumerate(labels) if x == "negative"] very_neg_indices = [ i for i, x in enumerate(labels) if x == "very negative" ] labels_tensor[very_pos_indices] = 0 labels_tensor[pos_indices] = 1 labels_tensor[neut_indices] = 2 labels_tensor[neg_indices] = 3 labels_tensor[very_neg_indices] = 4 return labels_tensor train = smt_dataset(train=True, fine_grained=True) valid = smt_dataset(dev=True, fine_grained=True) test = smt_dataset(test=True, fine_grained=True) train_labels = create_SMT_labels(train, len(train)) train_text = np.array(train.__getitem__('text')) valid_labels = create_SMT_labels(valid, len(valid)) valid_text = np.array(valid.__getitem__('text')) test_labels = create_SMT_labels(test, len(test)) test_text = np.array(test.__getitem__('text')) np.save('sst_train_text', train_text) np.save('sst_train_labels', train_labels) np.save('sst_valid_text', valid_text) np.save('sst_valid_labels', valid_labels) np.save('sst_test_text', test_text)