def predict(sentence): meta_data = joblib.load("meta.bin") enc_pos = meta_data["enc_pos"] enc_tag = meta_data["enc_tag"] num_pos = len(list(enc_pos.classes_)) num_tag = len(list(enc_tag.classes_)) tokenized_sentence = config.TOKENIZER.encode(sentence) sentence = sentence.split() print(sentence) print(tokenized_sentence) test_dataset = dataset.EntityDataset(texts=[sentence], pos=[[0] * len(sentence)], tags=[[0] * len(sentence)]) device = torch.device("cuda") model = EntityModel(num_tag=num_tag, num_pos=num_pos) model.load_state_dict(torch.load(config.MODEL_PATH), strict=False) model.to(device) with torch.no_grad(): data = test_dataset[0] for k, v in data.items(): data[k] = v.to(device).unsqueeze(0) tag, pos, _ = model(**data) geo_tag_sentence = enc_tag.inverse_transform( tag.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)] pos_sentence = enc_pos.inverse_transform( pos.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)] # print(geo_tag_sentence) print(pos_sentence) return tokenized_sentence, pos_sentence, geo_tag_sentence
def predict(sentence): tokenized_sentence = config.TOKENIZER.encode(sentence) sentence = sentence.split() #print(sentence) #print(tokenized_sentence) test_dataset = EntityDataset(texts=[sentence], pos=[[0] * len(sentence)], tags=[[0] * len(sentence)]) device = torch.device("cuda") model = EntityModel(num_tag=num_tag, num_pos=num_pos) model.load_state_dict(torch.load(config.MODEL_PATH)) model.to(device) with torch.no_grad(): data = test_dataset[0] for k, v in data.items(): data[k] = v.to(device).unsqueeze(0) tag, pos, _ = model(**data) tags = enc_tag.inverse_transform( tag.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)] i = 0 names = [] while i < len(tags): item = tags[i] indices = [] if (item == "B-per"): while (item == "B-per"): indices.append(i) i += 1 item = tags[i] tokenized_name = tokenized_sentence[indices[0]:indices[-1] + 1] name = config.TOKENIZER.decode(tokenized_name) names.append(name) indices = [] i += 1 return names
train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4 ) valid_dataset = dataset.EntityDataset( texts=test_sentences, pos=test_pos, tags=test_tag ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1 ) #device = torch.device("cuda") device = torch.device("cpu") model = EntityModel(num_tag=num_tag, num_pos=num_pos) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0,
def run(): sentences, pos, tag, enc_pos, enc_tag = utils.process_data(config.DATA_FILE) meta_data = { "enc_pos": enc_pos, "enc_tag": enc_tag } joblib.dump(meta_data, "meta.bin") num_pos = len(list(enc_pos.classes_)) num_tag = len(list(enc_tag.classes_)) ( train_sentences, test_sentences, train_pos, test_pos, train_tag, test_tag ) = model_selection.train_test_split(sentences, pos, tag, random_state=42, test_size=0.1) train_dataset = dataset.EntityDataset( texts = train_sentences, pos=train_pos, tags=train_tag ) test_dataset = dataset.EntityDataset( texts = test_sentences, pos=test_pos, tags=test_tag ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size = config.TRAIN_BATCH_SIZE, num_workers=4 ) test_data_loader = torch.utils.data.DataLoader( test_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1 ) device = torch.device("cuda") model = EntityModel(num_tag=num_tag, num_pos=num_pos) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_param = [ { "params" : [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params" : [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] num_train_steps = int(len(train_sentences) / config.TRAIN_BATCH_SIZE * config.EPOCHS ) optimizer = AdamW(optimizer_param, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) best_loss = np.inf for epoch in range(config.EPOCHS): train_loss = engine.train_fn(train_data_loader, model, optimizer, device, scheduler) test_loss = engine.eval_fn(test_data_loader, model, device) print(f"Train Loss = {train_loss} Valod Loss = {test_loss}") if test_loss < best_loss: torch.save(model.state_dict(), config.MODEL_SAVE_PATH) best_loss = test_loss
train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.params["TRAIN_BATCH_SIZE"], num_workers=4 ) valid_dataset = dataset.EntityDataset( texts=test_sentences, tags=test_tag, classification_tags=test_classify_tag, O_tag_id= enc_tag.transform(["O"])[0] ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.params["VALID_BATCH_SIZE"], num_workers=1 ) model = EntityModel(num_tag=num_tag, num_classify_tag= num_classify_tag) device = torch.device("cuda" if config.params["CUDA"] else "cpu") model.to(device) #BioBERT is taking alot of space param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.001, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0,
num_tag = len(list(enc_tag.classes_)) sentence = """ President Trump has addressed the nation on US supremacy over the world. """ tokenized_sentence = config.params["TOKENIZER"].encode(sentence) sentence = sentence.split() print(sentence) print(tokenized_sentence) print(config.params["TOKENIZER"].convert_ids_to_tokens(tokenized_sentence)) test_dataset = dataset.EntityDataset(texts=[sentence], tags=[[0] * len(sentence)], O_tag_id=enc_tag.transform(["O"])[0]) model = EntityModel(num_tag=num_tag) model.load_state_dict(torch.load(config.params["MODEL_PATH"])) device = torch.device("cuda") if torch.cuda.is_available(): model.to(device) with torch.no_grad(): data = test_dataset[0] for k, v in data.items(): data[k] = v.to(device).unsqueeze(0) tag, _ = model(**data) print( enc_tag.inverse_transform(tag.argmax(2).cpu().numpy().reshape(-1)) [:len(tokenized_sentence)])
def predic(sentence): meta_data = joblib.load(config.METADATA_PATH) enc_tag = meta_data["enc_tag"] num_tag = len(list(enc_tag.classes_)) tokenized_sentence = config.TOKENIZER.encode(sentence) ofToken = config.tok offSets = ofToken.encode(sentence) sentence = sentence.split() text = offSets.tokens print(sentence) print(tokenized_sentence) print(text) test_dataset = dataset.EntityDataset(texts=[sentence], tags=[[0] * len(sentence)]) device = torch.device("cuda") model = EntityModel(num_tag=num_tag) model.load_state_dict(torch.load(config.MODEL_PATH)) model.to(device) with torch.no_grad(): data = test_dataset[0] for k, v in data.items(): data[k] = v.to(device).unsqueeze(0) tag, _ = model(**data) decodedText = enc_tag.inverse_transform( tag.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)] print(decodedText) # print(config.TOKENIZER.encode()) text = text[1:-1] decodedText = decodedText[1:-1] finalText = '' listNum = [] for i, tex in enumerate(text): if tex.startswith('##'): finalText = finalText + tex[2:] listNum.append([(i - 1), i]) else: finalText = finalText + ' ' + tex if len(listNum) > 0: listNum.append([0, 0]) print(f'finalText {finalText}') print(f'listNum {listNum}') finalNum = [] for eachListNum in range(len(listNum) - 1): if listNum[eachListNum][1] == listNum[eachListNum + 1][0]: tempList = listNum[eachListNum] tempList.extend(listNum[eachListNum + 1]) finalNum.append(tempList) else: finalNum.append(listNum[eachListNum]) finalNum = [list(set(i)) for i in finalNum] finalNumList = [j for i in finalNum for j in i] print(f'finalNum {finalNum}') for i in finalNum[-2]: if i in finalNum[-1]: finalNum = finalNum[:-1] break finalIntent = [] for i in range(len(decodedText)): if not i in finalNumList: finalIntent.append(decodedText[i]) else: # index = (eachList if i == eachList[0] else False for enu, eachList in enumerate(finalNum)) index = [] for enu, eachList in enumerate(finalNum): if i == eachList[0]: index = eachList if index: tempToken = decodedText[index[0]:(index[-1] + 1)] print(f'temp token {tempToken}') tempToken = list(set(tempToken)) if len(tempToken) > 1: if 'O' in tempToken: tempToken = ' '.join(tempToken) tempToken = tempToken.replace("O", '').strip().split() tempToken = tempToken[-1] finalIntent.append(tempToken) else: finalText = ' '.join(text) finalIntent = decodedText intentDict = {} for i, inte in enumerate(finalIntent): if not inte == 'O': intentDict[finalText.strip().split(' ')[i]] = inte withOutZeroList = ' '.join(finalIntent) withOutZeroList = withOutZeroList.replace('O', '').strip().split() return withOutZeroList, intentDict