def predict(sentence): meta_data = joblib.load("meta.bin") enc_pos = meta_data["enc_pos"] enc_tag = meta_data["enc_tag"] num_pos = len(list(enc_pos.classes_)) num_tag = len(list(enc_tag.classes_)) tokenized_sentence = config.TOKENIZER.encode(sentence) sentence = sentence.split() print(sentence) print(tokenized_sentence) test_dataset = dataset.EntityDataset(texts=[sentence], pos=[[0] * len(sentence)], tags=[[0] * len(sentence)]) device = torch.device("cuda") model = EntityModel(num_tag=num_tag, num_pos=num_pos) model.load_state_dict(torch.load(config.MODEL_PATH), strict=False) model.to(device) with torch.no_grad(): data = test_dataset[0] for k, v in data.items(): data[k] = v.to(device).unsqueeze(0) tag, pos, _ = model(**data) geo_tag_sentence = enc_tag.inverse_transform( tag.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)] pos_sentence = enc_pos.inverse_transform( pos.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)] # print(geo_tag_sentence) print(pos_sentence) return tokenized_sentence, pos_sentence, geo_tag_sentence
def predict(sentence): tokenized_sentence = config.TOKENIZER.encode(sentence) sentence = sentence.split() #print(sentence) #print(tokenized_sentence) test_dataset = EntityDataset(texts=[sentence], pos=[[0] * len(sentence)], tags=[[0] * len(sentence)]) device = torch.device("cuda") model = EntityModel(num_tag=num_tag, num_pos=num_pos) model.load_state_dict(torch.load(config.MODEL_PATH)) model.to(device) with torch.no_grad(): data = test_dataset[0] for k, v in data.items(): data[k] = v.to(device).unsqueeze(0) tag, pos, _ = model(**data) tags = enc_tag.inverse_transform( tag.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)] i = 0 names = [] while i < len(tags): item = tags[i] indices = [] if (item == "B-per"): while (item == "B-per"): indices.append(i) i += 1 item = tags[i] tokenized_name = tokenized_sentence[indices[0]:indices[-1] + 1] name = config.TOKENIZER.decode(tokenized_name) names.append(name) indices = [] i += 1 return names
sentence = sentence.split( ) # This is currently the starting point for my dataset print(sentence) print(len(sentence)) print(tokenized_sentence) print(len(tokenized_sentence)) print(config.TOKENIZER.decode(tokenized_sentence)) print(len(config.TOKENIZER.decode(tokenized_sentence).split())) test_dataset = dataset.EntityDataset(texts=[sentence], tags=[[0] * len(sentence)]) device = torch.device("cuda") model = EntityModel(num_tag) model.load_state_dict(torch.load(config.MODEL_PATH)) model.to(device) with torch.no_grad(): data = test_dataset[0] for k, v in data.items(): data[k] = v.to(device).unsqueeze(0) tag, _ = model(**data) print( enc_tag.inverse_transform(tag.argmax(2).cpu().numpy().reshape(-1)) [:len(tokenized_sentence)]) print( len( enc_tag.inverse_transform( tag.argmax(2).cpu().numpy().reshape(-1))
sentence = sentence.split() print(sentence) print(tokenized_sentence) print((config.TOKENIZER.decode(tokenized_sentence))) #print(config.TOKENIZER.convert_ids_to_tokens(tokenized_sentence)) bert_tokens = config.TOKENIZER.convert_ids_to_tokens(tokenized_sentence) test_dataset = dataset.EntityDataset( texts=[sentence], pos=[[0] * len(sentence)], tags=[[0] * len(sentence)] ) device = 'cuda' if torch.cuda.is_available() else 'cpu' model = EntityModel(num_tag=num_tag, num_pos=num_pos) model.load_state_dict(torch.load(config.MODEL_PATH, map_location=torch.device(device))) model.to(device) with torch.no_grad(): data = test_dataset[0] for k, v in data.items(): data[k] = v.to(device).unsqueeze(0) tag, pos, _ = model(**data) resultTags = list(enc_tag.inverse_transform( tag.argmax(2).cpu().numpy().reshape(-1) )[:len(tokenized_sentence)]) # print( # #[i[0] for i in groupby(resultTags)]
indices = [] if (item == "B-per"): while (item == "B-per"): indices.append(i) i += 1 item = tags[i] tokenized_name = tokenized_sentence[ indices[0]:indices[-1] + 1] name = config.TOKENIZER.decode(tokenized_name) names.append(name) indices = [] i += 1 resp = ','.join(names) return [resp] if __name__ == "__main__": device = torch.device("cpu") model = EntityModel(num_tag=num_tag, num_pos=num_pos) model.load_state_dict(torch.load(config.MODEL_PATH, map_location='cpu')) model.to(device) # 2) `pack` it with required artifacts bento_svc = PyTorchModel() bento_svc.pack('ner', model) # 3) save your BentoSerivce saved_path = bento_svc.save()
num_tag = len(list(enc_tag.classes_)) sentence = """ President Trump has addressed the nation on US supremacy over the world. """ tokenized_sentence = config.params["TOKENIZER"].encode(sentence) sentence = sentence.split() print(sentence) print(tokenized_sentence) print(config.params["TOKENIZER"].convert_ids_to_tokens(tokenized_sentence)) test_dataset = dataset.EntityDataset(texts=[sentence], tags=[[0] * len(sentence)], O_tag_id=enc_tag.transform(["O"])[0]) model = EntityModel(num_tag=num_tag) model.load_state_dict(torch.load(config.params["MODEL_PATH"])) device = torch.device("cuda") if torch.cuda.is_available(): model.to(device) with torch.no_grad(): data = test_dataset[0] for k, v in data.items(): data[k] = v.to(device).unsqueeze(0) tag, _ = model(**data) print( enc_tag.inverse_transform(tag.argmax(2).cpu().numpy().reshape(-1)) [:len(tokenized_sentence)])
def predic(sentence): meta_data = joblib.load(config.METADATA_PATH) enc_tag = meta_data["enc_tag"] num_tag = len(list(enc_tag.classes_)) tokenized_sentence = config.TOKENIZER.encode(sentence) ofToken = config.tok offSets = ofToken.encode(sentence) sentence = sentence.split() text = offSets.tokens print(sentence) print(tokenized_sentence) print(text) test_dataset = dataset.EntityDataset(texts=[sentence], tags=[[0] * len(sentence)]) device = torch.device("cuda") model = EntityModel(num_tag=num_tag) model.load_state_dict(torch.load(config.MODEL_PATH)) model.to(device) with torch.no_grad(): data = test_dataset[0] for k, v in data.items(): data[k] = v.to(device).unsqueeze(0) tag, _ = model(**data) decodedText = enc_tag.inverse_transform( tag.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)] print(decodedText) # print(config.TOKENIZER.encode()) text = text[1:-1] decodedText = decodedText[1:-1] finalText = '' listNum = [] for i, tex in enumerate(text): if tex.startswith('##'): finalText = finalText + tex[2:] listNum.append([(i - 1), i]) else: finalText = finalText + ' ' + tex if len(listNum) > 0: listNum.append([0, 0]) print(f'finalText {finalText}') print(f'listNum {listNum}') finalNum = [] for eachListNum in range(len(listNum) - 1): if listNum[eachListNum][1] == listNum[eachListNum + 1][0]: tempList = listNum[eachListNum] tempList.extend(listNum[eachListNum + 1]) finalNum.append(tempList) else: finalNum.append(listNum[eachListNum]) finalNum = [list(set(i)) for i in finalNum] finalNumList = [j for i in finalNum for j in i] print(f'finalNum {finalNum}') for i in finalNum[-2]: if i in finalNum[-1]: finalNum = finalNum[:-1] break finalIntent = [] for i in range(len(decodedText)): if not i in finalNumList: finalIntent.append(decodedText[i]) else: # index = (eachList if i == eachList[0] else False for enu, eachList in enumerate(finalNum)) index = [] for enu, eachList in enumerate(finalNum): if i == eachList[0]: index = eachList if index: tempToken = decodedText[index[0]:(index[-1] + 1)] print(f'temp token {tempToken}') tempToken = list(set(tempToken)) if len(tempToken) > 1: if 'O' in tempToken: tempToken = ' '.join(tempToken) tempToken = tempToken.replace("O", '').strip().split() tempToken = tempToken[-1] finalIntent.append(tempToken) else: finalText = ' '.join(text) finalIntent = decodedText intentDict = {} for i, inte in enumerate(finalIntent): if not inte == 'O': intentDict[finalText.strip().split(' ')[i]] = inte withOutZeroList = ' '.join(finalIntent) withOutZeroList = withOutZeroList.replace('O', '').strip().split() return withOutZeroList, intentDict