def predict(sentence, model): sen_list = [[[i, 'O\n'] for i in sentence.split()]] #sen_list = [[['SOCCER', 'O\n'], ['-', 'O\n'], ['JAPAN', 'O\n'], ['GET', 'O\n'], ['LUCKY', 'O\n'], ['WIN', 'O\n'], [',', 'O\n'], ['CHINA', 'O\n'], ['IN', 'O\n'], ['SURPRISE', 'O\n'], ['DEFEAT', 'O\n'], ['.', 'O\n']]] test = addCharInformatioin(sen_list) predLabels = [] test_set = padding( createMatrices(test, word2Idx, label2Idx, case2Idx, char2Idx)) test_batch, test_batch_len = createBatches(test_set) for i, data in enumerate(test_batch): tokens, casing, char, labels = data tokens = np.asarray([tokens]) casing = np.asarray([casing]) char = np.asarray([char]) pred = model.predict([tokens, casing, char], verbose=False)[0] pred = pred.argmax(axis=-1) #Predict the classes predLabels.append(pred) entity_labels = [] j = 0 words_list = sentence.split() for i in predLabels[-1]: entity_labels.append((words_list[j], idx2Label[int(i)])) j += 1 print("predLabels", entity_labels) return entity_labels
casing = np.asarray([casing]) char = np.asarray([char]) pred = model.predict([tokens, casing, char], verbose=False)[0] pred = pred.argmax(axis=-1) #Predict the classes correctLabels.append(labels) predLabels.append(pred) b.update(i) b.update(i + 1) return predLabels, correctLabels trainSentences = readfile("data/train.txt") devSentences = readfile("data/valid.txt") testSentences = readfile("data/test.txt") trainSentences = addCharInformatioin(trainSentences) devSentences = addCharInformatioin(devSentences) testSentences = addCharInformatioin(testSentences) labelSet = set() words = {} for dataset in [trainSentences, devSentences, testSentences]: for sentence in dataset: for token, char, label in sentence: labelSet.add(label) words[token.lower()] = True # :: Create a mapping for the labels :: label2Idx = {} for label in labelSet:
from extract_all_words import extract_words from candidate_retriever import generate_training_data epochs = 100 training_data_path = "../data/ner_training_data.txt" all_words_path = "../data/words.txt" word_embedding_path = "../data/glove.6B.100d.txt" if not os.path.isfile(all_words_path): extract_words() if not os.path.isfile(training_data_path): generate_training_data() trainSentences = readfile(training_data_path) trainSentences = addCharInformatioin(trainSentences) ##LOAD all words from train, test and dev words = {} with open(all_words_path, encoding="utf-8") as f: content = f.readlines() for w in enumerate(content): words[w] = True # :: Create a mapping for the labels :: label2Idx = {} label2Idx["I"] = 1 label2Idx["O"] = 0 # :: Read in word embeddings :: word2Idx = {}
learnSentences = trainSentences[int(len(trainSentences)/10):] trainSentences = trainSentences[:int(len(trainSentences)/10)] testSentences = readfile("twitter/TwitterTestBIO.tsv") elif datasetName == "Medline": trainSentences = readfileTwitter("twitter/MedlineBIO.tsv") learnSentences = [] testSentences = [] elif datasetName == "Cadec": trainSentences = readfileTwitter("twitter/CadecBIO.tsv") learnSentences = [] testSentences = [] trainSentences = addCharInformatioin(trainSentences) learnSentences = addCharInformatioin(learnSentences) testSentences = addCharInformatioin(testSentences) labelSet = set() words = {} for dataset in [trainSentences, learnSentences, testSentences]: for sentence in dataset: for token,char, label in sentence: labelSet.add(label) words[token.lower()] = True # :: Create a mapping for the labels :: label2Idx = {}
def make_dataset(file_name): Senetnecs = readfile(file_name) Senetnecs = addCharInformatioin(Senetnecs) return Senetnecs