def predict(sentence, model): sen_list = [[[i, 'O\n'] for i in sentence.split()]] #sen_list = [[['SOCCER', 'O\n'], ['-', 'O\n'], ['JAPAN', 'O\n'], ['GET', 'O\n'], ['LUCKY', 'O\n'], ['WIN', 'O\n'], [',', 'O\n'], ['CHINA', 'O\n'], ['IN', 'O\n'], ['SURPRISE', 'O\n'], ['DEFEAT', 'O\n'], ['.', 'O\n']]] test = addCharInformatioin(sen_list) predLabels = [] test_set = padding( createMatrices(test, word2Idx, label2Idx, case2Idx, char2Idx)) test_batch, test_batch_len = createBatches(test_set) for i, data in enumerate(test_batch): tokens, casing, char, labels = data tokens = np.asarray([tokens]) casing = np.asarray([casing]) char = np.asarray([char]) pred = model.predict([tokens, casing, char], verbose=False)[0] pred = pred.argmax(axis=-1) #Predict the classes predLabels.append(pred) entity_labels = [] j = 0 words_list = sentence.split() for i in predLabels[-1]: entity_labels.append((words_list[j], idx2Label[int(i)])) j += 1 print("predLabels", entity_labels) return entity_labels
word2Idx["UNKNOWN_TOKEN"] = len(word2Idx) vector = np.random.uniform(-0.25, 0.25, len(split) - 1) wordEmbeddings.append(vector) if split[0].lower() in words: vector = np.array([float(num) for num in split[1:]]) wordEmbeddings.append(vector) word2Idx[split[0]] = len(word2Idx) wordEmbeddings = np.array(wordEmbeddings) char2Idx = {"PADDING": 0, "UNKNOWN": 1} for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|": char2Idx[c] = len(char2Idx) train_set = padding( createMatrices(trainSentences, word2Idx, label2Idx, case2Idx, char2Idx)) dev_set = padding( createMatrices(devSentences, word2Idx, label2Idx, case2Idx, char2Idx)) test_set = padding( createMatrices(testSentences, word2Idx, label2Idx, case2Idx, char2Idx)) idx2Label = {v: k for k, v in label2Idx.items()} np.save("models/idx2Label.npy", idx2Label) np.save("models/word2Idx.npy", word2Idx) train_batch, train_batch_len = createBatches(train_set) dev_batch, dev_batch_len = createBatches(dev_set) test_batch, test_batch_len = createBatches(test_set) words_input = Input(shape=(None, ), dtype='int32', name='words_input') words = Embedding(input_dim=wordEmbeddings.shape[0],
char2Idx[c] = len(char2Idx) # :: Hard coded case lookup :: case2Idx = { 'numeric': 0, 'allLower': 1, 'allUpper': 2, 'initialUpper': 3, 'other': 4, 'mainly_numeric': 5, 'contains_digit': 6, 'PADDING_TOKEN': 7 } caseEmbeddings = np.identity(len(case2Idx), dtype='float32') train_set = padding( createMatrices(trainSentences, word2Idx, label2Idx, case2Idx, char2Idx)) train_batch, train_batch_len = createBatches(train_set) words_input = Input(shape=(None, ), dtype='int32', name='words_input') words = Embedding(input_dim=wordEmbeddings.shape[0], output_dim=wordEmbeddings.shape[1], weights=[wordEmbeddings], trainable=False)(words_input) casing_input = Input(shape=(None, ), dtype='int32', name='casing_input') casing = Embedding(output_dim=caseEmbeddings.shape[1], input_dim=caseEmbeddings.shape[0], weights=[caseEmbeddings], trainable=False)(casing_input) character_input = Input(shape=( None,
def embed(self): """Create word- and character-level embeddings""" labelSet = set() words = {} # unique words and labels in data for dataset in [ self.trainSentences, self.devSentences, self.testSentences ]: for sentence in dataset: for token, char, label in sentence: # token ... token, char ... list of chars, label ... BIO labels labelSet.add(label) words[token.lower()] = True # mapping for labels self.label2Idx = {} for label in labelSet: self.label2Idx[label] = len(self.label2Idx) # mapping for token cases case2Idx = { 'numeric': 0, 'allLower': 1, 'allUpper': 2, 'initialUpper': 3, 'other': 4, 'mainly_numeric': 5, 'contains_digit': 6, 'PADDING_TOKEN': 7 } self.caseEmbeddings = np.identity( len(case2Idx), dtype='float32') # identity matrix used # read GLoVE word embeddings word2Idx = {} self.wordEmbeddings = [] fEmbeddings = open("embeddings/glove.6B.50d.txt", encoding="utf-8") # loop through each word in embeddings for line in fEmbeddings: split = line.strip().split(" ") word = split[0] # embedding word entry if len(word2Idx) == 0: # add padding+unknown word2Idx["PADDING_TOKEN"] = len(word2Idx) vector = np.zeros(len(split) - 1) # zero vector for 'PADDING' word self.wordEmbeddings.append(vector) word2Idx["UNKNOWN_TOKEN"] = len(word2Idx) vector = np.random.uniform(-0.25, 0.25, len(split) - 1) self.wordEmbeddings.append(vector) if split[0].lower() in words: vector = np.array([float(num) for num in split[1:]]) self.wordEmbeddings.append(vector) # word embedding vector word2Idx[split[0]] = len(word2Idx) # corresponding word dict self.wordEmbeddings = np.array(self.wordEmbeddings) # dictionary of all possible characters self.char2Idx = {"PADDING": 0, "UNKNOWN": 1} for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|<>": self.char2Idx[c] = len(self.char2Idx) # format: [[wordindices], [caseindices], [padded word indices], [label indices]] self.train_set = padding( createMatrices(self.trainSentences, word2Idx, self.label2Idx, case2Idx, self.char2Idx)) self.dev_set = padding( createMatrices(self.devSentences, word2Idx, self.label2Idx, case2Idx, self.char2Idx)) self.test_set = padding( createMatrices(self.testSentences, word2Idx, self.label2Idx, case2Idx, self.char2Idx)) self.idx2Label = {v: k for k, v in self.label2Idx.items()}
def embed(self): """Create word- and character-level embeddings""" s = SemEvalData() k = Komn(s.make_normal_vocabulary(), s.make_syntactical_vocabulary()) syntax_x, _, syntax_test_x, _ = s.get_data_syntax_concatenation(k) # can call s.make_syntactical_vocabulary() to get unique syntactic_words labelSet, words = self.get_unique_labels_and_words() self.map_labels_to_indexes(labelSet) # mapping for token cases case2Idx = { 'numeric': 0, 'allLower': 1, 'allUpper': 2, 'initialUpper': 3, 'other': 4, 'mainly_numeric': 5, 'contains_digit': 6, 'PADDING_TOKEN': 7 } self.caseEmbeddings = np.identity( len(case2Idx), dtype='float32') # identity matrix used # read GLoVE word embeddings word2Idx = {} self.wordEmbeddings = [] # loop through each word in embeddings for word, vector in k.word_to_emb.items(): if len(word2Idx) == 0: # add padding+unknown word2Idx["PADDING_TOKEN"] = len(word2Idx) vector = np.zeros( len(vector)) # zero vector for 'PADDING' word self.wordEmbeddings.append(vector) word2Idx["UNKNOWN_TOKEN"] = len(word2Idx) vector = np.random.uniform(-0.25, 0.25, len(vector)) self.wordEmbeddings.append(vector) if word.lower() in words: vector = np.array(vector) self.wordEmbeddings.append(vector) # word embedding vector word2Idx[word] = len(word2Idx) # corresponding word dict self.wordEmbeddings = np.array(self.wordEmbeddings) # dictionary of all possible characters self.char2Idx = {"PADDING": 0, "UNKNOWN": 1} for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|<>–™Ã©˜¦": self.char2Idx[c] = len(self.char2Idx) self.train_set = padding( createMatrices_syntax(self.trainSentences, syntax_x, word2Idx, self.label2Idx, case2Idx, self.char2Idx)) # self.dev_set = padding(createMatrices(self.devSentences, word2Idx, self.label2Idx, case2Idx, self.char2Idx)) self.test_set = padding( createMatrices_syntax(self.testSentences, syntax_test_x, word2Idx, self.label2Idx, case2Idx, self.char2Idx)) # format: [[wordindices], [caseindices], [padded word indices], [label indices]] # self.train_set = padding(createMatrices(self.trainSentences, word2Idx, self.label2Idx, case2Idx, self.char2Idx)) # self.test_set = padding(createMatrices(self.testSentences, word2Idx, self.label2Idx, case2Idx, self.char2Idx)) self.idx2Label = {v: k for k, v in self.label2Idx.items()}
def main(tweets): model = load_model("NER.h5") tweetsList = [] ne_chunked_sents_list = [] for tweet in tweets: tokenized_doc = nltk.word_tokenize(tweet) tagged_sentences = nltk.pos_tag(tokenized_doc) ne_chunked_sents = nltk.ne_chunk(tagged_sentences) tweetsList.append(tokenized_doc) ne_chunked_sents_list.append(ne_chunked_sents) word2Idx = {} f = io.open("embeddings/glove.6B.100d.txt", encoding="utf-8") for line in f: split = line.strip().split(" ") if len(word2Idx) == 0: # Add padding+unknown word2Idx["PADDING_TOKEN"] = len(word2Idx) word2Idx["UNKNOWN_TOKEN"] = len(word2Idx) word2Idx[split[0].lower()] = len(word2Idx) char2Idx = {"PADDING": 0, "UNKNOWN": 1} for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|": char2Idx[c] = len(char2Idx) word_input = [[]] words = tweet.split() charInd = [] wordInd = [] res = [] named_entities = [] for word in words: if word.lower() in word2Idx: wordIdx = word2Idx[word.lower()] else: wordIdx = word2Idx["UNKNOWN_TOKEN"] temp_char = [] for char in word: temp_char.append(char2Idx[char]) charInd.append(temp_char) wordInd.append(wordIdx) res.append([wordInd, charInd]) res = padding(res) i = 0 for ne_chunked_sents in ne_chunked_sents_list: named_entities.append([]) for element in ne_chunked_sents: if hasattr(element, 'label'): entity_name = ' '.join(c[0] for c in element.leaves()) #entity_type = element.label() # get NE category named_entities[i].append((entity_name)) i += 1 ans = [] for i in res: for j in range(len(i[0])): tokens = np.asarray([i[0][j]]) char = np.asarray([i[1][j]]) pred = model.predict([[tokens], [char]], verbose=False)[0] pred = pred.argmax(axis=-1) # Predict the classes if pred == 0 or pred == 1: ans.append("ORG") elif pred == 3 or pred == 8: ans.append("LOC") elif pred == 4 or pred == 7: ans.append("PER") else: ans.append("O") return named_entities