def __init__(self, word=None, embedding=None, left=None, right=None): if word is not None and embedding is not None: self.data = WordEmbedding(word, embedding) else: self.data = None self.left = left self.right = right
def Search(T, k): s = WordEmbedding(k) # returns node in tree where k is found, # if k is not found in the tree, None is returned for l in T.data: if s.word == l.word: return l if T.isLeaf: return None return Search(T.child[FindChild(T, s)], k)
def Search(T, k): j = WordEmbedding(k) #expects K to be a string, convert to object # Returns node where k is, or None if k is not in the tree for l in T.data: if j.word == l.word: return l if T.isLeaf: return None return Search(T.child[FindChild(T, j)], k)
def __init__(self, args, model_params): super(entityRelation, self).__init__() print("build network...") print("bbb") self.gpu = args.ifgpu self.label_size = model_params.label_alphabet.size() self.bert_encoder_dim = args.encoder_dim self.targetHiddenDim = args.targetHiddenDim self.relationHiddenDim = args.relationHiddenDim self.relation_num = args.relationNum self.drop = args.dropout # buliding model # encoding layer self.Embedding = WordEmbedding(args, model_params) self.encoder = WordHiddenRep(args, model_params) # module linear self.u_input_Linear = nn.Linear(self.bert_encoder_dim, self.targetHiddenDim) self.r_input_Linear = nn.Linear(self.bert_encoder_dim, self.relationHiddenDim) # Tag Linear self.targetHidden2Tag = nn.Linear(self.targetHiddenDim, self.label_size + 2) # CRF self.crf = CRF(self.label_size, self.gpu) # Relation self.relationAttention = RelationAttention(args) # Dropout self.dropout = nn.Dropout(self.drop) if self.gpu: self.Embedding = self.Embedding.cuda() self.encoder = self.encoder.cuda() self.u_input_Linear = self.u_input_Linear.cuda() self.r_input_Linear = self.r_input_Linear.cuda() self.targetHidden2Tag = self.targetHidden2Tag.cuda() self.crf = self.crf.cuda() self.relationAttention = self.relationAttention.cuda() self.dropout = self.dropout.cuda()
def buildBTree(max, filename): try: # set the max value of b tree to the value passed as max parameter T = btree.BTree([], max_data=max) f = open(filename, "r", encoding="utf8") for line in f: # tokenize each line into a list of strings tokens = line.split(" ") # if the value stored at the index begins with an alphabetical letter if tokens[0].isalpha(): btree.Insert(T, WordEmbedding(tokens[0], tokens[1:])) f.close() # close the file to save memory return T # return btree except IOError: print("File", filename, "not found!\n")
def buildHTProbing(algor=1, wordLimit=-1): try: runTime = 0 #total size of glove file is 400,000 words. 400,009 is the next largest prime number. HTP = HashTableLP(400009) file = open("glove.6B.50d.txt", encoding="utf8") lineCount = 0 for line in file: dataList = line.split( " ") #first element is word, the rest is the float array if dataList[0].isalpha( ): #only insert if word starts with an alpha character if algor == 1: startTime = time() HTP.insert1(WordEmbedding(dataList[0], dataList[1:])) runTime += time() - startTime if algor == 2: startTime = time() HTP.insert2(WordEmbedding(dataList[0], dataList[1:])) runTime += time() - startTime if algor == 3: startTime = time() HTP.insert3(WordEmbedding(dataList[0], dataList[1:])) runTime += time() - startTime if algor == 4: startTime = time() HTP.insert4(WordEmbedding(dataList[0], dataList[1:])) runTime += time() - startTime if algor == 5: startTime = time() HTP.insert5(WordEmbedding(dataList[0], dataList[1:])) runTime += time() - startTime if algor == 6: startTime = time() HTP.insert6(WordEmbedding(dataList[0], dataList[1:])) runTime += time() - startTime lineCount += 1 if lineCount == wordLimit: return HTP, runTime print(lineCount, end="\r") return HTP, runTime except Exception as e: print(e) HTP.print_table() raise e
class entityRelation(nn.Module): def __init__(self, args, model_params): super(entityRelation, self).__init__() print("build network...") print("bbb") self.gpu = args.ifgpu self.label_size = model_params.label_alphabet.size() self.bert_encoder_dim = args.encoder_dim self.targetHiddenDim = args.targetHiddenDim self.relationHiddenDim = args.relationHiddenDim self.relation_num = args.relationNum self.drop = args.dropout # buliding model # encoding layer self.Embedding = WordEmbedding(args, model_params) self.encoder = WordHiddenRep(args, model_params) # module linear self.u_input_Linear = nn.Linear(self.bert_encoder_dim, self.targetHiddenDim) self.r_input_Linear = nn.Linear(self.bert_encoder_dim, self.relationHiddenDim) # Tag Linear self.targetHidden2Tag = nn.Linear(self.targetHiddenDim, self.label_size + 2) # CRF self.crf = CRF(self.label_size, self.gpu) # Relation self.relationAttention = RelationAttention(args) # Dropout self.dropout = nn.Dropout(self.drop) if self.gpu: self.Embedding = self.Embedding.cuda() self.encoder = self.encoder.cuda() self.u_input_Linear = self.u_input_Linear.cuda() self.r_input_Linear = self.r_input_Linear.cuda() self.targetHidden2Tag = self.targetHidden2Tag.cuda() self.crf = self.crf.cuda() self.relationAttention = self.relationAttention.cuda() self.dropout = self.dropout.cuda() def neg_log_likelihood_loss(self, all_input_ids, input_length, all_input_mask, all_char_ids, char_length, char_recover, all_relations, all_labels): batch_size = all_input_ids.size(0) seq_len = all_input_ids.size(1) targetPredictScore, R_tensor = self.mainStructure( all_input_ids, input_length, all_input_mask, all_char_ids, char_length, char_recover) target_loss = self.crf.neg_log_likelihood_loss( targetPredictScore, all_input_mask.byte(), all_labels) / (batch_size) scores, tag_seq = self.crf._viterbi_decode(targetPredictScore, all_input_mask.byte()) relationScale = all_relations.transpose(1, 3).contiguous().view( -1, self.relation_num) relation_loss_function = nn.BCELoss(size_average=False) relationScoreLoss = R_tensor.transpose(1, 3).contiguous().view( -1, self.relation_num) relation_loss = relation_loss_function( relationScoreLoss, relationScale.float()) / (batch_size * seq_len) return target_loss, relation_loss, tag_seq, R_tensor def forward(self, all_input_ids, input_length, all_input_mask, all_char_ids, char_length, char_recover): targetPredictScore, R_tensor = self.mainStructure( all_input_ids, input_length, all_input_mask, all_char_ids, char_length, char_recover) scores, tag_seq = self.crf._viterbi_decode(targetPredictScore, all_input_mask.byte()) return tag_seq, R_tensor def mainStructure(self, all_input_ids, input_length, all_input_mask, all_char_ids, char_length, char_recover): batch_size = all_input_ids.size(0) seq_len = all_input_ids.size(1) # encoding layer wordEmbedding = self.Embedding(all_input_ids, all_char_ids, char_length, char_recover) maskEmb = all_input_mask.view(batch_size, seq_len, 1).repeat(1, 1, wordEmbedding.size(2)) wordEmbedding = wordEmbedding * (maskEmb.float()) sequence_output = self.encoder(wordEmbedding, input_length) # module linear h_t = self.u_input_Linear(sequence_output) h_r = self.r_input_Linear(sequence_output) # entity extraction module targetPredictInput = self.targetHidden2Tag(self.dropout(h_t)) # relation detection module relationScore = self.relationAttention(self.dropout(h_r)) return targetPredictInput, relationScore
def buildHashTableProbing(file_name, hashFunction, max): # catch file not found exception try: # file utilizes utf8 encoding f = open(file_name, "r", encoding="utf8") totaltime = 0 HTLP = HashTableLP(400000) lines = 0 # compare to max amount of lines limit for line in f: tokens = line.split(" ") # store if value begins with alphabetic letter (A-Z, lowercase or uppercase) if tokens[0].isalpha(): # The length of the string % n if hashFunction == 1: start = time.time() HTLP.insert1(WordEmbedding(tokens[0], tokens[1:])) end = time.time() totaltime += end - start # The ascii value (ord(c)) of the first character in the string % n if hashFunction == 2: start = time.time() HTLP.insert2(WordEmbedding(tokens[0], tokens[1:])) end = time.time() totaltime += end - start # The product of the ascii values of the first and last characters in the string % n if hashFunction == 3: start = time.time() HTLP.insert3(WordEmbedding(tokens[0], tokens[1:])) end = time.time() totaltime += end - start # The sum of the ascii values of the characters in the string % n if hashFunction == 4: start = time.time() HTLP.insert4(WordEmbedding(tokens[0], tokens[1:])) end = time.time() totaltime += end - start # The recursive formulation h(”,n) = 1; h(S,n) = (ord(s[0]) + 255*h(s[1:],n))% n if hashFunction == 5: start = time.time() HTLP.insert5(WordEmbedding(tokens[0], tokens[1:])) end = time.time() totaltime += end - start # (The length of the string // 2) % n if hashFunction == 6: start = time.time() HTLP.insert6(WordEmbedding(tokens[0], tokens[1:])) end = time.time() totaltime += end - start lines = lines + 1 # if the current line matches the maximum amount of lines alloted if lines == max: return totaltime, HTLP f.close() # close glove file to save memory return totaltime, HTLP except IOError: print("File {} was not found!".format(file_name))