Esempio n. 1
0
 def __init__(self, word=None, embedding=None, left=None, right=None):  
     if word is not None and embedding is not None:
         self.data = WordEmbedding(word, embedding)
     else:
         self.data = None
     self.left = left 
     self.right = right      
Esempio n. 2
0
def Search(T, k):
    s = WordEmbedding(k)
    # returns node in tree where k is found,
    # if k is not found in the tree, None is returned
    for l in T.data:
        if s.word == l.word:
            return l
    if T.isLeaf:
        return None
    return Search(T.child[FindChild(T, s)], k)
Esempio n. 3
0
def Search(T, k):
    j = WordEmbedding(k)  #expects K to be a string, convert to object

    # Returns node where k is, or None if k is not in the tree
    for l in T.data:
        if j.word == l.word:
            return l

    if T.isLeaf:
        return None
    return Search(T.child[FindChild(T, j)], k)
Esempio n. 4
0
    def __init__(self, args, model_params):
        super(entityRelation, self).__init__()
        print("build network...")
        print("bbb")
        self.gpu = args.ifgpu
        self.label_size = model_params.label_alphabet.size()
        self.bert_encoder_dim = args.encoder_dim
        self.targetHiddenDim = args.targetHiddenDim
        self.relationHiddenDim = args.relationHiddenDim
        self.relation_num = args.relationNum
        self.drop = args.dropout
        # buliding model
        # encoding layer
        self.Embedding = WordEmbedding(args, model_params)
        self.encoder = WordHiddenRep(args, model_params)
        # module linear
        self.u_input_Linear = nn.Linear(self.bert_encoder_dim,
                                        self.targetHiddenDim)
        self.r_input_Linear = nn.Linear(self.bert_encoder_dim,
                                        self.relationHiddenDim)
        # Tag Linear
        self.targetHidden2Tag = nn.Linear(self.targetHiddenDim,
                                          self.label_size + 2)
        # CRF
        self.crf = CRF(self.label_size, self.gpu)
        # Relation
        self.relationAttention = RelationAttention(args)
        # Dropout
        self.dropout = nn.Dropout(self.drop)

        if self.gpu:
            self.Embedding = self.Embedding.cuda()
            self.encoder = self.encoder.cuda()
            self.u_input_Linear = self.u_input_Linear.cuda()
            self.r_input_Linear = self.r_input_Linear.cuda()
            self.targetHidden2Tag = self.targetHidden2Tag.cuda()
            self.crf = self.crf.cuda()
            self.relationAttention = self.relationAttention.cuda()
            self.dropout = self.dropout.cuda()
Esempio n. 5
0
def buildBTree(max, filename):
    try:
        # set the max value of b tree to the value passed as max parameter
        T = btree.BTree([], max_data=max)
        f = open(filename, "r", encoding="utf8")
        for line in f:
            # tokenize each line into a list of strings
            tokens = line.split(" ")
            # if the value stored at the index begins with an alphabetical letter
            if tokens[0].isalpha():
                btree.Insert(T, WordEmbedding(tokens[0], tokens[1:]))

        f.close()  # close the file to save memory
        return T  # return btree
    except IOError:
        print("File", filename, "not found!\n")
Esempio n. 6
0
def buildHTProbing(algor=1, wordLimit=-1):
    try:
        runTime = 0
        #total size of glove file is 400,000 words.  400,009 is the next largest prime number.
        HTP = HashTableLP(400009)
        file = open("glove.6B.50d.txt", encoding="utf8")
        lineCount = 0
        for line in file:
            dataList = line.split(
                " ")  #first element is word, the rest is the float array
            if dataList[0].isalpha(
            ):  #only insert if word starts with an alpha character
                if algor == 1:
                    startTime = time()
                    HTP.insert1(WordEmbedding(dataList[0], dataList[1:]))
                    runTime += time() - startTime
                if algor == 2:
                    startTime = time()
                    HTP.insert2(WordEmbedding(dataList[0], dataList[1:]))
                    runTime += time() - startTime
                if algor == 3:
                    startTime = time()
                    HTP.insert3(WordEmbedding(dataList[0], dataList[1:]))
                    runTime += time() - startTime
                if algor == 4:
                    startTime = time()
                    HTP.insert4(WordEmbedding(dataList[0], dataList[1:]))
                    runTime += time() - startTime
                if algor == 5:
                    startTime = time()
                    HTP.insert5(WordEmbedding(dataList[0], dataList[1:]))
                    runTime += time() - startTime
                if algor == 6:
                    startTime = time()
                    HTP.insert6(WordEmbedding(dataList[0], dataList[1:]))
                    runTime += time() - startTime
            lineCount += 1
            if lineCount == wordLimit:
                return HTP, runTime
            print(lineCount, end="\r")
        return HTP, runTime

    except Exception as e:
        print(e)
        HTP.print_table()
        raise e
Esempio n. 7
0
class entityRelation(nn.Module):
    def __init__(self, args, model_params):
        super(entityRelation, self).__init__()
        print("build network...")
        print("bbb")
        self.gpu = args.ifgpu
        self.label_size = model_params.label_alphabet.size()
        self.bert_encoder_dim = args.encoder_dim
        self.targetHiddenDim = args.targetHiddenDim
        self.relationHiddenDim = args.relationHiddenDim
        self.relation_num = args.relationNum
        self.drop = args.dropout
        # buliding model
        # encoding layer
        self.Embedding = WordEmbedding(args, model_params)
        self.encoder = WordHiddenRep(args, model_params)
        # module linear
        self.u_input_Linear = nn.Linear(self.bert_encoder_dim,
                                        self.targetHiddenDim)
        self.r_input_Linear = nn.Linear(self.bert_encoder_dim,
                                        self.relationHiddenDim)
        # Tag Linear
        self.targetHidden2Tag = nn.Linear(self.targetHiddenDim,
                                          self.label_size + 2)
        # CRF
        self.crf = CRF(self.label_size, self.gpu)
        # Relation
        self.relationAttention = RelationAttention(args)
        # Dropout
        self.dropout = nn.Dropout(self.drop)

        if self.gpu:
            self.Embedding = self.Embedding.cuda()
            self.encoder = self.encoder.cuda()
            self.u_input_Linear = self.u_input_Linear.cuda()
            self.r_input_Linear = self.r_input_Linear.cuda()
            self.targetHidden2Tag = self.targetHidden2Tag.cuda()
            self.crf = self.crf.cuda()
            self.relationAttention = self.relationAttention.cuda()
            self.dropout = self.dropout.cuda()

    def neg_log_likelihood_loss(self, all_input_ids, input_length,
                                all_input_mask, all_char_ids, char_length,
                                char_recover, all_relations, all_labels):

        batch_size = all_input_ids.size(0)
        seq_len = all_input_ids.size(1)

        targetPredictScore, R_tensor = self.mainStructure(
            all_input_ids, input_length, all_input_mask, all_char_ids,
            char_length, char_recover)

        target_loss = self.crf.neg_log_likelihood_loss(
            targetPredictScore, all_input_mask.byte(),
            all_labels) / (batch_size)
        scores, tag_seq = self.crf._viterbi_decode(targetPredictScore,
                                                   all_input_mask.byte())

        relationScale = all_relations.transpose(1, 3).contiguous().view(
            -1, self.relation_num)
        relation_loss_function = nn.BCELoss(size_average=False)
        relationScoreLoss = R_tensor.transpose(1, 3).contiguous().view(
            -1, self.relation_num)
        relation_loss = relation_loss_function(
            relationScoreLoss, relationScale.float()) / (batch_size * seq_len)

        return target_loss, relation_loss, tag_seq, R_tensor

    def forward(self, all_input_ids, input_length, all_input_mask,
                all_char_ids, char_length, char_recover):

        targetPredictScore, R_tensor = self.mainStructure(
            all_input_ids, input_length, all_input_mask, all_char_ids,
            char_length, char_recover)
        scores, tag_seq = self.crf._viterbi_decode(targetPredictScore,
                                                   all_input_mask.byte())

        return tag_seq, R_tensor

    def mainStructure(self, all_input_ids, input_length, all_input_mask,
                      all_char_ids, char_length, char_recover):
        batch_size = all_input_ids.size(0)
        seq_len = all_input_ids.size(1)

        # encoding layer
        wordEmbedding = self.Embedding(all_input_ids, all_char_ids,
                                       char_length, char_recover)
        maskEmb = all_input_mask.view(batch_size, seq_len,
                                      1).repeat(1, 1, wordEmbedding.size(2))
        wordEmbedding = wordEmbedding * (maskEmb.float())
        sequence_output = self.encoder(wordEmbedding, input_length)

        # module linear
        h_t = self.u_input_Linear(sequence_output)
        h_r = self.r_input_Linear(sequence_output)

        # entity extraction module
        targetPredictInput = self.targetHidden2Tag(self.dropout(h_t))

        # relation detection module
        relationScore = self.relationAttention(self.dropout(h_r))

        return targetPredictInput, relationScore
Esempio n. 8
0
def buildHashTableProbing(file_name, hashFunction, max):
    # catch file not found exception
    try:
        # file utilizes utf8 encoding
        f = open(file_name, "r", encoding="utf8")
        totaltime = 0

        HTLP = HashTableLP(400000)

        lines = 0 # compare to max amount of lines limit
        for line in f:
            tokens = line.split(" ")
            # store if value begins with alphabetic letter (A-Z, lowercase or uppercase)
            if tokens[0].isalpha():

                # The length of the string % n
                if hashFunction == 1:
                    start = time.time()
                    HTLP.insert1(WordEmbedding(tokens[0], tokens[1:]))
                    end = time.time()
                    totaltime += end - start

                # The ascii value (ord(c)) of the first character in the string % n
                if hashFunction == 2:
                    start = time.time()
                    HTLP.insert2(WordEmbedding(tokens[0], tokens[1:]))
                    end = time.time()
                    totaltime += end - start

                # The product of the ascii values of the first and last characters in the string % n
                if hashFunction == 3:
                    start = time.time()
                    HTLP.insert3(WordEmbedding(tokens[0], tokens[1:]))
                    end = time.time()
                    totaltime += end - start

                # The sum of the ascii values of the characters in the string % n
                if hashFunction == 4:
                    start = time.time()
                    HTLP.insert4(WordEmbedding(tokens[0], tokens[1:]))
                    end = time.time()
                    totaltime += end - start

                # The recursive formulation h(”,n) = 1; h(S,n) = (ord(s[0]) + 255*h(s[1:],n))% n
                if hashFunction == 5:
                    start = time.time()
                    HTLP.insert5(WordEmbedding(tokens[0], tokens[1:]))
                    end = time.time()
                    totaltime += end - start

                # (The length of the string // 2) % n
                if hashFunction == 6:
                    start = time.time()
                    HTLP.insert6(WordEmbedding(tokens[0], tokens[1:]))
                    end = time.time()
                    totaltime += end - start

            lines = lines + 1

            # if the current line matches the maximum amount of lines alloted
            if lines == max:
                return totaltime, HTLP
        
        f.close() # close glove file to save memory
        return totaltime, HTLP

    except IOError:
        print("File {} was not found!".format(file_name))