コード例 #1
0
    def parse(self, corpus, tokenise=False):
        '''Tags a string `corpus`.'''
        # Assume untokenised corpus has \n between sentences and ' ' between words
        #s_split = SentenceTokenizer().tokenise if tokenise else lambda t: t.split('\n')
        #w_split = WordTokenizer().tokenise if tokenise else lambda s: s.split()

        reading = True
        for sentence in corpus:
            # print(c, n, '|||', sentence);
            # print(n, end='', file=sys.stderr)
            print("sentence =============", file=sys.stderr)
            print(sentence, file=sys.stderr)
            
            # removing unnecessary lines
            trimed_sentence = []
            V = [0]
            for token in sentence:
                if "." in token[0]:
                    continue
                V.append(int(token[0]))
                trimed_sentence.append(token)
            
            E = []
            F = {}
            context = self.START + [self._normalise(w[1]) for w in trimed_sentence] + self.END
            print("context ============\n", context, file=sys.stderr)
            tags = [w[3] for w in sentence]
            guessed_tags = {}
            POS_feats = {}
            for i in range(0, len(V)):
                for j in range(i + 1, len(V)):
                    print((i,j), file=sys.stderr)
                    dep = j
                    head = i
                    token = trimed_sentence[j-1]
                    depWord = token[1]
                    ########## depPOS = token[3]
                    
                    # head info
                    if i == 0:
                        headWord = "ROOT"
                        headPOS = "ROOT"
                    else:
                        h_token = trimed_sentence[i-1]
                        headWord = h_token[1]
                        headPOS = guessed_tags[head]

                    # prev retrieval
                    if j == 1:
                        prev = "ROOT"
                        prev2 = "START"
                    elif j == 2:
                        prev = guessed_tags[j-1]
                        prev2 = "ROOT"
                    else:
                        prev = guessed_tags[j-1]
                        prev2 = guessed_tags[j-2]

                    ########### depPOS
                    depPOS = self.tagdict.get(depWord)
                    # print("depPOS and word ==============\n", depPOS, depWord, file=sys.stderr)
                    if not depPOS:
                        depPOS = guessed_tags.setdefault(j, "current")
                        # print("TAGdict =========Guessed depPOS and word =============\n", depPOS, depWord, file=sys.stderr)
                        if depPOS == "current":
                            fPOS = self._get_features(self._normalise(depWord), prev, prev2, headPOS, self._normalise(headWord), depPOS, context, dep, head)
                            print("fPOS =================================\n", fPOS, file=sys.stderr)
                            depPOS = self.model.POS_predict(fPOS)
                            # print("Guessed depPOS and word =============\n", depPOS, depWord, file=sys.stderr)
                            guessed_tags[dep] = depPOS
                            POS_feats[dep] = fPOS
                    else:
                        guessed_tags[dep] = depPOS
                    # print("guessed_tags =============\n", guessed_tags, file=sys.stderr)
                    # break
                    # get features
                    feats = self._get_features(self._normalise(depWord), prev, prev2, headPOS, self._normalise(headWord), depPOS, context, dep, head)
                    # print(feats)
                    guess = self.model.predict(feats, depPOS)

                    e_tmp =[head, dep, guess]
                    E.append(e_tmp)
                    F[(head,dep)] = feats
                if i >= 2:
                    for j in range(1,i):
                        print((i,j), file=sys.stderr)
                        dep = j
                        head = i
                        token = trimed_sentence[j-1]
                        depWord = token[1]
                        ########## depPOS = token[3]
                        
                        # head info
                        if i == 0:
                            headWord = "ROOT"
                            headPOS = "ROOT"
                        else:
                            h_token = trimed_sentence[i-1]
                            headWord = h_token[1]
                            headPOS = guessed_tags[head]

                        # prev retrieval
                        if j == 1:
                            prev = "ROOT"
                            prev2 = "START"
                        elif j == 2:
                            prev = guessed_tags[j-1]
                            prev2 = "ROOT"
                        else:
                            prev = guessed_tags[j-1]
                            prev2 = guessed_tags[j-2]

                        ########### depPOS
                        depPOS = self.tagdict.get(depWord)
                        # print("depPOS and word ==============\n", depPOS, depWord, file=sys.stderr)
                        if not depPOS:
                            depPOS = guessed_tags.setdefault(j, "current")
                            # print("TAGdict =========Guessed depPOS and word =============\n", depPOS, depWord, file=sys.stderr)
                            if depPOS == "current":
                                fPOS = self._get_features(self._normalise(depWord), prev, prev2, headPOS, self._normalise(headWord), depPOS, context, dep, head)
                                # print("fPOS =================================\n", fPOS, file=sys.stderr)
                                depPOS = self.model.POS_predict(fPOS)
                                # print("Guessed depPOS and word =============\n", depPOS, depWord, file=sys.stderr)
                                guessed_tags[dep] = depPOS
                                POS_feats[dep] = fPOS
                        else:
                            guessed_tags[dep] = depPOS
                        # print("guessed_tags =============\n", guessed_tags, file=sys.stderr)
                        # break
                        # get features
                        feats = self._get_features(self._normalise(depWord), prev, prev2, headPOS, self._normalise(headWord), depPOS, context, dep, head)
                        # print(feats)
                        guess = self.model.predict(feats, depPOS)

                        e_tmp =[head, dep, guess]
                        E.append(e_tmp)
                        F[(head,dep)] = feats
                        
            # print("Gold V =================", file=sys.stderr)
            # print(G_V, file=sys.stderr)
            # print("Gold E =================", file=sys.stderr)
            # print(G_E, file=sys.stderr)
            # print("V ======================", file=sys.stderr)
            # print(V, file=sys.stderr)
            # print("E ======================", file=sys.stderr)
            # print(E, file=sys.stderr)
            # print("F =====================", file=sys.stderr)
            # print(F, file=sys.stderr)
            # print("==============================================================", file=sys.stderr)
            print("Maxspan", file=sys.stderr)
            M = maxspan(V,E)
            print(M, file=sys.stderr)
            
            if M:
                print("#\n#", file=sys.stdout)
            for token in sentence:
                if "." in token[0]:
                    p_str = ""
                    i = 0
                    while i < len(token):
                        if i == 6:
                            tmp = str(token[i])
                        else:
                            tmp = token[i]
                        p_str += tmp + "\t"
                        i += 1
                    p_str = p_str[0:-1]
                    print(p_str, file=sys.stdout)
                elif token ==[]:
                    print("\n", file=sys.stdout)
                else:
                    dep = int(token[0])
                    for m in M:
                        if m[1] == dep:
                            # token[]
                            p_str = ""
                            i = 0
                            while i < len(token):
                                if i == 3:
                                    tmp = guessed_tags[dep]
                                elif i == 6:
                                    tmp = str(m[0])
                                    # print(tmp, file=sys.stderr)
                                else:
                                    tmp = token[i]
                                p_str += tmp + "\t"
                                i += 1
                            p_str = p_str[0:-1]
                            print(p_str, file=sys.stdout)
            if M:
                print("", file=sys.stdout)

        return 
コード例 #2
0
    def train(self, sentences, tag_loc=None, save_loc=None, nr_iter=5):
        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.

        :param sentences: A list of 10-value tuples
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''
        # self._make_tagdict(sentences)
        self._make_tagdict(sentences)
        self.model.classes = self.classes
        # print("classes =============\n", self.classes)
        # print("tagdict======================\n",self.tagdict)
        # self.model.classes = self.classes
        for iter_ in range(nr_iter):
            c = 0
            n = 0
            # for words,tags in sentences:
            for sentence in sentences:
                # print(c, n, '|||', sentence);
                print(n, end='', file=sys.stderr)
                print("sentence =============", file=sys.stderr)
                print(sentence, file=sys.stderr)

                ###### Gold tree #######
                G_V = [0]
                G_E = []
                trimed_sentence = []
                for token in sentence:
                    if "." in token[0]:
                        continue
                    trimed_sentence.append(token)
                    dependent = int(token[0])
                    # print(dependent)
                    head = int(token[6])

                    G_V.append(dependent)
                    e_tmp =(head, dependent)
                    G_E.append(e_tmp)
                
                ###### Guessing weights ##############
                V = copy(G_V)
                E = []
                F = {}
                context = self.START + [self._normalise(w[1]) for w in trimed_sentence] + self.END
                print("context ============\n", context, file=sys.stderr)
                tags = [w[3] for w in sentence]
                guessed_tags = {}
                POS_feats = {}
                for i in range(0, len(V)):
                    for j in range(i + 1, len(V)):
                        print((i,j), file=sys.stderr)
                        dep = j
                        head = i
                        token = trimed_sentence[j-1]
                        depWord = token[1]
                        ########## depPOS = token[3]
                        
                        # head info
                        if i == 0:
                            headWord = "ROOT"
                            headPOS = "ROOT"
                        else:
                            h_token = trimed_sentence[i-1]
                            headWord = h_token[1]
                            headPOS = guessed_tags[head]

                        # prev retrieval
                        if j == 1:
                            prev = "ROOT"
                            prev2 = "START"
                        elif j == 2:
                            prev = guessed_tags[j-1]
                            prev2 = "ROOT"
                        else:
                            prev = guessed_tags[j-1]
                            prev2 = guessed_tags[j-2]

                        ########### depPOS
                        depPOS = self.tagdict.get(depWord)
                        # print("depPOS and word ==============\n", depPOS, depWord, file=sys.stderr)
                        if not depPOS:
                            depPOS = guessed_tags.setdefault(j, "current")
                            # print("TAGdict =========Guessed depPOS and word =============\n", depPOS, depWord, file=sys.stderr)
                            if depPOS == "current":
                                fPOS = self._get_features(self._normalise(depWord), prev, prev2, headPOS, self._normalise(headWord), depPOS, context, dep, head)
                                # print("fPOS =================================\n", fPOS, file=sys.stderr)
                                depPOS = self.model.POS_predict(fPOS)
                                # print("Guessed depPOS and word =============\n", depPOS, depWord, file=sys.stderr)
                                guessed_tags[dep] = depPOS
                                POS_feats[dep] = fPOS
                        else:
                            guessed_tags[dep] = depPOS
                        # print("guessed_tags =============\n", guessed_tags, file=sys.stderr)
                        # break
                        # get features
                        feats = self._get_features(self._normalise(depWord), prev, prev2, headPOS, self._normalise(headWord), depPOS, context, dep, head)
                        # print(feats)
                        guess = self.model.predict(feats, depPOS)

                        e_tmp =[head, dep, guess]
                        E.append(e_tmp)
                        F[(head,dep)] = feats
                    if i >= 2:
                        for j in range(1,i):
                            print((i,j), file=sys.stderr)
                            dep = j
                            head = i
                            token = trimed_sentence[j-1]
                            depWord = token[1]
                            ########## depPOS = token[3]
                            
                            # head info
                            if i == 0:
                                headWord = "ROOT"
                                headPOS = "ROOT"
                            else:
                                h_token = trimed_sentence[i-1]
                                headWord = h_token[1]
                                headPOS = guessed_tags[head]

                            # prev retrieval
                            if j == 1:
                                prev = "ROOT"
                                prev2 = "START"
                            elif j == 2:
                                prev = guessed_tags[j-1]
                                prev2 = "ROOT"
                            else:
                                prev = guessed_tags[j-1]
                                prev2 = guessed_tags[j-2]

                            ########### depPOS
                            depPOS = self.tagdict.get(depWord)
                            # print("depPOS and word ==============\n", depPOS, depWord, file=sys.stderr)
                            if not depPOS:
                                depPOS = guessed_tags.setdefault(j, "current")
                                # print("TAGdict =========Guessed depPOS and word =============\n", depPOS, depWord, file=sys.stderr)
                                if depPOS == "current":
                                    fPOS = self._get_features(self._normalise(depWord), prev, prev2, headPOS, self._normalise(headWord), depPOS, context, dep, head)
                                    # print("fPOS =================================\n", fPOS, file=sys.stderr)
                                    depPOS = self.model.POS_predict(fPOS)
                                    # print("Guessed depPOS and word =============\n", depPOS, depWord, file=sys.stderr)
                                    guessed_tags[dep] = depPOS
                                    POS_feats[dep] = fPOS
                            else:
                                guessed_tags[dep] = depPOS
                            # print("guessed_tags =============\n", guessed_tags, file=sys.stderr)
                            # break
                            # get features
                            feats = self._get_features(self._normalise(depWord), prev, prev2, headPOS, self._normalise(headWord), depPOS, context, dep, head)
                            # print(feats)
                            guess = self.model.predict(feats, depPOS)

                            e_tmp =[head, dep, guess]
                            E.append(e_tmp)
                            F[(head,dep)] = feats
                            # break
                            # dep = j
                            # head = i
                            # token = trimed_sentence[j-1]
                            # depWord = token[1]
                            # depPOS = token[3]

                            # # head info
                            # h_token = trimed_sentence[i-1]
                            # headWord = h_token[1]
                            # headPOS = h_token[3]

                            # # prev retrieval
                            # if j == 1:
                            #     prev = "ROOT"
                            #     prev2 = "START"
                            # elif j == 2:
                            #     prev = trimed_sentence[j-2][3]
                            #     prev2 = "ROOT"
                            # else:
                            #     prev = trimed_sentence[j-2][3]
                            #     prev2 = trimed_sentence[j-3][3]
                            
                            # # break

                            # # get features
                            # feats = self._get_features(self._normalise(depWord), prev, prev2, headPOS, self._normalise(headWord), depPOS, context, dep, head)
                            # # print(feats)
                            # guess = self.model.predict(feats)

                            # e_tmp =[head, dep, guess]
                            # E.append(e_tmp)
                            # F[(head,dep)] = feats
                print("Gold V =================", file=sys.stderr)
                print(G_V, file=sys.stderr)
                print("Gold E =================", file=sys.stderr)
                print(G_E, file=sys.stderr)
                print("V ======================", file=sys.stderr)
                print(V, file=sys.stderr)
                print("E ======================", file=sys.stderr)
                print(E, file=sys.stderr)
                print("F =====================", file=sys.stderr)
                print(F, file=sys.stderr)
                print("==============================================================", file=sys.stderr)
                print("Maxspan", file=sys.stderr)
                M = maxspan(V,E)
                print(M, file=sys.stderr)

                '''scores'''
                # Caluculate the score for M
                M_score = 0
                for m in M:
                    if m in G_E:
                        c += 1
                    n += 1
                    # M_score += self.model.predict(F[m])
                
                # Calculate the score for G_E
                # G_score = 0
                # for g in G_E:
                #     G_score += self.model.predict(F[g])

                # print("M score ====================\n", M_score, file=sys.stderr)
                # print("G score ====================\n", G_score, file=sys.stderr)
                

                # g_feat = []
                # for g in G_E:
                #     g_feat += F[g]
                # print("G feat ====================\n", g_feat)
                # feat = []
                # for m in M:
                #     feat += F[m]
                # print("feat ======================\n", feat)

                # dictionary of features, keys = dependent
                GM_dep_feat = {}
                for g in G_E:
                    for m in M:
                        if g[1] == m[1]:
                            GM_dep_feat[g[1]] = [F[g].keys(),F[m].keys()]
                # print("GM_feat =====================\n", GM_dep_feat)

                # GM_head_feat = {}
                # for g in G_E:
                #     for m in M:
                #         if g[0] == m[0]:
                #             GM

                # count = 0
                # for m in M:
                #     if m in G_E:
                #         count += 1
                # print("count ===================\n", count)

                '''update for POS'''
                for k in POS_feats.keys():
                    self.model.POS_update(tags[k-1],guessed_tags[k], POS_feats[k])
                # print(self.model.weights)

                ###### update for weighting
                for k in GM_dep_feat.keys():
                    gold = GM_dep_feat[k][0]
                    guessed = GM_dep_feat[k][1]
                    # print(guessed_tags[k])
                    self.model.update(gold, guessed, tags[k-1], guessed_tags[k])
                # for m in M:
                #     if m in G_E:
                #         self.model.update(F[m], 1.0)
                #         c += 1
                #     else:
                #         self.model.update(F[m], -1.0)
                #     n += 1
                # for m in G_E:
                #     if not (m in M):
                #         self.model.update(F[m],1.0)
            # break
                
            random.shuffle(sentences)
            print()
            print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n)))
        # print("\nweights ============", file=sys.stderr)
        # print(self.model.weights, file=sys.stderr)
        self.model.average_weights()
        # Pickle as a binary file
        if save_loc is not None:
            pickle.dump((self.model.weights, self.tagdict, self.classes),
                         open(save_loc, 'wb'), -1)

        print("\nweights ============", file=sys.stderr)
        print(self.model.weights, file=sys.stderr)
        return None
コード例 #3
0
    def train(self, sentences, save_loc=None, nr_iter=5):
        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.

        :param sentences: A list of 10-value tuples
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''
        # self._make_tagdict(sentences)
        # self.model.classes = self.classes
        for iter_ in range(nr_iter):
            c = 0
            n = 0
            # for words,tags in sentences:
            for sentence in sentences:
                # print(c, n, '|||', sentence);
                print(n, end='', file=sys.stderr)
                print("sentence =============", file=sys.stderr)
                print(sentence, file=sys.stderr)
                # prev, prev2 = self.START
                # context = self.START + [self._normalise(w[1]) for w in sentence] + self.END

                ###### Gold tree #######
                G_V = [0]
                G_E = []
                trimed_sentence = []
                for token in sentence:
                    if "." in token[0]:
                        continue
                    trimed_sentence.append(token)
                    dependent = int(token[0])
                    # print(dependent)
                    head = int(token[6])

                    # headPOS = sentence[head-1][3]
                    # headW = sentence[head-1][1]

                    # feats = self._get_features(dependent - 1, depWord, context, prev, prev2, headPOS, headW, depPOS)
                    # guess = self.model.predict(feats)

                    G_V.append(dependent)
                    e_tmp = (head, dependent)
                    G_E.append(e_tmp)

                    # prev2 = prev
                    # prev = depPOS

                ###### Guessing weights ##############
                V = copy(G_V)
                E = []
                F = {}
                context = self.START + [
                    self._normalise(w[1]) for w in trimed_sentence
                ] + self.END
                for i in range(0, len(V)):
                    for j in range(i + 1, len(V)):
                        print((i, j), file=sys.stderr)
                        dep = j
                        head = i
                        token = trimed_sentence[j - 1]
                        depWord = token[1]
                        depPOS = token[3]
                        # head info
                        if i == 0:
                            headWord = "ROOT"
                            headPOS = "ROOT"
                        else:
                            h_token = trimed_sentence[i - 1]
                            headWord = h_token[1]
                            headPOS = h_token[3]

                        # prev retrieval
                        if j == 1:
                            prev = "ROOT"
                            prev2 = "START"
                        elif j == 2:
                            prev = trimed_sentence[j - 2][3]
                            prev2 = "ROOT"
                        else:
                            prev = trimed_sentence[j - 2][3]
                            prev2 = trimed_sentence[j - 3][3]

                        # get features
                        feats = self._get_features(self._normalise(depWord),
                                                   prev, prev2, headPOS,
                                                   self._normalise(headWord),
                                                   depPOS, context, dep, head)
                        # print(feats)
                        guess = self.model.predict(feats)

                        e_tmp = [head, dep, guess]
                        E.append(e_tmp)
                        F[(head, dep)] = feats
                    if i >= 2:
                        for j in range(1, i):
                            dep = j
                            head = i
                            token = trimed_sentence[j - 1]
                            depWord = token[1]
                            depPOS = token[3]

                            # head info
                            h_token = trimed_sentence[i - 1]
                            headWord = h_token[1]
                            headPOS = h_token[3]

                            # prev retrieval
                            if j == 1:
                                prev = "ROOT"
                                prev2 = "START"
                            elif j == 2:
                                prev = trimed_sentence[j - 2][3]
                                prev2 = "ROOT"
                            else:
                                prev = trimed_sentence[j - 2][3]
                                prev2 = trimed_sentence[j - 3][3]

                            # get features
                            feats = self._get_features(
                                self._normalise(depWord), prev, prev2, headPOS,
                                self._normalise(headWord), depPOS, context,
                                dep, head)
                            # print(feats)
                            guess = self.model.predict(feats)

                            e_tmp = [head, dep, guess]
                            E.append(e_tmp)
                            F[(head, dep)] = feats
                # print("Gold V =================", file=sys.stderr)
                # print(G_V, file=sys.stderr)
                # print("Gold E =================", file=sys.stderr)
                # print(G_E, file=sys.stderr)
                # print("V ======================", file=sys.stderr)
                # print(V, file=sys.stderr)
                # print("E ======================", file=sys.stderr)
                # print(E, file=sys.stderr)
                # print("F =====================", file=sys.stderr)
                # print(F, file=sys.stderr)
                # print("==============================================================", file=sys.stderr)
                # print("Maxspan", file=sys.stderr)
                M = maxspan(V, E)
                # print(M, file=sys.stderr)

                # Caluculate the score for M
                M_score = 0
                for m in M:
                    if m in G_E:
                        c += 1
                    n += 1
                    M_score += self.model.predict(F[m])

                # Calculate the score for G_E
                G_score = 0
                for g in G_E:
                    G_score += self.model.predict(F[g])

                # print("M score ====================\n", M_score, file=sys.stderr)
                # print("G score ====================\n", G_score, file=sys.stderr)

                # g_feat = []
                # for g in G_E:
                #     g_feat += F[g]
                # print("G feat ====================\n", g_feat)
                # feat = []
                # for m in M:
                #     feat += F[m]
                # print("feat ======================\n", feat)

                # dictionary of features, keys = dependent
                GM_dep_feat = {}
                for g in G_E:
                    for m in M:
                        if g[1] == m[1]:
                            GM_dep_feat[g[1]] = [F[g].keys(), F[m].keys()]
                # print("GM_feat =====================\n", GM_dep_feat)

                # GM_head_feat = {}
                # for g in G_E:
                #     for m in M:
                #         if g[0] == m[0]:
                #             GM

                # count = 0
                # for m in M:
                #     if m in G_E:
                #         count += 1
                # print("count ===================\n", count)

                ###### update
                # self.model.i += 1
                for k in GM_dep_feat.keys():
                    gold = GM_dep_feat[k][0]
                    guessed = GM_dep_feat[k][1]
                    self.model.update(gold, guessed)
                # for m in M:
                #     if m in G_E:
                #         self.model.update(F[m], 1.0)
                #         c += 1
                #     else:
                #         self.model.update(F[m], -1.0)
                #     n += 1
                # for m in G_E:
                #     if not (m in M):
                #         self.model.update(F[m],1.0)

            random.shuffle(sentences)
            print()
            print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n)))
        # print("\nweights ============", file=sys.stderr)
        # print(self.model.weights, file=sys.stderr)
        self.model.average_weights()
        # Pickle as a binary file
        if save_loc is not None:
            pickle.dump((self.model.weights), open(save_loc, 'wb'), -1)

            # prev, prev2 = self.START
            # context = self.START + [self._normalise(w[1]) for w in sentence] + self.END
            # tags = [w[3] for w in sentence]
            # for i, token in enumerate(sentence):
            #     print("token ==========")
            #     print(token)
            #     if "." in token[0]:
            #         continue

            #     word = token[1]
            #     dependentPOS = token[3]
            #     print(token)
            #     head = int(token[6])
            #     # print("head ============")
            #     # print(head)
            #     # print(sentence[head-1])
            #     headPOS = sentence[head-1][3]
            #     headW = sentence[head-1][1]

            #     feats = self._get_features(i, word, context, prev, prev2, headPOS, headW, dependentPOS)
            #     # print("feats ===========")
            #     # print(feats)
            #     guess = self.model.predict(feats)
            #     # print("guess ===========")
            #     # print(guess)
            #     # Need to modify update function
            #     self.model.update(feats)
            #     # print("\nweights ============")
            #     # print(self.model.weights)

            #     prev2 = prev
            #     prev = dependentPOS
            #     c += guess == tags[i]
            #     n += 1
            # break
            # break
        print("\nweights ============", file=sys.stderr)
        print(self.model.weights, file=sys.stderr)
        #         print('\r', end='', file=sys.stderr)
        #     random.shuffle(sentences)
        #     print()
        #     print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n)), file=sys.stderr)
        # self.model.average_weights()
        # # Pickle as a binary file
        # if save_loc is not None:
        #     pickle.dump((self.model.weights, self.tagdict, self.classes),
        #                  open(save_loc, 'wb'), -1)
        return None
コード例 #4
0
    def parse(self, corpus, tokenise=False):
        '''Tags a string `corpus`.'''
        # Assume untokenised corpus has \n between sentences and ' ' between words
        #s_split = SentenceTokenizer().tokenise if tokenise else lambda t: t.split('\n')
        #w_split = WordTokenizer().tokenise if tokenise else lambda s: s.split()

        reading = True
        for sentence in corpus:
            # print(c, n, '|||', sentence);
            # print(n, end='', file=sys.stderr)
            print("sentence =============", file=sys.stderr)
            print(sentence, file=sys.stderr)

            # removing unnecessary lines
            trimed_sentence = []
            V = [0]
            for token in sentence:
                if "." in token[0]:
                    continue
                V.append(int(token[0]))
                trimed_sentence.append(token)

            ###### Guessing weights ##############
            E = []
            context = self.START + [
                self._normalise(w[1]) for w in trimed_sentence
            ] + self.END
            for i in range(0, len(V)):
                for j in range(i + 1, len(V)):
                    # print((i,j), file=sys.stderr)
                    dep = j
                    head = i
                    token = trimed_sentence[j - 1]
                    depWord = token[1]
                    depPOS = token[3]
                    # head info
                    if i == 0:
                        headWord = "ROOT"
                        headPOS = "ROOT"
                    else:
                        h_token = trimed_sentence[i - 1]
                        headWord = h_token[1]
                        headPOS = h_token[3]

                    # prev retrieval
                    if j == 1:
                        prev = "ROOT"
                        prev2 = "START"
                    elif j == 2:
                        prev = trimed_sentence[j - 2][3]
                        prev2 = "ROOT"
                    else:
                        prev = trimed_sentence[j - 2][3]
                        prev2 = trimed_sentence[j - 3][3]

                    # get features
                    feats = self._get_features(self._normalise(depWord), prev,
                                               prev2, headPOS,
                                               self._normalise(headWord),
                                               depPOS, context, dep, head)
                    # print(feats)
                    guess = self.model.predict(feats)

                    e_tmp = [head, dep, guess]
                    E.append(e_tmp)
                    # F[(head,dep)] = feats
                if i >= 2:
                    for j in range(1, i):
                        dep = j
                        head = i
                        token = trimed_sentence[j - 1]
                        depWord = token[1]
                        depPOS = token[3]

                        # head info
                        h_token = trimed_sentence[i - 1]
                        headWord = h_token[1]
                        headPOS = h_token[3]

                        # prev retrieval
                        if j == 1:
                            prev = "ROOT"
                            prev2 = "START"
                        elif j == 2:
                            prev = trimed_sentence[j - 2][3]
                            prev2 = "ROOT"
                        else:
                            prev = trimed_sentence[j - 2][3]
                            prev2 = trimed_sentence[j - 3][3]

                        # get features
                        feats = self._get_features(self._normalise(depWord),
                                                   prev, prev2, headPOS,
                                                   self._normalise(headWord),
                                                   depPOS, context, dep, head)
                        # print(feats)
                        guess = self.model.predict(feats)

                        e_tmp = [head, dep, guess]
                        E.append(e_tmp)
            print(V, file=sys.stderr)
            print(E, file=sys.stderr)
            M = maxspan(V, E)
            print("M ===================\n", M, file=sys.stderr)
            if M:
                print("#\n#", file=sys.stdout)
            for token in sentence:
                if "." in token[0]:
                    p_str = ""
                    i = 0
                    while i < len(token):
                        if i == 6:
                            tmp = str(token[i])
                        else:
                            tmp = token[i]
                        p_str += tmp + "\t"
                        i += 1
                    p_str = p_str[0:-1]
                    print(p_str, file=sys.stdout)
                elif token == []:
                    print("\n", file=sys.stdout)
                else:
                    dep = int(token[0])
                    for m in M:
                        if m[1] == dep:
                            # token[]
                            p_str = ""
                            i = 0
                            while i < len(token):
                                if i == 6:
                                    tmp = str(m[0])
                                    # print(tmp, file=sys.stderr)
                                else:
                                    tmp = token[i]
                                p_str += tmp + "\t"
                                i += 1
                            p_str = p_str[0:-1]
                            print(p_str, file=sys.stdout)
            if M:
                print("", file=sys.stdout)
            # break

        # sentence = []
#         line = corpus.readline()

#         while reading:
#             if line == '\n':
#                 # sentence boundary
#                 prev, prev2 = self.START
# #                print('s:',sentence)
#                 for words in sentence:
#                     context = self.START + [self._normalise(w[1]) for w in sentence] + self.END
#                     for i, token in enumerate(sentence):
#                         tag = self.tagdict.get(token[1])
#                         if not tag:
#                             # if the word isn't "unambiguous", extract features
#                             features = self._get_features(i, token[1], context, prev, prev2)
#                             # make the prediction
#                             tag = self.model.predict(features)
#                         sentence[i][3] = tag
#                         prev2 = prev
#                         prev = tag
#                 # print out the tokens and their tags
#                 for words in sentence:
#                     print('\t'.join(words))
#                 print()
#                 sentence = []
#             elif line == '':
#                 # we reached the end of the input
#                 reading = False
#             elif line[0] == '#':
#                 # line is a comment line
#                 print(line.strip())
#                 line = corpus.readline()
#                 continue
#             else:
#                 # normal conllu line
#                 row = line.strip().split('\t')
#                 sentence.append(row)

#             # read the next line
#             line = corpus.readline()

        return
コード例 #5
0
ファイル: Weighting.py プロジェクト: kazzyabe/graph-parser
    def parse(self, corpus, tokenise=False):
        '''Tags a string `corpus`.'''
        # Assume untokenised corpus has \n between sentences and ' ' between words
        #s_split = SentenceTokenizer().tokenise if tokenise else lambda t: t.split('\n')
        #w_split = WordTokenizer().tokenise if tokenise else lambda s: s.split()

        reading = True
        for sentence in corpus:
            # print(c, n, '|||', sentence);
            # print(n, end='', file=sys.stderr)
            print("sentence =============", file=sys.stderr)
            print(sentence, file=sys.stderr)

            # removing unnecessary lines
            trimed_sentence = []
            V = [0]
            for token in sentence:
                if "." in token[0]:
                    continue
                V.append(int(token[0]))
                trimed_sentence.append(token)

            ###### Guessing weights ##############
            E = []
            context = self.START + [
                self._normalise(w[1]) for w in trimed_sentence
            ] + self.END
            for i in range(0, len(V)):
                for j in range(i + 1, len(V)):
                    # print((i,j), file=sys.stderr)
                    dep = j
                    head = i
                    token = trimed_sentence[j - 1]
                    depWord = token[1]
                    depPOS = token[3]
                    # head info
                    if i == 0:
                        headWord = "ROOT"
                        headPOS = "ROOT"
                    else:
                        h_token = trimed_sentence[i - 1]
                        headWord = h_token[1]
                        headPOS = h_token[3]

                    # prev retrieval
                    if j == 1:
                        prev = "ROOT"
                        prev2 = "START"
                    elif j == 2:
                        prev = trimed_sentence[j - 2][3]
                        prev2 = "ROOT"
                    else:
                        prev = trimed_sentence[j - 2][3]
                        prev2 = trimed_sentence[j - 3][3]

                    # get features
                    feats = self._get_features(self._normalise(depWord), prev,
                                               prev2, headPOS,
                                               self._normalise(headWord),
                                               depPOS, context, dep, head)
                    # print(feats)
                    guess = self.model.predict(feats)

                    e_tmp = [head, dep, guess]
                    E.append(e_tmp)
                    # F[(head,dep)] = feats
                if i >= 2:
                    for j in range(1, i):
                        dep = j
                        head = i
                        token = trimed_sentence[j - 1]
                        depWord = token[1]
                        depPOS = token[3]

                        # head info
                        h_token = trimed_sentence[i - 1]
                        headWord = h_token[1]
                        headPOS = h_token[3]

                        # prev retrieval
                        if j == 1:
                            prev = "ROOT"
                            prev2 = "START"
                        elif j == 2:
                            prev = trimed_sentence[j - 2][3]
                            prev2 = "ROOT"
                        else:
                            prev = trimed_sentence[j - 2][3]
                            prev2 = trimed_sentence[j - 3][3]

                        # get features
                        feats = self._get_features(self._normalise(depWord),
                                                   prev, prev2, headPOS,
                                                   self._normalise(headWord),
                                                   depPOS, context, dep, head)
                        # print(feats)
                        guess = self.model.predict(feats)

                        e_tmp = [head, dep, guess]
                        E.append(e_tmp)
            print(V, file=sys.stderr)
            print(E, file=sys.stderr)
            M = maxspan(V, E)
            print("M ===================\n", M, file=sys.stderr)
            if M:
                print("#\n#", file=sys.stdout)
            for token in sentence:
                if "." in token[0]:
                    p_str = ""
                    i = 0
                    while i < len(token):
                        if i == 6:
                            tmp = str(token[i])
                        else:
                            tmp = token[i]
                        p_str += tmp + "\t"
                        i += 1
                    p_str = p_str[0:-1]
                    print(p_str, file=sys.stdout)
                elif token == []:
                    print("\n", file=sys.stdout)
                else:
                    dep = int(token[0])
                    for m in M:
                        if m[1] == dep:
                            # token[]
                            p_str = ""
                            i = 0
                            while i < len(token):
                                if i == 6:
                                    tmp = str(m[0])
                                    # print(tmp, file=sys.stderr)
                                else:
                                    tmp = token[i]
                                p_str += tmp + "\t"
                                i += 1
                            p_str = p_str[0:-1]
                            print(p_str, file=sys.stdout)
            if M:
                print("", file=sys.stdout)

        return