def tag(self, sentence, tag_only=True):
     """
     This is viterbi algorithm
     """
     tags = []
     sentence = sentence.strip()
     if sentence:
         clauses = common_lib.re_clause_findall.findall(sentence)
         for clause in clauses:
             cuts = common_lib.cut(clause)
             tags += self._tag(cuts, tag_only)
     return tags
 def tag(self, sentence, tag_only=True):
     """
     This is viterbi algorithm
     """
     tags = []
     sentence = sentence.strip()
     if sentence:
         clauses = common_lib.re_clause_findall.findall(sentence)
         for clause in clauses:
             cuts = common_lib.cut(clause)
             tags += self._tag(cuts, tag_only)
     return tags
Beispiel #3
0
    def run(self):
        with open(self._input_filename, encoding="utf-8") as readf:
            i = 0
            for line in readf:
                if i >= self._start:
                    # get the clauses
                    clauses = common_lib.re_clause_findall.findall(line.strip())

                    # write the clause's segments
                    with open(self._output_filepath % ((i - self._start) // self._gap),
                              "a", encoding="utf-8") as writef:
                        for clause in clauses:
                            segments = common_lib.cut(clause)
                            writef.write("%s\n" % "\t".join([segment + "/" for segment in segments]))
                        writef.write("\n" * 2)

                i += 1
                if 0 < self._end == i:
                    break
Beispiel #4
0
def write_(sentence, which):
    cuts = common_lib.cut(sentence)
    output = ""

    if which == 1:
        tag = "E"
    elif which == 2:
        tag = "P1"
    elif which == 3:
        tag = "P2"
    elif which == 4:
        tag = "N1"
    elif which == 5:
        tag = "N2"
    elif which == 6:
        tag = "OT"
    else:
        return

    if which != 6:
        if len(cuts) == 1:
            output = "%s/%s" % (cuts[0], "I-" + tag)
        else:
            i = 0
            while True:
                if i == 0:
                    output = "%s/%s\t" % (cuts[i], "B-" + tag)
                elif i + 1 == len(cuts):
                    output += "%s/%s\t" % (cuts[i], "E-" + tag)
                    break
                else:
                    output += "%s/%s\t" % (cuts[i], "M-" + tag)
                i += 1
    else:
        for i in range(len(cuts)):
            output += "%s/%s\t" % (cuts[i], "OT")

    with open(common_lib.miner_hmm_user_add_corpus_filepath,
              "a",
              encoding="utf-8") as f:
        f.write("%s\n" % output.strip())
Beispiel #5
0
    def run(self):
        while True:
            # randomly get the train corpus and distribute to each hmm file
            self.distribute()
            # train each hmm
            self.hmm1.train(self.hmm1_filepath)
            self.hmm2.train(self.hmm2_filepath)

            # tag each test corpus
            # if two tags are equal, then add it into corpus file, and change the state of self.added
            with open(self.train_corpus_filepath, 'a', encoding="utf-8") as train_f:
                for line in self.bootstrap_contents:
                    hmm1_tags = []
                    hmm2_tags = []
                    clauses = []
                    for clause in common_lib.re_clause_findall.findall(line.strip()):
                        segments = common_lib.cut(clause)
                        clauses.append(segments)
                        hmm1_tags.append(self.hmm1.tag(segments, tag_only=True))
                        hmm2_tags.append(self.hmm2.tag(segments, tag_only=True))

                    if hmm1_tags == hmm2_tags:
                        self.added = True
                        print("Add a new data.")
                        runout = ""
                        for i in range(len(hmm1_tags)):
                            content = ""
                            for j in range(len(hmm1_tags[i])):
                                content += "%s/%s\t" % (clauses[i][j], hmm1_tags[i][j])
                            runout += "%s\n" % content.strip()
                        train_f.write("%s\n" % runout.strip())
                        self.bootstrap_contents.remove(line)

            print("Length of remaining corpus: %d" % len(self.bootstrap_contents))
            # check whether there are new data added
            if not self.added:
                break
            else:
                # change the sate of self.added
                self.added = False
def write_(sentence, which):
    cuts = common_lib.cut(sentence)
    output = ""

    if which == 1:
        tag = "E"
    elif which == 2:
        tag = "P1"
    elif which == 3:
        tag = "P2"
    elif which == 4:
        tag = "N1"
    elif which == 5:
        tag = "N2"
    elif which == 6:
        tag = "OT"
    else:
        return

    if which != 6:
        if len(cuts) == 1:
            output = "%s/%s" % (cuts[0], "I-" + tag)
        else:
            i = 0
            while True:
                if i == 0:
                    output = "%s/%s\t" % (cuts[i], "B-" + tag)
                elif i + 1 == len(cuts):
                    output += "%s/%s\t" % (cuts[i], "E-" + tag)
                    break
                else:
                    output += "%s/%s\t" % (cuts[i], "M-" + tag)
                i += 1
    else:
        for i in range(len(cuts)):
            output += "%s/%s\t" % (cuts[i], "OT")

    with open(common_lib.miner_hmm_user_add_corpus_filepath, "a", encoding="utf-8") as f:
        f.write("%s\n" % output.strip())