def tag(self, sentence, tag_only=True): """ This is viterbi algorithm """ tags = [] sentence = sentence.strip() if sentence: clauses = common_lib.re_clause_findall.findall(sentence) for clause in clauses: cuts = common_lib.cut(clause) tags += self._tag(cuts, tag_only) return tags
def run(self): with open(self._input_filename, encoding="utf-8") as readf: i = 0 for line in readf: if i >= self._start: # get the clauses clauses = common_lib.re_clause_findall.findall(line.strip()) # write the clause's segments with open(self._output_filepath % ((i - self._start) // self._gap), "a", encoding="utf-8") as writef: for clause in clauses: segments = common_lib.cut(clause) writef.write("%s\n" % "\t".join([segment + "/" for segment in segments])) writef.write("\n" * 2) i += 1 if 0 < self._end == i: break
def write_(sentence, which): cuts = common_lib.cut(sentence) output = "" if which == 1: tag = "E" elif which == 2: tag = "P1" elif which == 3: tag = "P2" elif which == 4: tag = "N1" elif which == 5: tag = "N2" elif which == 6: tag = "OT" else: return if which != 6: if len(cuts) == 1: output = "%s/%s" % (cuts[0], "I-" + tag) else: i = 0 while True: if i == 0: output = "%s/%s\t" % (cuts[i], "B-" + tag) elif i + 1 == len(cuts): output += "%s/%s\t" % (cuts[i], "E-" + tag) break else: output += "%s/%s\t" % (cuts[i], "M-" + tag) i += 1 else: for i in range(len(cuts)): output += "%s/%s\t" % (cuts[i], "OT") with open(common_lib.miner_hmm_user_add_corpus_filepath, "a", encoding="utf-8") as f: f.write("%s\n" % output.strip())
def run(self): while True: # randomly get the train corpus and distribute to each hmm file self.distribute() # train each hmm self.hmm1.train(self.hmm1_filepath) self.hmm2.train(self.hmm2_filepath) # tag each test corpus # if two tags are equal, then add it into corpus file, and change the state of self.added with open(self.train_corpus_filepath, 'a', encoding="utf-8") as train_f: for line in self.bootstrap_contents: hmm1_tags = [] hmm2_tags = [] clauses = [] for clause in common_lib.re_clause_findall.findall(line.strip()): segments = common_lib.cut(clause) clauses.append(segments) hmm1_tags.append(self.hmm1.tag(segments, tag_only=True)) hmm2_tags.append(self.hmm2.tag(segments, tag_only=True)) if hmm1_tags == hmm2_tags: self.added = True print("Add a new data.") runout = "" for i in range(len(hmm1_tags)): content = "" for j in range(len(hmm1_tags[i])): content += "%s/%s\t" % (clauses[i][j], hmm1_tags[i][j]) runout += "%s\n" % content.strip() train_f.write("%s\n" % runout.strip()) self.bootstrap_contents.remove(line) print("Length of remaining corpus: %d" % len(self.bootstrap_contents)) # check whether there are new data added if not self.added: break else: # change the sate of self.added self.added = False