Ejemplo n.º 1
0
 def __init__(self, args):
     self.user_specified_dict_name = None
     self.model_path_char = None
     self.separator = '_'
     self.useT2S = False
     self.seg_only = False
     self.useFilter = False
     self.use_second = False
     self.input_file = ""
     self.output_file = ""
     self.coding = "utf-8"
     c = 0
     if(len(args) > 0):
         args = args.split(" ")
     else:
         args = []
     while(c < len(args)):
         arg = args[c]
         if(arg == "-t2s"):
             self.useT2S = True
         # elif(arg == "-user"):
             # self.user_specified_dict_name = arg
         elif(arg == "-deli"):
             c += 1
             self.separator = args[c]
         elif(arg == "-model_dir"):
             c += 1
             self.model_path_char = args[c]
         elif(arg == "-seg_only"):
             self.seg_only = True
         elif(arg == "-filter"):
             self.useFilter = True
         elif(arg == "-input"):
             c += 1
             self.input_file = args[c]
         elif(arg == "-output"):
             c += 1
             self.output_file = args[c]
         else:
             return
         c = c + 1
     self.prefix = ""
     if(self.model_path_char is not None):
         self.prefix = self.model_path_char
         if(self.prefix[-1] != "/"):
             self.prefix = self.prefix + "/"
     else:
         self.prefix = os.path.dirname(os.path.realpath(__file__))+"/models/"
     self.oiraw = ""
     self.raw = ""
     self.poc_cands = []
     self.cws_tagging_decoder = None
     self.tagging_decoder = None
     if(self.seg_only):
         self.cws_tagging_decoder = CBTaggingDecoder()
         self.cws_tagging_decoder.init((self.prefix+"cws_model.bin"), (self.prefix+"cws_dat.bin"),(self.prefix+"cws_label.txt"))
         self.cws_tagging_decoder.threshold = 0
         self.cws_tagging_decoder.separator = self.separator
         self.cws_tagging_decoder.setLabelTrans()
     else:
         self.tagging_decoder = CBTaggingDecoder()
         self.tagging_decoder.init((self.prefix+"model_c_model.bin"),(self.prefix+"model_c_dat.bin"),(self.prefix+"model_c_label.txt"))
         self.tagging_decoder.threshold = 10000
         self.tagging_decoder.separator = self.separator
         self.tagging_decoder.setLabelTrans()
     self.preprocesser = Preprocesser()
     self.preprocesser.setT2SMap((self.prefix+"t2s.dat"))
     self.nsDict = Postprocesser((self.prefix+"ns.dat"), "ns", False)
     self.idiomDict = Postprocesser((self.prefix+"idiom.dat"), "i", False)
     # self.userDict = None
     # if(self.user_specified_dict_name is not None):
     #     self.userDict = Postprocesser(self.user_specified_dict_name, "uw", True)
     self.myfilter = None
     if(self.useFilter):
         self.myfilter = Filter((self.prefix+"xu.dat"), (self.prefix+"time.dat"))
Ejemplo n.º 2
0
 def __init__(self, args):
     self.user_specified_dict_name = None
     self.model_path_char = None
     self.separator = '_'
     self.useT2S = False
     self.seg_only = False
     self.useFilter = False
     self.use_second = False
     self.input_file = ""
     self.output_file = ""
     self.coding = "utf-8"
     c = 0
     if (len(args) > 0):
         args = args.split(" ")
     else:
         args = []
     while (c < len(args)):
         arg = args[c]
         if (arg == "-t2s"):
             self.useT2S = True
         # elif(arg == "-user"):
         # self.user_specified_dict_name = arg
         elif (arg == "-deli"):
             c += 1
             self.separator = args[c]
         elif (arg == "-model_dir"):
             c += 1
             self.model_path_char = args[c]
         elif (arg == "-seg_only"):
             self.seg_only = True
         elif (arg == "-filter"):
             self.useFilter = True
         elif (arg == "-input"):
             c += 1
             self.input_file = args[c]
         elif (arg == "-output"):
             c += 1
             self.output_file = args[c]
         else:
             return
         c = c + 1
     self.prefix = ""
     if (self.model_path_char is not None):
         self.prefix = self.model_path_char
         if (self.prefix[-1] != "/"):
             self.prefix = self.prefix + "/"
     else:
         self.prefix = os.path.dirname(
             os.path.realpath(__file__)) + "/models/"
     self.oiraw = ""
     self.raw = ""
     self.poc_cands = []
     self.cws_tagging_decoder = None
     self.tagging_decoder = None
     if (self.seg_only):
         self.cws_tagging_decoder = CBTaggingDecoder()
         self.cws_tagging_decoder.init((self.prefix + "cws_model.bin"),
                                       (self.prefix + "cws_dat.bin"),
                                       (self.prefix + "cws_label.txt"))
         self.cws_tagging_decoder.threshold = 0
         self.cws_tagging_decoder.separator = self.separator
         self.cws_tagging_decoder.setLabelTrans()
     else:
         self.tagging_decoder = CBTaggingDecoder()
         self.tagging_decoder.init((self.prefix + "model_c_model.bin"),
                                   (self.prefix + "model_c_dat.bin"),
                                   (self.prefix + "model_c_label.txt"))
         self.tagging_decoder.threshold = 10000
         self.tagging_decoder.separator = self.separator
         self.tagging_decoder.setLabelTrans()
     self.preprocesser = Preprocesser()
     self.preprocesser.setT2SMap((self.prefix + "t2s.dat"))
     self.nsDict = Postprocesser((self.prefix + "ns.dat"), "ns", False)
     self.idiomDict = Postprocesser((self.prefix + "idiom.dat"), "i", False)
     # self.userDict = None
     # if(self.user_specified_dict_name is not None):
     #     self.userDict = Postprocesser(self.user_specified_dict_name, "uw", True)
     self.myfilter = None
     if (self.useFilter):
         self.myfilter = Filter((self.prefix + "xu.dat"),
                                (self.prefix + "time.dat"))
Ejemplo n.º 3
0
class thulac:
    def __init__(self, args):
        self.user_specified_dict_name = None
        self.model_path_char = None
        self.separator = '_'
        self.useT2S = False
        self.seg_only = False
        self.useFilter = False
        self.use_second = False
        self.input_file = ""
        self.output_file = ""
        self.coding = "utf-8"
        c = 0
        if(len(args) > 0):
            args = args.split(" ")
        else:
            args = []
        while(c < len(args)):
            arg = args[c]
            if(arg == "-t2s"):
                self.useT2S = True
            # elif(arg == "-user"):
                # self.user_specified_dict_name = arg
            elif(arg == "-deli"):
                c += 1
                self.separator = args[c]
            elif(arg == "-model_dir"):
                c += 1
                self.model_path_char = args[c]
            elif(arg == "-seg_only"):
                self.seg_only = True
            elif(arg == "-filter"):
                self.useFilter = True
            elif(arg == "-input"):
                c += 1
                self.input_file = args[c]
            elif(arg == "-output"):
                c += 1
                self.output_file = args[c]
            else:
                return
            c = c + 1
        self.prefix = ""
        if(self.model_path_char is not None):
            self.prefix = self.model_path_char
            if(self.prefix[-1] != "/"):
                self.prefix = self.prefix + "/"
        else:
            self.prefix = os.path.dirname(os.path.realpath(__file__))+"/models/"
        self.oiraw = ""
        self.raw = ""
        self.poc_cands = []
        self.cws_tagging_decoder = None
        self.tagging_decoder = None
        if(self.seg_only):
            self.cws_tagging_decoder = CBTaggingDecoder()
            self.cws_tagging_decoder.init((self.prefix+"cws_model.bin"), (self.prefix+"cws_dat.bin"),(self.prefix+"cws_label.txt"))
            self.cws_tagging_decoder.threshold = 0
            self.cws_tagging_decoder.separator = self.separator
            self.cws_tagging_decoder.setLabelTrans()
        else:
            self.tagging_decoder = CBTaggingDecoder()
            self.tagging_decoder.init((self.prefix+"model_c_model.bin"),(self.prefix+"model_c_dat.bin"),(self.prefix+"model_c_label.txt"))
            self.tagging_decoder.threshold = 10000
            self.tagging_decoder.separator = self.separator
            self.tagging_decoder.setLabelTrans()
        self.preprocesser = Preprocesser()
        self.preprocesser.setT2SMap((self.prefix+"t2s.dat"))
        self.nsDict = Postprocesser((self.prefix+"ns.dat"), "ns", False)
        self.idiomDict = Postprocesser((self.prefix+"idiom.dat"), "i", False)
        # self.userDict = None
        # if(self.user_specified_dict_name is not None):
        #     self.userDict = Postprocesser(self.user_specified_dict_name, "uw", True)
        self.myfilter = None
        if(self.useFilter):
            self.myfilter = Filter((self.prefix+"xu.dat"), (self.prefix+"time.dat"))


    def cut(self, oiraw):
        oiraw = oiraw.decode(self.coding)
        if(self.useT2S):
            traw, poc_cands = self.preprocesser.clean(oiraw)
            raw = self.preprocesser.T2S(traw)
        else:
            raw, poc_cands = self.preprocesser.clean(oiraw)

        if(len(raw) > 0):
            if(self.seg_only):
                tmp, tagged = self.cws_tagging_decoder.segmentTag(raw, poc_cands)
                segged = self.cws_tagging_decoder.get_seg_result()
                # if(self.userDict is not None):
                    # self.userDict.adjustSeg(segged)
                if(self.useFilter):
                    self.nsDict.adjustSeg(segged)
                    self.idiomDict.adjustSeg(segged)
                    self.myfilter.adjustSeg(segged)
                return map(lambda x: x.encode(self.coding), segged)
                
            else:
                tmp, tagged = self.tagging_decoder.segmentTag(raw, poc_cands)
                # if(self.userDict is not None):
                    # self.userDict.adjustTag(tagged)
                if(self.useFilter):
                    self.nsDict.adjustTag(tagged)
                    self.idiomDict.adjustTag(tagged)
                    self.myfilter.adjustTag(tagged)
                return map(lambda x: "".join(x).encode(self.coding), tagged)
        
    def run(self):
        start = time.clock()
        input_f = None
        output_f = None
        if(len(self.input_file) > 0):
            input_f = open(self.input_file, "r")
        if(len(self.output_file) > 0):
            output_f = open(self.output_file, "w")
        while(True):
            if(input_f is not None):
                oiraw = self.getRaw(input_f)
            else:
                oiraw = raw_input().strip()
            if(len(oiraw) < 1):
                break
            if(self.seg_only):
                segged = self.cut(oiraw)
                if(output_f is not None):
                    output_f.write(" ".join(segged))
                    output_f.write("\n")
                else:
                    print " ".join(segged)
            else:
                tagged = self.cut(oiraw)
                if(output_f is not None):
                    output_f.write(" ".join(tagged))
                    output_f.write("\n")
                else:
                    print " ".join(tagged)
        end = time.clock()
        print "Time used: %f s" % (end - start)
            

    def getRaw(self, inputfile):
        return inputfile.readline().strip()
Ejemplo n.º 4
0
class thulac:
    def __init__(self, args):
        self.user_specified_dict_name = None
        self.model_path_char = None
        self.separator = '_'
        self.useT2S = False
        self.seg_only = False
        self.useFilter = False
        self.use_second = False
        self.input_file = ""
        self.output_file = ""
        self.coding = "utf-8"
        c = 0
        if (len(args) > 0):
            args = args.split(" ")
        else:
            args = []
        while (c < len(args)):
            arg = args[c]
            if (arg == "-t2s"):
                self.useT2S = True
            # elif(arg == "-user"):
            # self.user_specified_dict_name = arg
            elif (arg == "-deli"):
                c += 1
                self.separator = args[c]
            elif (arg == "-model_dir"):
                c += 1
                self.model_path_char = args[c]
            elif (arg == "-seg_only"):
                self.seg_only = True
            elif (arg == "-filter"):
                self.useFilter = True
            elif (arg == "-input"):
                c += 1
                self.input_file = args[c]
            elif (arg == "-output"):
                c += 1
                self.output_file = args[c]
            else:
                return
            c = c + 1
        self.prefix = ""
        if (self.model_path_char is not None):
            self.prefix = self.model_path_char
            if (self.prefix[-1] != "/"):
                self.prefix = self.prefix + "/"
        else:
            self.prefix = os.path.dirname(
                os.path.realpath(__file__)) + "/models/"
        self.oiraw = ""
        self.raw = ""
        self.poc_cands = []
        self.cws_tagging_decoder = None
        self.tagging_decoder = None
        if (self.seg_only):
            self.cws_tagging_decoder = CBTaggingDecoder()
            self.cws_tagging_decoder.init((self.prefix + "cws_model.bin"),
                                          (self.prefix + "cws_dat.bin"),
                                          (self.prefix + "cws_label.txt"))
            self.cws_tagging_decoder.threshold = 0
            self.cws_tagging_decoder.separator = self.separator
            self.cws_tagging_decoder.setLabelTrans()
        else:
            self.tagging_decoder = CBTaggingDecoder()
            self.tagging_decoder.init((self.prefix + "model_c_model.bin"),
                                      (self.prefix + "model_c_dat.bin"),
                                      (self.prefix + "model_c_label.txt"))
            self.tagging_decoder.threshold = 10000
            self.tagging_decoder.separator = self.separator
            self.tagging_decoder.setLabelTrans()
        self.preprocesser = Preprocesser()
        self.preprocesser.setT2SMap((self.prefix + "t2s.dat"))
        self.nsDict = Postprocesser((self.prefix + "ns.dat"), "ns", False)
        self.idiomDict = Postprocesser((self.prefix + "idiom.dat"), "i", False)
        # self.userDict = None
        # if(self.user_specified_dict_name is not None):
        #     self.userDict = Postprocesser(self.user_specified_dict_name, "uw", True)
        self.myfilter = None
        if (self.useFilter):
            self.myfilter = Filter((self.prefix + "xu.dat"),
                                   (self.prefix + "time.dat"))

    def cut(self, oiraw):
        oiraw = oiraw.decode(self.coding)
        if (self.useT2S):
            traw, poc_cands = self.preprocesser.clean(oiraw)
            raw = self.preprocesser.T2S(traw)
        else:
            raw, poc_cands = self.preprocesser.clean(oiraw)

        if (len(raw) > 0):
            if (self.seg_only):
                tmp, tagged = self.cws_tagging_decoder.segmentTag(
                    raw, poc_cands)
                segged = self.cws_tagging_decoder.get_seg_result()
                # if(self.userDict is not None):
                # self.userDict.adjustSeg(segged)
                if (self.useFilter):
                    self.nsDict.adjustSeg(segged)
                    self.idiomDict.adjustSeg(segged)
                    self.myfilter.adjustSeg(segged)
                return map(lambda x: x.encode(self.coding), segged)

            else:
                tmp, tagged = self.tagging_decoder.segmentTag(raw, poc_cands)
                # if(self.userDict is not None):
                # self.userDict.adjustTag(tagged)
                if (self.useFilter):
                    self.nsDict.adjustTag(tagged)
                    self.idiomDict.adjustTag(tagged)
                    self.myfilter.adjustTag(tagged)
                return map(lambda x: "".join(x).encode(self.coding), tagged)

    def run(self):
        start = time.clock()
        input_f = None
        output_f = None
        if (len(self.input_file) > 0):
            input_f = open(self.input_file, "r")
        if (len(self.output_file) > 0):
            output_f = open(self.output_file, "w")
        while (True):
            if (input_f is not None):
                oiraw = self.getRaw(input_f)
            else:
                oiraw = raw_input().strip()
            if (len(oiraw) < 1):
                break
            if (self.seg_only):
                segged = self.cut(oiraw)
                if (output_f is not None):
                    output_f.write(" ".join(segged))
                    output_f.write("\n")
                else:
                    print " ".join(segged)
            else:
                tagged = self.cut(oiraw)
                if (output_f is not None):
                    output_f.write(" ".join(tagged))
                    output_f.write("\n")
                else:
                    print " ".join(tagged)
        end = time.clock()
        print "Time used: %f s" % (end - start)

    def getRaw(self, inputfile):
        return inputfile.readline().strip()