Beispiel #1
0
        WSAtom.set_encoding(encoding)
        for line in df:
            line = line.strip()
            #if len(line) == 0 : #! still handle it !
            #    continue
            try:
                uline = line.decode(encoding)
            except UnicodeDecodeError, e:
                logging.warning("decoding dataset error : %s " % (line))
                #continue
                uline = ""  #! still handle it !
            uline_parts = uline.split()
            atom_list = []
            for uline_part in uline_parts:
                atom_list.append(
                    WSAtomTranslator.trans_unicode_list2atom_gram_list(
                        uline_part))
            yield atom_list
        df.close()

    @staticmethod
    def read_predict_data(df):
        '''
        An Iteration generator for predict data
        Args :
            df : file , or a path str
        Returns :
            atom_list : [ WSAtom , WSAtom , ... ] 
            separator_position : list , the position where seperator exists
        '''
        if not isinstance(df, file):
            try:
Beispiel #2
0
class DatasetHandler(object):
    @staticmethod
    def is_readable(path):
        return (os.access(path, os.F_OK) and os.access(path, os.R_OK))

    @staticmethod
    def is_writeable(path):
        if os.access(path, os.F_OK):
            return os.access(path, os.W_OK)
        #! path not exists , check dir path is writeable
        dir_path = os.path.dirname(
            os.path.abspath(path))  #!! os.path.abspath is needed !
        #~  or an empty str is returned for dirname for a relative path
        return (os.access(dir_path, os.F_OK) and os.access(dir_path, os.W_OK))

    @staticmethod
    def get_file_encoding(f):
        '''
        get file's encoding ; sample and naive implementation
        Args :
            f : file 
        Returns :
            encoding : str ;
        Attention :
            if failed , will cause to Exit !
        '''
        cur_g = f.tell()
        line = f.readline()
        f.seek(cur_g)
        encoding_list = []
        if f.encoding is not None:
            encoding_list.append(f.encoding)
        encoding_list.extend(["utf8", "gb18030"])
        uline = ""
        for encoding in encoding_list:
            try:
                uline = line.decode(encoding)
            except:
                uline = ""
                continue
            return encoding
        logging.error("failed to decode the training data . file path : '%s'" %
                      (f.name))
        print >> sys.stderr, "Exit"
        exit(1)

    @staticmethod
    def read_training_data(tf):
        '''
        read lines from training dataset
        Args: 
            tf : file object of training data
    
        Returns :
            data_lines : lines of dataset , each line is also a list , every element is also a list !
                        the most inner element is WSAtom .
                        => [ [ [ WSAtom("like") , WSAtom("我") , ... ] , [WSAtom("一") , WSAtom("样")] ,...  ] ]
                        what is this ? -> the most inner list , is same as the N-grams of chars ! so as for every word , it is represented by
                                          a list of WSAtom . upper list is the sentence , the most outer is the list of sentence
                        Why use WSAtom ? -> because we want an English word to be a `single representation` instead of `list of letters` ! 
        '''
        if type(tf) != file:
            try:
                tf = open(tf)
            except IOError, e:
                traceback.print_exc()
                exit(1)
        logging.info("reading training data from '%s'" % (tf.name))
        data_lines = []
        encoding = DatasetHandler.get_file_encoding(tf)
        WSAtom.set_encoding(encoding)
        for line in tf:
            line = line.strip()
            if len(line) == 0:
                continue
            uline = ""
            try:
                uline = line.decode(encoding)
            except:
                logging.warning("decoding dataset error : %s " % (line))
                continue
            uline_parts = uline.split()
            atom_list = []
            for uline_part in uline_parts:
                atom_list.append(
                    WSAtomTranslator.trans_unicode_list2atom_gram_list(
                        uline_part))
            data_lines.append(atom_list)
        logging.info("%d lines read done ." % (len(data_lines)))
        tf.close()
        return data_lines
Beispiel #3
0
        encoding = DatasetHandler.get_file_encoding(df)
        WSAtom.set_encoding(encoding)
        for line in df :
            line = line.strip()
            #if len(line) == 0 : #! still handle it !
            #    continue
            try :
                uline = line.decode(encoding)
            except UnicodeDecodeError , e :
                logging.warning("decoding dataset error : %s " %(line))
                #continue
                uline = "" #! still handle it !
            uline_parts = uline.split()
            atom_list = []
            for uline_part in uline_parts :
                atom_list.append(WSAtomTranslator.trans_unicode_list2atom_gram_list(uline_part))
            yield atom_list
        df.close()

    @staticmethod
    def read_predict_data(df) :
        '''
        An Iteration generator for predict data
        Args :
            df : file , or a path str
        Returns :
            atom_list : [ WSAtom , WSAtom , ... ] 
            separator_position : list , the position where seperator exists
        '''
        if not isinstance(df , file) :
            try :