Esempio n. 1
0
def get_train_data(filename=None,word2id=None,usebigram=True):
    """
    introduce this functions
    """
    filename=os.path.join(DATA_PATH,filename)
    x,y=[],[]
    with codecs.open(filename,'r','utf-8') as f:
        for line in f:
            word_list=line.strip().split()
            line_y=[]
            line_x=[]
            for word in word_list:
                line_y.extend(word2tag(word))
            y.append(line_y)
            line=re.sub(u'\s+','',line.strip())
            contexs=window(line)
            for contex in contexs:
                charx=[]
                #contex window
                charx.extend([word2id.get(c,word2id[UNK]) for c in contex])
                #bigram feature
                if usebigram:
                    charx.extend([word2id.get(bigram,word2id[UNK]) for bigram in preprocess.ngram(contex)])
                line_x.append(charx)
            x.append(line_x)
            assert len(line_x)==len(line_y)
    return x,y
Esempio n. 2
0
def get_train_data(filename=None, word2id=None, usebigram=True):
    #X_train,y_train=get_train_data(train_data_path,word2id)
    #                                   pku_train
    filename = os.path.join(DATA_PATH, filename)  # data/pku_train
    x, y = [], []
    with codecs.open(filename, 'r', 'utf-8') as f:
        for line in f:
            word_list = line.strip().split()  #每行单词存到list中
            line_y = []
            line_x = []
            '''
            当line="李  鹏  在  北京  考察  企业"
            各个word对应的line_y(word2tag(word)):
            [3]
            [3]
            [3]
            [0, 2]
            [0, 2]
            [0, 2]
            三个or四个词为:[0, 1, 1, 2]、[0, 1, 2]
            '''
            for word in word_list:
                line_y.extend(word2tag(word))
            #line_y:[3,3, 3, 0, 2, 0, 2, 0, 2]
            y.append(line_y)  #多行的line_y都放入y中
            line = re.sub(u'\s+', '', line.strip())  #去掉line里的空格:李鹏在北京考察企业

            contexs = window(line)  #句子line中的所有五字的词,有len(line)个
            #contexs=['SS李鹏在', 'S李鹏在北', '李鹏在北京', '鹏在北京考', '在北京考察', '北京考察企', '京考察企业', '考察企业E', '察企业EE']

            #word2id是函数的参数,代表各个字的id
            #例子:word2id={'P': 1, 'E': 0, 'U': 3, 'S': 2,'李': 5,'鹏': 6, '在': 7, '北': 8,'京':4}
            for contex in contexs:
                charx = []

                #contex window
                #将contex,转换成word2id中的id。contex='李鹏在北京'; charx.extend([5, 6, 7, 8, 4])
                charx.extend([word2id.get(c, word2id[UNK]) for c in contex])

                #bigram feature
                #contex='李鹏在北京',在word2id中找到contex的bigram的id,不能就返回U的id。
                #contex='李鹏在北京'; bigram=['李鹏', '鹏在', '在北', '北京']

                if usebigram:
                    charx.extend([
                        word2id.get(bigram, word2id[UNK])
                        for bigram in preprocess.ngram(contex)
                    ])
                #最终结果放入line_x :[5, 6, 7, 8, 4, 3, 3, 3, 3]
                line_x.append(charx)

            #最终的x是n(训练集的字数)个9维的列表组成的list。
            #8维数据中包含:以某个字为中心,窗口大小为5的词,包含的五个字的id,以及这个size=5窗口得到的4个bigram的id
            x.append(line_x)
            assert len(line_x) == len(line_y)
    return x, y
Esempio n. 3
0
 def _input_from_line(self, sentence, user_words=None):
     line = sentence
     contexs = utils_data.window(line)
     line_x = []
     for contex in contexs:
         charx = []
         # contex window
         charx.extend([
             self.word2id.get(c, self.word2id[utils_data.UNK])
             for c in contex
         ])
         # bigram feature
         charx.extend([
             self.word2id.get(bigram, self.word2id[utils_data.UNK])
             for bigram in preprocess.ngram(contex)
         ])
         line_x.append(charx)
     dict_feature = utils_data.tag_sentence(sentence, self.dict, user_words)
     return line_x, dict_feature