def get_train_data(filename=None,word2id=None,usebigram=True): """ introduce this functions """ filename=os.path.join(DATA_PATH,filename) x,y=[],[] with codecs.open(filename,'r','utf-8') as f: for line in f: word_list=line.strip().split() line_y=[] line_x=[] for word in word_list: line_y.extend(word2tag(word)) y.append(line_y) line=re.sub(u'\s+','',line.strip()) contexs=window(line) for contex in contexs: charx=[] #contex window charx.extend([word2id.get(c,word2id[UNK]) for c in contex]) #bigram feature if usebigram: charx.extend([word2id.get(bigram,word2id[UNK]) for bigram in preprocess.ngram(contex)]) line_x.append(charx) x.append(line_x) assert len(line_x)==len(line_y) return x,y
def get_train_data(filename=None, word2id=None, usebigram=True): #X_train,y_train=get_train_data(train_data_path,word2id) # pku_train filename = os.path.join(DATA_PATH, filename) # data/pku_train x, y = [], [] with codecs.open(filename, 'r', 'utf-8') as f: for line in f: word_list = line.strip().split() #每行单词存到list中 line_y = [] line_x = [] ''' 当line="李 鹏 在 北京 考察 企业" 各个word对应的line_y(word2tag(word)): [3] [3] [3] [0, 2] [0, 2] [0, 2] 三个or四个词为:[0, 1, 1, 2]、[0, 1, 2] ''' for word in word_list: line_y.extend(word2tag(word)) #line_y:[3,3, 3, 0, 2, 0, 2, 0, 2] y.append(line_y) #多行的line_y都放入y中 line = re.sub(u'\s+', '', line.strip()) #去掉line里的空格:李鹏在北京考察企业 contexs = window(line) #句子line中的所有五字的词,有len(line)个 #contexs=['SS李鹏在', 'S李鹏在北', '李鹏在北京', '鹏在北京考', '在北京考察', '北京考察企', '京考察企业', '考察企业E', '察企业EE'] #word2id是函数的参数,代表各个字的id #例子:word2id={'P': 1, 'E': 0, 'U': 3, 'S': 2,'李': 5,'鹏': 6, '在': 7, '北': 8,'京':4} for contex in contexs: charx = [] #contex window #将contex,转换成word2id中的id。contex='李鹏在北京'; charx.extend([5, 6, 7, 8, 4]) charx.extend([word2id.get(c, word2id[UNK]) for c in contex]) #bigram feature #contex='李鹏在北京',在word2id中找到contex的bigram的id,不能就返回U的id。 #contex='李鹏在北京'; bigram=['李鹏', '鹏在', '在北', '北京'] if usebigram: charx.extend([ word2id.get(bigram, word2id[UNK]) for bigram in preprocess.ngram(contex) ]) #最终结果放入line_x :[5, 6, 7, 8, 4, 3, 3, 3, 3] line_x.append(charx) #最终的x是n(训练集的字数)个9维的列表组成的list。 #8维数据中包含:以某个字为中心,窗口大小为5的词,包含的五个字的id,以及这个size=5窗口得到的4个bigram的id x.append(line_x) assert len(line_x) == len(line_y) return x, y
def _input_from_line(self, sentence, user_words=None): line = sentence contexs = utils_data.window(line) line_x = [] for contex in contexs: charx = [] # contex window charx.extend([ self.word2id.get(c, self.word2id[utils_data.UNK]) for c in contex ]) # bigram feature charx.extend([ self.word2id.get(bigram, self.word2id[utils_data.UNK]) for bigram in preprocess.ngram(contex) ]) line_x.append(charx) dict_feature = utils_data.tag_sentence(sentence, self.dict, user_words) return line_x, dict_feature