def user_format(userPath,userSavePath): '''给粉丝|关注|微博数加性别标签''' print '''读取用户列表''' userlist =[] gender =[] t=[] userf = codecs.open(userPath) for line in userf.readlines(): usertemp = line.strip().split(',') userlist.append([tk for tk in usertemp[:]]) userlist = np.array(userlist) data = userlist.T[4:].T genderlabel = userlist.T[1].T for i in range(0,len(genderlabel)): if genderlabel[i] =='女': gender.append(0) else: gender.append(1) t.append([double(tk) for tk in data[i][:]]) data = np.array(t) genderlabel = np.array(gender) print len(data[0]) print len(genderlabel) dump_svmlight_file(data, genderlabel,userSavePath,zero_based=False) userf.close()
def gender_label_format(userPath,contentPath,DataPath,WritePath): '''加性别标签''' data = [] label = [] userlist =[] imagename =[] contentlist= [] print'''导入数据''' img = codecs.open(DataPath) for line in img.readlines(): datatemp = line.strip().split(',') imagename.append(datatemp[1]) data.append([double(tk) for tk in datatemp[2:]]) img.close() imagename = np.array(imagename) data = np.array(data) print'''导入用户信息''' userf = codecs.open(userPath) for line in userf.readlines(): usertemp = line.strip().split(',') #print usertemp[1] userlist.append([tk for tk in usertemp[:]]) userf.close() print'''导入图片信息''' contentf = codecs.open(contentPath) for line in contentf.readlines(): contenttemp = line.strip().split(',') #print contenttemp[1] contentlist.append([tk for tk in contenttemp[:]]) contentf.close() print '''填入标签''' for i in range(0,len(imagename)): name = imagename[i] print name flag = 0 for li in contentlist: #print li if(name == li[1]): for user in userlist: #print user if(user[0] == li [0]): flag = 1 print user[1] if(user[1] == '女'): label.append(0) else: label.append(1) break break #print genderlabel if(flag == 0): print i print(name+"没有对应的标签") np.delete(data, i, 0)#删除对应的数据 label = np.array(label) print label print len(data) print len(label) dump_svmlight_file(data, label,WritePath,zero_based=False)
def tovw(x, y=None, sample_weight=None): """ Convert array or sparse matrix to Vowpal Wabbit format Parameters ---------- x : {array-like, sparse matrix}, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : {array-like}, shape (n_samples,), optional Target vector relative to X. sample_weight : {array-like}, shape (n_samples,), optional sample weight vector relative to X. Returns ------- out : {array-like}, shape (n_samples, 1) Training vectors in VW string format """ use_truth = y is not None use_weight = sample_weight is not None # convert to numpy array if needed if not isinstance(x, (np.ndarray, csr_matrix)): x = np.array(x) if not isinstance(y, np.ndarray): y = np.array(y) # make sure this is a 2d array if x.ndim == 1: x = x.reshape(1, -1) if y.ndim == 0: y = y.reshape(1) rows, cols = x.shape # check for invalid characters if array has string values if x.dtype.char == 'S': for row in rows: for col in cols: x[row, col] = INVALID_CHARS.sub('.', x[row, col]) # convert input to svmlight format s = StringIO.StringIO() dump_svmlight_file(x, np.zeros(rows), s) # parse entries to construct VW format rows = s.getvalue().split('\n')[:-1] out = [] for idx, row in enumerate(rows): truth = y[idx] if use_truth else 1 weight = sample_weight[idx] if use_weight else 1 features = row.split('0 ', 1)[1] # only using a single namespace and no tags out.append(('{y} {w} |{ns} {x}'.format(y=truth, w=weight, ns=DEFAULT_NS, x=features))) s.close() return out
def tovw(x, y=None, sample_weight=None): """Convert array or sparse matrix to Vowpal Wabbit format Parameters ---------- x : {array-like, sparse matrix}, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : {array-like}, shape (n_samples,), optional Target vector relative to X. sample_weight : {array-like}, shape (n_samples,), optional sample weight vector relative to X. Returns ------- out : {array-like}, shape (n_samples, 1) Training vectors in VW string format """ use_truth = y is not None use_weight = sample_weight is not None # convert to numpy array if needed if not isinstance(x, (np.ndarray, csr_matrix)): x = np.array(x) if not isinstance(y, np.ndarray): y = np.array(y) # make sure this is a 2d array if x.ndim == 1: x = x.reshape(1, -1) if y.ndim == 0: y = y.reshape(1) rows, cols = x.shape # check for invalid characters if array has string values if x.dtype.char == 'S': for row in rows: for col in cols: x[row, col] = INVALID_CHARS.sub('.', x[row, col]) # convert input to svmlight format s = StringIO.StringIO() dump_svmlight_file(x, np.zeros(rows), s) # parse entries to construct VW format rows = s.getvalue().split('\n')[:-1] out = [] for idx, row in enumerate(rows): truth = y[idx] if use_truth else 1 weight = sample_weight[idx] if use_weight else 1 features = row.split('0 ', 1)[1] # only using a single namespace and no tags out.append(('{y} {w} |{ns} {x}'.format(y=truth, w=weight, ns=DEFAULT_NS, x=features))) s.close() return out
def write_to_file(X,K): from scipy import io from sklearn.datasets.svmlight_format import dump_svmlight_file dd_path = olfaction_prediction_path + '/data/derived/' if not os.path.isdir(dd_path): os.mkdir(dd_path) np.savetxt(dd_path + 'nspdk_r3_d4_unaug_gramian.mtx.gz', K) # Write features in standard libSVM format: dump_svmlight_file(X,np.zeros(X.shape[0]),dd_path + 'nspdk_r3_d4_unaug.svm')
def write_to_file(X, K): from scipy import io from sklearn.datasets.svmlight_format import dump_svmlight_file dd_path = olfaction_prediction_path + '/data/derived/' if not os.path.isdir(dd_path): os.mkdir(dd_path) np.savetxt(dd_path + 'nspdk_r3_d4_unaug_gramian.mtx.gz', K) # Write features in standard libSVM format: dump_svmlight_file(X, np.zeros(X.shape[0]), dd_path + 'nspdk_r3_d4_unaug.svm')
def term_label_format(imagePath,termPath,termWritePath): ''''加终端标签''' data = [] termlabel = [] termlist = [] imagename =[] contentlist= [] img = codecs.open(imagePath) for line in img.readlines(): datatemp = line.strip().split(',') imagename.append(datatemp[1]) data.append([double(tk) for tk in datatemp[2:]]) img.close() imagename = np.array(imagename) data = np.array(data) comf = codecs.open(termPath) for line in comf.readlines(): termlisttemp = line.strip().split(",") termlist.append(termlisttemp) comf.close() contentf = codecs.open(contentPath) for line in contentf.readlines(): contenttemp = line.strip().split(',') #print contenttemp[1] contentlist.append([tk for tk in contenttemp[:]]) contentf.close() print '''填入标签''' for i in range(0,len(imagename)): name = imagename[i] flag = 0 for li in contentlist: #print li if(name == li[1]): for term in termlist: if(term[0] == li [6]): flag = 1 termlabel.append(int(term[1])) print i,name,term[0],term[1] break break if(flag == 0): print i print(name+"没有对应的标签") del data[i]#删除对应的数据 termlabel = np.array(termlabel) print termlabel print len(data) print len(termlabel) dump_svmlight_file(data, termlabel,termWritePath,zero_based=False)
def cutUselessFeatures(Path,delline,writePath): """ 把区分度不大的特征删除 """ from sklearn.datasets import load_svmlight_file data, target = load_svmlight_file(Path) deldata=data.toarray() for i in delline: print i deldata = np.delete(deldata, i, 1)#删除第i列的元素值 dump_svmlight_file(deldata, target,writePath,zero_based=False)
def transform_features(self): totransform = [] for index, item in enumerate(self.feat_head): field = item[0] func_name = item[1] transform = item[2] is_enable = item[3] if is_enable: if not field in self.stumble_data.get_features(): print 'field not in feature..generating:' + field func_name(field) totransform.append((field, transform)) if len(totransform): mapper = DataFrameMapper(totransform) mapper.fit(self.stumble_data.all_pd[:self.stumble_data.len_train]) # X_transformed_train = mapper.transform( self.stumble_data.all_pd[:self.stumble_data.len_train]) X_transformed_test = mapper.transform( self.stumble_data.all_pd[self.stumble_data.len_train:]) for index, item in enumerate(self.feat_head): field = item[0] is_enable = item[3] if is_enable and field in self.stumble_data.get_features(): del self.stumble_data.all_pd[field] import pdb pdb.set_trace() from scipy.sparse import hstack X_train = X_transformed_train X_test = X_transformed_test y_train = self.stumble_data.all_pd[:self.stumble_data. len_train]['label'] # print 'Dumping train in SVMLight.' dump_svmlight_file(X_train, y_train, output_train_libsvm_file) # print 'Dumping test in SVMLight.' # dump_svmlight_file(X_test, pred, output_test_libsvm_file ) else: X_train = X_train.as_matrix() X_test = X_test.as_matrix() return X_train, y_train, X_test
def transform_features(self): totransform = [] for index, item in enumerate(self.feat_head): field = item[0] func_name = item[1] transform = item[2] is_enable = item[3] if is_enable: if not field in self.stumble_data.get_features(): print 'field not in feature..generating:' + field func_name(field) totransform.append((field, transform)) if len(totransform): mapper = DataFrameMapper(totransform) mapper.fit(self.stumble_data.all_pd[:self.stumble_data.len_train]) # X_transformed_train = mapper.transform( self.stumble_data.all_pd[:self.stumble_data.len_train]) X_transformed_test = mapper.transform( self.stumble_data.all_pd[self.stumble_data.len_train:]) for index, item in enumerate(self.feat_head): field = item[0] is_enable = item[3] if is_enable and field in self.stumble_data.get_features(): del self.stumble_data.all_pd[field] import pdb pdb.set_trace() from scipy.sparse import hstack X_train = X_transformed_train X_test = X_transformed_test y_train = self.stumble_data.all_pd[:self.stumble_data.len_train]['label'] # print 'Dumping train in SVMLight.' dump_svmlight_file(X_train, y_train, output_train_libsvm_file ) # print 'Dumping test in SVMLight.' # dump_svmlight_file(X_test, pred, output_test_libsvm_file ) else: X_train = X_train.as_matrix() X_test = X_test.as_matrix() return X_train, y_train, X_test
def sina_content_format(contentPath,userPath,contentSavePath): '''图片微博加标签''' data = [] userlist =[] contentlist =[] genderlabel= [] contentf = codecs.open(contentPath) print '''读取发图微博''' for line in contentf.readlines(): contenttemp = line.strip().split(',') #print contenttemp[1] contentlist.append([tk for tk in contenttemp[:]]) contentf.close() print '''读取用户列表''' userf = codecs.open(userPath) for line in userf.readlines(): usertemp = line.strip().split(',') #print usertemp[1] userlist.append([tk for tk in usertemp[:]]) userf.close() print '''填入数据和标签''' for i in range(len(contentlist)): temp = [] temp=[int(tk) for tk in contentlist[i][2:5]] s = contentlist[i][5].split(" ") t = s[1].split(":") temp.append(int(t[0])) temp.append(int(contentlist[i][7])) #填入数据 data.append(temp) #填入标签 for user in userlist: if(user[0]==contentlist[i][0]): if(user[1] == '女'): genderlabel.append(0) else: genderlabel.append(1) break data = np.array(data) genderlabel = np.array(genderlabel) print len(data) print len(genderlabel) dump_svmlight_file(data, genderlabel,contentSavePath,zero_based=False)
# category vec = DictVectorizer() print 'Transforming to dict.' X_2_cat_feat = vec.fit_transform(category_rows) from scipy.sparse import hstack Y_temp = hstack((X_2_cat_feat, X_1_norm_feat)) Y_temp_2 = hstack((Y_temp, X_0_text_feat_bus_name)) Y = Y_temp_2.tocsr() dump_group_names( vec.get_feature_names(), feature_name_bus_name, 'bus_name', float_header_list, output_train_libsvm_file + '.grp', y_shape, ) print 'Dumping train in SVMLight.' dump_svmlight_file(Y[0:len_train], rating_rows[0:len_train], output_train_libsvm_file) print 'Dumping test in SVMLight.' dump_svmlight_file(Y[len_train:], rating_rows[len_train:], output_test_libsvm_file) print 'done... Dumping in SVMLight.'
# category vec = DictVectorizer() print 'Transforming to dict.' X_2_cat_feat = vec.fit_transform(category_rows) from scipy.sparse import hstack if(len(X_1_norm_feat) > 0 ): Y_temp = hstack((X_2_cat_feat,X_1_norm_feat)) else: Y_temp = X_2_cat_feat if ( len(text_rows) > 0 ): Y_temp_2 = hstack((Y_temp,X_0_text_feat_bus_name)) else: Y_temp_2 = Y_temp Y = Y_temp_2.tocsr() #dump_group_names(vec.get_feature_names(), feature_name_bus_name, 'bus_name',float_header_list, output_train_libsvm_file + '.grp', y_shape, ) print 'Dumping train in SVMLight.' dump_svmlight_file(Y[0:len_train], rating_rows[0:len_train], output_train_libsvm_file ) print 'Dumping test in SVMLight.' dump_svmlight_file(Y[len_train:], rating_rows[len_train:], output_test_libsvm_file ) print 'done... Dumping in SVMLight.'
def tovw(x, y=None, sample_weight=None): """Convert array or sparse matrix to Vowpal Wabbit format Parameters ---------- x : {array-like, sparse matrix}, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : {array-like}, shape (n_samples,), optional Target vector relative to X. sample_weight : {array-like}, shape (n_samples,), optional sample weight vector relative to X. Returns ------- out : {array-like}, shape (n_samples, 1) Training vectors in VW string format Examples -------- >>> import pandas as pd >>> from sklearn.feature_extraction.text import HashingVectorizer >>> from vowpalwabbit.sklearn_vw import tovw >>> X = pd.Series(['cat', 'dog', 'cat', 'cat'], name='catdog') >>> y = pd.Series([-1, 1, -1, -1], name='label') >>> hv = HashingVectorizer() >>> hashed = hv.fit_transform(X) >>> tovw(x=hashed, y=y) """ use_truth = y is not None use_weight = sample_weight is not None # convert to numpy array if needed if not isinstance(x, (np.ndarray, csr_matrix)): x = np.array(x) if not isinstance(y, np.ndarray): y = np.array(y) # make sure this is a 2d array if x.ndim == 1: x = x.reshape(1, -1) if y.ndim == 0: y = y.reshape(1) rows, cols = x.shape # check for invalid characters if array has string values if x.dtype.char == 'S': for row in rows: for col in cols: x[row, col] = INVALID_CHARS.sub('.', x[row, col]) # convert input to svmlight format s = io.BytesIO() dump_svmlight_file(x, np.zeros(rows), s) # parse entries to construct VW format rows = s.getvalue().decode('ascii').split('\n')[:-1] out = [] for idx, row in enumerate(rows): truth = y[idx] if use_truth else 1 weight = sample_weight[idx] if use_weight else 1 features = row.split('0 ', 1)[1] # only using a single namespace and no tags out.append(('{y} {w} |{ns} {x}'.format(y=truth, w=weight, ns=DEFAULT_NS, x=features))) s.close() return out
def feature_format(sinadataPath,userPath,contentPath,sinaGenderPath): '''性别标签格式化''' imagename = []#每行数据所对应的图片么 data = []# 数据矩阵 contentlist = []#微博列表 genderlabel = []#性别标签 userlist = [] #用户列表 #注意读取的格式编码!!!,有中文时字符编码是uft-8的菜可以识别, #可以在eclipse建立普通文件复制内容过来就可以解决 ''''读取数据''''' f = codecs.open(sinadataPath) for line in f.readlines(): datatemp = line.strip().split(',') imagename.append(datatemp[1]) data.append([double(tk) for tk in datatemp[2:]]) f.close() imagename = np.array(imagename) data = np.array(data) #print imagename #print data print '''读取发图微博''' contentf = codecs.open(contentPath) for line in contentf.readlines(): contenttemp = line.strip().split(',') #print contenttemp[1] contentlist.append([tk for tk in contenttemp[:]]) contentf.close() print '''读取用户列表''' userf = codecs.open(userPath) for line in userf.readlines(): usertemp = line.strip().split(',') #print usertemp[1] userlist.append([tk for tk in usertemp[:]]) userf.close() print '''填入标签''' for i in range(0,len(imagename)): name = imagename[i] print name flag = 0 for li in contentlist: #print li if(name == li[1]): for user in userlist: #print user if(user[0] == li [0]): flag = 1 #print user[1] if(user[1] == '女'): genderlabel.append(0) else: genderlabel.append(1) break break #print genderlabel if(flag == 0): print i print(name+"没有对应的标签") np.delete(data, i, 0)#删除对应的数据 genderlabel = np.array(genderlabel) print genderlabel print data.shape[0] print genderlabel.shape[0] print ''''构建libsvm数据''' dump_svmlight_file(data, genderlabel,sinaGenderPath,zero_based=False)
def wechat_fomat(dataPath,labelPath,writeGenderPath,writeLocPath): '''微信数据格式化''' imagename = []#每行数据所对应的图片么 data = []# 数据矩阵 genderlabel = []#性别标签 loclabel = []#位置标签 labelfile = [] #注意读取的格式编码!!!,有中文时字符编码是uft-8的菜可以识别, #可以在eclipse建立普通文件复制内容过来就可以解决 ''''读取数据''''' f = codecs.open(dataPath) for line in f.readlines(): tokens = line.strip().split(' ') imagename.append(tokens[0]) data.append([double(tk) for tk in tokens[1:]]) f.close() imagename = np.array(imagename) data = np.array(data) print imagename '''''读取标签''' labelf = codecs.open(labelPath) for line in labelf.readlines(): tokens = line.strip().split(' ') labelfile.append([tk for tk in tokens[:]]) # print labelfile flag = 0 '''填入标签''''' for i in range(0,len(imagename)): name = imagename[i] flag = 0 for li in labelfile: # print li[3] if(name == li[3]): flag = 1 if(li[1] == '女'): genderlabel.append(0) else: genderlabel.append(1) if(li[5] == '2'): loclabel.append(0) else: loclabel.append(1) if(flag == 0): print i print(name+"没有对应的标签") np.delete(data, i, 0)#删除对应的数据 # print loclabel # label = np.array(label) labelf.close() ''''稀疏矩阵化数据''' data = np.array(data) genderlabel = np.array(genderlabel) loclabel = np.array(loclabel) '''查看数据是否一致大小 如果结果不一致说明标签和数据不匹配. ''' print data.shape[0] print genderlabel.shape[0] print loclabel.shape[0] ''''将libsvm格式数据写到文件''' dump_svmlight_file(data, genderlabel,writeGenderPath,zero_based=False) dump_svmlight_file(data, loclabel,writeLocPath,zero_based=False) print ("Wechat format End!")