def Embed(dat, train_dat, test_dat): content = dat.contents temp_content = [] for i in content: temp = [] for x in i[0].split(" "): temp.append(x) temp_content.append(temp) tokenizer = Tokenizer(num_words=5000, lower=True) tokenizer.fit_on_texts(temp_content) word_index = tokenizer.word_index Embeded = tokenizer.texts_to_sequences(temp_content) Embeded = pad_sequences(Embeded, maxlen=250) X_train, X_test, y_train, y_test = train_test_split(Embeded, dat.cat, test_size=0.20, random_state=42) train_Bunch = base.Bunch(cat=y_train, contents=X_train, vocabulary=word_index) test_Bunch = base.Bunch(cat=y_test, contents=X_test) with open("./dat/" + train_dat, "wb") as file_obj: pickle.dump(train_Bunch, file_obj) with open("./dat/" + test_dat, "wb") as file_obj: pickle.dump(test_Bunch, file_obj)
def load_SRAA(AVI_HOME='./SRAA/partition1/data', percent=1./3, rnd=2342, \ vect=CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))): data = load_files(AVI_HOME, encoding="latin1", load_content=True, random_state=rnd) data.data = [remove_header_subject(text) for text in data.data] indices = ShuffleSplit(len(data.data), n_iter=1, test_size=percent, indices=True, random_state=rnd) for train_ind, test_ind in indices: data = bunch.Bunch(train=bunch.Bunch(data=[data.data[i] for i in train_ind], target=data.target[train_ind]), test=bunch.Bunch(data=[data.data[i] for i in test_ind], target=data.target[test_ind])) X_tr = vect.fit_transform(data.train.data) y_tr = data.train.target X_te = vect.transform(data.test.data) y_te = data.test.target # cache the files pickle.dump(X_tr, open('SRAA_X_train.pickle', 'wb')) pickle.dump(y_tr, open('SRAA_y_train.pickle', 'wb')) pickle.dump(X_te, open('SRAA_X_test.pickle', 'wb')) pickle.dump(y_te, open('SRAA_y_test.pickle', 'wb')) pickle.dump(data.train.data, open('SRAA_X_train_corpus.pickle', 'wb')) pickle.dump(data.test.data, open('SRAA_X_test_corpus.pickle', 'wb')) pickle.dump(vect.get_feature_names(), open('SRAA_feature_names.pickle', 'wb')) return (X_tr, y_tr, X_te, y_te, data.train.data, data.test.data)
def list_3_ngrams(): test = ["原标题:不法商人“围猎”领导干部,“套路”背后是什么? 半月谈评论员字强 “我们就是猎人,领导就是猎物。” “" "在我们眼中,他就是我们获取利益的一个工具。” “他首先获取了你的信任,之后才跟你一步一步提出小事情的帮忙,再到大事情" "帮忙,之后再跟上重金贿赂,利益输送。” “不管他说得多甜言蜜语,喊你爹妈,喊你大爷,喊你恩人,你都不要当真,我就是当真" "了。” …… 近期,云南省纪委监委推出反腐警示专题片《围猎:行贿者说》,从“围猎”者和被“围猎”者双方角度,揭示了官商之" "间过从甚密、利益勾连、蝇营狗苟的乱象,不法商人与被“围猎”官员现身说法,声泪俱下地道出了“围猎”的" "本质和“大梦初醒”的悔恨心理,令人唏嘘,催人警醒。 专题片中,不法商人程绪库在一次饭局上认识了云南省人大财政经济委员会", "周四指数红盘收,可市场情绪非常差,出逃资金明显。周五当低吸赚不到钱时候,大家都选择了一键清仓,指数也就崩了,这算不算人为干预的股灾呢?早上盘" "面的下砸基本把大部分人吓尿了,如果是做短线的基本又是少不了割肉。然后尾盘指数拉起来点,心态也彻底崩了。以前行情好的时候,当天是拉大金融,之后是" "拉科技股,大家一起赚钱high。今年下半年的行情是,没有增量资金,那么就先拉大金融,过些日子拉科技股。现在行情是局部科技股走牛,大金融歇菜。缺乏赚钱" "效应,或者你低吸当天吃肉,第二天直接砸盘,让你没利润。这种奇葩行情是不值得留念的。别看大盘指数跌得凄凄惨惨,短线接力的情绪其实是回暖的。周五没有出" "现核按钮的局面,高位票和低位票承接都非常不错,这种承接的强度其实是超预期的。创业板天山生物涨停,大家知道这是20cm的首个妖", "原标题:31省份11月CPI出炉:22地物价降了!海南降最猛 中新经纬客户端12月12日电(董湘依)国家统计局11日公布31省份2020年11月居民消费" "价格指数(CPI),数据显示,22个省份11月CPI同比录得负增长,而海南为跌幅最大的省份,降幅达1.9%。仅8省份CPI同比上涨,西藏涨0.9%领涨全国。 " " 降降降!22省份物价负增长 国家统计局日前公布的数据显示,11月全国CPI同比下降0.5%,为时隔11年后再现负增长。各地物价也纷纷骤降,海南、湖北" "、湖南、山东、河北等22个省份11月CPI同比录得负增长,其中海南CPI同比降1.9%,为全国降幅最大省份。 17省份CPI涨幅数据超过了全国水平,其中," "、青海、云南、甘肃、北京、新疆、山西、浙江这8个省份的CPI为正增长,西", " 原标题:四川威远男子杀害失足女潜逃21年被抓 犯故意杀人罪被判12年 21年前,四川威远男子罗某持匕首杀害一名失足女子,随后潜逃。直到今年6月,罗" "某被警方抓获归案。12月11日,中国裁判文书网公开了罗某犯故意杀人罪一案的判决书,他被判处有期徒刑12年。 今年41岁的罗某系内江市威远人,住相邻的" "自贡市大安区,从事个体废品收购。据公诉机关指控,1999年3月23日下午,罗某从内江出发乘坐公共汽车到威远县城找朋友余某玩耍。到了后无法联系上余某,便" "入住县城的渔业招待所。当晚10时左右,罗某经招待所承包人介绍,与卖淫女曹某发生了卖淫嫖娼行为,之后曹某离开房间。又过了一段时间,被害人唐某经招待所" "承包人介绍,前往罗某所住房间从事卖淫活动,双方发生性关系后,因费用问题发生争执。 公诉机关指控还称,"] Raw = base.Bunch(contents=[]) for t in test: Raw.contents.append(list_3_ngram(t)) return Raw
def corpus_to_bunch(bunch_path, seg_path): ''' :param bunch_path: Bunch存储路径 :param seg_path: 分词后语料库路径 ''' seg_class_list = listdir_nohidden(seg_path) bunch = base.Bunch(target_name=[], label=[], filenames=[], contents=[]) bunch.target_name.extend(seg_class_list) for seg_class_dir in bunch.target_name: seg_class_path = seg_path + "/" + seg_class_dir + "/" seg_file_list = listdir_nohidden(seg_class_path) for seg_file in seg_file_list: seg_full_path = seg_class_path + seg_file bunch.label.append(seg_class_dir) bunch.filenames.append(seg_file) bunch.contents.append(read_file(seg_full_path)) with open(bunch_path, "wb") as file_obj: pickle.dump(bunch, file_obj) print("===================*****====================") print("corpus_to_bunch end") print("===================*****====================")
def jsonDataFilter(fileInfo): #传入数据,面向不同的数据存储方式,需要调整函数内读取的代码 rootPath = list(fileInfo.keys()) #待读取数据文件的根目录 # print(rootPath) dataName = flatten_lst(list(fileInfo.values())) #待读取数据文件的文件名列表 # print(dataName) coodiDic = [] for fName in dataName: #逐一读取json数据格式文件,并将需要数据存储于列表中,本次实验数据为poi的经纬度信息和一级行业分类名,注意使用了百度坐标系,未转换为WGS84. f = open(os.path.join(rootPath[0], fName)) jsonDecodes = json.load(f) coodiDic.append([(coordi['location']['lat'], coordi['location']['lng'], fName[:-5]) for coordi in jsonDecodes]) coodiDic = flatten_lst(coodiDic) #读取的数据多层嵌套,需展平处理。 # print(coodiDic) data = np.array([(v[0], v[1]) for v in coodiDic]) #经纬度信息 targetNames = np.array([v[2] for v in coodiDic]) #一级分类 # print(data) # print(targetNames) class_label = LabelEncoder() #以整数形式编码一级分类名 targetLabel = class_label.fit_transform(targetNames) class_mapping = [ (idx, label) for idx, label in enumerate(class_label.classes_) ] #建立一级分类名和整数编码的映射列表 # print(class_mapping) dataBunch = base.Bunch(DESCR=r'spatial points datasets of poi', data=data, feature_names=["XCoordinate", "yCoordinate"], target=targetLabel, target_names=class_mapping) #建立sklearn的数据存储格式bunch return dataBunch, class_mapping #返回bunch格式的数据和分类名映射列表
def import_dataset_from_csv(input_dict): """ Imports CSV file, and creates a Scikit dataset. """ # the target value must be in the last column of the CSV file output_dict = {} # this code converts data from the csv file into scikit learn dataset and returns it as a tuple import numpy # my_data = numpy.genfromtxt(input_dict['fileIn'], delimiter=',') from StringIO import StringIO my_data = numpy.genfromtxt(StringIO(input_dict['fileIn']), delimiter=',') num_samples, num_attributes = np.shape(my_data) num_targets = 1 data = np.empty((num_samples, num_attributes - num_targets)) target = np.empty((num_samples,)) for i in range(0, num_samples): data[i] = np.asarray(my_data[i][:-1]) target[i] = np.asarray(my_data[i][-1]) from sklearn.datasets import base as ds dataset = ds.Bunch(data=data, target=target, feature_names=[], DESCR="", target_names="") output_dict['dataset'] = dataset return output_dict
def parse_val(temp="cnews.vocab.txt"): with open("./data/" + temp, "r", encoding="UTF-8") as f: lines = f.readlines() Bunch = base.Bunch(vocab=[]) for line in lines: Bunch.vocab.append(line) with open("./dat/" + "cnews.vocab.dat", "wb") as file_obj: pickle.dump(Bunch, file_obj)
def split_dataset_randomly(input_dict): """ Randomly splits a given dataset into a train and test dataset.""" inst = input_dict['data'] test_size = 1 - float(input_dict["p"]) # train test split from sklearn.cross_validation import train_test_split data_train, data_test, target_train, target_test = train_test_split( inst['data'], inst['target'], test_size=test_size, random_state=1) from sklearn.datasets import base as ds if dataset.is_target_nominal(inst): a_train = ds.Bunch(data=data_train, target=target_train, feature_names=inst.feature_names, DESCR=inst.DESCR, target_names=inst.target_names) a_test = ds.Bunch(data=data_test, target=target_test, feature_names=inst.feature_names, DESCR=inst.DESCR, target_names=inst.target_names) else: a_train = ds.Bunch(data=data_train, target=target_train, feature_names=inst.feature_names, DESCR=inst.DESCR) a_test = ds.Bunch(data=data_test, target=target_test, feature_names=inst.feature_names, DESCR=inst.DESCR) if inst.has_key("feature_value_names"): a_train["feature_value_names"] = inst.feature_value_names a_test["feature_value_names"] = inst.feature_value_names return {'train_data': a_train, 'test_data': a_test}
def load_testotto(fname=testFileName, fpath=parentDirPath): data = np.loadtxt(os.path.join(fpath, fname), delimiter=',', dtype=float) flat_data = data[:,:] images = flat_data.view() return base.Bunch( data=flat_data, target=None, target_names=np.arange(1,10), images=images, )
def combine(): Bunch = base.Bunch(cat=[], contents=[]) aa = read_dat("dataset_aa.dat") ab = read_dat("dataset_ab.dat") ac = read_dat("dataset_ac.dat") # Bunch.cat = aa.cat + ab.cat # Bunch.contents = aa.contents + ab.contents Bunch.cat = aa.cat + ab.cat + ac.cat Bunch.contents = aa.contents + ab.contents + ac.contents with open("./dat/dataset.dat", "wb") as file_obj: pickle.dump(Bunch, file_obj)
def load_otto(fname=trainFileName, fpath=parentDirPath): '''load_data from fname''' data = np.loadtxt(os.path.join(fpath, fname), delimiter=',', dtype=float) flat_data = data[:,:-1] target = data[:,-1] images = flat_data.view() return base.Bunch( data=flat_data, target=target.astype(np.int), target_names=np.arange(1,10), images=images, )
def TDM(bunch, temp_dat): temp = [] idfBunch = base.Bunch(tdm=[]) for content in bunch.contents: temp = temp + content train_tfidfbunch = read_dat("train_tfidf.dat") idfBunch.vocabulary = train_tfidfbunch.vocabulary vectorizer = CountVectorizer(vocabulary=train_tfidfbunch.vocabulary) # count frequency transformer = TfidfTransformer() # tf-tdf weight freq = vectorizer.fit_transform(temp) # freq.toarray() frequency array tfidf = transformer.fit_transform(freq) # tfidf.toarray() tfidf matrix idfBunch.tdm = tfidf with open("./dat/" + temp_dat, "wb") as file_obj: pickle.dump(idfBunch, file_obj)
def tf_idf(bunch, temp_dat, train_tfidf_path=None): temp = [] idfBunch = base.Bunch(cat=bunch.cat, contents=bunch.contents, tdm=[], vocabulary=[]) count, _count = 0, 0 partial = [] last = [] for content in bunch.contents: print(content) partial = partial + content last = partial count += 1 if count // 20000 == 1: temp = temp + partial partial = [] count = 0 _count += 1 print(_count) temp = temp + last if train_tfidf_path is None: vectorizer = CountVectorizer() # count frequency transformer = TfidfTransformer() # tf-tdf weight freq = vectorizer.fit_transform(temp) # freq.toarray() frequency array # print(freq) tfidf = transformer.fit_transform(freq) # tfidf.toarray() tfidf matrix tfidf = csr_matrix(tfidf, dtype=np.float32) idfBunch.tdm = tfidf idfBunch.vocabulary = vectorizer.vocabulary_ else: train_tfidfbunch = read_dat("train_tfidf.dat") idfBunch.vocabulary = train_tfidfbunch.vocabulary vectorizer = CountVectorizer( vocabulary=train_tfidfbunch.vocabulary) # count frequency print(train_tfidfbunch.vocabulary) transformer = TfidfTransformer() # tf-tdf weight freq = vectorizer.fit_transform(temp) # freq.toarray() frequency array tfidf = transformer.fit_transform(freq) # tfidf.toarray() tfidf matrix tfidf = csr_matrix(tfidf, dtype=np.float32) print(tfidf) idfBunch.tdm = tfidf with open("./dat/" + temp_dat, "wb") as file_obj: pickle.dump(idfBunch, file_obj)
def parse_dataset(temp, num=None): with open("./data/" + temp, "r", encoding="UTF-8") as f: lines = f.readlines() Bunch = base.Bunch(cat=[], contents=[]) count_ = 0 for line in lines: t = "" cat = "" count = 0 for element in line.split("|")[2:]: if len(element) > 1 and element != ",": if count == 0: # print(element) if "news_game" in element or "news_comic" in element: cat = "news_entertainment" # print("--news_entertainment") if "digital" in element: cat = "news_tech" # print("--news_tech") if "news_history" in element: cat = "news_culture" # print("--news_culture") if "news_politics" in element: cat = "news_society" else: for categorie in categories: if categorie in element: cat = categorie count += 1 else: # print(element+"\n") t = t + element gram_3 = list_3_ngram(t) if num: if count_ > int(num): break if len(gram_3[0]) != 0 and cat: Bunch.cat.append(cat) Bunch.contents.append(gram_3) count_ += 1 print("So far: " + str(count_)) with open("./dat/" + temp_dat, "wb") as file_obj: pickle.dump(Bunch, file_obj)
def parse_dataset(temp, num): with open("./data/" + temp, "r", encoding="UTF-8") as f: lines = f.readlines() Bunch = base.Bunch(cat=[], contents=[]) for line in lines: cat = line[0:2] content = line[3:103] if categories_dict[cat] <= num: gram_3 = list_3_ngram(content) # content = stopwords(content) Bunch.cat.append(cat) Bunch.contents.append(gram_3) categories_dict[cat] += 1 print("So far: " + cat + " " + str(categories_dict[cat])) with open("./dat/" + temp_dat, "wb") as file_obj: pickle.dump(Bunch, file_obj)
def corpus2Bunch(wordbag_path,seg_path): catelist = os.listdir(seg_path) bunch = base.Bunch(target_name=[],label=[],filename=[],content=[]) bunch.target_name.extend(catelist) # 获取每个目录下所有的文件 for mydir in catelist: class_path = seg_path + mydir + "/" # 拼出分类子目录的路径 file_list = os.listdir(class_path) # 获取class_path下的所有文件 for file_path in file_list: # 遍历类别目录下文件 fullname = class_path + file_path # 拼出文件名全路径 bunch.label.append(mydir) bunch.filenames.append(fullname) bunch.contents.append(_readfile(fullname)) # 读取文件内容 '''''append(element)是python list中的函数,意思是向原来的list中添加element,注意与extend()函数的区别''' # 将bunch存储到wordbag_path路径中 with open(wordbag_path, "wb") as file_obj: pickle.dump(bunch, file_obj) print "构建文本对象结束!!!"
def json2bunch(fName): #传入数据,面向不同的数据存储方式,需要调整函数内读取的代码 infoDic=[] f=open(fName) jsonDecodes=json.load(f) j=0 for info in jsonDecodes: condiKeys=info['detail_info'].keys() if 'price' in condiKeys and'overall_rating' in condiKeys and 'service_rating' in condiKeys and 'facility_rating' in condiKeys and 'hygiene_rating' in condiKeys and 'image_num' in condiKeys and 'comment_num' in condiKeys and 'favorite_num' in condiKeys: #提取的键都有数据时,才提取,否则忽略掉此数据 if 50<float(info['detail_info']['price'])<1000: #设置价格区间,提取数据 j+=1 infoDic.append([info['location']['lat'],info['location']['lng'],info['detail_info']['price'],info['detail_info']['overall_rating'],info['detail_info']['service_rating'],info['detail_info']['facility_rating'],info['detail_info']['hygiene_rating'],info['detail_info']['image_num'],info['detail_info']['comment_num'],info['detail_info']['favorite_num'],info['detail_info']['checkin_num'],info['name']]) else:pass else:pass print('.....................................',j) data=np.array([(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7],v[8],v[9],v[10]) for v in infoDic],dtype='float') #解释变量(特征)数据部分 targetInfo=np.array([v[11] for v in infoDic]) #目标变量(类标)部分 dataBunch=base.Bunch(DESCR=r'info of poi',data=data,feature_names=['lat','lng','price','overall_rating','service_rating','facility_rating','hygiene_rating','image_num','comment_num','favorite_num','checkin_num'],target=targetInfo,target_names=['price','name']) #建立sklearn的数据存储格式bunch return dataBunch #返回bunch格式的数据
def load_myDigits(fpath='F:/ML/digits/', fDataName='train-images-idx3-ubyte', fLabelName='train-labels-idx1-ubyte'): with open(join(fpath, fDataName), 'rb') as fin: magicNumber, n_sample, n_row, n_col = struct.unpack('>IIII', fin.read(struct.calcsize("=IIII"))) n_size = n_row * n_col vectors = [] for i in xrange(n_sample): vectors.append(list(struct.unpack('B'*n_size, fin.read(struct.calcsize('B'*n_size))))) # print vectors[:10] flat_data = np.array(vectors) with open(join(fpath, fLabelName), 'rb') as fin: magicNumber, n_sample = struct.unpack('>II', fin.read(struct.calcsize("=II"))) # print 'magicNumber =', magicNumber # print 'n_sample =', n_sample # pres = struct.unpack('bbbbbbbb', fin.read(struct.calcsize("=bbbbbbbb"))) # print 'pres =', pres labels = struct.unpack('B'*n_sample, fin.read(struct.calcsize('B'*n_sample))) # print labels[:10] # print len(labels), labels[0] target = np.array(labels) images = flat_data.view() images.shape = (-1, n_row, n_col) # for index, (image, prediction) in enumerate(list(zip(flat_data[:4], target[:4]))): # plt.subplot(1, 4, index+1) # plt.axis('off') # imageData = image.reshape(28,28) # plt.imshow(imageData, cmap=plt.cm.gray_r, interpolation='nearest') # plt.title('Prediction: %i' % prediction) # plt.show() return base.Bunch( data=flat_data, target=target.astype(np.int), target_names=np.arange(10), images=images )
""" Bunch和字典结构类似,也是由键值对组成,和字典区别:其键值可以被实例对象当作属性使用。 """ from sklearn.datasets import base buch = base.Bunch(A=1,B=2,c=3) print(buch.A)
def convert_weka_instances_to_bunch(instances): ''' Converts WEKA Instances to the scikit Bunch format :param instances: WEKA dataset (Instances) :return: ''' if not jp.isThreadAttachedToJVM(): jp.attachThreadToJVM() if instances.classIndex() < 0: instances.setClassIndex(instances.numAttributes() - 1) target_att = instances.classAttribute() target_names = [] if target_att.isNominal(): for j in range(0, target_att.numValues()): target_names.append(target_att.value(j)) feature_names = [] num_samples = instances.numInstances() num_attributes = instances.numAttributes() num_targets = 1 data = np.empty((num_samples, num_attributes - num_targets)) target = np.empty((num_samples, ), dtype=np.int) fdescr = instances.relationName() feature_value_names = [] for j in range(0, num_attributes - num_targets): myatt = instances.attribute(j) # mtype = 1 if myatt.isNumeric() else 0 mname = myatt.name() feature_names.append(mname) num_vals = myatt.numValues() f_vals = [] for k in range(0, num_vals): f_vals.append(myatt.value(k)) feature_value_names.append(f_vals) for i in range(0, num_samples): arr = [] for j in range(0, num_attributes): arr.append(instances.get(i).value(j)) data[i] = np.asarray(arr[:-1], dtype=np.float) if target_att.isNominal(): target[i] = np.asarray(arr[-1], dtype=np.int) else: target[i] = np.asarray(arr[-1], dtype=np.float) return sk.Bunch( data=data, target=target, target_names=target_names, DESCR=fdescr, feature_value_names=feature_value_names, feature_names=feature_names ) # ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'])
encoding="latin1", load_content=True, random_state=rnd) data.data = [remove_header_subject(text) for text in data.data] print "Data loaded in %f." % (time() - t0) print 'Total number of data: %d' % len(data.data) indices = ShuffleSplit(len(data.data), n_iter=1, test_size=percent, indices=True, random_state=rnd) for train_ind, test_ind in indices: data = bunch.Bunch(train=bunch.Bunch( data=[data.data[i] for i in train_ind], target=data.target[train_ind]), test=bunch.Bunch(data=[data.data[i] for i in test_ind], target=data.target[test_ind])) t0 = time() print sep print "Extracting features from the training dataset using a sparse vectorizer..." print "Feature extraction technique is %s." % vect X_tr = vect.fit_transform(data.train.data) y_tr = data.train.target duration = time() - t0 print "done in %fs" % duration print "n_samples: %d, n_features: %d" % X_tr.shape t0 = time() print sep print "Extracting features from the test dataset using the same vectorizer..."
feature_names = list(comm_conns[0])[0].split(",")[ 1:92] # make a list of the column names split by commas, remove subject_id raw_data = list(comm_conns[1:len(comm_conns)]) data = [] for row in raw_data: row_as_list_subj_id = row[0].split(",") # there are 92 items per row row_as_list = row_as_list_subj_id[1:92] # remove subject_id row_as_list = list(map(float, row_as_list)) data.append(row_as_list) data = np.asarray(data) target = np.asarray(beh.maltreatment) classify_adv_df = base.Bunch(target_names=target_names, feature_names=feature_names, target=target, data=data) # Train and test adversity classifier # Specify the hyperparameter space, test code here: # parameters = {'SVM__C':[1, 10, 100], # 'SVM__gamma':[0.1, 0.01]} # # c_space = np.logspace(-5, 8, 15) # param_grid = {'C': c_space, 'penalty': ['l1', 'l2']} # temp = GridSearchCV(svc, param_grid, cv = 5) # Create splits with leave-one-out cross validation loo.get_n_splits(classify_adv_df.data)
def simulation_result(self, print_result=False, plot_result=False, save=False): # print('\n============ Simulation Result ==============') print('\n-------- {} ----------'.format(self.name)) # X_o and y_o print('\nPredicting on training set:') X_train, y_train, y_train_pred = self.predict_one_by_one( self.X_train, self.y_train) print('\nPredicting on testing set:') X_test, y_test, y_test_pred = self.predict_one_by_one( self.X_test, self.y_test) if True: from sklearn.metrics import r2_score y_train_mean = np.mean(y_train) y_train_pred_mean = np.mean(y_train_pred) self.result_train = {} self.result_train['r2'] = round(r2_score(y_train, y_train_pred), 4) self.result_train['mae'] = round( np.mean(np.abs(y_train - y_train_pred)), 2) self.result_train['sae'] = round( np.abs(y_train_mean - y_train_pred_mean) / y_train_mean, 4) self.result_train['y_mean'] = round(y_train_mean, 2) self.result_train['y_pred_mean'] = round(y_train_pred_mean, 2) y_test_mean = np.mean(y_test) y_test_pred_mean = np.mean(y_test_pred) self.result_test = {} self.result_test['r2'] = round(r2_score(y_test, y_test_pred), 4) self.result_test['mae'] = round( np.mean(np.abs(y_test - y_test_pred)), 2) self.result_test['sae'] = round( np.abs(y_test_mean - y_test_pred_mean) / y_test_mean, 4) self.result_test['y_mean'] = round(y_test_mean, 2) self.result_test['y_pred_mean'] = round(y_test_pred_mean, 2) if print_result == True: print('\nTraining:') print('SAE =', self.result_train['sae']) print('MAE =', self.result_train['mae']) print('R^2 =', self.result_train['r2']) print('GT_mean =', self.result_train['y_mean']) print('pred_mean =', self.result_train['y_pred_mean']) print('\nTesting:') print('SAE =', self.result_test['sae']) print('MAE =', self.result_test['mae']) print('R^2 =', self.result_test['r2']) print('GT_mean =', self.result_test['y_mean']) print('pred_mean =', self.result_test['y_pred_mean']) if plot_result == True: plt.figure() plt.suptitle(self.name) plt.subplot(2, 1, 1) plt.title( 'Results on Training Set ($R^2$ score={:.3f})'.format( self.result_train['r2'])) plt.plot(X_train, label='Aggregate Data') plt.plot(y_train, label='({}) Ground Truth'.format(self.appliance)) plt.plot(y_train_pred, label='({}) Prediction'.format(self.appliance)) plt.legend(loc='upper right') plt.ylabel('Power/W') plt.subplot(2, 1, 2) plt.title( 'Results on Testing Set ($R^2$ score={:.3f})'.format( self.result_test['r2'])) plt.plot(X_test, label='Aggregate Data') plt.plot(y_test, label='({}) Ground Truth'.format(self.appliance)) plt.plot(y_test_pred, label='({}) Prediction'.format(self.appliance)) plt.legend(loc='upper right') plt.xlabel('Time/s') plt.ylabel('Power/W') plt.show() if save == True: from sklearn.datasets import base import pickle data = base.Bunch(name=self.name, appliance=self.appliance, X_train=X_train, y_train=y_train, y_train_pred=y_train_pred, result_train=self.result_train, X_test=X_test, y_test=y_test, y_test_pred=y_test_pred, result_test=self.result_test, train_history=self.history) resultPath = r'resultData/{}.pickle'.format(self.name) with open(resultPath, 'wb') as file: pickle.dump(data, file) print('\n已保存result到', resultPath)
def get_immunized_nodes_and_movement(epidemic_graph, movement_info, n_immunized_nodes): # im_bunch = Immunized nodes in a Bunch format: im_bunch = base.Bunch() # Choose immunization by neighbor of a random node: im_bunch.neighbors = get_immunized_neighbors(epidemic_graph, n_immunized_nodes) im_bunch.neighbors_move = get_immunized_movement_info( movement_info, im_bunch.neighbors) # Choose immunization by random: im_bunch.random = get_immunized_random(epidemic_graph, n_immunized_nodes) im_bunch.random_move = get_immunized_movement_info(movement_info, im_bunch.random) # Choose immunization by k-shell (core): core_dict = nx.core_number(epidemic_graph) im_bunch.core = dict_to_sorted_list(core_dict, n_immunized_nodes) im_bunch.core_move = get_immunized_movement_info(movement_info, im_bunch.core) # Choose immunization by unweighted clustering coefficient c: clustering_dict = nx.clustering(epidemic_graph) im_bunch.clustering = dict_to_sorted_list(clustering_dict, n_immunized_nodes) im_bunch.clustering_move = get_immunized_movement_info( movement_info, im_bunch.clustering) # Choose immunization by degree k: degree_dict = epidemic_graph.degree() im_bunch.degree = dict_to_sorted_list(degree_dict, n_immunized_nodes) im_bunch.degree_move = get_immunized_movement_info(movement_info, im_bunch.degree) # Choose immunization by strenght: strenght_dict = epidemic_graph.degree(weight='weight') im_bunch.strenght = dict_to_sorted_list(strenght_dict, n_immunized_nodes) im_bunch.strenght_move = get_immunized_movement_info( movement_info, im_bunch.strenght) # Choose immunization by betweeness: betweenness_dict = nx.betweenness_centrality(epidemic_graph) im_bunch.betweenness = dict_to_sorted_list(betweenness_dict, n_immunized_nodes) im_bunch.betweenness_move = get_immunized_movement_info( movement_info, im_bunch.betweenness) # Choose immunization by closeness: closeness_dict = nx.closeness_centrality(epidemic_graph) im_bunch.closeness = dict_to_sorted_list(closeness_dict, n_immunized_nodes) im_bunch.closeness_move = get_immunized_movement_info( movement_info, im_bunch.closeness) # All immunized nodes: neighbors_set = set(im_bunch.neighbors) random_set = set(im_bunch.random) core_set = set(im_bunch.core) clustering_set = set(im_bunch.clustering) degree_set = set(im_bunch.degree) strenght_set = set(im_bunch.strenght) betweenness_set = set(im_bunch.betweenness) closeness_set = set(im_bunch.closeness) all_immunized_nodes = neighbors_set.union(random_set, core_set, clustering_set, degree_set, strenght_set, betweenness_set, closeness_set) im_bunch.all_immunized_nodes = list(all_immunized_nodes) return im_bunch
def update_to_pickle(data): Full_stations, City_only, ErrorCities = data # 改变列顺序,使之更易读 columns = [ 'time_point', 'area', 'position_name', 'station_code', 'aqi', 'quality', 'primary_pollutant', 'pm2_5', 'pm2_5_24h', 'pm10', 'pm10_24h', 'co', 'co_24h', 'no2', 'no2_24h', 'o3', 'o3_24h', 'o3_8h', 'o3_8h_24h', 'so2', 'so2_24h' ] Full_stations = Full_stations.reindex(columns=columns) City_only = City_only.reindex(columns=columns) City_only.pop('position_name') City_only.pop('station_code') time_point = City_only.iloc[0, 0] # time_point = City_only.ix[0,'time_point'] # 保存此次update的数据 with open('AQIsData/update.pickle', 'wb') as file: data = base.Bunch(full=Full_stations, city=City_only, time=time_point, notUpdatedCity=ErrorCities) pickle.dump(data, file) # 将更新并入历史数据 month = time_point[:7] his_filename = '{}.pickle'.format(month) # 按月存放数据于一文件中 filepath = 'AQIsData/' + his_filename import os if os.path.exists(filepath): try: # Full_his, City_his, time_his = pd.read_pickle(filepath) his = pd.read_pickle(filepath) Full_his = his.full City_his = his.city time_his = his.time except Exception as e: # 如无法获取该月份的历史数据,为了避免覆写历史数据的误操作,将本次更新的数据另建一pickle,以待后续手动合并 filename = 'not-merged-Data-{}.pickle'.format(time_point) with open(r'AQIsData/' + filename, 'wb') as file: # pickle.dump([Full_stations, City_only, time_point], file) data = base.Bunch(full=Full_stations, city=City_only, time=time_point, notUpdatedCity=ErrorCities) pickle.dump(data, file) log('[Error] Fail to load [{}] and unable to merge into his data. \ Create an extra file:{}. ({})'.format( his_filename, filename, e)) return else: #否则新建新月份的pickle文件 Full_his = City_his = pd.DataFrame() time_his = pd.Series() log('=======================================================================================' ) log('[New his pickle] Create {}'.format(his_filename)) # 合并之 Full_his = pd.concat([Full_stations, Full_his], axis=0, join='outer', ignore_index=True) City_his = pd.concat([City_only, City_his], axis=0, join='outer', ignore_index=True) time_his = pd.Series(time_point).append(time_his, ignore_index=True) with open(filepath, 'wb') as file: # pickle.dump([Full_his, City_his, time_his], file) data = base.Bunch(full=Full_his, city=City_his, time=time_his) pickle.dump(data, file)