Beispiel #1
0
def Embed(dat, train_dat, test_dat):
    content = dat.contents
    temp_content = []

    for i in content:
        temp = []
        for x in i[0].split(" "):
            temp.append(x)
        temp_content.append(temp)

    tokenizer = Tokenizer(num_words=5000, lower=True)
    tokenizer.fit_on_texts(temp_content)
    word_index = tokenizer.word_index
    Embeded = tokenizer.texts_to_sequences(temp_content)
    Embeded = pad_sequences(Embeded, maxlen=250)

    X_train, X_test, y_train, y_test = train_test_split(Embeded,
                                                        dat.cat,
                                                        test_size=0.20,
                                                        random_state=42)
    train_Bunch = base.Bunch(cat=y_train,
                             contents=X_train,
                             vocabulary=word_index)
    test_Bunch = base.Bunch(cat=y_test, contents=X_test)
    with open("./dat/" + train_dat, "wb") as file_obj:
        pickle.dump(train_Bunch, file_obj)
    with open("./dat/" + test_dat, "wb") as file_obj:
        pickle.dump(test_Bunch, file_obj)
Beispiel #2
0
def load_SRAA(AVI_HOME='./SRAA/partition1/data', percent=1./3, rnd=2342, \
              vect=CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))):
    data = load_files(AVI_HOME, encoding="latin1", load_content=True, random_state=rnd)
    data.data = [remove_header_subject(text) for text in data.data]

    indices = ShuffleSplit(len(data.data), n_iter=1, test_size=percent, indices=True, random_state=rnd)
    for train_ind, test_ind in indices:
        data = bunch.Bunch(train=bunch.Bunch(data=[data.data[i] for i in train_ind], target=data.target[train_ind]),
                              test=bunch.Bunch(data=[data.data[i] for i in test_ind], target=data.target[test_ind]))

    X_tr = vect.fit_transform(data.train.data)
    y_tr = data.train.target

    X_te = vect.transform(data.test.data)
    y_te = data.test.target
    
    # cache the files
    pickle.dump(X_tr, open('SRAA_X_train.pickle', 'wb'))
    pickle.dump(y_tr, open('SRAA_y_train.pickle', 'wb'))
    pickle.dump(X_te, open('SRAA_X_test.pickle', 'wb'))
    pickle.dump(y_te, open('SRAA_y_test.pickle', 'wb'))
    pickle.dump(data.train.data, open('SRAA_X_train_corpus.pickle', 'wb'))
    pickle.dump(data.test.data, open('SRAA_X_test_corpus.pickle', 'wb'))
    pickle.dump(vect.get_feature_names(), open('SRAA_feature_names.pickle', 'wb'))
    
    return (X_tr, y_tr, X_te, y_te, data.train.data, data.test.data)
Beispiel #3
0
def list_3_ngrams():
    test = ["原标题:不法商人“围猎”领导干部,“套路”背后是什么?  半月谈评论员字强  “我们就是猎人,领导就是猎物。”  “"
            "在我们眼中,他就是我们获取利益的一个工具。”  “他首先获取了你的信任,之后才跟你一步一步提出小事情的帮忙,再到大事情"
            "帮忙,之后再跟上重金贿赂,利益输送。”  “不管他说得多甜言蜜语,喊你爹妈,喊你大爷,喊你恩人,你都不要当真,我就是当真"
            "了。”  ……  近期,云南省纪委监委推出反腐警示专题片《围猎:行贿者说》,从“围猎”者和被“围猎”者双方角度,揭示了官商之"
            "间过从甚密、利益勾连、蝇营狗苟的乱象,不法商人与被“围猎”官员现身说法,声泪俱下地道出了“围猎”的"
            "本质和“大梦初醒”的悔恨心理,令人唏嘘,催人警醒。  专题片中,不法商人程绪库在一次饭局上认识了云南省人大财政经济委员会",

            "周四指数红盘收,可市场情绪非常差,出逃资金明显。周五当低吸赚不到钱时候,大家都选择了一键清仓,指数也就崩了,这算不算人为干预的股灾呢?早上盘"
            "面的下砸基本把大部分人吓尿了,如果是做短线的基本又是少不了割肉。然后尾盘指数拉起来点,心态也彻底崩了。以前行情好的时候,当天是拉大金融,之后是"
            "拉科技股,大家一起赚钱high。今年下半年的行情是,没有增量资金,那么就先拉大金融,过些日子拉科技股。现在行情是局部科技股走牛,大金融歇菜。缺乏赚钱"
            "效应,或者你低吸当天吃肉,第二天直接砸盘,让你没利润。这种奇葩行情是不值得留念的。别看大盘指数跌得凄凄惨惨,短线接力的情绪其实是回暖的。周五没有出"
            "现核按钮的局面,高位票和低位票承接都非常不错,这种承接的强度其实是超预期的。创业板天山生物涨停,大家知道这是20cm的首个妖",

            "原标题:31省份11月CPI出炉:22地物价降了!海南降最猛  中新经纬客户端12月12日电(董湘依)国家统计局11日公布31省份2020年11月居民消费"
            "价格指数(CPI),数据显示,22个省份11月CPI同比录得负增长,而海南为跌幅最大的省份,降幅达1.9%。仅8省份CPI同比上涨,西藏涨0.9%领涨全国。 "
            " 降降降!22省份物价负增长  国家统计局日前公布的数据显示,11月全国CPI同比下降0.5%,为时隔11年后再现负增长。各地物价也纷纷骤降,海南、湖北"
            "、湖南、山东、河北等22个省份11月CPI同比录得负增长,其中海南CPI同比降1.9%,为全国降幅最大省份。  17省份CPI涨幅数据超过了全国水平,其中,"
            "、青海、云南、甘肃、北京、新疆、山西、浙江这8个省份的CPI为正增长,西",

            "  原标题:四川威远男子杀害失足女潜逃21年被抓 犯故意杀人罪被判12年  21年前,四川威远男子罗某持匕首杀害一名失足女子,随后潜逃。直到今年6月,罗"
            "某被警方抓获归案。12月11日,中国裁判文书网公开了罗某犯故意杀人罪一案的判决书,他被判处有期徒刑12年。  今年41岁的罗某系内江市威远人,住相邻的"
            "自贡市大安区,从事个体废品收购。据公诉机关指控,1999年3月23日下午,罗某从内江出发乘坐公共汽车到威远县城找朋友余某玩耍。到了后无法联系上余某,便"
            "入住县城的渔业招待所。当晚10时左右,罗某经招待所承包人介绍,与卖淫女曹某发生了卖淫嫖娼行为,之后曹某离开房间。又过了一段时间,被害人唐某经招待所"
            "承包人介绍,前往罗某所住房间从事卖淫活动,双方发生性关系后,因费用问题发生争执。  公诉机关指控还称,"]
    Raw = base.Bunch(contents=[])
    for t in test:
        Raw.contents.append(list_3_ngram(t))
    return Raw
Beispiel #4
0
def corpus_to_bunch(bunch_path, seg_path):
    '''
    :param bunch_path: Bunch存储路径
    :param seg_path:  分词后语料库路径
    '''
    seg_class_list = listdir_nohidden(seg_path)
    bunch = base.Bunch(target_name=[], label=[], filenames=[], contents=[])
    bunch.target_name.extend(seg_class_list)

    for seg_class_dir in bunch.target_name:

        seg_class_path = seg_path + "/" + seg_class_dir + "/"
        seg_file_list = listdir_nohidden(seg_class_path)

        for seg_file in seg_file_list:
            seg_full_path = seg_class_path + seg_file
            bunch.label.append(seg_class_dir)
            bunch.filenames.append(seg_file)
            bunch.contents.append(read_file(seg_full_path))

    with open(bunch_path, "wb") as file_obj:
        pickle.dump(bunch, file_obj)

    print("===================*****====================")
    print("corpus_to_bunch end")
    print("===================*****====================")
def jsonDataFilter(fileInfo):  #传入数据,面向不同的数据存储方式,需要调整函数内读取的代码
    rootPath = list(fileInfo.keys())  #待读取数据文件的根目录
    #    print(rootPath)
    dataName = flatten_lst(list(fileInfo.values()))  #待读取数据文件的文件名列表
    #    print(dataName)
    coodiDic = []
    for fName in dataName:  #逐一读取json数据格式文件,并将需要数据存储于列表中,本次实验数据为poi的经纬度信息和一级行业分类名,注意使用了百度坐标系,未转换为WGS84.
        f = open(os.path.join(rootPath[0], fName))
        jsonDecodes = json.load(f)
        coodiDic.append([(coordi['location']['lat'], coordi['location']['lng'],
                          fName[:-5]) for coordi in jsonDecodes])
        coodiDic = flatten_lst(coodiDic)  #读取的数据多层嵌套,需展平处理。


#    print(coodiDic)
    data = np.array([(v[0], v[1]) for v in coodiDic])  #经纬度信息
    targetNames = np.array([v[2] for v in coodiDic])  #一级分类
    #    print(data)
    #    print(targetNames)
    class_label = LabelEncoder()  #以整数形式编码一级分类名
    targetLabel = class_label.fit_transform(targetNames)
    class_mapping = [
        (idx, label) for idx, label in enumerate(class_label.classes_)
    ]  #建立一级分类名和整数编码的映射列表
    #    print(class_mapping)
    dataBunch = base.Bunch(DESCR=r'spatial points datasets of poi',
                           data=data,
                           feature_names=["XCoordinate", "yCoordinate"],
                           target=targetLabel,
                           target_names=class_mapping)  #建立sklearn的数据存储格式bunch
    return dataBunch, class_mapping  #返回bunch格式的数据和分类名映射列表
Beispiel #6
0
def import_dataset_from_csv(input_dict):
    """ Imports CSV file, and creates a Scikit dataset. """

    # the target value must be in the last column of the CSV file
    output_dict = {}
    # this code converts data from the csv file into scikit learn dataset and returns it as a tuple
    import numpy

    # my_data = numpy.genfromtxt(input_dict['fileIn'], delimiter=',')
    from StringIO import StringIO
    my_data = numpy.genfromtxt(StringIO(input_dict['fileIn']), delimiter=',')

    num_samples, num_attributes = np.shape(my_data)
    num_targets = 1

    data = np.empty((num_samples, num_attributes - num_targets))
    target = np.empty((num_samples,))

    for i in range(0, num_samples):
        data[i] = np.asarray(my_data[i][:-1])
        target[i] = np.asarray(my_data[i][-1])

    from sklearn.datasets import base as ds
    dataset = ds.Bunch(data=data,
                       target=target,
                       feature_names=[],
                       DESCR="",
                       target_names="")

    output_dict['dataset'] = dataset
    return output_dict
Beispiel #7
0
def parse_val(temp="cnews.vocab.txt"):
    with open("./data/" + temp, "r", encoding="UTF-8") as f:
        lines = f.readlines()
    Bunch = base.Bunch(vocab=[])
    for line in lines:
        Bunch.vocab.append(line)
    with open("./dat/" + "cnews.vocab.dat", "wb") as file_obj:
        pickle.dump(Bunch, file_obj)
Beispiel #8
0
def split_dataset_randomly(input_dict):
    """ Randomly splits a given dataset into a train and test dataset."""

    inst = input_dict['data']
    test_size = 1 - float(input_dict["p"])

    # train test split
    from sklearn.cross_validation import train_test_split
    data_train, data_test, target_train, target_test = train_test_split(
            inst['data'],
            inst['target'],
            test_size=test_size,
            random_state=1)

    from sklearn.datasets import base as ds

    if dataset.is_target_nominal(inst):
        a_train = ds.Bunch(data=data_train,
                           target=target_train,
                           feature_names=inst.feature_names,
                           DESCR=inst.DESCR,
                           target_names=inst.target_names)

        a_test = ds.Bunch(data=data_test,
                          target=target_test,
                          feature_names=inst.feature_names,
                          DESCR=inst.DESCR,
                          target_names=inst.target_names)
    else:
        a_train = ds.Bunch(data=data_train,
                           target=target_train,
                           feature_names=inst.feature_names,
                           DESCR=inst.DESCR)

        a_test = ds.Bunch(data=data_test,
                          target=target_test,
                          feature_names=inst.feature_names,
                          DESCR=inst.DESCR)

    if inst.has_key("feature_value_names"):
        a_train["feature_value_names"] = inst.feature_value_names
        a_test["feature_value_names"] = inst.feature_value_names

    return {'train_data': a_train, 'test_data': a_test}
def load_testotto(fname=testFileName, fpath=parentDirPath):
	data = np.loadtxt(os.path.join(fpath, fname), delimiter=',', dtype=float)
	flat_data = data[:,:]
	images = flat_data.view()
	return base.Bunch(
		data=flat_data,
		target=None,
		target_names=np.arange(1,10),
		images=images,
	)
Beispiel #10
0
def combine():
    Bunch = base.Bunch(cat=[], contents=[])
    aa = read_dat("dataset_aa.dat")
    ab = read_dat("dataset_ab.dat")
    ac = read_dat("dataset_ac.dat")
    # Bunch.cat = aa.cat + ab.cat
    # Bunch.contents = aa.contents + ab.contents
    Bunch.cat = aa.cat + ab.cat + ac.cat
    Bunch.contents = aa.contents + ab.contents + ac.contents
    with open("./dat/dataset.dat", "wb") as file_obj:
        pickle.dump(Bunch, file_obj)
Beispiel #11
0
def load_otto(fname=trainFileName, fpath=parentDirPath):

	'''load_data from fname'''
	data = np.loadtxt(os.path.join(fpath, fname), delimiter=',', dtype=float)
	flat_data = data[:,:-1]
	target = data[:,-1]
	images = flat_data.view()
	return base.Bunch(
		data=flat_data,
		target=target.astype(np.int),
		target_names=np.arange(1,10),
		images=images,
	)
Beispiel #12
0
def TDM(bunch, temp_dat):
    temp = []
    idfBunch = base.Bunch(tdm=[])

    for content in bunch.contents:
        temp = temp + content
    train_tfidfbunch = read_dat("train_tfidf.dat")
    idfBunch.vocabulary = train_tfidfbunch.vocabulary
    vectorizer = CountVectorizer(vocabulary=train_tfidfbunch.vocabulary)  # count frequency
    transformer = TfidfTransformer()  # tf-tdf weight
    freq = vectorizer.fit_transform(temp)  # freq.toarray() frequency array
    tfidf = transformer.fit_transform(freq)  # tfidf.toarray() tfidf matrix
    idfBunch.tdm = tfidf

    with open("./dat/" + temp_dat, "wb") as file_obj:
        pickle.dump(idfBunch, file_obj)
Beispiel #13
0
def tf_idf(bunch, temp_dat, train_tfidf_path=None):
    temp = []
    idfBunch = base.Bunch(cat=bunch.cat,
                          contents=bunch.contents,
                          tdm=[],
                          vocabulary=[])
    count, _count = 0, 0
    partial = []
    last = []
    for content in bunch.contents:
        print(content)
        partial = partial + content
        last = partial
        count += 1
        if count // 20000 == 1:
            temp = temp + partial
            partial = []
            count = 0
            _count += 1
            print(_count)
    temp = temp + last

    if train_tfidf_path is None:
        vectorizer = CountVectorizer()  # count frequency
        transformer = TfidfTransformer()  # tf-tdf weight
        freq = vectorizer.fit_transform(temp)  # freq.toarray() frequency array
        # print(freq)
        tfidf = transformer.fit_transform(freq)  # tfidf.toarray() tfidf matrix
        tfidf = csr_matrix(tfidf, dtype=np.float32)
        idfBunch.tdm = tfidf
        idfBunch.vocabulary = vectorizer.vocabulary_
    else:
        train_tfidfbunch = read_dat("train_tfidf.dat")
        idfBunch.vocabulary = train_tfidfbunch.vocabulary
        vectorizer = CountVectorizer(
            vocabulary=train_tfidfbunch.vocabulary)  # count frequency
        print(train_tfidfbunch.vocabulary)
        transformer = TfidfTransformer()  # tf-tdf weight
        freq = vectorizer.fit_transform(temp)  # freq.toarray() frequency array
        tfidf = transformer.fit_transform(freq)  # tfidf.toarray() tfidf matrix
        tfidf = csr_matrix(tfidf, dtype=np.float32)
        print(tfidf)
        idfBunch.tdm = tfidf

    with open("./dat/" + temp_dat, "wb") as file_obj:
        pickle.dump(idfBunch, file_obj)
Beispiel #14
0
def parse_dataset(temp, num=None):

    with open("./data/" + temp, "r", encoding="UTF-8") as f:
        lines = f.readlines()
    Bunch = base.Bunch(cat=[], contents=[])
    count_ = 0
    for line in lines:
        t = ""
        cat = ""
        count = 0
        for element in line.split("|")[2:]:
            if len(element) > 1 and element != ",":
                if count == 0:
                    # print(element)
                    if "news_game" in element or "news_comic" in element:
                        cat = "news_entertainment"
                        # print("--news_entertainment")
                    if "digital" in element:
                        cat = "news_tech"
                        # print("--news_tech")
                    if "news_history" in element:
                        cat = "news_culture"
                        # print("--news_culture")
                    if "news_politics" in element:
                        cat = "news_society"
                    else:
                        for categorie in categories:
                            if categorie in element:
                                cat = categorie
                    count += 1
                else:
                    # print(element+"\n")
                    t = t + element
        gram_3 = list_3_ngram(t)
        if num:
            if count_ > int(num):
                break
        if len(gram_3[0]) != 0 and cat:
            Bunch.cat.append(cat)
            Bunch.contents.append(gram_3)
            count_ += 1
            print("So far: " + str(count_))

    with open("./dat/" + temp_dat, "wb") as file_obj:
        pickle.dump(Bunch, file_obj)
Beispiel #15
0
def parse_dataset(temp, num):

    with open("./data/" + temp, "r", encoding="UTF-8") as f:
        lines = f.readlines()
    Bunch = base.Bunch(cat=[], contents=[])
    for line in lines:
        cat = line[0:2]
        content = line[3:103]
        if categories_dict[cat] <= num:
            gram_3 = list_3_ngram(content)
            # content = stopwords(content)
            Bunch.cat.append(cat)
            Bunch.contents.append(gram_3)
            categories_dict[cat] += 1
            print("So far: " + cat + " " + str(categories_dict[cat]))

    with open("./dat/" + temp_dat, "wb") as file_obj:
        pickle.dump(Bunch, file_obj)
Beispiel #16
0
def corpus2Bunch(wordbag_path,seg_path):
    catelist = os.listdir(seg_path)
    bunch = base.Bunch(target_name=[],label=[],filename=[],content=[])
    bunch.target_name.extend(catelist)
    
    # 获取每个目录下所有的文件  
    for mydir in catelist:  
        class_path = seg_path + mydir + "/"  # 拼出分类子目录的路径  
        file_list = os.listdir(class_path)  # 获取class_path下的所有文件  
        for file_path in file_list:  # 遍历类别目录下文件  
            fullname = class_path + file_path  # 拼出文件名全路径  
            bunch.label.append(mydir)  
            bunch.filenames.append(fullname)  
            bunch.contents.append(_readfile(fullname))  # 读取文件内容  
            '''''append(element)是python list中的函数,意思是向原来的list中添加element,注意与extend()函数的区别'''  
    # 将bunch存储到wordbag_path路径中  
    with open(wordbag_path, "wb") as file_obj:  
        pickle.dump(bunch, file_obj)  
    print "构建文本对象结束!!!"  
def json2bunch(fName):   #传入数据,面向不同的数据存储方式,需要调整函数内读取的代码
    infoDic=[]
    f=open(fName)
    jsonDecodes=json.load(f)
    j=0
    for info in jsonDecodes:
        condiKeys=info['detail_info'].keys()
        if 'price' in condiKeys and'overall_rating' in condiKeys and 'service_rating' in condiKeys and 'facility_rating' in condiKeys and 'hygiene_rating' in condiKeys and 'image_num' in condiKeys and 'comment_num' in condiKeys and 'favorite_num' in condiKeys: #提取的键都有数据时,才提取,否则忽略掉此数据
            if 50<float(info['detail_info']['price'])<1000: #设置价格区间,提取数据
                j+=1
                infoDic.append([info['location']['lat'],info['location']['lng'],info['detail_info']['price'],info['detail_info']['overall_rating'],info['detail_info']['service_rating'],info['detail_info']['facility_rating'],info['detail_info']['hygiene_rating'],info['detail_info']['image_num'],info['detail_info']['comment_num'],info['detail_info']['favorite_num'],info['detail_info']['checkin_num'],info['name']])
            else:pass
        else:pass
    print('.....................................',j)

    data=np.array([(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7],v[8],v[9],v[10]) for v in infoDic],dtype='float')  #解释变量(特征)数据部分
    targetInfo=np.array([v[11] for v in infoDic])  #目标变量(类标)部分
    dataBunch=base.Bunch(DESCR=r'info of poi',data=data,feature_names=['lat','lng','price','overall_rating','service_rating','facility_rating','hygiene_rating','image_num','comment_num','favorite_num','checkin_num'],target=targetInfo,target_names=['price','name'])  #建立sklearn的数据存储格式bunch
    return dataBunch #返回bunch格式的数据
Beispiel #18
0
def load_myDigits(fpath='F:/ML/digits/', fDataName='train-images-idx3-ubyte', fLabelName='train-labels-idx1-ubyte'):
	with open(join(fpath, fDataName), 'rb') as fin:
		magicNumber, n_sample, n_row, n_col = struct.unpack('>IIII', fin.read(struct.calcsize("=IIII")))
		n_size = n_row * n_col
		vectors = []
		for i in xrange(n_sample):
			vectors.append(list(struct.unpack('B'*n_size, fin.read(struct.calcsize('B'*n_size)))))
		# print vectors[:10]
	flat_data = np.array(vectors)
		
	with open(join(fpath, fLabelName), 'rb') as fin:
		magicNumber, n_sample = struct.unpack('>II', fin.read(struct.calcsize("=II")))
		# print 'magicNumber =', magicNumber
		# print 'n_sample =', n_sample
		# pres = struct.unpack('bbbbbbbb', fin.read(struct.calcsize("=bbbbbbbb")))
		# print 'pres =', pres
		labels = struct.unpack('B'*n_sample, fin.read(struct.calcsize('B'*n_sample)))
		# print labels[:10]
		# print len(labels), labels[0]
	target = np.array(labels)
	images = flat_data.view()
	images.shape = (-1, n_row, n_col)
	
	# for index, (image, prediction) in enumerate(list(zip(flat_data[:4], target[:4]))):
		# plt.subplot(1, 4, index+1)
		# plt.axis('off')
		# imageData = image.reshape(28,28)
		# plt.imshow(imageData, cmap=plt.cm.gray_r, interpolation='nearest')
		# plt.title('Prediction: %i' % prediction)
	# plt.show()
	
	return base.Bunch(
		data=flat_data,
		target=target.astype(np.int),
		target_names=np.arange(10),
		images=images
	)
Beispiel #19
0

"""
Bunch和字典结构类似,也是由键值对组成,和字典区别:其键值可以被实例对象当作属性使用。
"""
from sklearn.datasets import base

buch = base.Bunch(A=1,B=2,c=3)

print(buch.A)
Beispiel #20
0
def convert_weka_instances_to_bunch(instances):
    ''' Converts WEKA Instances to the scikit Bunch format

    :param instances: WEKA dataset (Instances)
    :return:
    '''
    if not jp.isThreadAttachedToJVM():
        jp.attachThreadToJVM()

    if instances.classIndex() < 0:
        instances.setClassIndex(instances.numAttributes() - 1)

    target_att = instances.classAttribute()
    target_names = []
    if target_att.isNominal():
        for j in range(0, target_att.numValues()):
            target_names.append(target_att.value(j))

    feature_names = []

    num_samples = instances.numInstances()
    num_attributes = instances.numAttributes()
    num_targets = 1

    data = np.empty((num_samples, num_attributes - num_targets))
    target = np.empty((num_samples, ), dtype=np.int)

    fdescr = instances.relationName()

    feature_value_names = []
    for j in range(0, num_attributes - num_targets):
        myatt = instances.attribute(j)
        # mtype = 1 if myatt.isNumeric() else 0
        mname = myatt.name()
        feature_names.append(mname)

        num_vals = myatt.numValues()
        f_vals = []
        for k in range(0, num_vals):
            f_vals.append(myatt.value(k))

        feature_value_names.append(f_vals)

    for i in range(0, num_samples):
        arr = []
        for j in range(0, num_attributes):
            arr.append(instances.get(i).value(j))

        data[i] = np.asarray(arr[:-1], dtype=np.float)
        if target_att.isNominal():
            target[i] = np.asarray(arr[-1], dtype=np.int)
        else:
            target[i] = np.asarray(arr[-1], dtype=np.float)

    return sk.Bunch(
        data=data,
        target=target,
        target_names=target_names,
        DESCR=fdescr,
        feature_value_names=feature_value_names,
        feature_names=feature_names
    )  # ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'])
Beispiel #21
0
                  encoding="latin1",
                  load_content=True,
                  random_state=rnd)
data.data = [remove_header_subject(text) for text in data.data]
print "Data loaded in %f." % (time() - t0)

print 'Total number of data: %d' % len(data.data)

indices = ShuffleSplit(len(data.data),
                       n_iter=1,
                       test_size=percent,
                       indices=True,
                       random_state=rnd)
for train_ind, test_ind in indices:
    data = bunch.Bunch(train=bunch.Bunch(
        data=[data.data[i] for i in train_ind], target=data.target[train_ind]),
                       test=bunch.Bunch(data=[data.data[i] for i in test_ind],
                                        target=data.target[test_ind]))

t0 = time()
print sep
print "Extracting features from the training dataset using a sparse vectorizer..."
print "Feature extraction technique is %s." % vect
X_tr = vect.fit_transform(data.train.data)
y_tr = data.train.target
duration = time() - t0
print "done in %fs" % duration
print "n_samples: %d, n_features: %d" % X_tr.shape

t0 = time()
print sep
print "Extracting features from the test dataset using the same vectorizer..."
feature_names = list(comm_conns[0])[0].split(",")[
    1:92]  # make a list of the column names split by commas, remove subject_id
raw_data = list(comm_conns[1:len(comm_conns)])

data = []
for row in raw_data:
    row_as_list_subj_id = row[0].split(",")  # there are 92 items per row
    row_as_list = row_as_list_subj_id[1:92]  # remove subject_id
    row_as_list = list(map(float, row_as_list))
    data.append(row_as_list)
data = np.asarray(data)

target = np.asarray(beh.maltreatment)

classify_adv_df = base.Bunch(target_names=target_names,
                             feature_names=feature_names,
                             target=target,
                             data=data)

# Train and test adversity classifier

# Specify the hyperparameter space, test code here:
# parameters = {'SVM__C':[1, 10, 100],
#               'SVM__gamma':[0.1, 0.01]}
#
# c_space = np.logspace(-5, 8, 15)
# param_grid = {'C': c_space, 'penalty': ['l1', 'l2']}
# temp = GridSearchCV(svc, param_grid, cv = 5)

# Create splits with leave-one-out cross validation
loo.get_n_splits(classify_adv_df.data)
    def simulation_result(self,
                          print_result=False,
                          plot_result=False,
                          save=False):
        #            print('\n============ Simulation Result  ==============')
        print('\n-------- {} ----------'.format(self.name))

        # X_o and y_o
        print('\nPredicting on training set:')
        X_train, y_train, y_train_pred = self.predict_one_by_one(
            self.X_train, self.y_train)
        print('\nPredicting on testing set:')
        X_test, y_test, y_test_pred = self.predict_one_by_one(
            self.X_test, self.y_test)

        if True:
            from sklearn.metrics import r2_score

            y_train_mean = np.mean(y_train)
            y_train_pred_mean = np.mean(y_train_pred)

            self.result_train = {}
            self.result_train['r2'] = round(r2_score(y_train, y_train_pred), 4)
            self.result_train['mae'] = round(
                np.mean(np.abs(y_train - y_train_pred)), 2)
            self.result_train['sae'] = round(
                np.abs(y_train_mean - y_train_pred_mean) / y_train_mean, 4)
            self.result_train['y_mean'] = round(y_train_mean, 2)
            self.result_train['y_pred_mean'] = round(y_train_pred_mean, 2)

            y_test_mean = np.mean(y_test)
            y_test_pred_mean = np.mean(y_test_pred)

            self.result_test = {}
            self.result_test['r2'] = round(r2_score(y_test, y_test_pred), 4)
            self.result_test['mae'] = round(
                np.mean(np.abs(y_test - y_test_pred)), 2)
            self.result_test['sae'] = round(
                np.abs(y_test_mean - y_test_pred_mean) / y_test_mean, 4)
            self.result_test['y_mean'] = round(y_test_mean, 2)
            self.result_test['y_pred_mean'] = round(y_test_pred_mean, 2)

            if print_result == True:
                print('\nTraining:')
                print('SAE       =', self.result_train['sae'])
                print('MAE       =', self.result_train['mae'])
                print('R^2       =', self.result_train['r2'])
                print('GT_mean   =', self.result_train['y_mean'])
                print('pred_mean =', self.result_train['y_pred_mean'])

                print('\nTesting:')
                print('SAE       =', self.result_test['sae'])
                print('MAE       =', self.result_test['mae'])
                print('R^2       =', self.result_test['r2'])
                print('GT_mean   =', self.result_test['y_mean'])
                print('pred_mean =', self.result_test['y_pred_mean'])

            if plot_result == True:
                plt.figure()

                plt.suptitle(self.name)
                plt.subplot(2, 1, 1)
                plt.title(
                    'Results on Training Set      ($R^2$ score={:.3f})'.format(
                        self.result_train['r2']))
                plt.plot(X_train, label='Aggregate Data')
                plt.plot(y_train,
                         label='({}) Ground Truth'.format(self.appliance))
                plt.plot(y_train_pred,
                         label='({}) Prediction'.format(self.appliance))
                plt.legend(loc='upper right')
                plt.ylabel('Power/W')

                plt.subplot(2, 1, 2)
                plt.title(
                    'Results on Testing Set      ($R^2$ score={:.3f})'.format(
                        self.result_test['r2']))
                plt.plot(X_test, label='Aggregate Data')
                plt.plot(y_test,
                         label='({}) Ground Truth'.format(self.appliance))
                plt.plot(y_test_pred,
                         label='({}) Prediction'.format(self.appliance))
                plt.legend(loc='upper right')
                plt.xlabel('Time/s')
                plt.ylabel('Power/W')

                plt.show()

            if save == True:
                from sklearn.datasets import base
                import pickle

                data = base.Bunch(name=self.name,
                                  appliance=self.appliance,
                                  X_train=X_train,
                                  y_train=y_train,
                                  y_train_pred=y_train_pred,
                                  result_train=self.result_train,
                                  X_test=X_test,
                                  y_test=y_test,
                                  y_test_pred=y_test_pred,
                                  result_test=self.result_test,
                                  train_history=self.history)

                resultPath = r'resultData/{}.pickle'.format(self.name)
                with open(resultPath, 'wb') as file:
                    pickle.dump(data, file)
                print('\n已保存result到', resultPath)
def get_immunized_nodes_and_movement(epidemic_graph, movement_info,
                                     n_immunized_nodes):
    # im_bunch = Immunized nodes in a Bunch format:
    im_bunch = base.Bunch()

    # Choose immunization by neighbor of a random node:
    im_bunch.neighbors = get_immunized_neighbors(epidemic_graph,
                                                 n_immunized_nodes)
    im_bunch.neighbors_move = get_immunized_movement_info(
        movement_info, im_bunch.neighbors)

    # Choose immunization by random:
    im_bunch.random = get_immunized_random(epidemic_graph, n_immunized_nodes)
    im_bunch.random_move = get_immunized_movement_info(movement_info,
                                                       im_bunch.random)

    # Choose immunization by k-shell (core):
    core_dict = nx.core_number(epidemic_graph)
    im_bunch.core = dict_to_sorted_list(core_dict, n_immunized_nodes)
    im_bunch.core_move = get_immunized_movement_info(movement_info,
                                                     im_bunch.core)

    # Choose immunization by unweighted clustering coefficient c:
    clustering_dict = nx.clustering(epidemic_graph)
    im_bunch.clustering = dict_to_sorted_list(clustering_dict,
                                              n_immunized_nodes)
    im_bunch.clustering_move = get_immunized_movement_info(
        movement_info, im_bunch.clustering)

    # Choose immunization by degree k:
    degree_dict = epidemic_graph.degree()
    im_bunch.degree = dict_to_sorted_list(degree_dict, n_immunized_nodes)
    im_bunch.degree_move = get_immunized_movement_info(movement_info,
                                                       im_bunch.degree)

    # Choose immunization by strenght:
    strenght_dict = epidemic_graph.degree(weight='weight')
    im_bunch.strenght = dict_to_sorted_list(strenght_dict, n_immunized_nodes)
    im_bunch.strenght_move = get_immunized_movement_info(
        movement_info, im_bunch.strenght)

    # Choose immunization by betweeness:
    betweenness_dict = nx.betweenness_centrality(epidemic_graph)
    im_bunch.betweenness = dict_to_sorted_list(betweenness_dict,
                                               n_immunized_nodes)
    im_bunch.betweenness_move = get_immunized_movement_info(
        movement_info, im_bunch.betweenness)

    # Choose immunization by closeness:
    closeness_dict = nx.closeness_centrality(epidemic_graph)
    im_bunch.closeness = dict_to_sorted_list(closeness_dict, n_immunized_nodes)
    im_bunch.closeness_move = get_immunized_movement_info(
        movement_info, im_bunch.closeness)

    # All immunized nodes:
    neighbors_set = set(im_bunch.neighbors)
    random_set = set(im_bunch.random)
    core_set = set(im_bunch.core)
    clustering_set = set(im_bunch.clustering)
    degree_set = set(im_bunch.degree)
    strenght_set = set(im_bunch.strenght)
    betweenness_set = set(im_bunch.betweenness)
    closeness_set = set(im_bunch.closeness)

    all_immunized_nodes = neighbors_set.union(random_set, core_set,
                                              clustering_set, degree_set,
                                              strenght_set, betweenness_set,
                                              closeness_set)
    im_bunch.all_immunized_nodes = list(all_immunized_nodes)

    return im_bunch
Beispiel #25
0
def update_to_pickle(data):
    Full_stations, City_only, ErrorCities = data
    # 改变列顺序,使之更易读
    columns = [
        'time_point', 'area', 'position_name', 'station_code', 'aqi',
        'quality', 'primary_pollutant', 'pm2_5', 'pm2_5_24h', 'pm10',
        'pm10_24h', 'co', 'co_24h', 'no2', 'no2_24h', 'o3', 'o3_24h', 'o3_8h',
        'o3_8h_24h', 'so2', 'so2_24h'
    ]
    Full_stations = Full_stations.reindex(columns=columns)
    City_only = City_only.reindex(columns=columns)
    City_only.pop('position_name')
    City_only.pop('station_code')
    time_point = City_only.iloc[0, 0]
    #      time_point = City_only.ix[0,'time_point']

    # 保存此次update的数据
    with open('AQIsData/update.pickle', 'wb') as file:
        data = base.Bunch(full=Full_stations,
                          city=City_only,
                          time=time_point,
                          notUpdatedCity=ErrorCities)
        pickle.dump(data, file)

    # 将更新并入历史数据
    month = time_point[:7]
    his_filename = '{}.pickle'.format(month)  # 按月存放数据于一文件中
    filepath = 'AQIsData/' + his_filename
    import os
    if os.path.exists(filepath):
        try:
            #                  Full_his, City_his, time_his = pd.read_pickle(filepath)
            his = pd.read_pickle(filepath)
            Full_his = his.full
            City_his = his.city
            time_his = his.time
        except Exception as e:
            # 如无法获取该月份的历史数据,为了避免覆写历史数据的误操作,将本次更新的数据另建一pickle,以待后续手动合并
            filename = 'not-merged-Data-{}.pickle'.format(time_point)
            with open(r'AQIsData/' + filename, 'wb') as file:
                #                        pickle.dump([Full_stations, City_only, time_point], file)
                data = base.Bunch(full=Full_stations,
                                  city=City_only,
                                  time=time_point,
                                  notUpdatedCity=ErrorCities)
                pickle.dump(data, file)
            log('[Error]  Fail to load [{}] and unable to merge into his data. \
                                     Create an extra file:{}.  ({})'.format(
                his_filename, filename, e))
            return
    else:
        #否则新建新月份的pickle文件
        Full_his = City_his = pd.DataFrame()
        time_his = pd.Series()
        log('======================================================================================='
            )
        log('[New his pickle] Create {}'.format(his_filename))
    # 合并之
    Full_his = pd.concat([Full_stations, Full_his],
                         axis=0,
                         join='outer',
                         ignore_index=True)
    City_his = pd.concat([City_only, City_his],
                         axis=0,
                         join='outer',
                         ignore_index=True)
    time_his = pd.Series(time_point).append(time_his, ignore_index=True)
    with open(filepath, 'wb') as file:
        #            pickle.dump([Full_his, City_his, time_his], file)
        data = base.Bunch(full=Full_his, city=City_his, time=time_his)
        pickle.dump(data, file)