def create_species_bunch(species_name, train, test, coverages, xgrid, ygrid): """ create a bunch with information about a particular organism This will use the test/train record arrays to extract the data specific to the given species name. """ bunch = Bunch(name=' '.join(species_name.split("_")[:2])) points = dict(test=test, train=train) for label, pts in points.iteritems(): # choose points associated with the desired species pts = pts[pts['species'] == species_name] bunch['pts_%s' % label] = pts # determine coverage values for each of the training & testing points ix = np.searchsorted(xgrid, pts['dd long']) iy = np.searchsorted(ygrid, pts['dd lat']) bunch['cov_%s' % label] = coverages[:, -iy, ix].T return bunch
def _fetch_surf_fsaverage(data_dir=None): """Helper function to ship fsaverage (highest resolution) surfaces and sulcal information with Nilearn. The source of the data is downloaded from nitrc. """ dataset_dir = _get_dataset_dir('fsaverage', data_dir=data_dir) url = 'https://www.nitrc.org/frs/download.php/10846/fsaverage.tar.gz' if not os.path.isdir(os.path.join(dataset_dir, 'fsaverage')): _fetch_files(dataset_dir, [('fsaverage.tar.gz', url, {})]) _uncompress_file(os.path.join(dataset_dir, 'fsaverage.tar.gz')) result = { name: os.path.join(dataset_dir, 'fsaverage', '{}.gii'.format(name)) for name in ['pial_right', 'sulc_right', 'sulc_left', 'pial_left'] } result['infl_left'] = os.path.join(dataset_dir, 'fsaverage', 'inflated_left.gii') result['infl_right'] = os.path.join(dataset_dir, 'fsaverage', 'inflated_right.gii') result['description'] = str(_get_dataset_descr('fsaverage')) return Bunch(**result)
def fetch_coords_power_2011(): """Download and load the Power et al. brain atlas composed of 264 ROIs. Returns ------- data: sklearn.datasets.base.Bunch dictionary-like object, contains: - "rois": coordinates of 264 ROIs in MNI space References ---------- Power, Jonathan D., et al. "Functional network organization of the human brain." Neuron 72.4 (2011): 665-678. """ dataset_name = 'power_2011' fdescr = _get_dataset_descr(dataset_name) package_directory = os.path.dirname(os.path.abspath(__file__)) csv = os.path.join(package_directory, "data", "power_2011.csv") params = dict(rois=np.recfromcsv(csv), description=fdescr) return Bunch(**params)
def fetch_TR9856(): """ Fetch TR9856 dataset for testing multi-word term relatedness Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': matrix of 2 words per column, 'y': vector with scores, 'topic': vector of topics providing context for each pair of terms References ---------- Levy, Ran et al., "TR9856: A multi-word term relatedness benchmark", 2015. Notes ----- """ data = pd.read_csv( os.path.join( _fetch_file( "https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_TR9856.v2.zip", "similarity", uncompress=True, verbose=0, ), "IBM_Debater_(R)_TR9856.v0.2", "TermRelatednessResults.csv", ), encoding="iso-8859-1", ) # We basically select all the columns available X = data[["term1", "term2"]].values y = data["score"].values topic = data["topic"].values return Bunch(X=X.astype("object"), y=y, topic=topic)
def vector_space(stopword_path, bunch_path, space_path): stpwrdlst = _readfile(stopword_path).splitlines() bunch = _readbunchobj(bunch_path) #构建tf-idf词向量空间对象 tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={}) vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5) # 此时tdm里面存储的就是tf-idf权值矩阵 # vectorizer.fit_transform(corpus)将文本corpus输入,得到词频矩阵 tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) tfidfspace.vocabulary = vectorizer.vocabulary_ _writebunchobj(space_path, tfidfspace) print("if-idf词向量空间实例创建成功!")
def _ising_case(n_samples=100, n_dim_obs=100, T=10, time_on_axis='first', update_theta='l2', responses=[-1, 1], **kwargs): thetas = ising_theta_generator(n_dim_obs=n_dim_obs, n=n_samples, T=T, mode=update_theta, **kwargs) samples = [ ising_sampler(t, np.zeros(n_dim_obs), n=n_samples, responses=[-1, 1]) for t in thetas ] data = np.array(samples) X = np.vstack(data) y = np.repeat(range(len(thetas)), n_samples).astype(int) if time_on_axis == "last": data = data.transpose(1, 2, 0) return Bunch(data=data, thetas=np.array(thetas), X=X, y=y)
def _load_camcan_scores(filename_csv, subjects_selected): """Load the scores from the Cam-CAN data set. Parameters ---------- filename_csv : str, Path to the csv file containing the participants information. subjects_selected: list of str, A list of strings, contains the ID of the patient to be selected. The string provided should follow the BIDS standard (e.g., 'sub-******'). Returns ------- data : Bunch, Dictionary-like object. The interesting attributes are: - 'age', the age of the patient; - 'hand', handedness of the patient; - 'gender_text', gender of the patient. """ if not isfile(filename_csv): raise ValueError('The file {} does not exist.'.format(filename_csv)) if not filename_csv.endswith('.csv'): raise ValueError('The file {} is not a CSV file.'.format(filename_csv)) patients_info = pd.read_csv(filename_csv, usecols=COLUMN_SELECT_PATIENTS_INFO) # the id in the CSV is missing 'sub-' patients_info['Observations'] = 'sub-' + patients_info['Observations'] # filter the IDs to be kept and sort just in case patients_info = ( patients_info.set_index('Observations').loc[subjects_selected]) return Bunch(**patients_info.to_dict('list'))
def build_training_set(file_list): '''This function is an alternative form of the loads in sklearn but allows me to load from a list of files output by another file rather than the folder structure prescribed by giving loads a folder-path''' from sklearn.datasets.base import Bunch b = Bunch() b['filenames'] = file_list #filenames def target_function(filepath): #/home/dhrumil/Desktop/PoliticalFraming/data/immigration/D/123.json if filepath[filepath.rfind("/") - 1] == 'D': return 0 elif filepath[filepath.rfind("/") - 1] == 'R': return 1 else: print "file must be categorized as D or R : " + str(filepath) b['target'] = [] #target for filepath in file_list: b['target'].append(target_function(filepath)) b['target_names'] = ['D', 'R'] #target_names b['data'] = [] #data for filepath in file_list: f = open(filepath, 'r') jdata = json.loads(f.read()) f.close() speech_string = "" for sentence in jdata['speaking']: speech_string += sentence b['data'].append(speech_string) b['DESCR'] = "" #DESCR return b
def _compute_fit(series, fitter): if cfg.verbosity > 0: print 'Computing fit for {}@{} using {}'.format( series.gene_name, series.region_name, fitter) x = series.ages y = series.single_expression if np.count_nonzero(abs(y) > cfg.nonzero_threshold ) < cfg.min_nonzero_points_for_fitting: print 'Not enough non-zero data points to fit for {}@{}. Skipping...'.format( series.gene_name, series.region_name) theta = None sigma = None fit_predictions = None LOO_predictions = None theta_samples = None else: theta, sigma, LOO_predictions, LOO_fits = fitter.fit(x, y, loo=True) if theta is None: print 'WARNING: Optimization failed during overall fit for {}@{} using {}'.format( series.gene_name, series.region_name, fitter) fit_predictions = None theta_samples = None else: fit_predictions = fitter.shape.f(theta, x) if fitter.shape.parameter_type() == object: theta_samples = None else: theta_samples = fitter.parametric_bootstrap(x, theta, sigma) return Bunch( fitter=fitter, seed=cfg.random_seed, theta=theta, sigma=sigma, fit_predictions=fit_predictions, LOO_predictions=LOO_predictions, theta_samples=theta_samples, )
def _add_dataset_correlation_fits_from_results_dictionary( dataset, ds_fits, dct_results): """This function converts the results of the job_splitting which is a flat dictionary to structures which are easier to use and integrated into the dataset fits """ region_to_ix_original_inds = {} for ir, r in enumerate(dataset.region_names): series = dataset.get_several_series(dataset.gene_names, r) region_to_ix_original_inds[r] = series.original_inds for (ir, loo_point), levels in dct_results.iteritems(): n_iterations = len(levels) r = dataset.region_names[ir] if loo_point is None: # Global fit - collect the parameters (theta, sigma, L) and compute a correlation matrix for the region # the hack of using the key (None,r) to store these results can be removed if/when dataset fits is changed from a dictionary to a class with several fields k = (None, r) if k not in ds_fits: ds_fits[k] = n_iterations * [None] for iLevel, level in enumerate(levels): ds_fits[k][iLevel] = level level.correlations = covariance_to_correlation(level.sigma) else: # LOO point - collect the predictions ix, iy = loo_point g = dataset.gene_names[iy] fit = ds_fits[(g, r)] if not hasattr(fit, 'with_correlations'): fit.with_correlations = [ Bunch( LOO_predictions=init_array(np.NaN, len(dataset.ages)) ) # NOTE: we place the predictions at the original indexes (before NaN were removed by the get_series) for _ in xrange(n_iterations) ] for iLevel, level_prediction in enumerate(levels): orig_ix = region_to_ix_original_inds[r][ix] fit.with_correlations[iLevel].LOO_predictions[ orig_ix] = level_prediction
def fetch_localizer_first_level(data_dir=None, verbose=1): """ Download a first-level localizer fMRI dataset Parameters ---------- data_dir: string directory where data should be downloaded and unpacked. Returns ------- data: sklearn.datasets.base.Bunch dictionary-like object, with the keys: epi_img: the input 4D image events: a csv file describing the paardigm """ url = 'ftp://ftp.cea.fr/pub/dsv/madic/download/nipy' dataset_name = "localizer_first_level" files = dict(epi_img="s12069_swaloc1_corr.nii.gz", events="localizer_paradigm.csv") # The options needed for _fetch_files options = [(filename, os.path.join(url, filename), {}) for _, filename in sorted(files.items())] data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) sub_files = _fetch_files(data_dir, options, resume=True, verbose=verbose) params = dict(zip(sorted(files.keys()), sub_files)) try: _check_events_file_uses_tab_separators(params['events']) except ValueError: _make_events_file_localizer_first_level(events_file= params['events'] ) return Bunch(**params)
def vector_space(stopword_path, bunch_path, space_path, train_tfidf_path=None): stpwrdlst = readfile(stopword_path).splitlines() bunch = readbunchobj(bunch_path) tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[], vocabulary={}) if train_tfidf_path is not None: trainbunch = readbunchobj(train_tfidf_path) tfidfspace.vocabulary = trainbunch.vocabulary vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, vocabulary=trainbunch.vocabulary) tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) else: vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5) tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) tfidfspace.vocabulary = vectorizer.vocabulary_ writebunchobj(space_path, tfidfspace) print("if-idf词向量空间实例创建成功!!!")
def load_corpus(path): """ Loads and wrangles the passed in text corpus by path. """ # Check if the data exists, otherwise download or raise if not os.path.exists(path): raise ValueError( ("'{}' dataset has not been downloaded, " "use the yellowbrick.download module to fetch datasets" ).format(path)) # Read the directories in the directory as the categories. categories = [ cat for cat in os.listdir(path) if os.path.isdir(os.path.join(path, cat)) ] files = [] # holds the file names relative to the root data = [] # holds the text read from the file target = [] # holds the string of the category # Load the data from the files in the corpus for cat in categories: for name in os.listdir(os.path.join(path, cat)): files.append(os.path.join(path, cat, name)) target.append(cat) with open(os.path.join(path, cat, name), 'r') as f: data.append(f.read()) # Return the data bunch for use similar to the newsgroups example return Bunch( categories=categories, files=files, data=data, target=target, )
def fetch_TR9856(): """ Fetch TR9856 dataset for testing multi-word term relatedness Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': matrix of 2 words per column, 'y': vector with scores, 'topic': vector of topics providing context for each pair of terms References ---------- Levy, Ran et al., "TR9856: A multi-word term relatedness benchmark", 2015. Notes ----- """ #data = pd.read_csv(os.path.join(_fetch_file( # 'https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_TR9856.v2.zip', # 'similarity', uncompress=True, verbose=0), # 'IBM_Debater_(R)_TR9856.v0.2', 'TermRelatednessResults.csv'), encoding="iso-8859-1") data = pd.read_csv(os.path.join( _fetch_file( 'http://homes.cs.washington.edu/~febrahim/files/IBM_Debater_(R)_TR9856.v2.zip', 'similarity', uncompress=True, verbose=0), 'IBM_Debater_(R)_TR9856.v0.2', 'TermRelatednessResults.csv'), encoding="iso-8859-1") # We basically select all the columns available X = data[['term1', 'term2']].values y = data['score'].values topic = data['topic'].values return Bunch(X=X.astype("object"), y=y, topic=topic)
def load_datasets(data_home=DATA_HOME_BASIC): """ Load the benchmark datasets. :param data_home: Default catalogue in which the data is stored in .tar.gz format. :returns: OrderedDict of Bunch object. Each Bunch object refered as dataset have the following attributes: * dataset.data : ndarray, shape (n_samples, n_features) * dataset.target : ndarray, shape (n_samples, ) * dataset.DESCR : string Description of the each dataset. """ extracted_dir = join(data_home, "extracted") datasets = OrderedDict() filter_data_ = MAP_NAME_ID.keys() for it in filter_data_: filename = PRE_FILENAME + str(MAP_NAME_ID[it]) + POST_FILENAME filename = join(extracted_dir, filename) available = isfile(filename) if not available: makedirs(extracted_dir, exist_ok=True) with open(f'{data_home}data.tar.gz', 'rb') as fin: f = BytesIO(fin.read()) tar = tarfile.open(fileobj=f) tar.extractall(path=extracted_dir) data = np.load(filename) X, y = data['data'], data['label'] datasets[it] = Bunch(data=X, target=y, DESCR=it) return datasets
def load_graphs_LMdata(): """Load the LMdata graph dataset for graph classification.. Returns ------- data : Bunch Dictionary-like object with the following attributes : 'graphs', the graphs in the dataset in Networkx format, 'target', the classification labels for each sample. """ input_target_url = 'http://www.math.unipd.it/~nnavarin/datasets/LMdata/labels.txt.standardized' input_data_url = 'http://www.math.unipd.it/~nnavarin/datasets//LMdata/graphs.gspan.standardized' _target = load_target(input_target_url) label_dict = {} counter = [1] g_it = instance_to_graph(input_data_url, label_dict, counter) print 'Loaded LMdata graph dataset for graph classification.' return Bunch(graphs=[i for i in g_it], label_dict=label_dict, target=_target, labels=True, veclabels=False)
def confidence_calculate(label, content, stop_word_list, clf, train_path, url, alexa_dict): # bunch规格化 bunch = Bunch(target_name=[], label=[], filenames=[], contents=[]) bunch.target_name.append(label) bunch.label.append(label) bunch.filenames.append(label) bunch.contents.append(content) # 计算TF-IDF并预测 bunch = vector_space(bunch, train_path, stop_word_list) predicted = clf.predict_proba(bunch.tdm) predict_score = predicted[0][0] # 根据alexa排名,得到url对应的初始分值 alexa_score = 0 for domain in alexa_dict: if domain in url: alexa_score = alexa_dict[domain] # 综合得出可信度 confidence = predict_score * 0.95 + (alexa_score - 1) * 0.05 return confidence
def _fetch_surf_fsaverage5_sphere(data_dir=None): """Helper function to ship fsaverage5 spherical meshes. These meshes can be used for visualization purposes, but also to run cortical surface-based searchlight decoding. The source of the data is downloaded from OSF. """ fsaverage_dir = _get_dataset_dir('fsaverage', data_dir=data_dir) dataset_dir = _get_dataset_dir('fsaverage5_sphere', data_dir=fsaverage_dir) url = 'https://osf.io/b79fy/download' opts = {'uncompress': True} names = ['sphere_right', 'sphere_left'] filenames = [('{}.gii'.format(name), url, opts) for name in names] _fetch_files(dataset_dir, filenames) result = { name: os.path.join(dataset_dir, '{}.gii'.format(name)) for name in names } result['description'] = str(_get_dataset_descr('fsaverage5_sphere')) return Bunch(**result)
def corpus2Bunch(wordbag_path, seg_path): catelist = os.listdir(seg_path) # 获取seg_path下的所有子目录,也就是分类信息 # 创建一个Bunch实例 bunch = Bunch(target_name=[], label=[], filenames=[], contents=[]) bunch.target_name.extend(catelist) ''' extend(addlist)是python list中的函数,意思是用新的list(addlist)去扩充 原来的list ''' # 获取每个目录下所有的文件 for mydir in catelist: class_path = seg_path + mydir + "/" # 拼出分类子目录的路径 file_list = os.listdir(class_path) # 获取class_path下的所有文件 for file_path in file_list: # 遍历类别目录下文件 fullname = class_path + file_path # 拼出文件名全路径 bunch.label.append(mydir) bunch.filenames.append(fullname) bunch.contents.append(readfile(fullname)) # 读取文件内容 '''append(element)是python list中的函数,意思是向原来的list中添加element,注意与extend()函数的区别''' # 将bunch存储到wordbag_path路径中 with open(wordbag_path, "wb") as file_obj: pickle.dump(bunch, file_obj) print("构建文本对象结束!!!")
def fetch_thai_simlex999(): """ added by Gerhard Wohlgenannt, ([email protected], [email protected]), 2019 Get the SemEval2017-Task2 dataset for Thai language The dataset is in Thai language (!) for the evaluation of Thai embedding models Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': matrix of 2 words per column, 'y': vector with scores, """ data = _get_as_pd( 'https://www.dropbox.com/s/nlct64af7qmhc49/thaiSimLex-999-v2.csv?dl=1', # SimLex-999 -- thai version 'similarity', header=None, sep=",").values return Bunch(X=data[:, 0:2].astype("object"), y=2 * data[:, 2].astype(np.float))
def fetch_thai_wordsim353(): """ added by Gerhard Wohlgenannt, ([email protected], [email protected]), 2019 Get the WordSim-353 dataset for Thai language The dataset is in Thai language (!) for the evaluation of Thai embedding models Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': matrix of 2 words per column, 'y': vector with scores, """ data = _get_as_pd( 'https://www.dropbox.com/s/h8c3ll1764d7akf/thai-wordsim353-v2.csv?dl=1', 'similarity', header=None, sep=",").values return Bunch(X=data[:, 0:2].astype("object"), y=2 * data[:, 2].astype(np.float))
def word_to_bunch(train_save_path, train_bunch_path): bunch = Bunch(label=[], filepath=[], contents=[]) all_labels = os.listdir(train_save_path) for label in all_labels: detail_path = train_save_path + label + '/' all_details = os.listdir(detail_path) for all_detail in all_details: file_detail_path = detail_path + all_detail # 文件具体路径 bunch.label.append(label) # print(bunch.label) # bunch.filepath.append(file_detail_path) # print(bunch.filepath) # contents = read_file(file_detail_path) # print(contents) # bunch.contents.append(contents) # print(bunch.contents) # with open(train_bunch_path, "wb+") as fp: pickle.dump(bunch, fp) print("创建完成")
def load_yeast(): ''' Yeast 1484 instances 1 sequence number + 8 real attributes 10 classes (localization site of protein) CYT (cytosolic or cytoskeletal) 463 NUC (nuclear) 429 MIT (mitochondrial) 244 ME3 (membrane protein, no N-terminal signal) 163 ME2 (membrane protein, uncleaved signal) 51 ME1 (membrane protein, cleaved signal) 44 EXC (extracellular) 37 VAC (vacuolar) 30 POX (peroxisomal) 20 ERL (endoplasmic reticulum lumen) 5 Note: first attribute(sequence number)IGNORED ''' data = pd.read_csv(get_data_path('yeast.data'),delim_whitespace=True,header=None) flat_data = data.ix[:, 1:8].values labels = data.ix[:, 9].values return Bunch(data=flat_data, target=labels, name='yeast', dataset_type='classification')
def _glob_fsl_feeds_data(subject_dir): """glob data from subject_dir. """ if not os.path.exists(subject_dir): return None for file_name in FSL_FEEDS_DATA_FILES: file_path = os.path.join(subject_dir, file_name) if os.path.exists(file_path) or os.path.exists( file_path.rstrip(".gz")): file_name = re.sub("(?:\.nii\.gz|\.txt)", "", file_name) else: if not os.path.basename(subject_dir) == 'data': return _glob_fsl_feeds_data( os.path.join(subject_dir, 'feeds/data')) else: print("%s missing from filelist!" % file_name) return None return Bunch(data_dir=data_dir, func=os.path.join(subject_dir, "fmri.nii.gz"), anat=os.path.join(subject_dir, "structural_brain.nii.gz"))
def load_sample_images(): """Load sample images for image manipulation. Loads ``sloth``, ``sloth_closeup``, ``cat_and_dog``. Returns ------- data : Bunch Dictionary-like object with the following attributes : 'images', the sample images, 'filenames', the file names for the images, and 'DESCR' the full description of the dataset. """ module_path = os.path.join(os.path.dirname(__file__), "images") with open(os.path.join(module_path, 'README.txt')) as f: descr = f.read() filenames = [ os.path.join(module_path, filename) for filename in os.listdir(module_path) if filename.endswith(".jpg") ] # Load image data for each image in the source folder. images = [np.array(Image.open(filename, 'r')) for filename in filenames] return Bunch(images=images, filenames=filenames, DESCR=descr)
def load_data(root='data'): with open(os.path.join(root, 'meta.json'), 'r') as f: meta = json.load(f) names = meta['features'] # print names[:-1] train = pd.read_excel(os.path.join(root, 'mergewothdkom.xlsx'), sheetname='Sheet1') # train=list(train) train = train.values # print train test = pd.read_excel(os.path.join(root, 'merge_test1.xlsx'), sheetname='Sheet1') test = test.values # print names[-1] # test=list(test) return Bunch( data=train[:, :-2], target=train[:, -2], data_test=test[:, :-1], target_test=test[:, -1], target_names=meta['target_names'], feature_names=meta['features'], )
def fetch_RG65(): """ Fetch Rubenstein and Goodenough dataset for testing attributional and relatedness similarity Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': matrix of 2 words per column, 'y': vector with scores, 'sd': vector of std of scores if available (for set1 and set2) References ---------- Rubenstein, Goodenough, "Contextual correlates of synonymy", 1965 Notes ----- Scores were scaled by factor 10/4 """ data = _get_as_pd('https://www.dropbox.com/s/chopke5zqly228d/EN-RG-65.txt?dl=1', 'similarity', header=None, sep="\t").values return Bunch(X=data[:, 0:2].astype("object"), y=data[:, 2].astype(np.float) * 10.0 / 4.0)
def fetch_asirra(image_count=1000): """ Parameters ---------- image_count : positive integer Returns ------- data : Bunch Dictionary-like object with the following attributes : 'images', the sample images, 'data', the flattened images, 'target', the label for the image (0 for cat, 1 for dog), and 'DESCR' the full description of the dataset. """ partial_path = check_fetch_asirra() m = Memory(cachedir=partial_path, compress=6, verbose=0) load_func = m.cache(_fetch_asirra) images, target = load_func(partial_path, image_count=image_count) return Bunch(data=images.reshape(len(images), -1), images=images, target=target, DESCR="Asirra cats and dogs dataset")
def load_fuman_rant(file_path, target_func=fuman_gvb_target): data = list() target = list() parse_errors = 0 n_samples = 0 with open(file_path, newline='') as csv_file: data_file = csv.reader(csv_file, delimiter=',', quotechar="'") next(data_file) for row in data_file: if not check_row_format(row[0], row): parse_errors += 1 continue data.append(unicodedata.normalize('NFKC', row[5])) status = int(row[6]) if len(row) is 16: price = int(row[15]) else: price = 0 target.append(target_func(status, price)) n_samples += 1 logging.info('Finished loading data. (read: {} errors: {})'.format( n_samples, parse_errors)) return Bunch(data=data, target=target, DESCR="Fuman DB csv dump dataset")
def load_train_file(fname, selected_categorys=None, description=None): import json data = [] target = [] segmentor = Segmentor() with open(fname) as ifd: for line in ifd: obj = json.loads(line) if 'topic' not in obj or 'title' not in obj: continue title = obj['title'] L = bow(title, skip_unigram=False, segmentor=segmentor) title_words = [i.word for i in L] #title_words_str = ' '.join(title_words) for tname, tidstr in obj['topic'].iteritems(): if selected_categorys and tname not in selected_categorys: continue #data.append(title_words_str) data.append(title_words) target.append(tname) print '%s file name[%s] load %d records' % (datetime.now(), fname, len(data)) return Bunch(fname=fname, data=data, target=target, DESCR=description)