def get_data_info(filename, distance_constructor, do_normalize_distances=True): ext = os.path.splitext(filename)[1].lower() di = None if ext in [".csv", ".tab", ".arff"]: di = orange_load(filename, distance_constructor) else: open_op = open if ext == ".gz": open_op = gzip.open ext = os.path.splitext(os.path.splitext(filename)[0])[1].lower() if ext == ".pcdt": method = DataInfo.DeserializationMethod.csv elif ext == ".pcdb": method = DataInfo.DeserializationMethod.proto else: raise NotImplementedError("No implementation for ext %s" % ext) with open_op(filename) as f: di = DataInfo.deserialize(f, method) if do_normalize_distances: di = di.get_distance_normalized() return di
def main(*argv): in_file = clean_path(argv[1]) out_file = clean_path(argv[2]) dist_constructor_str = argv[3] serialization_method_str = argv[4] dist_constructor_generator = getattr(orange, "ExamplesDistanceConstructor_%s" % dist_constructor_str) dist_constructor = lambda data: dist_constructor_generator()(list(data)) data_info = get_data_info(in_file, dist_constructor) do_zip = len(argv) > 5 and int(argv[5]) == '1' open_args = (out_file, "wb") open_func = gzip.open if do_zip else open serialization_method = getattr(DataInfo.SerializationMethod, serialization_method_str) with open_func(*open_args) as ofs: data_info.serialize(ofs, serialization_method, DataInfo.get_numeric_str_repr_getter())
def get_20newsgroups_data_info_for_categories(categories): data = fetch_20newsgroups(subset='all', categories=categories, shuffle=False) vectorizer = Vectorizer() t0 = time() tfidf = vectorizer.fit_transform(data.data) pairwise_similarity = (tfidf * tfidf.T).todense().tolist() print "done in %fs" % (time() - t0) labels = [data.target_names[i] for i in data.target] payloads = [os.sep.join(e.split(os.sep)[-3:]) for e in data.filenames] # Similarity is from Zero to One - so (1-s) gives distance from 0 to 1. distances = [[(1-s) for s in row[:col_to+1]]for (col_to, row) in enumerate(pairwise_similarity)] # Fix the very slight off-ness involved in precision-conversion for row in distances: row[-1] = 0 pcd_tuples = zip(payloads, labels, distances) di = DataInfo.deserialize_pcd_tuples(pcd_tuples) return di