예제 #1
0
def get_data_info(filename, distance_constructor, do_normalize_distances=True):
    ext = os.path.splitext(filename)[1].lower()
    di = None
    if ext in [".csv", ".tab", ".arff"]:
        di = orange_load(filename, distance_constructor)
    else:
        open_op = open
        if ext == ".gz":
            open_op = gzip.open
            ext = os.path.splitext(os.path.splitext(filename)[0])[1].lower()
                
        if ext == ".pcdt":
            method = DataInfo.DeserializationMethod.csv
        elif ext == ".pcdb":
            method = DataInfo.DeserializationMethod.proto
        else:
            raise NotImplementedError("No implementation for ext %s" % ext)
        
        with open_op(filename) as f:
            di = DataInfo.deserialize(f, method)
            
    if do_normalize_distances:
        di = di.get_distance_normalized()
    
    return di
            
        
         
    
    
    

        
예제 #2
0
def main(*argv):
    in_file = clean_path(argv[1])
    out_file = clean_path(argv[2])
    dist_constructor_str = argv[3]
    serialization_method_str = argv[4]
    dist_constructor_generator = getattr(orange, "ExamplesDistanceConstructor_%s" % dist_constructor_str)
    dist_constructor = lambda data: dist_constructor_generator()(list(data))
    data_info = get_data_info(in_file, dist_constructor)
    
    do_zip = len(argv) > 5 and int(argv[5]) == '1'
    open_args = (out_file, "wb")
    open_func = gzip.open if do_zip else open
    
    serialization_method = getattr(DataInfo.SerializationMethod, serialization_method_str)
    
    with open_func(*open_args) as ofs:
        data_info.serialize(ofs, serialization_method, DataInfo.get_numeric_str_repr_getter())
예제 #3
0
def get_20newsgroups_data_info_for_categories(categories):
    data = fetch_20newsgroups(subset='all', categories=categories, shuffle=False)
    vectorizer = Vectorizer()
    t0 = time()
    
    tfidf = vectorizer.fit_transform(data.data)
    
    pairwise_similarity = (tfidf * tfidf.T).todense().tolist()
    print "done in %fs" % (time() - t0)
    
    labels = [data.target_names[i] for i in data.target]
    payloads = [os.sep.join(e.split(os.sep)[-3:]) for e in data.filenames]
    
    # Similarity is from Zero to One - so (1-s) gives distance from 0 to 1.
    distances = [[(1-s) for s in row[:col_to+1]]for (col_to, row) in enumerate(pairwise_similarity)]
    
    # Fix the very slight off-ness involved in precision-conversion
    for row in distances:
        row[-1] = 0 
    
    pcd_tuples = zip(payloads, labels, distances)
    
    di = DataInfo.deserialize_pcd_tuples(pcd_tuples)
    return di