def prop4da(dataset): root, name = op.split(dataset) name = op.splitext(name)[0] masterfile = op.join(root, name + "-master.csv") y = read_y_from_master(masterfile) classes = infer_classes(y) # Process CDK descriptors for descs in glob.glob(op.join(root, "*-cdk-*.csv")): with open(descs) as reader: header = reader.next() if header.startswith("Title"): x, features = cdkdeskui2dense(descs) mlio.save_arff(x, y, op.splitext(descs)[0] + ".arff", feature_names=features, classes=classes) mlio.save_tab(x, y, op.splitext(descs)[0] + ".txt", classes=classes) else: x, relation_name = cdkdeskuifps2dense(descs) features = mlio.generate_names(x.shape[0], descs) mlio.save_arff( x, y, op.splitext(descs)[0] + ".arff", relation_name=relation_name, classes=classes, feature_names=features, ) mlio.save_tab(x, y, op.splitext(descs)[0] + ".txt", classes=classes) # Process ob spectrophores specs = op.join(root, name + "-ob-spectrophores.csv") with open(specs) as reader: specs = [] for line in reader: specs.append(map(lambda a: float(a.strip()), line.split(","))) x = np.array(specs) feature_names = mlio.generate_names(len(specs[0])) mlio.save_arff(x, y, op.join(root, name + "-ob-spectrophores.arff"), classes=classes, feature_names=feature_names) mlio.save_tab(x, y, op.join(root, name + "-ob-spectrophores.txt"), classes=classes)
for dataset in datasets: print dataset y = read_y(root, dataset) classes = infer_classes(y) #Process ob spectrophores specs = op.join(root, dataset, dataset + '-ob-spectrophores.csv') with open(specs) as reader: specs = [] for line in reader: specs.append(map(lambda a: float(a.strip()), line.split(','))) x = np.array(specs) mlio.save_arff(x, y, op.join(root, dataset, dataset + '-ob-spectrophores.arff'), classes=classes) mlio.save_tab(x, y, op.join(root, dataset, dataset + '-ob-spectrophores.txt'), classes=classes) #Process CDK descriptors for descs in glob.glob(op.join(root, dataset, '*-cdk-*.csv')): with open(descs) as reader: header = reader.next() if header.startswith('Title'): x, features = cdkdeskui2dense(descs) mlio.save_arff(x, y, op.splitext(descs)[0] + '.arff', feature_names=features, classes=classes) mlio.save_tab(x, y, op.splitext(descs)[0] + '.txt', classes=classes) else: x, name = cdkdeskuifps2dense(descs) mlio.save_arff(x, y, op.splitext(descs)[0] + '.arff', relation_name=name, classes=classes) mlio.save_tab(x, y, op.splitext(descs)[0] + '.txt', classes=classes) #TODO: Save the compound ID too