Example #1
0
def cdk_desc_to_arff(directory, master_file, desc_csv, to_predict):
    y = read_y_from_master(op.join(directory, master_file))
    classes = infer_classes(y)
    descs = op.join(directory, desc_csv)
    f = open(descs, "r")
    data = []
    line1 = f.readline()
    feature_names = ["ID"] + [name for name in line1.split()[1:]]
    for line in f:
        if len(line.strip()):
            data.append(map(floatOrNaN, line.strip().split("\t")))
    f.close()
    x = np.array(data)
    arff_file = op.join(directory, op.splitext(desc_csv)[0] + ".arff")
    mlio.save_arff(x, y, arff_file, relation_name=to_predict, feature_names=feature_names, classes=classes)
Example #2
0
def spectrophores_to_arff(directory, master_file, spec_csv, to_predict):
    y = read_y_from_master(op.join(directory, master_file))
    classes = infer_classes(y)
    specs = op.join(directory, spec_csv)
    f = open(specs, "r")
    data = []
    for line in f:
        # data.append(map(lambda a: float(a.strip()), line.split(',')))
        if len(line.strip()):
            data.append(map(floatOrNaN, line.strip().split(",")[:-1]))
    x = np.array(data)
    feature_names = ["ID"]
    f.close()
    for i in range(48):
        feature_names.append("Spec" + str(i))
    arff_file = op.join(directory, op.splitext(spec_csv)[0] + ".arff")
    mlio.save_arff(x, y, arff_file, relation_name=to_predict, feature_names=feature_names, classes=classes)
Example #3
0
def cdk_fpt_to_arff(directory, master_file, fpt_csv, to_predict, fpt_type):
    y = read_y_from_master(op.join(directory, master_file))
    classes = infer_classes(y)
    fpts = op.join(directory, fpt_csv)
    f = open(fpts, "r")
    x, _ = cdkdeskuifps2dense(fpts, keep_id=True)
    feature_names = ["ID"]
    if fpt_type == "maccs":
        for i in range(166):
            feature_names.append("maccs" + str(i))
    elif fpt_type == "estate":
        for i in range(79):
            feature_names.append("estate" + str(i))
    elif fpt_type == "extended":
        for i in range(1024):
            feature_names.append("extended" + str(i))
    else:
        print "Fingerprint type currently not supported."
    arff_file = op.join(directory, op.splitext(fpt_csv)[0] + ".arff")
    mlio.save_arff(x, y, arff_file, relation_name=to_predict, feature_names=feature_names, classes=classes)
Example #4
0
def prop4da(dataset):
    root, name = op.split(dataset)
    name = op.splitext(name)[0]
    masterfile = op.join(root, name + "-master.csv")
    y = read_y_from_master(masterfile)
    classes = infer_classes(y)

    # Process CDK descriptors
    for descs in glob.glob(op.join(root, "*-cdk-*.csv")):
        with open(descs) as reader:
            header = reader.next()
            if header.startswith("Title"):
                x, features = cdkdeskui2dense(descs)
                mlio.save_arff(x, y, op.splitext(descs)[0] + ".arff", feature_names=features, classes=classes)
                mlio.save_tab(x, y, op.splitext(descs)[0] + ".txt", classes=classes)
            else:
                x, relation_name = cdkdeskuifps2dense(descs)
                features = mlio.generate_names(x.shape[0], descs)
                mlio.save_arff(
                    x,
                    y,
                    op.splitext(descs)[0] + ".arff",
                    relation_name=relation_name,
                    classes=classes,
                    feature_names=features,
                )
                mlio.save_tab(x, y, op.splitext(descs)[0] + ".txt", classes=classes)

    # Process ob spectrophores
    specs = op.join(root, name + "-ob-spectrophores.csv")
    with open(specs) as reader:
        specs = []
        for line in reader:
            specs.append(map(lambda a: float(a.strip()), line.split(",")))
        x = np.array(specs)
    feature_names = mlio.generate_names(len(specs[0]))
    mlio.save_arff(x, y, op.join(root, name + "-ob-spectrophores.arff"), classes=classes, feature_names=feature_names)
    mlio.save_tab(x, y, op.join(root, name + "-ob-spectrophores.txt"), classes=classes)
Example #5
0
    datasets = sorted([name for name in os.listdir(root) if op.isdir(op.join(root, name))])

    for dataset in datasets:
        print dataset

        y = read_y(root, dataset)
        classes = infer_classes(y)

        #Process ob spectrophores
        specs = op.join(root, dataset, dataset + '-ob-spectrophores.csv')
        with open(specs) as reader:
            specs = []
            for line in reader:
                specs.append(map(lambda a: float(a.strip()), line.split(',')))
            x = np.array(specs)
        mlio.save_arff(x, y, op.join(root, dataset, dataset + '-ob-spectrophores.arff'), classes=classes)
        mlio.save_tab(x, y, op.join(root, dataset, dataset + '-ob-spectrophores.txt'), classes=classes)

        #Process CDK descriptors
        for descs in glob.glob(op.join(root, dataset, '*-cdk-*.csv')):
            with open(descs) as reader:
                header = reader.next()
                if header.startswith('Title'):
                    x, features = cdkdeskui2dense(descs)
                    mlio.save_arff(x, y, op.splitext(descs)[0] + '.arff', feature_names=features, classes=classes)
                    mlio.save_tab(x, y, op.splitext(descs)[0] + '.txt', classes=classes)
                else:
                    x, name = cdkdeskuifps2dense(descs)
                    mlio.save_arff(x, y, op.splitext(descs)[0] + '.arff', relation_name=name, classes=classes)
                    mlio.save_tab(x, y, op.splitext(descs)[0] + '.txt', classes=classes)