Beispiel #1
0
print("loading..")

for f in files:
    assert_exists(f)  # check for files

# extract studyid
cohort_id_file = cohort_file + "_studyid"
a = os.system("csv_split " +
              cohort_file) if not os.path.exists(cohort_id_file) else None
assert_exists(cohort_id_file)  # make sure we got the result

# load filtered student credit table
dat_cohort, datf_cohort = None, None
if not os.path.exists('dat_cohort.p'):
    print "dat_cohort.p not found. Creating.."
    dat_cohort, datf_cohort = load_fields([cohort_file, 'dob'])
    pickle.dump([dat_cohort, datf_cohort], open('dat_cohort.p', 'wb'))
else:
    dat_cohort, datf_cohort = pickle.load(open('dat_cohort.p', 'rb'))
studyid, dob = list(dat_cohort.keys()), {}
fdat_cohort = {datf_cohort[i]: i for i in range(0, len(datf_cohort))}

# express dob as a function of studyid
for i in studyid:
    dob[i] = dat_cohort[i][fdat_cohort['dob']][0]


def filter_table_for_cohort(cohort_id_file, table_file):
    select_file = table_file + "_select.csv"
    if not os.path.exists(select_file):
        a = os.system("csv_select " + cohort_id_file + " studyid " +
Beispiel #2
0
# guess data types of cols of csv. Ignore data with no info (one outcome only)
import os
import sys
from misc import load_fields
args = sys.argv
info = len(args) > 3
dat = load_fields("../test/merge.csv" if len(args) < 2 else args[1])

types = {}
for k in dat.keys():
    d, is_float = dat[k], True  # extract col
    for i in range(0, len(d)):
        try:
            f = float(d[i])
        except Exception:
            is_float = False

    t = "float" if is_float else "str"
    lsd = len(set(d))
    if lsd > 1:
        if info:
            print(t, k, lsd if lsd > 12 else set(d))
        if not t in types:
            types[t] = []
        types[t].append(k)

if len(args) < 3:
    print(types)
else:
    if args[2] in ['str', 'float']:
        print(','.join(types[args[2]]))