def merger(fin, fout, verbose=False): "Merger function" filelist = [] if fin.find(',') != -1: # list of files filelist = fin.split(',') elif fin.find('*') != -1: # pattern filelist = glob.glob(fin) elif os.path.isdir(fin): # we got directory name for ext in ['.csv.gz', '.csv', 'csv.bz2']: filelist = [f for f in files(fin, ext)] if len(filelist): break if not filelist: print("ERROR; unable to create filelist from %s" % fin) sys.exit(1) headers = None with fopen(fout, 'wb') as ostream: for fname in filelist: if verbose: print("Read", fname) with fopen(fname, 'r') as istream: while True: line = istream.readline() if not headers: headers = line ostream.write(headers) continue if line == headers: continue if not line: break ostream.write(line)
def select(pred, data, ids): "Select prediction and data based on provided ids" sids = [i.replace('\n','').split(',') for i in fopen(ids).readlines() if not i.startswith('dataset')] ipred = fopen(pred, 'r') idata = fopen(data, 'r') pheaders = [] dheaders = [] didx = 0 with fopen('new_pred.txt', 'wb') as opred: with fopen('new_data.txt', 'wb') as odata: while True: pline = ipred.readline().replace('\n', '').split(',') dline = idata.readline().replace('\n', '').split(',') if not dheaders: pheaders = pline dheaders = dline didx = dheaders.index('dataset') ddbs = dheaders.index('dbs') dpre = dheaders.index('target') opred.write(','.join(pheaders)+'\n') odata.write(','.join(dheaders)+'\n') continue if len(pline) == 1: break dataset, dbs, prediction = pline if [dataset, dbs] in sids and \ [dline[didx], dline[ddbs]] in sids and dline[dpre]=='1': opred.write(','.join(pline)+'\n') odata.write(','.join(dline)+'\n')
def select(pred, data, ids): "Select prediction and data based on provided ids" sids = [ i.replace('\n', '').split(',') for i in fopen(ids).readlines() if not i.startswith('dataset') ] ipred = fopen(pred, 'r') idata = fopen(data, 'r') pheaders = [] dheaders = [] didx = 0 with fopen('new_pred.txt', 'wb') as opred: with fopen('new_data.txt', 'wb') as odata: while True: pline = ipred.readline().replace('\n', '').split(',') dline = idata.readline().replace('\n', '').split(',') if not dheaders: pheaders = pline dheaders = dline didx = dheaders.index('dataset') ddbs = dheaders.index('dbs') dpre = dheaders.index('target') opred.write(','.join(pheaders) + '\n') odata.write(','.join(dheaders) + '\n') continue if len(pline) == 1: break dataset, dbs, prediction = pline if [dataset, dbs] in sids and \ [dline[didx], dline[ddbs]] in sids and dline[dpre]=='1': opred.write(','.join(pline) + '\n') odata.write(','.join(dline) + '\n')
def calc_logloss(fpred, fobs): "Calculate logloss value for input prediction/actual value files" obs = [float(r.replace('\n','').split(',')[-1]) for r in fopen(fobs) if not r.lower().startswith('id')] pred = [int(r.replace('\n','').split(',')[-1]) for r in fopen(fpred) if not r.lower().startswith('id')] print("obs", obs[:5]) print("pred", pred[:5]) print("LogLoss", logloss(obs, pred))
def mknfold(fin, match=1, nfold=5): "Split input file into few" random.seed(123) with fopen(fin, 'r') as istream: with fopen(fin + '.train', 'w') as otrain: with fopen(fin + '.test', 'w') as otest: for line in istream: if random.randint(1, nfold) == match: otest.write(line) else: otrain.write(line)
def mknfold(fin, match=1, nfold=5): "Split input file into few" random.seed(123) with fopen(fin, 'r') as istream: with fopen(fin+'.train', 'w') as otrain: with fopen(fin+'.test', 'w') as otest: for line in istream: if random.randint( 1 , nfold ) == match: otest.write(line) else: otrain.write(line)
def find_drops(file1, file2, idrops=None): "Find difference in headers of two files and print it out" with fopen(file1) as istream1, fopen(file2) as istream2: headers1 = istream1.readline().replace('\n', '').split(',') headers2 = istream2.readline().replace('\n', '').split(',') drops = set(headers1) - set(headers2) drops = drops.union(set(headers2) - set(headers1)) if idrops: print(','.join(set(list(drops) + idrops.split(',')))) else: print(','.join(drops))
def find_drops(file1, file2, idrops=None): "Find difference in headers of two files and print it out" with fopen(file1) as istream1, fopen(file2) as istream2: headers1 = istream1.readline().replace('\n', '').split(',') headers2 = istream2.readline().replace('\n', '').split(',') drops = set(headers1)-set(headers2) drops = drops.union(set(headers2)-set(headers1)) if idrops: print(','.join(set(list(drops)+idrops.split(',')))) else: print(','.join(drops))
def data(path, D, ndim, extra_dim, label_path=None, hcols=[], misses=[]): for t, line in enumerate(fopen(path)): # initialize our generator if t == 0: # create a static x, # so we don't have to construct a new x for every instance x = [0] * (ndim + extra_dim) if label_path: label = fopen(label_path) label.readline() # we don't need the headers continue # parse x counter = 0 for m, feat in enumerate(line.rstrip().split(',')): if m in misses: continue if m == 0: try: ID = int(feat) except: continue else: # one-hot encode everything with hash trick # categorical: one-hotted # boolean: ONE-HOTTED # numerical: ONE-HOTTED! # note, the build in hash(), although fast is not stable, # i.e., same value won't always have the same hash # on different machines if misses: idx = counter else: idx = m x[idx] = abs(hash(str(m) + '_' + str(feat))) % D counter += 1 row = line.rstrip().split(',') tidx = ndim for i in xrange(len(hcols)): for j in xrange(i + 1, len(hcols)): tidx += 1 try: val = str(tidx) + '_' + row[hcols[i]] + "_" + row[hcols[j]] x[tidx] = abs(hash(val)) % D except Exception as exc: print("tidx=%s, i=%s, j=%s" % (tidx, i, j)) print(str(exc)) raise # parse y, if provided if label_path: # use float() to prevent future type casting, [1:] to ignore id y = [float(y) for y in label.readline().split(',')[1:]] yield (ID, x, y) if label_path else (ID, x)
def data(path, D, ndim, extra_dim, label_path=None, hcols=[], misses=[]): for t, line in enumerate(fopen(path)): # initialize our generator if t == 0: # create a static x, # so we don't have to construct a new x for every instance x = [0] * (ndim + extra_dim) if label_path: label = fopen(label_path) label.readline() # we don't need the headers continue # parse x counter = 0 for m, feat in enumerate(line.rstrip().split(',')): if m in misses: continue if m == 0: try: ID = int(feat) except: continue else: # one-hot encode everything with hash trick # categorical: one-hotted # boolean: ONE-HOTTED # numerical: ONE-HOTTED! # note, the build in hash(), although fast is not stable, # i.e., same value won't always have the same hash # on different machines if misses: idx = counter else: idx = m x[idx] = abs(hash(str(m) + '_' + str(feat))) % D counter += 1 row = line.rstrip().split(',') tidx = ndim for i in xrange(len(hcols)): for j in xrange(i+1, len(hcols)): tidx += 1 try: val = str(tidx)+'_'+row[hcols[i]]+"_"+row[hcols[j]] x[tidx] = abs(hash(val)) % D except Exception as exc: print("tidx=%s, i=%s, j=%s" % (tidx, i, j)) print(str(exc)) raise # parse y, if provided if label_path: # use float() to prevent future type casting, [1:] to ignore id y = [float(y) for y in label.readline().split(',')[1:]] yield (ID, x, y) if label_path else (ID, x)
def read_popdb(popdb): "Read popdb data" headers = [] pdict = {} with fopen(popdb, 'r') as istream: while True: if not headers: for row in istream.readline().replace('\n', '').split(','): if row == 'COLLNAME': headers.append('dataset') elif row == 'NACC': headers.append('naccess') elif row == 'RNACC': headers.append('rnaccess') else: headers.append(row.lower()) continue vals = istream.readline().replace('\n', '').split(',') if len(vals) < 2: break row = dict(zip(headers, vals)) try: dataset = row.pop('dataset') pdict[dataset] = row except: pass return pdict
def line_offsets(fname): """Read in the file once and return a list of line offsets""" line_offset = [] offset = 0 for _, line in enumerate( fopen(fname) ): line_offset.append(offset) offset += len(line) return line_offset
def line_offsets(fname): """Read in the file once and return a list of line offsets""" line_offset = [] offset = 0 for _, line in enumerate(fopen(fname)): line_offset.append(offset) offset += len(line) return line_offset
def convert(fin, fout, datasets, sep=','): """ Convert input prediction file (id,prediction) into (dataset,prediction) by using datasets file """ headers = None df = pd.read_csv(datasets) with fopen(fin, 'r') as istream, fopen(fout, 'w') as ostream: for line in istream.readlines(): did, dbs, pred = line.replace('\n', '').split(sep) if not headers: headers = '%s,%s,%s' % (did, dbs, pred) continue res = df[(df.hash == int(did)) & (df.dbsinst == int(dbs))] if not res.empty: dataset = res.get_value(res.index[0], 'dataset') ostream.write("%5.3f%s%s\n" % (float(pred), sep, dataset))
def convert(fin, fout, datasets, sep=','): """ Convert input prediction file (id,prediction) into (dataset,prediction) by using datasets file """ headers = None df = pd.read_csv(datasets) with fopen(fin, 'r') as istream, fopen(fout, 'w') as ostream: for line in istream.readlines(): did, dbs, pred = line.replace('\n', '').split(sep) if not headers: headers = '%s,%s,%s' % (did, dbs, pred) continue res = df[(df.hash==int(did)) & (df.dbsinst==int(dbs))] if not res.empty: dataset = res.get_value(res.index[0], 'dataset') ostream.write("%5.3f%s%s\n" % (float(pred), sep, dataset))
def new_datasets(fin, fnew, fout): "Find out which datasets were new in provided train/valid files" train_ids = dataset_ids(fin) valid_ids = dataset_ids(fnew) ids = valid_ids - train_ids with fopen(fout, 'wb') as ostream: ostream.write('dataset,dbs\n') for pair in ids: ostream.write(','.join(pair) + '\n')
def new_datasets(fin, fnew, fout): "Find out which datasets were new in provided train/valid files" train_ids = dataset_ids(fin) valid_ids = dataset_ids(fnew) ids = valid_ids - train_ids with fopen(fout, 'wb') as ostream: ostream.write('dataset,dbs\n') for pair in ids: ostream.write(','.join(pair)+'\n')
def convert(fin, fout, uri, sep=','): """ Convert input prediction file (id,prediction) into (dataset,prediction) by using DCAFPilot cache """ client = MongoClient(uri) mgr = client['analytics']['datasets'] headers = None with fopen(fin, 'r') as istream: with fopen(fout, 'w') as ostream: for line in istream.readlines(): did, dbs, pred = line.replace('\n', '').split(sep) if not headers: headers = '%s,%s,%s' % (did, dbs, pred) continue spec = {'dataset_id':int(did), 'dbs_instance':int(dbs)} res = mgr.find_one(spec) if res: ostream.write("%5.3f%s%s\n" % (float(pred), sep, res['dataset']))
def transform(fin, fout, target, thr, drops, verbose=0): "Perform transformation on given CSV file" istream = fopen(fin, 'r') ostream = fopen(fout, 'wb') headers = False for line in istream.readlines(): if not headers: headers = line.replace('\n', '').split(',') if drops: new_headers = [] for idx, val in enumerate(headers): if val in drops or val == target: continue new_headers.append(val) ostream.write(','.join(new_headers)+',target\n') continue vals = [eval(v) for v in line.replace('\n', '').split(',')] row = dict(zip(headers, vals)) if thr==-1: # keep regression tval = row[target] else: # do classification if INT_PAT.match(thr) or FLOAT_PAT.match(thr): tval = 1 if float(row[target])>float(thr) else 0 else: try: cond = eval(thr) if cond: tval = 1 else: tval = 0 except: print("Please supply valid python condition, e.g. row['naccess']>10 and row['nusers']>5") sys.exit(1) new_vals = [] for key in new_headers: if key in drops or key == target: continue new_vals.append(str(row[key])) new_vals.append(str(tval)) ostream.write(','.join(new_vals)+'\n') istream.close() ostream.close()
def merger(fin, fout, verbose=False): "Merger function" filelist = [] if fin.find(',') != -1: # list of files filelist = fin.split(',') elif fin.find('*') != -1: # pattern filelist = glob.glob(fin) elif os.path.isdir(fin): # we got directory name for ext in ['.csv.gz', '.csv', 'csv.bz2']: filelist = [f for f in files(fin, ext)] if len(filelist): break elif os.path.isfile(fin): # we got file name filelist = [fin] if not filelist: print("ERROR; unable to create filelist from %s" % fin) sys.exit(1) # sort all files filelist.sort() headers = find_headers(filelist) with fopen(fout, 'wb') as ostream: ostream.write(','.join(headers) + '\n') for fname in filelist: if verbose: print("Read", fname) with fopen(fname, 'r') as istream: keys = istream.readline().replace('\n', '').split(',') # headers while True: line = istream.readline() if not line: break vals = line.replace('\n', '').split(',') row = dict(zip(keys, vals)) srow = ','.join([str(row.get(k, 0)) for k in headers]) ostream.write(srow + '\n')
def find_headers(files): "Scan all files and extract full set of attributes" headers = [] for fname in files: with fopen(fname, 'r') as istream: line = istream.readline() fheaders = line.replace('\n','').split(',') if not headers: headers = fheaders if headers != fheaders: # take a union of two sets headers = list(set(headers) | set(fheaders)) if 'id' in headers: headers.remove('id') return ['id'] + sorted(headers)
def find_headers(files): "Scan all files and extract full set of attributes" headers = [] for fname in files: with fopen(fname, 'r') as istream: line = istream.readline() fheaders = line.replace('\n', '').split(',') if not headers: headers = fheaders if headers != fheaders: # take a union of two sets headers = list(set(headers) | set(fheaders)) if 'id' in headers: headers.remove('id') return ['id'] + sorted(headers)
def merger(fin, fout, verbose=False): "Merger function" filelist = [] if fin.find(',') != -1: # list of files filelist = fin.split(',') elif fin.find('*') != -1: # pattern filelist = glob.glob(fin) elif os.path.isdir(fin): # we got directory name for ext in ['.csv.gz', '.csv', 'csv.bz2']: filelist = [f for f in files(fin, ext)] if len(filelist): break elif os.path.isfile(fin): # we got file name filelist = [fin] if not filelist: print("ERROR; unable to create filelist from %s" % fin) sys.exit(1) # sort all files filelist.sort() headers = find_headers(filelist) with fopen(fout, 'wb') as ostream: ostream.write(','.join(headers)+'\n') for fname in filelist: if verbose: print("Read", fname) with fopen(fname, 'r') as istream: keys = istream.readline().replace('\n', '').split(',') # headers while True: line = istream.readline() if not line: break vals = line.replace('\n', '').split(',') row = dict(zip(keys, vals)) srow = ','.join([str(row.get(k, 0)) for k in headers]) ostream.write(srow+'\n')
def run(fin, fout, split=30, seed=0): "Read input file and write train/validation files" if seed: random.seed(seed) base, ext = fout.split('.') ftest = '%s_valid.%s' % (base, ext) offsets = line_offsets(fin) nlines = len(offsets) indices = range(1, nlines) random.shuffle(indices) with fopen(fin, 'r') as istream, fopen(fout, 'wb') as ostream, \ fopen(ftest, 'wb') as tstream: headers = istream.readline() ostream.write(headers) tstream.write(headers) count = 0 for idx in indices: istream.seek(offsets[idx]) line = istream.readline() if count > (nlines - round(nlines * split / 100.)): tstream.write(line) else: ostream.write(line) count += 1
def run(fin, fout, split=30, seed=0): "Read input file and write train/validation files" if seed: random.seed(seed) base, ext = fout.split('.') ftest = '%s_valid.%s' % (base, ext) offsets = line_offsets(fin) nlines = len(offsets) indices = range(1, nlines) random.shuffle(indices) with fopen(fin, 'r') as istream, fopen(fout, 'wb') as ostream, \ fopen(ftest, 'wb') as tstream: headers = istream.readline() ostream.write(headers) tstream.write(headers) count = 0 for idx in indices: istream.seek(offsets[idx]) line = istream.readline() if count > (nlines-round(nlines*split/100.)): tstream.write(line) else: ostream.write(line) count += 1
def loader(ifile, target, sep=',', threshold=None): "Load prediction from given file" headers = None tidx = None arr = [] with fopen(ifile, 'r') as istream: while True: row = istream.readline().replace('\n', '').replace('\r', '').split(sep) if not headers: headers = row tidx = headers.index(target) continue if len(row) < 2: break # changed since previous approach did not recognice scientific notation val = str_to_num(row[tidx]) if threshold: val = 1 if val >= threshold else 0 arr.append(val) return arr
def loader(ifile, target, sep=',', threshold=None): "Load prediction from given file" headers = None tidx = None arr = [] with fopen(ifile, 'r') as istream: while True: row = istream.readline().replace('\n', '').replace('\r', '').split(sep) if not headers: headers = row tidx = headers.index(target) continue # if len(row) < 2: if not row or (isinstance(row, list) and len(row)==1 and not row[0]): break # changed since previous approach did not recognice scientific notation val = str_to_num(row[tidx]) if threshold: val = 1 if val >= threshold else 0 arr.append(val) return arr
def dataset_ids(fin): "Return dataset ids from given file" ids = set() headers = [] didx = None tidx = None dbsidx = None with fopen(fin, 'r') as istream: while True: if not headers: headers = istream.readline().replace('\n', '').split(',') didx = headers.index('dataset') tidx = headers.index('target') dbsidx = headers.index('dbs') continue row = istream.readline().replace('\n', '').split(',') if len(row) < 2: break if float(row[tidx]) > 0: ids.add((row[didx],row[dbsidx])) return set(ids)
def dataset_ids(fin): "Return dataset ids from given file" ids = set() headers = [] didx = None tidx = None dbsidx = None with fopen(fin, 'r') as istream: while True: if not headers: headers = istream.readline().replace('\n', '').split(',') didx = headers.index('dataset') tidx = headers.index('target') dbsidx = headers.index('dbs') continue row = istream.readline().replace('\n', '').split(',') if len(row) < 2: break if float(row[tidx]) > 0: ids.add((row[didx], row[dbsidx])) return set(ids)
def loader(ifile, target, sep=','): "Load prediction from given file" headers = None tidx = None arr = [] with fopen(ifile, 'r') as istream: while True: row = istream.readline().replace('\n', '').split(sep) if not headers: headers = row tidx = headers.index(target) continue if len(row) < 2: break val = row[tidx] if INT_PAT.match(val): val = int(val) elif FLT_PAT.match(val): val = float(val) else: raise Exception("Parsed value '%s' has unknown data-type" % val) arr.append(val) return arr
def verify_csv(fout, verbose): err = None try: with fopen(fout, 'r') as istream: nc = len(istream.readline().split(',')) while err is not None: line = istream.readline() if not line: break if len(line.split(',')) != nc: err = 'Number of headers differs from number of data in rows.' break except IOError: err = 'IOError raised while reading the file.' except Exception as ex: err = 'Exception raised while reading: ' err += ex + '. Arguments: ' err += ex.args + '.' if err is not None: err = ("Verification of file %s failed. " % fout) + err print(err) else: if verbose: print("Verification of %s passed successfully." % fout)
def read_data(fin, fbeg, fend, timein): "Selects files and picks structured data for plotting" if fin == ".": files = [f for f in os.listdir('.') \ if (len(fbeg) == 0 or f.startswith(fbeg)) and (len(fend) == 0 or f.endswith(fend))] elif os.path.isdir(fin): files = [os.path.join(fin, f) for f in os.listdir(fin) \ if (len(fbeg) == 0 or f.startswith(fbeg)) and (len(fend) == 0 or f.endswith(fend))] elif os.path.isfile(fin): # if one file passed, its fn not checked files = [fin] else: print("Cannot produce file list, not a file or directory: \"%s\"" % fin) raise SystemExit ### data structure: # { dftypes: { scorers: { classifiers: [ ordered weekly results for particular dftype, scorer and classifier ] } } } # dftype is dataframe type: old or new # dates variable stores ordered dates with an idea that dates in csv # are stored respectfully in same order for all dftype and # classifier combinations data = {} dates = {} scorers = [] headers = [] rundata = {} # read classifiers scores for f in files: first = True for line in fopen(f).readlines(): vals = line.strip(" \n\r").split(',') if first: first = False if not headers: headers = vals if headers[0] != 'dftype' \ or headers[1] != 'clf' \ or headers[2] != 'date' \ or len(headers) < 4: print("Error: check structure of file %s, headers should be: dftype,clf,date" % f) sys.exit(1) scorers = headers[3:] elif headers != vals: print("Original headers: %s" % headers) print("Headers in %s: %s" % (f, vals)) sys.exit(1) else: row = dict(zip(headers, vals)) if not row['dftype'] in data: data[row['dftype']] = {} if not row['dftype'] in dates: dates[row['dftype']] = [] if not row['date'] in dates[row['dftype']]: dates[row['dftype']].append(row['date']) for s in scorers: if not s in data[row['dftype']]: data[row['dftype']][s] = {} if not row['clf'] in data[row['dftype']][s]: data[row['dftype']][s][row['clf']] = [] data[row['dftype']][s][row['clf']].append(str_to_num(row[s])) # read classifiers running time if timein: head = [] for line in open(timein, 'r').readlines(): if not head: head = line continue line = line.strip(" \r\n").split(',') rundata[line[0]] = float(line[1]) return data, dates, rundata
def run(train, test, label, bits, alpha, hcols, no_out, misses, glf): "Run train/test program" print("Train model with b=%s, a=%s, cols=%s, misses=%s, host=%s"\ % (bits, alpha, hcols, misses, socket.gethostname())) sys.__stdout__.flush() start = datetime.now() # find out number of dimensions ndim = 146 # number of features in our dataset plus 1 bias feature extra_dim = 1 # extra dimension for hcols interactions for i in xrange(len(hcols)): for j in xrange(i + 1, len(hcols)): extra_dim += 1 if misses: ndim -= len(misses) print('Dim: %s, extra %s' % (ndim, extra_dim)) sys.__stdout__.flush() # number of weights use for each model, we have 32 of them D = 2**bits # a list for range(0, 33) - 13, no need to learn y14 since it is always 0 K = [k for k in range(33) if k != 13] # initialize our model, all 32 of them, again ignoring y14 w = [[0.] * D if k != 13 else None for k in range(33)] n = [[0.] * D if k != 13 else None for k in range(33)] loss = 0. loss_y14 = log(1. - 10**-16) for ID, x, y in data(train, D, ndim, extra_dim, label, hcols, misses=misses): # get predictions and train on all labels for k in K: p = predict(x, w[k], glf) update(alpha, w[k], n[k], x, p, y[k]) loss += logloss(p, y[k]) # for progressive validation loss += loss_y14 # the loss of y14, logloss is never zero # print out progress, so that we know everything is working if ID % 100000 == 0: print('%s\tencountered: %d\tcurrent logloss: %f' % (datetime.now(), ID, (loss / 33.) / ID)) sys.__stdout__.flush() print("Final loss", (loss / 33.) / ID) if no_out: print("No output request") sys.__stdout__.flush() else: oname = 'b%s_a%s.csv' % (bits, alpha) print("Yield %s" % oname) sys.__stdout__.flush() with fopen(oname, 'w') as outfile: outfile.write('id_label,pred\n') for ID, x in data(test, D, ndim, extra_dim, hcols=hcols, misses=misses): predSum = 1.0 for k in K: p = predict(x, w[k], glf) outfile.write('%s_y%d,%s\n' % (ID, k + 1, str(p))) predSum -= p if k == 12: outfile.write('%s_y14,0.0\n' % ID) if k == 31: p = max(0.01, predSum) outfile.write('%s_y33,%s\n' % (ID, str(p))) print('Done, elapsed time: %s' % str(datetime.now() - start)) sys.__stdout__.flush()
def write_data(dfr, fout, comp): csv = dfr.to_csv(index=False, sep=',') fopen(fout, 'w').write(csv)
def transform(fin, fout, target, thr, drops, verbose=0, logcols='', logall=False, logbias=2, logthr=None, logignore=''): "Perform transformation on given CSV file" istream = fopen(fin, 'r') ostream = fopen(fout, 'wb') headers = False eno = 0 logignore = logignore.split(',') if logthr and not logcols: logcols = get_log_cols(fin, logthr, logignore) for line in istream.readlines(): if not headers: headers = line.replace('\n', '').split(',') if drops: new_headers = [] for idx, val in enumerate(headers): if val in drops or val == target: continue new_headers.append(val) ostream.write(','.join(new_headers)+',target\n') continue try: item = line.replace('\n', '').replace('<nil>', '-1') vals = [eval(v) for v in item.split(',')] except Exception as exp: print("Unable to parse the line", line, type(line), exp) vals = [] for item in line.split(','): try: vals.append(eval(item)) except: vals.append(-1) if len(vals) != len(headers): raise Exception("Unable to parse line '%s', #values != #headers", line) row = dict(zip(headers, vals)) if thr==-1: # keep regression tval = row[target] else: # do classification if INT_PAT.match(thr) or FLOAT_PAT.match(thr): tval = 1 if float(row[target])>float(thr) else 0 else: try: cond = eval(thr) if cond: tval = 1 else: tval = 0 except: print("Please supply valid python condition, e.g. row['naccess']>10 and row['nusers']>5") sys.exit(1) new_vals = [] for key in new_headers: if key in drops or key == target: continue new_vals.append(str(row[key])) new_vals.append(str(tval)) if logcols or logall: if logall: logcols = new_headers[:] logcols = list(set(logcols)-set(logignore)) for i in xrange(len(new_headers)): if new_headers[i] in logcols: new_vals[i] = str(log(eval(new_vals[i])+logbias)) ostream.write(','.join(new_vals)+'\n') istream.close() ostream.close()
def run(fout, tframe, seed, dformat, dbsextra, newdata): with fopen(opts.fout, 'w') as ostream: for row in mgr.dataframe(tframe, seed, dformat, dbsextra, newdata): ostream.write(row + '\n')
def read_data(fin, fbeg, fend, timein): "Selects files and picks structured data for plotting" if fin == ".": files = [f for f in os.listdir('.') \ if (len(fbeg) == 0 or f.startswith(fbeg)) and (len(fend) == 0 or f.endswith(fend))] elif os.path.isdir(fin): files = [os.path.join(fin, f) for f in os.listdir(fin) \ if (len(fbeg) == 0 or f.startswith(fbeg)) and (len(fend) == 0 or f.endswith(fend))] elif os.path.isfile(fin): # if one file passed, its fn not checked files = [fin] else: print("Cannot produce file list, not a file or directory: \"%s\"" % fin) raise SystemExit ### data structure: # { dftypes: { scorers: { classifiers: [ ordered weekly results for particular dftype, scorer and classifier ] } } } # dftype is dataframe type: old or new # dates variable stores ordered dates with an idea that dates in csv # are stored respectfully in same order for all dftype and # classifier combinations data = {} dates = {} scorers = [] headers = [] rundata = {} # read classifiers scores for f in files: first = True for line in fopen(f).readlines(): vals = line.strip(" \n\r").split(',') if first: first = False if not headers: headers = vals if headers[0] != 'dftype' \ or headers[1] != 'clf' \ or headers[2] != 'date' \ or len(headers) < 4: print( "Error: check structure of file %s, headers should be: dftype,clf,date" % f) sys.exit(1) scorers = headers[3:] elif headers != vals: print("Original headers: %s" % headers) print("Headers in %s: %s" % (f, vals)) sys.exit(1) else: row = dict(zip(headers, vals)) if not row['dftype'] in data: data[row['dftype']] = {} if not row['dftype'] in dates: dates[row['dftype']] = [] if not row['date'] in dates[row['dftype']]: dates[row['dftype']].append(row['date']) for s in scorers: if not s in data[row['dftype']]: data[row['dftype']][s] = {} if not row['clf'] in data[row['dftype']][s]: data[row['dftype']][s][row['clf']] = [] data[row['dftype']][s][row['clf']].append( str_to_num(row[s])) # read classifiers running time if timein: head = [] for line in open(timein, 'r').readlines(): if not head: head = line continue line = line.strip(" \r\n").split(',') rundata[line[0]] = float(line[1]) return data, dates, rundata
def transform(fin, fout, idcol, target, thr, drops, verbose=0, logcols='', logall=False, logbias=2, logthr=None, logignore=''): "Perform transformation on given CSV file" istream = fopen(fin, 'r') ostream = fopen(fout, 'wb') headers = False eno = 0 logignore = logignore.split(',') if logthr and not logcols: logcols = get_log_cols(fin, logthr, logignore) for line in istream.readlines(): if not headers: headers = line.replace('\n', '').split(',') if drops: new_headers = [] for idx, val in enumerate(headers): if val in drops or val == target or val == idcol: continue new_headers.append(val) ostream.write('id,' + ','.join(new_headers) + ',target\n') continue try: item = line.replace('\n', '').replace('<nil>', '-1') vals = [eval(v) for v in item.split(',')] except Exception as exp: print("Unable to parse the line", line, type(line), exp) vals = [] for item in line.split(','): try: vals.append(eval(item)) except: vals.append(-1) if len(vals) != len(headers): raise Exception( "Unable to parse line '%s', #values != #headers", line) row = dict(zip(headers, vals)) if thr == -1: # keep regression tval = row[target] else: # do classification if INT_PAT.match(thr) or FLOAT_PAT.match(thr): tval = 1 if float(row[target]) > float(thr) else 0 else: try: cond = eval(thr) if cond: tval = 1 else: tval = 0 except: print( "Please supply valid python condition, e.g. row['naccess']>10 and row['nusers']>5" ) sys.exit(1) if idcol in row.keys(): new_vals = [str(int(row[idcol]))] else: new_vals = [getuid()] for key in new_headers: if key in drops or key == target or key == idcol: continue new_vals.append(str(row[key])) new_vals.append(str(tval)) if logcols or logall: if logall: logcols = new_headers[:] logcols = list(set(logcols) - set(logignore)) for i in xrange(len(new_headers)): if new_headers[i] in logcols: new_vals[i] = str(log(eval(new_vals[i]) + logbias)) ostream.write(','.join(new_vals) + '\n') istream.close() ostream.close()
def run(fout, tframe, seed, dformat, dbsextra, newdata): with fopen(opts.fout, 'w') as ostream: for row in mgr.dataframe(tframe, seed, dformat, dbsextra, newdata): ostream.write(row+'\n')
def verify_prediction(pred, popdb, oformat, verbose, target, thr): "Verify prediction file against popdb one" pdict = read_popdb(popdb) total = 0 popular = 0 totpop = 0 tpos = 0 tneg = 0 fpos = 0 fneg = 0 tp_list = [] tn_list = [] fp_list = [] fn_list = [] def is_popular(dataset, pdict): return (target is None or float(pdict[dataset][target]) > thr) for line in fopen(pred, 'r').readlines(): prob, dataset = line.replace('\n', '').split(',') total += 1 if float(prob)>0: popular += 1 if dataset in pdict and is_popular(dataset, pdict): totpop += 1 if float(prob)>0: tpos += 1 tp_list.append(dataset) if verbose>1: print('TP, prob=%s dataset=%s' % (prob, dataset)) else: fneg += 1 fn_list.append(dataset) if verbose>1: print('FN, prob=%s dataset=%s' % (prob, dataset)) else: if float(prob)>0: fpos += 1 fp_list.append(dataset) if verbose>1: print('FP, prob=%s dataset=%s' % (prob, dataset)) else: tneg += 1 tn_list.append(dataset) if verbose>1: print('TN, prob=%s dataset=%s' % (prob, dataset)) accuracy, precision, recall, f1score = metrics(tpos, tneg, fpos, fneg) print("# dataset in popdb sample :", len(pdict.keys())) print("# datasets we predict :", total) print("# datasets in popular set :", totpop) print("Predicted as popular :", popular) print() def perc(vvv): return '%s' % round(vvv*100./total, 1) def space(vvv): return '%s%s' % (vvv, ' '*(len(str(total))-len(str(vvv)))) tptiers = datasets2tiers(tp_list) tntiers = datasets2tiers(tn_list) fptiers = datasets2tiers(fp_list) fntiers = datasets2tiers(fn_list) alltiers = set(tptiers.keys()+tntiers.keys()+fptiers.keys()+fntiers.keys()) if oformat=='csv': out = 'cls,tp,tn,fp,fn\n' out += '%s,ALL,%s,%s,%s,%s\n' \ % (pred.split('.')[0], perc(tpos), perc(tneg), perc(fpos), perc(fneg)) for tier in sorted(alltiers): tp, tn, fp, fn = percentage(tptiers.get(tier, 0), tntiers.get(tier, 0), fptiers.get(tier, 0), fntiers.get(tier, 0)) out += '%s,%s,%s,%s,%s,%s\n' \ % (pred.split('.')[0], tier, tp, tn, fp, fn) print(out) else: if verbose: print("True positive : %s, %s%%" % (space(tpos), perc(tpos))) print("True negative : %s, %s%%" % (space(tneg), perc(tneg))) print("False positive : %s, %s%%" % (space(fpos), perc(fpos))) print("False negative : %s, %s%%" % (space(fneg), perc(fneg))) print() classify_all(tp_list, tn_list, fp_list, fn_list) width = max([len(t) for t in alltiers]) msg = 'ALL tiers' pad = ' '*(width-len(msg)) tp, tn, fp, fn = percentage(tpos, tneg, fpos, fneg) print("%s %s %6.2f %6.2f %6.2f %6.2f\n" % (msg, pad, tp, tn, fp, fn)) if verbose: print("Classification of TP sample") classify(tp_list) print("Classification of TN sample") classify(tn_list) print("Classification of FP sample") classify(fp_list) print("Classification of FN sample") classify(fn_list) print("Accuracy :", accuracy) print("Precision :", precision) print("Recall :", recall) print("F1-score :", f1score)
def run(train, test, label, bits, alpha, hcols, no_out, misses, glf): "Run train/test program" print("Train model with b=%s, a=%s, cols=%s, misses=%s, host=%s"\ % (bits, alpha, hcols, misses, socket.gethostname())) sys.__stdout__.flush() start = datetime.now() # find out number of dimensions ndim = 146 # number of features in our dataset plus 1 bias feature extra_dim = 1 # extra dimension for hcols interactions for i in xrange(len(hcols)): for j in xrange(i+1, len(hcols)): extra_dim += 1 if misses: ndim -= len(misses) print('Dim: %s, extra %s' % (ndim, extra_dim)) sys.__stdout__.flush() # number of weights use for each model, we have 32 of them D = 2 ** bits # a list for range(0, 33) - 13, no need to learn y14 since it is always 0 K = [k for k in range(33) if k != 13] # initialize our model, all 32 of them, again ignoring y14 w = [[0.] * D if k != 13 else None for k in range(33)] n = [[0.] * D if k != 13 else None for k in range(33)] loss = 0. loss_y14 = log(1. - 10**-16) for ID, x, y in data(train, D, ndim, extra_dim, label, hcols, misses=misses): # get predictions and train on all labels for k in K: p = predict(x, w[k], glf) update(alpha, w[k], n[k], x, p, y[k]) loss += logloss(p, y[k]) # for progressive validation loss += loss_y14 # the loss of y14, logloss is never zero # print out progress, so that we know everything is working if ID % 100000 == 0: print('%s\tencountered: %d\tcurrent logloss: %f' % ( datetime.now(), ID, (loss/33.)/ID)) sys.__stdout__.flush() print("Final loss", (loss/33.)/ID) if no_out: print("No output request") sys.__stdout__.flush() else: oname = 'b%s_a%s.csv' % (bits, alpha) print("Yield %s" % oname) sys.__stdout__.flush() with fopen(oname, 'w') as outfile: outfile.write('id_label,pred\n') for ID, x in data(test, D, ndim, extra_dim, hcols=hcols, misses=misses): predSum = 1.0 for k in K: p = predict(x, w[k], glf) outfile.write('%s_y%d,%s\n' % (ID, k+1, str(p))) predSum -= p if k == 12: outfile.write('%s_y14,0.0\n' % ID) if k == 31: p = max(0.01,predSum) outfile.write('%s_y33,%s\n' % (ID, str(p))) print('Done, elapsed time: %s' % str(datetime.now() - start)) sys.__stdout__.flush()