Ejemplo n.º 1
0
def merger(fin, fout, verbose=False):
    "Merger function"
    filelist = []
    if  fin.find(',') != -1: # list of files
        filelist = fin.split(',')
    elif fin.find('*') != -1: # pattern
        filelist = glob.glob(fin)
    elif os.path.isdir(fin): # we got directory name
        for ext in ['.csv.gz', '.csv', 'csv.bz2']:
            filelist = [f for f in files(fin, ext)]
            if  len(filelist):
                break
    if  not filelist:
        print("ERROR; unable to create filelist from %s" % fin)
        sys.exit(1)

    headers = None
    with fopen(fout, 'wb') as ostream:
        for fname in filelist:
            if  verbose:
                print("Read", fname)
            with fopen(fname, 'r') as istream:
                while True:
                    line = istream.readline()
                    if  not headers:
                        headers = line
                        ostream.write(headers)
                        continue
                    if  line == headers:
                        continue
                    if  not line:
                        break
                    ostream.write(line)
Ejemplo n.º 2
0
def select(pred, data, ids):
    "Select prediction and data based on provided ids"
    sids = [i.replace('\n','').split(',') for i in fopen(ids).readlines() if not i.startswith('dataset')]
    ipred = fopen(pred, 'r')
    idata = fopen(data, 'r')
    pheaders = []
    dheaders = []
    didx = 0
    with fopen('new_pred.txt', 'wb') as opred:
        with fopen('new_data.txt', 'wb') as odata:
            while True:
                pline = ipred.readline().replace('\n', '').split(',')
                dline = idata.readline().replace('\n', '').split(',')
                if  not dheaders:
                    pheaders = pline
                    dheaders = dline
                    didx = dheaders.index('dataset')
                    ddbs = dheaders.index('dbs')
                    dpre = dheaders.index('target')
                    opred.write(','.join(pheaders)+'\n')
                    odata.write(','.join(dheaders)+'\n')
                    continue
                if  len(pline) == 1:
                    break
                dataset, dbs, prediction = pline
                if  [dataset, dbs] in sids and \
                    [dline[didx], dline[ddbs]] in sids and dline[dpre]=='1':
                    opred.write(','.join(pline)+'\n')
                    odata.write(','.join(dline)+'\n')
Ejemplo n.º 3
0
def select(pred, data, ids):
    "Select prediction and data based on provided ids"
    sids = [
        i.replace('\n', '').split(',') for i in fopen(ids).readlines()
        if not i.startswith('dataset')
    ]
    ipred = fopen(pred, 'r')
    idata = fopen(data, 'r')
    pheaders = []
    dheaders = []
    didx = 0
    with fopen('new_pred.txt', 'wb') as opred:
        with fopen('new_data.txt', 'wb') as odata:
            while True:
                pline = ipred.readline().replace('\n', '').split(',')
                dline = idata.readline().replace('\n', '').split(',')
                if not dheaders:
                    pheaders = pline
                    dheaders = dline
                    didx = dheaders.index('dataset')
                    ddbs = dheaders.index('dbs')
                    dpre = dheaders.index('target')
                    opred.write(','.join(pheaders) + '\n')
                    odata.write(','.join(dheaders) + '\n')
                    continue
                if len(pline) == 1:
                    break
                dataset, dbs, prediction = pline
                if  [dataset, dbs] in sids and \
                    [dline[didx], dline[ddbs]] in sids and dline[dpre]=='1':
                    opred.write(','.join(pline) + '\n')
                    odata.write(','.join(dline) + '\n')
Ejemplo n.º 4
0
def calc_logloss(fpred, fobs):
    "Calculate logloss value for input prediction/actual value files"
    obs = [float(r.replace('\n','').split(',')[-1]) for r in fopen(fobs) if not
            r.lower().startswith('id')]
    pred = [int(r.replace('\n','').split(',')[-1]) for r in fopen(fpred) if not
            r.lower().startswith('id')]
    print("obs", obs[:5])
    print("pred", pred[:5])
    print("LogLoss", logloss(obs, pred))
Ejemplo n.º 5
0
def mknfold(fin, match=1, nfold=5):
    "Split input file into few"
    random.seed(123)
    with fopen(fin, 'r') as istream:
        with fopen(fin + '.train', 'w') as otrain:
            with fopen(fin + '.test', 'w') as otest:
                for line in istream:
                    if random.randint(1, nfold) == match:
                        otest.write(line)
                    else:
                        otrain.write(line)
Ejemplo n.º 6
0
def mknfold(fin, match=1, nfold=5):
    "Split input file into few"
    random.seed(123)
    with fopen(fin, 'r') as istream:
        with fopen(fin+'.train', 'w') as otrain:
            with fopen(fin+'.test', 'w') as otest:
                for line in istream:
                    if  random.randint( 1 , nfold ) == match:
                        otest.write(line)
                    else:
                        otrain.write(line)
Ejemplo n.º 7
0
def find_drops(file1, file2, idrops=None):
    "Find difference in headers of two files and print it out"
    with fopen(file1) as istream1, fopen(file2) as istream2:
        headers1 = istream1.readline().replace('\n', '').split(',')
        headers2 = istream2.readline().replace('\n', '').split(',')
        drops = set(headers1) - set(headers2)
        drops = drops.union(set(headers2) - set(headers1))
        if idrops:
            print(','.join(set(list(drops) + idrops.split(','))))
        else:
            print(','.join(drops))
Ejemplo n.º 8
0
def find_drops(file1, file2, idrops=None):
    "Find difference in headers of two files and print it out"
    with fopen(file1) as istream1, fopen(file2) as istream2:
        headers1 = istream1.readline().replace('\n', '').split(',')
        headers2 = istream2.readline().replace('\n', '').split(',')
        drops = set(headers1)-set(headers2)
        drops = drops.union(set(headers2)-set(headers1))
        if  idrops:
            print(','.join(set(list(drops)+idrops.split(','))))
        else:
            print(','.join(drops))
Ejemplo n.º 9
0
def data(path, D, ndim, extra_dim, label_path=None, hcols=[], misses=[]):
    for t, line in enumerate(fopen(path)):
        # initialize our generator
        if t == 0:
            # create a static x,
            # so we don't have to construct a new x for every instance
            x = [0] * (ndim + extra_dim)
            if label_path:
                label = fopen(label_path)
                label.readline()  # we don't need the headers
            continue
        # parse x
        counter = 0
        for m, feat in enumerate(line.rstrip().split(',')):
            if m in misses:
                continue
            if m == 0:
                try:
                    ID = int(feat)
                except:
                    continue
            else:
                # one-hot encode everything with hash trick
                # categorical: one-hotted
                # boolean: ONE-HOTTED
                # numerical: ONE-HOTTED!
                # note, the build in hash(), although fast is not stable,
                #       i.e., same value won't always have the same hash
                #       on different machines
                if misses:
                    idx = counter
                else:
                    idx = m
                x[idx] = abs(hash(str(m) + '_' + str(feat))) % D
            counter += 1
        row = line.rstrip().split(',')
        tidx = ndim
        for i in xrange(len(hcols)):
            for j in xrange(i + 1, len(hcols)):
                tidx += 1
                try:
                    val = str(tidx) + '_' + row[hcols[i]] + "_" + row[hcols[j]]
                    x[tidx] = abs(hash(val)) % D
                except Exception as exc:
                    print("tidx=%s, i=%s, j=%s" % (tidx, i, j))
                    print(str(exc))
                    raise

        # parse y, if provided
        if label_path:
            # use float() to prevent future type casting, [1:] to ignore id
            y = [float(y) for y in label.readline().split(',')[1:]]
        yield (ID, x, y) if label_path else (ID, x)
Ejemplo n.º 10
0
def data(path, D, ndim, extra_dim, label_path=None, hcols=[], misses=[]):
    for t, line in enumerate(fopen(path)):
        # initialize our generator
        if t == 0:
            # create a static x,
            # so we don't have to construct a new x for every instance
            x = [0] * (ndim + extra_dim)
            if label_path:
                label = fopen(label_path)
                label.readline()  # we don't need the headers
            continue
        # parse x
        counter = 0
        for m, feat in enumerate(line.rstrip().split(',')):
            if  m in misses:
                continue
            if  m == 0:
                try:
                    ID = int(feat)
                except:
                    continue
            else:
                # one-hot encode everything with hash trick
                # categorical: one-hotted
                # boolean: ONE-HOTTED
                # numerical: ONE-HOTTED!
                # note, the build in hash(), although fast is not stable,
                #       i.e., same value won't always have the same hash
                #       on different machines
                if  misses:
                    idx = counter
                else:
                    idx = m
                x[idx] = abs(hash(str(m) + '_' + str(feat))) % D
            counter += 1
        row = line.rstrip().split(',')
        tidx = ndim
        for i in xrange(len(hcols)):
          for j in xrange(i+1, len(hcols)):
            tidx += 1
            try:
                val = str(tidx)+'_'+row[hcols[i]]+"_"+row[hcols[j]]
                x[tidx] = abs(hash(val)) % D
            except Exception as exc:
                print("tidx=%s, i=%s, j=%s" % (tidx, i, j))
                print(str(exc))
                raise

        # parse y, if provided
        if label_path:
            # use float() to prevent future type casting, [1:] to ignore id
            y = [float(y) for y in label.readline().split(',')[1:]]
        yield (ID, x, y) if label_path else (ID, x)
Ejemplo n.º 11
0
def read_popdb(popdb):
    "Read popdb data"
    headers = []
    pdict = {}
    with fopen(popdb, 'r') as istream:
        while True:
            if  not headers:
                for row in istream.readline().replace('\n', '').split(','):
                    if  row == 'COLLNAME':
                        headers.append('dataset')
                    elif row == 'NACC':
                        headers.append('naccess')
                    elif row == 'RNACC':
                        headers.append('rnaccess')
                    else:
                        headers.append(row.lower())
                continue
            vals = istream.readline().replace('\n', '').split(',')
            if  len(vals) < 2:
                break
            row = dict(zip(headers, vals))
            try:
                dataset = row.pop('dataset')
                pdict[dataset] = row
            except:
                pass
    return pdict
Ejemplo n.º 12
0
def line_offsets(fname):
    """Read in the file once and return a list of line offsets"""
    line_offset = []
    offset = 0
    for _, line in enumerate( fopen(fname) ):
        line_offset.append(offset)
        offset += len(line)
    return line_offset
Ejemplo n.º 13
0
def line_offsets(fname):
    """Read in the file once and return a list of line offsets"""
    line_offset = []
    offset = 0
    for _, line in enumerate(fopen(fname)):
        line_offset.append(offset)
        offset += len(line)
    return line_offset
Ejemplo n.º 14
0
def convert(fin, fout, datasets, sep=','):
    """
    Convert input prediction file (id,prediction) into (dataset,prediction)
    by using datasets file
    """
    headers = None
    df = pd.read_csv(datasets)
    with fopen(fin, 'r') as istream, fopen(fout, 'w') as ostream:
        for line in istream.readlines():
            did, dbs, pred = line.replace('\n', '').split(sep)
            if not headers:
                headers = '%s,%s,%s' % (did, dbs, pred)
                continue
            res = df[(df.hash == int(did)) & (df.dbsinst == int(dbs))]
            if not res.empty:
                dataset = res.get_value(res.index[0], 'dataset')
                ostream.write("%5.3f%s%s\n" % (float(pred), sep, dataset))
Ejemplo n.º 15
0
def convert(fin, fout, datasets, sep=','):
    """
    Convert input prediction file (id,prediction) into (dataset,prediction)
    by using datasets file
    """
    headers = None
    df = pd.read_csv(datasets)
    with fopen(fin, 'r') as istream, fopen(fout, 'w') as ostream:
        for line in istream.readlines():
            did, dbs, pred = line.replace('\n', '').split(sep)
            if  not headers:
                headers = '%s,%s,%s' % (did, dbs, pred)
                continue
            res = df[(df.hash==int(did)) & (df.dbsinst==int(dbs))]
            if  not res.empty:
                dataset = res.get_value(res.index[0], 'dataset')
                ostream.write("%5.3f%s%s\n" % (float(pred), sep, dataset))
Ejemplo n.º 16
0
def new_datasets(fin, fnew, fout):
    "Find out which datasets were new in provided train/valid files"
    train_ids = dataset_ids(fin)
    valid_ids = dataset_ids(fnew)
    ids = valid_ids - train_ids
    with fopen(fout, 'wb') as ostream:
        ostream.write('dataset,dbs\n')
        for pair in ids:
            ostream.write(','.join(pair) + '\n')
Ejemplo n.º 17
0
def new_datasets(fin, fnew, fout):
    "Find out which datasets were new in provided train/valid files"
    train_ids = dataset_ids(fin)
    valid_ids = dataset_ids(fnew)
    ids = valid_ids - train_ids
    with fopen(fout, 'wb') as ostream:
        ostream.write('dataset,dbs\n')
        for pair in ids:
            ostream.write(','.join(pair)+'\n')
Ejemplo n.º 18
0
def convert(fin, fout, uri, sep=','):
    """
    Convert input prediction file (id,prediction) into (dataset,prediction)
    by using DCAFPilot cache
    """
    client = MongoClient(uri)
    mgr = client['analytics']['datasets']
    headers = None
    with fopen(fin, 'r') as istream:
        with fopen(fout, 'w') as ostream:
            for line in istream.readlines():
                did, dbs, pred = line.replace('\n', '').split(sep)
                if  not headers:
                    headers = '%s,%s,%s' % (did, dbs, pred)
                    continue
                spec = {'dataset_id':int(did), 'dbs_instance':int(dbs)}
                res = mgr.find_one(spec)
                if  res:
                    ostream.write("%5.3f%s%s\n" % (float(pred), sep, res['dataset']))
Ejemplo n.º 19
0
def transform(fin, fout, target, thr, drops, verbose=0):
    "Perform transformation on given CSV file"
    istream = fopen(fin, 'r')
    ostream = fopen(fout, 'wb')
    headers = False
    for line in istream.readlines():
        if  not headers:
            headers = line.replace('\n', '').split(',')
            if  drops:
                new_headers = []
                for idx, val in enumerate(headers):
                    if  val in drops or val == target:
                        continue
                    new_headers.append(val)
                ostream.write(','.join(new_headers)+',target\n')
            continue
        vals = [eval(v) for v in line.replace('\n', '').split(',')]
        row = dict(zip(headers, vals))
        if  thr==-1: # keep regression
            tval = row[target]
        else: # do classification
            if  INT_PAT.match(thr) or FLOAT_PAT.match(thr):
                tval = 1 if float(row[target])>float(thr) else 0
            else:
                try:
                    cond = eval(thr)
                    if  cond:
                        tval = 1
                    else:
                        tval = 0
                except:
                    print("Please supply valid python condition, e.g. row['naccess']>10 and row['nusers']>5")
                    sys.exit(1)
        new_vals = []
        for key in new_headers:
            if  key in drops or key == target:
                continue
            new_vals.append(str(row[key]))
        new_vals.append(str(tval))
        ostream.write(','.join(new_vals)+'\n')
    istream.close()
    ostream.close()
Ejemplo n.º 20
0
def merger(fin, fout, verbose=False):
    "Merger function"
    filelist = []
    if fin.find(',') != -1:  # list of files
        filelist = fin.split(',')
    elif fin.find('*') != -1:  # pattern
        filelist = glob.glob(fin)
    elif os.path.isdir(fin):  # we got directory name
        for ext in ['.csv.gz', '.csv', 'csv.bz2']:
            filelist = [f for f in files(fin, ext)]
            if len(filelist):
                break
    elif os.path.isfile(fin):  # we got file name
        filelist = [fin]
    if not filelist:
        print("ERROR; unable to create filelist from %s" % fin)
        sys.exit(1)
    # sort all files
    filelist.sort()

    headers = find_headers(filelist)

    with fopen(fout, 'wb') as ostream:
        ostream.write(','.join(headers) + '\n')
        for fname in filelist:
            if verbose:
                print("Read", fname)
            with fopen(fname, 'r') as istream:
                keys = istream.readline().replace('\n',
                                                  '').split(',')  # headers
                while True:
                    line = istream.readline()
                    if not line:
                        break
                    vals = line.replace('\n', '').split(',')
                    row = dict(zip(keys, vals))
                    srow = ','.join([str(row.get(k, 0)) for k in headers])
                    ostream.write(srow + '\n')
Ejemplo n.º 21
0
def find_headers(files):
    "Scan all files and extract full set of attributes"
    headers = []
    for fname in files:
        with fopen(fname, 'r') as istream:
            line = istream.readline()
            fheaders = line.replace('\n','').split(',')
            if  not headers:
                headers = fheaders
            if  headers != fheaders: # take a union of two sets
                headers = list(set(headers) | set(fheaders))
    if  'id' in headers:
        headers.remove('id')
    return ['id'] + sorted(headers)
Ejemplo n.º 22
0
def find_headers(files):
    "Scan all files and extract full set of attributes"
    headers = []
    for fname in files:
        with fopen(fname, 'r') as istream:
            line = istream.readline()
            fheaders = line.replace('\n', '').split(',')
            if not headers:
                headers = fheaders
            if headers != fheaders:  # take a union of two sets
                headers = list(set(headers) | set(fheaders))
    if 'id' in headers:
        headers.remove('id')
    return ['id'] + sorted(headers)
Ejemplo n.º 23
0
def merger(fin, fout, verbose=False):
    "Merger function"
    filelist = []
    if  fin.find(',') != -1: # list of files
        filelist = fin.split(',')
    elif fin.find('*') != -1: # pattern
        filelist = glob.glob(fin)
    elif os.path.isdir(fin): # we got directory name
        for ext in ['.csv.gz', '.csv', 'csv.bz2']:
            filelist = [f for f in files(fin, ext)]
            if  len(filelist):
                break
    elif os.path.isfile(fin): # we got file name
        filelist = [fin]
    if  not filelist:
        print("ERROR; unable to create filelist from %s" % fin)
        sys.exit(1)
    # sort all files
    filelist.sort()

    headers = find_headers(filelist)

    with fopen(fout, 'wb') as ostream:
        ostream.write(','.join(headers)+'\n')
        for fname in filelist:
            if  verbose:
                print("Read", fname)
            with fopen(fname, 'r') as istream:
                keys = istream.readline().replace('\n', '').split(',') # headers
                while True:
                    line = istream.readline()
                    if  not line:
                        break
                    vals = line.replace('\n', '').split(',')
                    row = dict(zip(keys, vals))
                    srow = ','.join([str(row.get(k, 0)) for k in headers])
                    ostream.write(srow+'\n')
Ejemplo n.º 24
0
def run(fin, fout, split=30, seed=0):
    "Read input file and write train/validation files"
    if seed:
        random.seed(seed)
    base, ext = fout.split('.')
    ftest = '%s_valid.%s' % (base, ext)
    offsets = line_offsets(fin)
    nlines = len(offsets)
    indices = range(1, nlines)
    random.shuffle(indices)
    with fopen(fin, 'r') as istream, fopen(fout, 'wb') as ostream, \
        fopen(ftest, 'wb') as tstream:
        headers = istream.readline()
        ostream.write(headers)
        tstream.write(headers)
        count = 0
        for idx in indices:
            istream.seek(offsets[idx])
            line = istream.readline()
            if count > (nlines - round(nlines * split / 100.)):
                tstream.write(line)
            else:
                ostream.write(line)
            count += 1
Ejemplo n.º 25
0
def run(fin, fout, split=30, seed=0):
    "Read input file and write train/validation files"
    if  seed:
        random.seed(seed)
    base, ext = fout.split('.')
    ftest = '%s_valid.%s' % (base, ext)
    offsets = line_offsets(fin)
    nlines = len(offsets)
    indices = range(1, nlines)
    random.shuffle(indices)
    with fopen(fin, 'r') as istream, fopen(fout, 'wb') as ostream, \
        fopen(ftest, 'wb') as tstream:
        headers = istream.readline()
        ostream.write(headers)
        tstream.write(headers)
        count = 0
        for idx in indices:
            istream.seek(offsets[idx])
            line = istream.readline()
            if  count > (nlines-round(nlines*split/100.)):
                tstream.write(line)
            else:
                ostream.write(line)
            count += 1
Ejemplo n.º 26
0
def loader(ifile, target, sep=',', threshold=None):
    "Load prediction from given file"
    headers = None
    tidx  = None
    arr   = []
    with fopen(ifile, 'r') as istream:
        while True:
            row = istream.readline().replace('\n', '').replace('\r', '').split(sep)
            if  not headers:
                headers = row
                tidx = headers.index(target)
                continue
            if  len(row) < 2:
                break
            # changed since previous approach did not recognice scientific notation
            val  = str_to_num(row[tidx])
            if  threshold:
                val = 1 if val >= threshold else 0
            arr.append(val)
    return arr
Ejemplo n.º 27
0
def loader(ifile, target, sep=',', threshold=None):
    "Load prediction from given file"
    headers = None
    tidx  = None
    arr   = []
    with fopen(ifile, 'r') as istream:
        while True:
            row = istream.readline().replace('\n', '').replace('\r', '').split(sep)
            if  not headers:
                headers = row
                tidx = headers.index(target)
                continue
#             if  len(row) < 2:
            if  not row or (isinstance(row, list) and len(row)==1 and not row[0]):
                break
            # changed since previous approach did not recognice scientific notation
            val  = str_to_num(row[tidx])
            if  threshold:
                val = 1 if val >= threshold else 0
            arr.append(val)
    return arr
Ejemplo n.º 28
0
def dataset_ids(fin):
    "Return dataset ids from given file"
    ids = set()
    headers = []
    didx = None
    tidx = None
    dbsidx = None
    with fopen(fin, 'r') as istream:
        while True:
            if  not headers:
                headers = istream.readline().replace('\n', '').split(',')
                didx = headers.index('dataset')
                tidx = headers.index('target')
                dbsidx = headers.index('dbs')
                continue
            row = istream.readline().replace('\n', '').split(',')
            if  len(row) < 2:
                break
            if  float(row[tidx]) > 0:
                ids.add((row[didx],row[dbsidx]))
    return set(ids)
Ejemplo n.º 29
0
def dataset_ids(fin):
    "Return dataset ids from given file"
    ids = set()
    headers = []
    didx = None
    tidx = None
    dbsidx = None
    with fopen(fin, 'r') as istream:
        while True:
            if not headers:
                headers = istream.readline().replace('\n', '').split(',')
                didx = headers.index('dataset')
                tidx = headers.index('target')
                dbsidx = headers.index('dbs')
                continue
            row = istream.readline().replace('\n', '').split(',')
            if len(row) < 2:
                break
            if float(row[tidx]) > 0:
                ids.add((row[didx], row[dbsidx]))
    return set(ids)
Ejemplo n.º 30
0
def loader(ifile, target, sep=','):
    "Load prediction from given file"
    headers = None
    tidx = None
    arr = []
    with fopen(ifile, 'r') as istream:
        while True:
            row = istream.readline().replace('\n', '').split(sep)
            if  not headers:
                headers = row
                tidx = headers.index(target)
                continue
            if  len(row) < 2:
                break
            val = row[tidx]
            if  INT_PAT.match(val):
                val = int(val)
            elif FLT_PAT.match(val):
                val = float(val)
            else:
                raise Exception("Parsed value '%s' has unknown data-type" % val)
            arr.append(val)
    return arr
Ejemplo n.º 31
0
def verify_csv(fout, verbose):
    err = None
    try:
        with fopen(fout, 'r') as istream:
            nc = len(istream.readline().split(','))
            while err is not None:
                line = istream.readline()
                if  not line:
                    break
                if  len(line.split(',')) != nc:
                    err = 'Number of headers differs from number of data in rows.'
                    break
    except IOError:
        err  = 'IOError raised while reading the file.'
    except Exception as ex:
        err  = 'Exception raised while reading: '
        err += ex + '. Arguments: '
        err += ex.args + '.'
    if  err is not None:
        err  = ("Verification of file %s failed. " % fout) + err
        print(err)
    else:
        if  verbose:
            print("Verification of %s passed successfully." % fout)
Ejemplo n.º 32
0
def verify_csv(fout, verbose):
    err = None
    try:
        with fopen(fout, 'r') as istream:
            nc = len(istream.readline().split(','))
            while err is not None:
                line = istream.readline()
                if not line:
                    break
                if len(line.split(',')) != nc:
                    err = 'Number of headers differs from number of data in rows.'
                    break
    except IOError:
        err = 'IOError raised while reading the file.'
    except Exception as ex:
        err = 'Exception raised while reading: '
        err += ex + '. Arguments: '
        err += ex.args + '.'
    if err is not None:
        err = ("Verification of file %s failed. " % fout) + err
        print(err)
    else:
        if verbose:
            print("Verification of %s passed successfully." % fout)
Ejemplo n.º 33
0
def read_data(fin, fbeg, fend, timein):
    "Selects files and picks structured data for plotting"
    if  fin == ".":
        files = [f for f in os.listdir('.') \
                    if (len(fbeg) == 0 or f.startswith(fbeg))
                    and (len(fend) == 0 or f.endswith(fend))]
    elif os.path.isdir(fin):
        files = [os.path.join(fin, f) for f in os.listdir(fin)  \
                    if (len(fbeg) == 0 or f.startswith(fbeg))
                    and (len(fend) == 0 or f.endswith(fend))]        
    elif os.path.isfile(fin):
        # if one file passed, its fn not checked
        files = [fin]
    else:
        print("Cannot produce file list, not a file or directory: \"%s\"" % fin)
        raise SystemExit
    ### data structure:
    #   { dftypes: { scorers: { classifiers: [ ordered weekly results for particular dftype, scorer and classifier ] } } }
    # dftype is dataframe type: old or new
    # dates variable stores ordered dates with an idea that dates in csv
    #   are stored respectfully in same order for all dftype and
    #   classifier combinations
    data    = {}
    dates   = {}
    scorers = []
    headers = []
    rundata = {}
    # read classifiers scores
    for f in files:
        first = True
        for line in fopen(f).readlines():
            vals = line.strip(" \n\r").split(',')
            if  first:
                first = False
                if  not headers:
                    headers = vals
                    if      headers[0] != 'dftype'  \
                        or  headers[1] != 'clf'     \
                        or  headers[2] != 'date'    \
                        or  len(headers) < 4:
                        print("Error: check structure of file %s, headers should be: dftype,clf,date" % f)
                        sys.exit(1)
                    scorers = headers[3:]
                elif headers != vals:
                    print("Original headers: %s" % headers)
                    print("Headers in %s: %s" % (f, vals))
                    sys.exit(1)
            else:
                row = dict(zip(headers, vals))
                if  not row['dftype'] in data:
                    data[row['dftype']] = {}
                if  not row['dftype'] in dates:
                    dates[row['dftype']] = []
                if  not row['date'] in dates[row['dftype']]:
                    dates[row['dftype']].append(row['date'])
                for s in scorers:
                    if  not s in data[row['dftype']]:
                        data[row['dftype']][s] = {}
                    if  not row['clf'] in data[row['dftype']][s]:
                        data[row['dftype']][s][row['clf']] = []
                    data[row['dftype']][s][row['clf']].append(str_to_num(row[s]))
    # read classifiers running time
    if  timein:
        head = []
        for line in open(timein, 'r').readlines():
            if  not head:
                head = line
                continue
            line = line.strip(" \r\n").split(',')
            rundata[line[0]] = float(line[1])
    return data, dates, rundata
Ejemplo n.º 34
0
def run(train, test, label, bits, alpha, hcols, no_out, misses, glf):
    "Run train/test program"
    print("Train model with b=%s, a=%s, cols=%s, misses=%s, host=%s"\
            % (bits, alpha, hcols, misses, socket.gethostname()))
    sys.__stdout__.flush()
    start = datetime.now()

    # find out number of dimensions
    ndim = 146  # number of features in our dataset plus 1 bias feature
    extra_dim = 1  # extra dimension for hcols interactions
    for i in xrange(len(hcols)):
        for j in xrange(i + 1, len(hcols)):
            extra_dim += 1
    if misses:
        ndim -= len(misses)
    print('Dim: %s, extra %s' % (ndim, extra_dim))
    sys.__stdout__.flush()

    # number of weights use for each model, we have 32 of them
    D = 2**bits

    # a list for range(0, 33) - 13, no need to learn y14 since it is always 0
    K = [k for k in range(33) if k != 13]

    # initialize our model, all 32 of them, again ignoring y14
    w = [[0.] * D if k != 13 else None for k in range(33)]
    n = [[0.] * D if k != 13 else None for k in range(33)]

    loss = 0.
    loss_y14 = log(1. - 10**-16)

    for ID, x, y in data(train,
                         D,
                         ndim,
                         extra_dim,
                         label,
                         hcols,
                         misses=misses):
        # get predictions and train on all labels
        for k in K:
            p = predict(x, w[k], glf)
            update(alpha, w[k], n[k], x, p, y[k])
            loss += logloss(p, y[k])  # for progressive validation
        loss += loss_y14  # the loss of y14, logloss is never zero
        # print out progress, so that we know everything is working
        if ID % 100000 == 0:
            print('%s\tencountered: %d\tcurrent logloss: %f' %
                  (datetime.now(), ID, (loss / 33.) / ID))
            sys.__stdout__.flush()
    print("Final loss", (loss / 33.) / ID)

    if no_out:
        print("No output request")
        sys.__stdout__.flush()
    else:
        oname = 'b%s_a%s.csv' % (bits, alpha)
        print("Yield %s" % oname)
        sys.__stdout__.flush()
        with fopen(oname, 'w') as outfile:
            outfile.write('id_label,pred\n')
            for ID, x in data(test,
                              D,
                              ndim,
                              extra_dim,
                              hcols=hcols,
                              misses=misses):
                predSum = 1.0
                for k in K:
                    p = predict(x, w[k], glf)
                    outfile.write('%s_y%d,%s\n' % (ID, k + 1, str(p)))
                    predSum -= p
                    if k == 12:
                        outfile.write('%s_y14,0.0\n' % ID)
                    if k == 31:
                        p = max(0.01, predSum)
                        outfile.write('%s_y33,%s\n' % (ID, str(p)))

    print('Done, elapsed time: %s' % str(datetime.now() - start))
    sys.__stdout__.flush()
Ejemplo n.º 35
0
def write_data(dfr, fout, comp):
    csv = dfr.to_csv(index=False, sep=',')
    fopen(fout, 'w').write(csv)
Ejemplo n.º 36
0
def transform(fin, fout, target, thr, drops, verbose=0, logcols='', logall=False, logbias=2, logthr=None, logignore=''):
    "Perform transformation on given CSV file"
    istream = fopen(fin, 'r')
    ostream = fopen(fout, 'wb')
    headers = False
    eno     = 0
    logignore = logignore.split(',')
    if  logthr and not logcols:
        logcols = get_log_cols(fin, logthr, logignore)
    for line in istream.readlines():
        if  not headers:
            headers = line.replace('\n', '').split(',')
            if  drops:
                new_headers = []
                for idx, val in enumerate(headers):
                    if  val in drops or val == target:
                        continue
                    new_headers.append(val)
                ostream.write(','.join(new_headers)+',target\n')
            continue
        try:
            item = line.replace('\n', '').replace('<nil>', '-1')
            vals = [eval(v) for v in item.split(',')]
        except Exception as exp:
            print("Unable to parse the line", line, type(line), exp)
            vals = []
            for item in line.split(','):
                try:
                    vals.append(eval(item))
                except:
                    vals.append(-1)
            if  len(vals) != len(headers):
                raise Exception("Unable to parse line '%s', #values != #headers", line)
        row = dict(zip(headers, vals))
        if  thr==-1: # keep regression
            tval = row[target]
        else: # do classification
            if  INT_PAT.match(thr) or FLOAT_PAT.match(thr):
                tval = 1 if float(row[target])>float(thr) else 0
            else:
                try:
                    cond = eval(thr)
                    if  cond:
                        tval = 1
                    else:
                        tval = 0
                except:
                    print("Please supply valid python condition, e.g. row['naccess']>10 and row['nusers']>5")
                    sys.exit(1)
        new_vals = []
        for key in new_headers:
            if  key in drops or key == target:
                continue
            new_vals.append(str(row[key]))
        new_vals.append(str(tval))
        if  logcols or logall:
            if  logall:
                logcols = new_headers[:]
                logcols = list(set(logcols)-set(logignore))
            for i in xrange(len(new_headers)):
                if  new_headers[i] in logcols:
                    new_vals[i] = str(log(eval(new_vals[i])+logbias))
        ostream.write(','.join(new_vals)+'\n')
    istream.close()
    ostream.close()
Ejemplo n.º 37
0
 def run(fout, tframe, seed, dformat, dbsextra, newdata):
     with fopen(opts.fout, 'w') as ostream:
         for row in mgr.dataframe(tframe, seed, dformat, dbsextra, newdata):
             ostream.write(row + '\n')
Ejemplo n.º 38
0
def read_data(fin, fbeg, fend, timein):
    "Selects files and picks structured data for plotting"
    if fin == ".":
        files = [f for f in os.listdir('.') \
                    if (len(fbeg) == 0 or f.startswith(fbeg))
                    and (len(fend) == 0 or f.endswith(fend))]
    elif os.path.isdir(fin):
        files = [os.path.join(fin, f) for f in os.listdir(fin)  \
                    if (len(fbeg) == 0 or f.startswith(fbeg))
                    and (len(fend) == 0 or f.endswith(fend))]
    elif os.path.isfile(fin):
        # if one file passed, its fn not checked
        files = [fin]
    else:
        print("Cannot produce file list, not a file or directory: \"%s\"" %
              fin)
        raise SystemExit
    ### data structure:
    #   { dftypes: { scorers: { classifiers: [ ordered weekly results for particular dftype, scorer and classifier ] } } }
    # dftype is dataframe type: old or new
    # dates variable stores ordered dates with an idea that dates in csv
    #   are stored respectfully in same order for all dftype and
    #   classifier combinations
    data = {}
    dates = {}
    scorers = []
    headers = []
    rundata = {}
    # read classifiers scores
    for f in files:
        first = True
        for line in fopen(f).readlines():
            vals = line.strip(" \n\r").split(',')
            if first:
                first = False
                if not headers:
                    headers = vals
                    if      headers[0] != 'dftype'  \
                        or  headers[1] != 'clf'     \
                        or  headers[2] != 'date'    \
                        or  len(headers) < 4:
                        print(
                            "Error: check structure of file %s, headers should be: dftype,clf,date"
                            % f)
                        sys.exit(1)
                    scorers = headers[3:]
                elif headers != vals:
                    print("Original headers: %s" % headers)
                    print("Headers in %s: %s" % (f, vals))
                    sys.exit(1)
            else:
                row = dict(zip(headers, vals))
                if not row['dftype'] in data:
                    data[row['dftype']] = {}
                if not row['dftype'] in dates:
                    dates[row['dftype']] = []
                if not row['date'] in dates[row['dftype']]:
                    dates[row['dftype']].append(row['date'])
                for s in scorers:
                    if not s in data[row['dftype']]:
                        data[row['dftype']][s] = {}
                    if not row['clf'] in data[row['dftype']][s]:
                        data[row['dftype']][s][row['clf']] = []
                    data[row['dftype']][s][row['clf']].append(
                        str_to_num(row[s]))
    # read classifiers running time
    if timein:
        head = []
        for line in open(timein, 'r').readlines():
            if not head:
                head = line
                continue
            line = line.strip(" \r\n").split(',')
            rundata[line[0]] = float(line[1])
    return data, dates, rundata
Ejemplo n.º 39
0
def transform(fin,
              fout,
              idcol,
              target,
              thr,
              drops,
              verbose=0,
              logcols='',
              logall=False,
              logbias=2,
              logthr=None,
              logignore=''):
    "Perform transformation on given CSV file"
    istream = fopen(fin, 'r')
    ostream = fopen(fout, 'wb')
    headers = False
    eno = 0
    logignore = logignore.split(',')
    if logthr and not logcols:
        logcols = get_log_cols(fin, logthr, logignore)
    for line in istream.readlines():
        if not headers:
            headers = line.replace('\n', '').split(',')
            if drops:
                new_headers = []
                for idx, val in enumerate(headers):
                    if val in drops or val == target or val == idcol:
                        continue
                    new_headers.append(val)
                ostream.write('id,' + ','.join(new_headers) + ',target\n')
            continue
        try:
            item = line.replace('\n', '').replace('<nil>', '-1')
            vals = [eval(v) for v in item.split(',')]
        except Exception as exp:
            print("Unable to parse the line", line, type(line), exp)
            vals = []
            for item in line.split(','):
                try:
                    vals.append(eval(item))
                except:
                    vals.append(-1)
            if len(vals) != len(headers):
                raise Exception(
                    "Unable to parse line '%s', #values != #headers", line)
        row = dict(zip(headers, vals))
        if thr == -1:  # keep regression
            tval = row[target]
        else:  # do classification
            if INT_PAT.match(thr) or FLOAT_PAT.match(thr):
                tval = 1 if float(row[target]) > float(thr) else 0
            else:
                try:
                    cond = eval(thr)
                    if cond:
                        tval = 1
                    else:
                        tval = 0
                except:
                    print(
                        "Please supply valid python condition, e.g. row['naccess']>10 and row['nusers']>5"
                    )
                    sys.exit(1)
        if idcol in row.keys():
            new_vals = [str(int(row[idcol]))]
        else:
            new_vals = [getuid()]
        for key in new_headers:
            if key in drops or key == target or key == idcol:
                continue
            new_vals.append(str(row[key]))
        new_vals.append(str(tval))
        if logcols or logall:
            if logall:
                logcols = new_headers[:]
                logcols = list(set(logcols) - set(logignore))
            for i in xrange(len(new_headers)):
                if new_headers[i] in logcols:
                    new_vals[i] = str(log(eval(new_vals[i]) + logbias))
        ostream.write(','.join(new_vals) + '\n')
    istream.close()
    ostream.close()
Ejemplo n.º 40
0
 def run(fout, tframe, seed, dformat, dbsextra, newdata):
     with fopen(opts.fout, 'w') as ostream:
         for row in mgr.dataframe(tframe, seed, dformat, dbsextra, newdata):
             ostream.write(row+'\n')
Ejemplo n.º 41
0
def verify_prediction(pred, popdb, oformat, verbose, target, thr):
    "Verify prediction file against popdb one"
    pdict = read_popdb(popdb)
    total = 0
    popular = 0
    totpop = 0
    tpos = 0
    tneg = 0
    fpos = 0
    fneg = 0
    tp_list = []
    tn_list = []
    fp_list = []
    fn_list = []
    def is_popular(dataset, pdict):
        return (target is None or float(pdict[dataset][target]) > thr)
    for line in fopen(pred, 'r').readlines():
        prob, dataset = line.replace('\n', '').split(',')
        total += 1
        if  float(prob)>0:
            popular += 1
        if  dataset in pdict and is_popular(dataset, pdict):
            totpop += 1
            if  float(prob)>0:
                tpos += 1
                tp_list.append(dataset)
                if  verbose>1:
                    print('TP, prob=%s dataset=%s' % (prob, dataset))
            else:
                fneg += 1
                fn_list.append(dataset)
                if  verbose>1:
                    print('FN, prob=%s dataset=%s' % (prob, dataset))
        else:
            if  float(prob)>0:
                fpos += 1
                fp_list.append(dataset)
                if  verbose>1:
                    print('FP, prob=%s dataset=%s' % (prob, dataset))
            else:
                tneg += 1
                tn_list.append(dataset)
                if  verbose>1:
                    print('TN, prob=%s dataset=%s' % (prob, dataset))
    accuracy, precision, recall, f1score = metrics(tpos, tneg, fpos, fneg)
    print("# dataset in popdb sample :", len(pdict.keys()))
    print("# datasets we predict     :", total)
    print("# datasets in popular set :", totpop)
    print("Predicted as popular      :", popular)
    print()
    def perc(vvv):
        return '%s' % round(vvv*100./total, 1)
    def space(vvv):
        return '%s%s' % (vvv, ' '*(len(str(total))-len(str(vvv))))
    tptiers = datasets2tiers(tp_list)
    tntiers = datasets2tiers(tn_list)
    fptiers = datasets2tiers(fp_list)
    fntiers = datasets2tiers(fn_list)
    alltiers = set(tptiers.keys()+tntiers.keys()+fptiers.keys()+fntiers.keys())
    if  oformat=='csv':
        out = 'cls,tp,tn,fp,fn\n'
        out += '%s,ALL,%s,%s,%s,%s\n' \
                % (pred.split('.')[0], perc(tpos), perc(tneg), perc(fpos), perc(fneg))
        for tier in sorted(alltiers):
            tp, tn, fp, fn = percentage(tptiers.get(tier, 0),
                                        tntiers.get(tier, 0),
                                        fptiers.get(tier, 0),
                                        fntiers.get(tier, 0))
            out += '%s,%s,%s,%s,%s,%s\n' \
                    % (pred.split('.')[0], tier, tp, tn, fp, fn)
        print(out)
    else:
        if  verbose:
            print("True positive             : %s, %s%%" % (space(tpos), perc(tpos)))
            print("True negative             : %s, %s%%" % (space(tneg), perc(tneg)))
            print("False positive            : %s, %s%%" % (space(fpos), perc(fpos)))
            print("False negative            : %s, %s%%" % (space(fneg), perc(fneg)))
            print()
        classify_all(tp_list, tn_list, fp_list, fn_list)
        width = max([len(t) for t in alltiers])
        msg = 'ALL tiers'
        pad = ' '*(width-len(msg))
        tp, tn, fp, fn = percentage(tpos, tneg, fpos, fneg)
        print("%s %s %6.2f %6.2f %6.2f %6.2f\n" % (msg, pad, tp, tn, fp, fn))
        if  verbose:
            print("Classification of TP sample")
            classify(tp_list)
            print("Classification of TN sample")
            classify(tn_list)
            print("Classification of FP sample")
            classify(fp_list)
            print("Classification of FN sample")
            classify(fn_list)
        print("Accuracy                  :", accuracy)
        print("Precision                 :", precision)
        print("Recall                    :", recall)
        print("F1-score                  :", f1score)
Ejemplo n.º 42
0
def write_data(dfr, fout, comp):
    csv = dfr.to_csv(index=False, sep=',')
    fopen(fout, 'w').write(csv)
Ejemplo n.º 43
0
def run(train, test, label, bits, alpha, hcols, no_out, misses, glf):
    "Run train/test program"
    print("Train model with b=%s, a=%s, cols=%s, misses=%s, host=%s"\
            % (bits, alpha, hcols, misses, socket.gethostname()))
    sys.__stdout__.flush()
    start = datetime.now()

    # find out number of dimensions
    ndim = 146 # number of features in our dataset plus 1 bias feature
    extra_dim = 1 # extra dimension for hcols interactions
    for i in xrange(len(hcols)):
        for j in xrange(i+1, len(hcols)):
            extra_dim += 1
    if  misses:
        ndim -= len(misses)
    print('Dim: %s, extra %s' % (ndim, extra_dim))
    sys.__stdout__.flush()

    # number of weights use for each model, we have 32 of them
    D = 2 ** bits

    # a list for range(0, 33) - 13, no need to learn y14 since it is always 0
    K = [k for k in range(33) if k != 13]

    # initialize our model, all 32 of them, again ignoring y14
    w = [[0.] * D if k != 13 else None for k in range(33)]
    n = [[0.] * D if k != 13 else None for k in range(33)]

    loss = 0.
    loss_y14 = log(1. - 10**-16)

    for ID, x, y in data(train, D, ndim, extra_dim, label, hcols, misses=misses):
        # get predictions and train on all labels
        for k in K:
            p = predict(x, w[k], glf)
            update(alpha, w[k], n[k], x, p, y[k])
            loss += logloss(p, y[k])  # for progressive validation
        loss += loss_y14  # the loss of y14, logloss is never zero
        # print out progress, so that we know everything is working
        if ID % 100000 == 0:
            print('%s\tencountered: %d\tcurrent logloss: %f' % (
                datetime.now(), ID, (loss/33.)/ID))
            sys.__stdout__.flush()
    print("Final loss", (loss/33.)/ID)

    if  no_out:
        print("No output request")
        sys.__stdout__.flush()
    else:
        oname = 'b%s_a%s.csv' % (bits, alpha)
        print("Yield %s" % oname)
        sys.__stdout__.flush()
        with fopen(oname, 'w') as outfile:
            outfile.write('id_label,pred\n')
            for ID, x in data(test, D, ndim, extra_dim, hcols=hcols, misses=misses):
                predSum = 1.0
                for k in K:
                    p = predict(x, w[k], glf)
                    outfile.write('%s_y%d,%s\n' % (ID, k+1, str(p)))
                    predSum -= p
                    if k == 12:
                        outfile.write('%s_y14,0.0\n' % ID)
                    if k == 31:
                        p = max(0.01,predSum)
                        outfile.write('%s_y33,%s\n' % (ID, str(p)))

    print('Done, elapsed time: %s' % str(datetime.now() - start))
    sys.__stdout__.flush()