コード例 #1
0
ファイル: transform_csv.py プロジェクト: rqm5/DMWMAnalytics
def transform(fin, fout, target, thr, drops, verbose=0):
    "Perform transformation on given CSV file"
    istream = fopen(fin, 'r')
    ostream = fopen(fout, 'wb')
    headers = False
    for line in istream.readlines():
        if  not headers:
            headers = line.replace('\n', '').split(',')
            if  drops:
                new_headers = []
                for idx, val in enumerate(headers):
                    if  val in drops or val == target:
                        continue
                    new_headers.append(val)
                ostream.write(','.join(new_headers)+',target\n')
            continue
        vals = [eval(v) for v in line.replace('\n', '').split(',')]
        row = dict(zip(headers, vals))
        if  thr==-1: # keep regression
            tval = row[target]
        else: # do classification
            if  INT_PAT.match(thr) or FLOAT_PAT.match(thr):
                tval = 1 if float(row[target])>float(thr) else 0
            else:
                try:
                    cond = eval(thr)
                    if  cond:
                        tval = 1
                    else:
                        tval = 0
                except:
                    print("Please supply valid python condition, e.g. row['naccess']>10 and row['nusers']>5")
                    sys.exit(1)
        new_vals = []
        for key in new_headers:
            if  key in drops or key == target:
                continue
            new_vals.append(str(row[key]))
        new_vals.append(str(tval))
        ostream.write(','.join(new_vals)+'\n')
    istream.close()
    ostream.close()
コード例 #2
0
ファイル: utils.py プロジェクト: KiprasKancys/DMWMAnalytics
def cmssw_test(rserie, rmajor, rminor):
    "Return if provided parameter match CMSSW release schema"
    cond = INT_PAT.match(str(rserie)) and INT_PAT.match(str(rmajor)) and INT_PAT.match(str(rminor))
    return cond
コード例 #3
0
def cmssw_test(rserie, rmajor, rminor):
    "Return if provided parameter match CMSSW release schema"
    cond = INT_PAT.match(str(rserie)) and INT_PAT.match(
        str(rmajor)) and INT_PAT.match(str(rminor))
    return cond
コード例 #4
0
ファイル: transform_csv.py プロジェクト: dmwm/DMWMAnalytics
def transform(fin,
              fout,
              idcol,
              target,
              thr,
              drops,
              verbose=0,
              logcols='',
              logall=False,
              logbias=2,
              logthr=None,
              logignore=''):
    "Perform transformation on given CSV file"
    istream = fopen(fin, 'r')
    ostream = fopen(fout, 'wb')
    headers = False
    eno = 0
    logignore = logignore.split(',')
    if logthr and not logcols:
        logcols = get_log_cols(fin, logthr, logignore)
    for line in istream.readlines():
        if not headers:
            headers = line.replace('\n', '').split(',')
            if drops:
                new_headers = []
                for idx, val in enumerate(headers):
                    if val in drops or val == target or val == idcol:
                        continue
                    new_headers.append(val)
                ostream.write('id,' + ','.join(new_headers) + ',target\n')
            continue
        try:
            item = line.replace('\n', '').replace('<nil>', '-1')
            vals = [eval(v) for v in item.split(',')]
        except Exception as exp:
            print("Unable to parse the line", line, type(line), exp)
            vals = []
            for item in line.split(','):
                try:
                    vals.append(eval(item))
                except:
                    vals.append(-1)
            if len(vals) != len(headers):
                raise Exception(
                    "Unable to parse line '%s', #values != #headers", line)
        row = dict(zip(headers, vals))
        if thr == -1:  # keep regression
            tval = row[target]
        else:  # do classification
            if INT_PAT.match(thr) or FLOAT_PAT.match(thr):
                tval = 1 if float(row[target]) > float(thr) else 0
            else:
                try:
                    cond = eval(thr)
                    if cond:
                        tval = 1
                    else:
                        tval = 0
                except:
                    print(
                        "Please supply valid python condition, e.g. row['naccess']>10 and row['nusers']>5"
                    )
                    sys.exit(1)
        if idcol in row.keys():
            new_vals = [str(int(row[idcol]))]
        else:
            new_vals = [getuid()]
        for key in new_headers:
            if key in drops or key == target or key == idcol:
                continue
            new_vals.append(str(row[key]))
        new_vals.append(str(tval))
        if logcols or logall:
            if logall:
                logcols = new_headers[:]
                logcols = list(set(logcols) - set(logignore))
            for i in xrange(len(new_headers)):
                if new_headers[i] in logcols:
                    new_vals[i] = str(log(eval(new_vals[i]) + logbias))
        ostream.write(','.join(new_vals) + '\n')
    istream.close()
    ostream.close()
コード例 #5
0
def transform(fin, fout, target, thr, drops, verbose=0, logcols='', logall=False, logbias=2, logthr=None, logignore=''):
    "Perform transformation on given CSV file"
    istream = fopen(fin, 'r')
    ostream = fopen(fout, 'wb')
    headers = False
    eno     = 0
    logignore = logignore.split(',')
    if  logthr and not logcols:
        logcols = get_log_cols(fin, logthr, logignore)
    for line in istream.readlines():
        if  not headers:
            headers = line.replace('\n', '').split(',')
            if  drops:
                new_headers = []
                for idx, val in enumerate(headers):
                    if  val in drops or val == target:
                        continue
                    new_headers.append(val)
                ostream.write(','.join(new_headers)+',target\n')
            continue
        try:
            item = line.replace('\n', '').replace('<nil>', '-1')
            vals = [eval(v) for v in item.split(',')]
        except Exception as exp:
            print("Unable to parse the line", line, type(line), exp)
            vals = []
            for item in line.split(','):
                try:
                    vals.append(eval(item))
                except:
                    vals.append(-1)
            if  len(vals) != len(headers):
                raise Exception("Unable to parse line '%s', #values != #headers", line)
        row = dict(zip(headers, vals))
        if  thr==-1: # keep regression
            tval = row[target]
        else: # do classification
            if  INT_PAT.match(thr) or FLOAT_PAT.match(thr):
                tval = 1 if float(row[target])>float(thr) else 0
            else:
                try:
                    cond = eval(thr)
                    if  cond:
                        tval = 1
                    else:
                        tval = 0
                except:
                    print("Please supply valid python condition, e.g. row['naccess']>10 and row['nusers']>5")
                    sys.exit(1)
        new_vals = []
        for key in new_headers:
            if  key in drops or key == target:
                continue
            new_vals.append(str(row[key]))
        new_vals.append(str(tval))
        if  logcols or logall:
            if  logall:
                logcols = new_headers[:]
                logcols = list(set(logcols)-set(logignore))
            for i in xrange(len(new_headers)):
                if  new_headers[i] in logcols:
                    new_vals[i] = str(log(eval(new_vals[i])+logbias))
        ostream.write(','.join(new_vals)+'\n')
    istream.close()
    ostream.close()