Example #1
0
def pyunit_apply():
    fr = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    fr.apply(lambda x: x["PSA"], axis=1).show()
    print
    print
    print fr.apply(lambda x: x['PSA'] > x['VOL'], axis=1).show()
    print
    zz = fr.apply(lambda x: x.mean(na_rm=True))
    print zz.show()

    zz = fr.apply(lambda row: row + 2, axis=1)

    print zz.show()

    zz = fr.apply(lambda row: h2o.ifelse(row[0] == 1, row[2], row[3]), axis=1)

    print zz.show()

    fr.apply(lambda col: col.abs()).show()
    fr.apply(lambda col: col.cos()).show()
    fr.apply(lambda col: col.sin()).show()
    fr.apply(lambda col: col.ceil()).show()
    fr.apply(lambda col: col.floor()).show()
    fr.apply(lambda col: col.cosh()).show()
    fr.apply(lambda col: col.exp()).show()
    fr.apply(lambda col: col.log()).show()
    fr.apply(lambda col: col.sqrt()).show()
    fr.apply(lambda col: col.tan()).show()
    fr.apply(lambda col: col.tanh()).show()

    fr.apply(lambda col: (col * col - col * 5 * col).abs() - 55 / col).show()

    fr.apply(lambda row: h2o.ifelse(row[0] < 5, (row[2] - 3).expm1(),
                                    (row[2] - 999).expm1()),
             axis=1)
    fr.apply(lambda row: h2o.ifelse(row[0] < 5, (row[2] - 3).expm1(), 55),
             axis=1)
    fr.apply(lambda row: h2o.ifelse(row[0] < 5, 3, (row[2] - 1).expm1()),
             axis=1)
def pyunit_apply():
  fr = h2o.import_file(tests.locate("smalldata/logreg/prostate.csv"))

  fr.apply(lambda x: x["PSA"], axis=1).show()
  print
  print
  print fr.apply(lambda x: x['PSA'] > x['VOL'],axis=1).show()
  print
  zz = fr.apply(lambda x: x.mean(na_rm=True))
  print zz.show()

  zz = fr.apply(lambda row: row + 2, axis=1)

  print zz.show()


  zz = fr.apply(lambda row: h2o.ifelse(row[0] == 1, row[2], row[3]), axis=1)

  print zz.show()


  fr.apply(lambda col: col.abs()).show()
  fr.apply(lambda col: col.cos()).show()
  fr.apply(lambda col: col.sin()).show()
  fr.apply(lambda col: col.ceil()).show()
  fr.apply(lambda col: col.floor()).show()
  fr.apply(lambda col: col.cosh()).show()
  fr.apply(lambda col: col.exp()).show()
  fr.apply(lambda col: col.log()).show()
  fr.apply(lambda col: col.sqrt()).show()
  fr.apply(lambda col: col.tan()).show()
  fr.apply(lambda col: col.tanh()).show()

  fr.apply(lambda col: (col*col - col*5*col).abs() - 55/col ).show()


  fr.apply(lambda row: h2o.ifelse(row[0] < 5, (row[2]-3).expm1(), (row[2] - 999).expm1()), axis=1)
  fr.apply(lambda row: h2o.ifelse(row[0] < 5, (row[2]-3).expm1(), 55), axis=1)
  fr.apply(lambda row: h2o.ifelse(row[0] < 5, 3, (row[2] - 1).expm1()), axis=1)
Example #3
0
def refine_date_col(data, col, pattern):
  data[col]         = data[col].as_date(pattern)
  data["Day"]       = data[col].day()
  data["Month"]     = data[col].month() + 1    # Since H2O indexes from 0
  data["Year"]      = data[col].year() + 1900  # Start of epoch is 1900
  data["WeekNum"]   = data[col].week()
  data["WeekDay"]   = data[col].dayOfWeek()
  data["HourOfDay"] = data[col].hour()

  # Create weekend and season cols
  # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
  # data["Weekend"] = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
  data["Weekend"] = h2o.ifelse(data["WeekDay"] == "Sun" | data["WeekDay"] == "Sat", 1, 0)[0]
  data["Season"]  = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])
Example #4
0
def refine_date_col(data, col, pattern):
  data[col]         = data[col].as_date(pattern)
  data["Day"]       = data[col].day()
  data["Month"]     = data[col].month() + 1    # Since H2O indexes from 0
  data["Year"]      = data[col].year() + 1900  # Start of epoch is 1900
  data["WeekNum"]   = data[col].week()
  data["WeekDay"]   = data[col].dayOfWeek()
  data["HourOfDay"] = data[col].hour()

  # Create weekend and season cols
  # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
  # data["Weekend"] = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
  data["Weekend"] = h2o.ifelse(data["WeekDay"] == "Sun" | data["WeekDay"] == "Sat", 1, 0)[0]
  data["Season"]  = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])
Example #5
0
def insert_missing(ip,port):
  air_path = [h2o.locate("smalldata/airlines/allyears2k_headers.zip")]

  data = h2o.import_file(path=air_path)

  hour1 = data["CRSArrTime"] / 100
  mins1 = data["CRSArrTime"] % 100
  arrTime = hour1*60 + mins1

  hour2 = data["CRSDepTime"] / 100
  mins2 = data["CRSDepTime"] % 100
  depTime = hour2*60 + mins2

  data["TravelTime"] = h2o.ifelse((arrTime-depTime)>0,(arrTime-depTime),float("nan"))[0]

  data.show()
Example #6
0
def refine_date_col(data, col, pattern):
  data[col]         = data[col].as_date(pattern)
  data["Day"]       = data[col].day()
  data["Month"]     = data[col].month() + 1    # Since H2O indexes from 0
  data["Year"]      = data[col].year() + 1900  # Start of epoch is 1900
  data["WeekNum"]   = data[col].week()
  data["WeekDay"]   = data[col].dayOfWeek()
  data["HourOfDay"] = data[col].hour()

  data.describe()   # HACK to force evaluation before the ifelse and cut... basically current execution scheme with Expr is way too restrictive and is going away soon. so we need hacks like this to circumvent type issues and the like

  # Create weekend and season cols
  # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
  # data["Weekend"] = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
  data["Weekend"] = h2o.ifelse(data["WeekDay"] == "Sun" or data["WeekDay"] == "Sat", 1, 0)[0]
  data["Season"]  = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])
Example #7
0
def insert_missing(ip,port):
  air_path = [h2o.locate("smalldata/airlines/allyears2k_headers.zip")]

  data = h2o.import_file(path=air_path)

  hour1 = data["CRSArrTime"] / 100
  mins1 = data["CRSArrTime"] % 100
  arrTime = hour1*60 + mins1

  hour2 = data["CRSDepTime"] / 100
  mins2 = data["CRSDepTime"] % 100
  depTime = hour2*60 + mins2

  data["TravelTime"] = h2o.ifelse((arrTime-depTime)>0,(arrTime-depTime),float("nan"))[0]

  data.show()
Example #8
0
def refine_date_col(data, col, pattern):
    data[col] = data[col].as_date(pattern)
    data["Day"] = data[col].day()
    data["Month"] = data[col].month() + 1  # Since H2O indexes from 0
    data["Year"] = data[col].year() + 1900  # Start of epoch is 1900
    data["WeekNum"] = data[col].week()
    data["WeekDay"] = data[col].dayOfWeek()
    data["HourOfDay"] = data[col].hour()

    data.describe(
    )  # HACK to force evaluation before the ifelse and cut... basically current execution scheme with Expr is way too restrictive and is going away soon. so we need hacks like this to circumvent type issues and the like

    # Create weekend and season cols
    # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
    # data["Weekend"] = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
    data["Weekend"] = h2o.ifelse(
        data["WeekDay"] == "Sun" or data["WeekDay"] == "Sat", 1, 0)[0]
    data["Season"] = data["Month"].cut(
        [0, 2, 5, 7, 10, 12],
        ["Winter", "Spring", "Summer", "Autumn", "Winter"])