def main(): settings = get_settings_from_file("spec.json") print(settings) # TODO: Add your code here left_part = get_header_data(settings.Input.x) right_part = get_header_data(settings.Input.y) file_merged = open(settings.Output.merged_file, "wb") writer = csv.writer(file_merged) if (left_part[0] and right_part[0]): left_part[1].extend(right_part[1]) writer.writerow(left_part[1]) if (len(left_part[2]) != len(right_part[2])): raise Exception( "The x.csv has a different line number from y.csv. Can not merge.") for i in range(len(left_part[2])): l = left_part[2][i] r = right_part[2][i] l.extend(r) writer.writerow(l) file_merged.close() print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) # TODO: Add your code here pivot = int(settings.Param.split_index) foutput1 = open(settings.Output.output1, "wb") foutput2 = open(settings.Output.output2, "wb") output1_writer = csv.writer(foutput1, lineterminator='\n') output2_writer = csv.writer(foutput2, lineterminator='\n') with open(settings.Input.input_file, "r") as fin: while 1: line = fin.readline() if not line: break columns = line.rstrip('\n').split(',') output1_writer.writerow(columns[:pivot]) output2_writer.writerow(columns[pivot:]) foutput1.close() foutput2.close() fin.close() print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) # init parameter here f = open(settings.Output.Matrix, 'w') intx = int(float(settings.Param.X)) inty = int(float(settings.Param.Y)) imax = int(float(settings.Param.Max)) intType = settings.Param.Type == 'int' # write random num to file f.write('COLUMN_%d' % 0) for iy in range(inty-1): f.write(',COLUMN_%d' % (iy + 1)) for ix in range(intx): f.write('\n') f.write(GetRandom(imax, intType)) for iy in range(inty-1): f.write(',%s' % GetRandom(imax, intType)) f.flush() f.close() print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) # Read Matrix here m1 = ReadMatrix(settings.Input.File1, ',', 1) m2 = ReadMatrix(settings.Input.File2, ',', 1) # Get Min row of two matrix minx = min(len(m1), len(m2)) # Compare two matrix same = total = 0 for ix in range(minx): miny = min(len(m1[ix]), len(m2[ix])) for iy in range(miny): if m1[ix][iy] == m2[ix][iy]: same += 1 total += 1 f = open(settings.Output.Result, 'w') f.write('%.2f%%' % (float(same) / total * 100)) f.flush() f.close() print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) conclusion = readcolumn(settings.Input.conclusion) label = readcolumn(settings.Input.label) precision_list = [] recall_list = [] hits = 0 if (math.fabs(len(label) - len(conclusion)) > 1): raise Exception( "The conclusion size is different from the label size. Can not plot. Check input please." ) if (len(label) - len(conclusion) == 1): #label file got a header on the top label = label[1:] print "There is a header on your label csv file." if (len(conclusion) - len(label) == 1): #conculsion may got a header on the top conclusion = conclusion[1:] print "There is a header on your conclusion csv file." for i in range(len(label)): if conclusion[i] == label[i]: hits += 1 precision_list.append(1.0 * hits / (i + 1)) recall_list.append(1.0 * hits / (len(label))) drawPrecisionRecall(precision_list, recall_list, settings.Output.report) print("Done")
def getSchema(): settings = get_settings_from_file("spec.json") print(settings) conn = pyhs2.connect(host=settings.Param.HiveServer2_Host, port=int(settings.Param.HiveServer2_Port), authMechanism="PLAIN", user="******", password="", database="default") query_sql = "DESCRIBE %s" % settings.Input.table_a.val cur = conn.cursor() cur.execute(query_sql) a_schema = [] for row in cur.fetch(): a_schema.append(("a.%s AS a_%s") %(row[0],row[0])) query_sql = "DESCRIBE %s" % settings.Input.table_b.val cur = conn.cursor() cur.execute(query_sql) b_schema = [] for row in cur.fetch(): b_schema.append(("b.%s AS b_%s")%(row[0],row[0])) cur.close() conn.close() return [a_schema,b_schema]
def main(): settings = get_settings_from_file("spec.json") print(settings) # Read Matrix here m1 = ReadMatrix(settings.Input.File1, ',', 1) m2 = ReadMatrix(settings.Input.File2, ',', 1) # Get Min row of two matrix minx = min(len(m1),len(m2)) # Compare two matrix same = total = 0 for ix in range(minx): miny = min(len(m1[ix]),len(m2[ix])) for iy in range(miny): if m1[ix][iy] == m2[ix][iy]: same += 1 total += 1 f = open(settings.Output.Result, 'w') f.write('%.2f%%' % (float(same)/total*100)) f.flush() f.close() print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) # TODO: Add your code here print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) # TODO: Add your code here pivot = int(settings.Param.split_index) foutput1 = open(settings.Output.output1,"wb") foutput2 = open(settings.Output.output2,"wb") output1_writer = csv.writer(foutput1,lineterminator='\n') output2_writer = csv.writer(foutput2,lineterminator='\n') with open(settings.Input.input_file,"r") as fin: while 1: line = fin.readline() if not line: break columns = line.rstrip('\n').split(',') output1_writer.writerow(columns[:pivot]) output2_writer.writerow(columns[pivot:]) foutput1.close() foutput2.close() fin.close() print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) # TODO: Add your code here left_part = get_header_data(settings.Input.x) right_part = get_header_data(settings.Input.y) file_merged = open(settings.Output.merged_file,"wb") writer = csv.writer(file_merged) if(left_part[0] and right_part[0] ): left_part[1].extend(right_part[1]) writer.writerow(left_part[1]) if(len(left_part[2]) != len(right_part[2])): raise Exception("The x.csv has a different line number from y.csv. Can not merge.") for i in range(len(left_part[2])): l = left_part[2][i] r = right_part[2][i] l.extend(r) writer.writerow(l) file_merged.close() print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) X = np.genfromtxt(settings.Input.X, delimiter=',', skip_header=1) svc = joblib.load(settings.Input.MODEL) Y_out = svc.predict(X) np.savetxt(settings.Output.Y, Y_out, fmt="%d", delimiter=",") print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) X = np.genfromtxt(settings.Input.X, delimiter=',', skip_header=1) lr = joblib.load(settings.Input.MODEL) Y_out = lr.predict(X) np.savetxt(settings.Output.Y, Y_out, fmt="%d", delimiter=",") print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) sqoop = MySqoop(settings.Param.Sqoop2Server_Host, int(settings.Param.Sqoop2Server_Port)) # 1. Create an connection conn_name = "exporter_job%s_blk%s" % ( settings.GlobalParam["jobId"], settings.GlobalParam["blockId"]) conn_ret = sqoop.create_connection(conn_name=conn_name, conn_str=settings.Param.connection_string, username=settings.Param.connection_username, password=settings.Param.connection_password) # 2. empty the table print "Deleting the Table %s" % settings.Param.table_name conn_str = settings.Param.connection_string cfg = parse_jdbc(conn_str) cfg["username"] = settings.Param.connection_username cfg["password"] = settings.Param.connection_password print cfg if "postgresql" in cfg["name"]: psycopg2_delete_table(cfg,settings.Param.table_name) print "delete table %s in POSTGRES" % settings.Param.table_name if "sqlserver" in cfg["name"]: pymssql_delete_table(cfg, settings.Param.table_name) print "delete table %s in MS SQL" % settings.Param.table_name # 3. Run sqoop export job print "Running Sqoop2 Job to Export" fw_ps = { "input.inputDirectory": settings.Input.hdfs_path.val } job_ps = { "table.tableName": settings.Param.table_name, "table.columns": settings.Param.table_columns } job_name = "export job :: username(%s) job %s, block %s" % ( settings.GlobalParam["userName"], settings.GlobalParam["jobId"], settings.GlobalParam["blockId"]) r = sqoop.create_export_job(job_name=job_name, connection_id=conn_ret["id"], framework_params=fw_ps, job_params=job_ps) pp(r) sqoop.run_job(r['id']) sqoop.wait_job(r['id']) sqoop.delete_job(r['id']) # Finally, Delete connection we created sqoop.delete_connection_by_id(conn_ret["id"]) settings.Output.signal.val="ready" print("Done")
def main(): settings = get_settings_from_file("spec.json") X = np.genfromtxt(settings.Input.X, delimiter=',', skip_header=1) Y = np.genfromtxt(settings.Input.Y, delimiter=',', skip_header=1) svc = LinearSVC(C=float(settings.Param.C)) svc.fit(X,Y) with open(settings.Output.MODEL, "w") as f: pickle.dump(svc, f) print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) # TODO: Add your code here with open(settings.Input.DS) as f: ds = json.load(f) print("Downloading '%s'..." % settings.Input.DS) urllib.urlretrieve(ds["URL"], filename=settings.Output.O) print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) X = np.genfromtxt(settings.Input.X, delimiter=',', skip_header=1) Y = np.genfromtxt(settings.Input.Y, delimiter=',', skip_header=1) lr = linear_model.LogisticRegression(C=float(settings.Param.C), penalty=settings.Param.penalty) lr.fit(X,Y) # joblib.dump(lr, settings.Output.MODEL, compress=9, cache_size=1e9) with open(settings.Output.MODEL, "w") as f: pickle.dump(lr, f) print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) # TODO: Add your code here with open(settings.Input.DS) as f: ds = json.load(f) print("Downloading '%s'..." % settings.Input.DS) urllib.urlretrieve(ds['URL'], filename=settings.Output.O) print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) X = np.genfromtxt(settings.Input.X, delimiter=',', skip_header=1) Y = np.genfromtxt(settings.Input.Y, delimiter=',', skip_header=1) svc = LinearSVC(C=float(settings.Param.C), loss=settings.Param.loss, penalty=settings.Param.penalty) svc.fit(X,Y) # joblib.dump(svc, settings.Output.MODEL, cache_size=1e9) with open(settings.Output.MODEL, "w") as f: pickle.dump(svc, f) print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) X = np.genfromtxt(settings.Input.X, delimiter=',', skip_header=1) Y = np.genfromtxt(settings.Input.Y, delimiter=',', skip_header=1) lr = linear_model.LogisticRegression(C=float(settings.Param.C), penalty=settings.Param.penalty) lr.fit(X, Y) # joblib.dump(lr, settings.Output.MODEL, compress=9, cache_size=1e9) with open(settings.Output.MODEL, "w") as f: pickle.dump(lr, f) print("Done")
def write_Main_hql(columns): settings = get_settings_from_file("spec.json") with open("main.hql","w") as file: file.write( """ DROP TABLE IF EXISTS ${OUTPUT_joined_table}; CREATE TABLE ${OUTPUT_joined_table} AS SELECT %s FROM ${INPUT_table_a} a %s JOIN ${INPUT_table_b} b ON %s ; """ %(columns,settings.Param.join_type,settings.Param.on_condition) )
def main(): settings = get_settings_from_file("spec.json") print(settings) command = '/usr/bin/Rscript forecast.R {} {} "{}" "{}" "{}" "{}" "{}" {}'.format(settings.Input.TimeSeries, settings.Output.Forecast, settings.Param.column_name, settings.Param.start, settings.Param.end, settings.Param.frequency, settings.Param.forecast_number, settings.Output.PlotPdf) ret = call(command, shell=True) if ret != 0: sys.exit(ret) # command = "/usr/bin/Rscript forecast.R {} {}".format(settings.Input.TimeSeries, settings.Output.Forecast) # print(command) # os.system(command); print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) X = np.genfromtxt(settings.Input.X, delimiter=',', skip_header=1) Y = np.genfromtxt(settings.Input.Y, delimiter=',', skip_header=1) svc = LinearSVC(C=float(settings.Param.C), loss=settings.Param.loss, penalty=settings.Param.penalty) svc.fit(X, Y) # joblib.dump(svc, settings.Output.MODEL, cache_size=1e9) with open(settings.Output.MODEL, "w") as f: pickle.dump(svc, f) print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) p = settings.Param s3_conn = boto.connect_s3(p.AWS_ACCESS_KEY_ID, p.AWS_ACCESS_KEY_SECRET) s3_bucket = s3_conn.get_bucket(p.S3_BUCKET) remote_filename = get_s3_working_dir(settings, s3_bucket, "OUTPUT_dest_s3/dest_s3") remote_filename_full = s3_multipart_upload(s3_bucket, p.SOURCE_URL, remote_filename) remote_dir = os.path.dirname(remote_filename_full) with open(settings.Output.dest_s3, "w") as f: f.write(remote_dir) print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) command = '/usr/bin/Rscript forecast.R {} {} "{}" "{}" "{}" "{}" "{}" {}'.format( settings.Input.TimeSeries, settings.Output.Forecast, settings.Param.column_name, settings.Param.start, settings.Param.end, settings.Param.frequency, settings.Param.forecast_number, settings.Output.PlotPdf) ret = call(command, shell=True) if ret != 0: sys.exit(ret) # command = "/usr/bin/Rscript forecast.R {} {}".format(settings.Input.TimeSeries, settings.Output.Forecast) # print(command) # os.system(command); print("Done")
def write_Main_hql(): settings = get_settings_from_file("spec.json") limit_string = "" if(settings.Param.limit is not None and settings.Param.limit.strip(' ') != ''): limit_string = "limit "+settings.Param.limit with open("main.hql","w") as file: file.write( """ DROP TABLE IF EXISTS ${OUTPUT_ordered_table}; CREATE TABLE ${OUTPUT_ordered_table} AS SELECT * FROM ${INPUT_from_table} ORDER BY ${PARAM_order_by_columns} %s ; """ % limit_string )
def main(): settings = get_settings_from_file("spec.json") print(settings) ds = json.load(open(settings.Input.DS)) # Build Output if ds['Type'] in ["Http", "LocalFile", "Http", "Ftp"]: ds_output = upload_url_to_s3(settings, ds) elif ds['Type'] in ["AWS_S3"]: ds_output = upload_s3_to_s3(settings, ds) else: raise ValueError("Invalid type for input datasource: '%s'" % ds['Type']) settings.Output.dest_s3.val = json.dumps(ds_output) print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) with open(settings.Input.input_file, "rb") as f: dialect, has_header = sniff_csv(f) data = list(csv.reader(f, dialect)) if has_header: data_header = data[0] data = data[1:] if int(settings.Param.random_seed) > 0: random.seed(int(settings.Param.random_seed)) else: random.seed(None) random.shuffle(data) # data = [i for i in data if i != ''] total = len(data) pivot = int(math.floor(total * float(settings.Param.train_ratio))) train_data = data[:pivot] test_data = data[pivot:] with open(settings.Output.train_file, "w") as train_f: train_writer = csv.writer(train_f, lineterminator='\n') if has_header: train_writer.writerow(data_header) for val in train_data: train_writer.writerow(val) with open(settings.Output.test_file, "w") as test_f: test_writer = csv.writer(test_f, lineterminator='\n') if has_header: test_writer.writerow(data_header) for val in test_data: test_writer.writerow(val) f.close() train_f.close() test_f.close() print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) with open(settings.Input.input_file, "rb") as f: dialect, has_header = sniff_csv(f) data = list(csv.reader(f, dialect)) if has_header: data_header = data[0] data = data[1:] if int(settings.Param.random_seed) > 0: random.seed(int(settings.Param.random_seed)) else: random.seed(None) random.shuffle(data) # data = [i for i in data if i != ''] total = len(data) pivot = int(math.floor(total * float(settings.Param.train_ratio))) train_data = data[:pivot] test_data = data[pivot:] with open(settings.Output.train_file, "w") as train_f: train_writer = csv.writer(train_f, lineterminator="\n") if has_header: train_writer.writerow(data_header) for val in train_data: train_writer.writerow(val) with open(settings.Output.test_file, "w") as test_f: test_writer = csv.writer(test_f, lineterminator="\n") if has_header: test_writer.writerow(data_header) for val in test_data: test_writer.writerow(val) f.close() train_f.close() test_f.close() print("Done")
def getSchema(): settings = get_settings_from_file("spec.json") print(settings) conn = pyhs2.connect(host=settings.Param.HiveServer2_Host, port=int(settings.Param.HiveServer2_Port), authMechanism="PLAIN", user="******", password="", database="default") query_sql = "DESCRIBE %s" % settings.Input.table_a.val cur = conn.cursor() cur.execute(query_sql) a_schema = [] a_select_item = [] for row in cur.fetch(): a_schema.append(row[0]) a_select_item.append(("%s AS %s") % (row[0], row[0])) query_sql = "DESCRIBE %s" % settings.Input.table_b.val cur = conn.cursor() cur.execute(query_sql) b_select_item = [] i = 0 for row in cur.fetch(): if (i >= len(a_schema)): raise Exception( "The two table to be unioned have different column numbers") b_select_item.append(("%s AS %s") % (row[0], a_schema[i])) i = i + 1 if (i != len(a_schema)): raise Exception( "The two table to be unioned have different column numbers") cur.close() conn.close() return [a_select_item, b_select_item]
def getSchema(): settings = get_settings_from_file("spec.json") print(settings) conn = pyhs2.connect( host=settings.Param.HiveServer2_Host, port=int(settings.Param.HiveServer2_Port), authMechanism="PLAIN", user="******", password="", database="default", ) query_sql = "DESCRIBE %s" % settings.Input.table_a.val cur = conn.cursor() cur.execute(query_sql) a_schema = [] a_select_item = [] for row in cur.fetch(): a_schema.append(row[0]) a_select_item.append(("%s AS %s") % (row[0], row[0])) query_sql = "DESCRIBE %s" % settings.Input.table_b.val cur = conn.cursor() cur.execute(query_sql) b_select_item = [] i = 0 for row in cur.fetch(): if i >= len(a_schema): raise Exception("The two table to be unioned have different column numbers") b_select_item.append(("%s AS %s") % (row[0], a_schema[i])) i = i + 1 if i != len(a_schema): raise Exception("The two table to be unioned have different column numbers") cur.close() conn.close() return [a_select_item, b_select_item]
def main(): settings = get_settings_from_file("spec.json") print(settings) X = np.genfromtxt(settings.Input.X, delimiter=',', skip_header=1) svc = joblib.load(settings.Input.MODEL) Y_out = svc.predict(X) Y_list = [Y_out] np.savetxt("./conclusion.csv", Y_out, fmt="%d", delimiter=",") conclusion = readcolumn("./conclusion.csv") label = readcolumn(settings.Input.Y) precision_list = [] recall_list = [] hits = 0 for i in range(len(label)): if conclusion[i] == label[i]: hits+=1 precision_list.append(1.0*hits/(i+1)) recall_list.append(1.0*hits/(len(label))) drawPrecisionRecall(precision_list,recall_list,settings.Output.report) print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) X = np.genfromtxt(settings.Input.X, delimiter=",", skip_header=1) svc = joblib.load(settings.Input.MODEL) Y_out = svc.predict(X) Y_list = [Y_out] np.savetxt("./conclusion.csv", Y_out, fmt="%d", delimiter=",") conclusion = readcolumn("./conclusion.csv") label = readcolumn(settings.Input.Y) precision_list = [] recall_list = [] hits = 0 for i in range(len(label)): if conclusion[i] == label[i]: hits += 1 precision_list.append(1.0 * hits / (i + 1)) recall_list.append(1.0 * hits / (len(label))) drawPrecisionRecall(precision_list, recall_list, settings.Output.report) print("Done")
#!/usr/bin/env python # -*- coding: utf-8 -*- from specparser import get_settings_from_file import os,urllib2 from pprint import pprint if __name__ == "__main__": settings = get_settings_from_file("spec.json") print(settings) with open(settings.Output.O, "w") as f: page=urllib2.urlopen(settings.Param.URI) f.write(page.read()) print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) handler(settings.Param.uri, settings.Param.method) print("Done")
def main(): settings = get_settings_from_file("spec.json") print(settings) handler(settings.Param.uri,settings.Param.method) print("Done")
#!/usr/bin/env python # -*- coding: utf-8 -*- from specparser import get_settings_from_file import os, urllib2 from pprint import pprint if __name__ == "__main__": settings = get_settings_from_file("spec.json") print(settings) with open(settings.Output.O, "w") as f: page = urllib2.urlopen(settings.Param.URI) f.write(page.read()) print("Done")