def handle_job_item_(self, job_syntax_item): if job_syntax_item and QUERY_KEY in job_syntax_item: query = job_syntax_item[QUERY_KEY].strip() is_csv = CSV_KEY in job_syntax_item is_var = VAR_KEY in job_syntax_item is_cache = CACHE_KEY in job_syntax_item if not query: return getLogger(LOG).info(job_syntax_item) if not is_var and is_csv and is_cache: csv_key_val = var_replaced(self.variables, job_syntax_item, CSV_KEY) csv_name = SqlExecutor.csv_name(csv_key_val) csv_size = SqlExecutor.csv_size(csv_key_val) if csv_size and SqlExecutor.valid_csv_exist(csv_key_val): getLogger(LOG).info("SKIP query: '%s', csvfile exist: %s", query, csv_name) return sql_exec = self.create_executor(job_syntax_item) try: sql_exec.execute() except: print job_syntax_item raise sql_exec.saved_csv() self.post_operation(job_syntax_item) self.variables = sql_exec.variables del sql_exec
def handle_transmitter_merge(self, job_syntax_item, endpoint): opname = job_syntax_item[OP_KEY] csv_key_val = var_replaced(self.variables, job_syntax_item, CSV_KEY) csv_filename = SqlExecutor.csv_name(csv_key_val) csv_data = self.csvdata(csv_filename) num_lines = len(csv_data) # do nothing for empty data set if num_lines <= 1: getLogger(LOG).info('skip empty csv') from mriya.sf_merge_wrapper import HEADER result_ids = BulkData(HEADER, []) else: objname = job_syntax_item[endpoint] conn = self.endpoints.endpoint(endpoint) max_batch_size = int(job_syntax_item[BATCH_SIZE_KEY]) getLogger(STDOUT).info('EXECUTE: %s %s, lines count=%d', opname, objname, num_lines - 1) t_before = time.time() if len(csv_data): result_ids = conn.soap_merge(objname, csv_data, max_batch_size) t_after = time.time() getLogger(STDOUT).info('SF %s Took time: %.2f' \ % (opname, t_after-t_before)) if NEW_IDS_TABLE in job_syntax_item: results_file_name = \ SqlExecutor.csv_name(job_syntax_item[NEW_IDS_TABLE]) with open(results_file_name, 'w') as result_ids_file: csv_data = csv_from_bulk_data(result_ids) result_ids_file.write(csv_data) getLogger(LOG).info('Saved result ids: %s', results_file_name) getLogger(LOG).info('Done: %s operation', opname)
def fix_empty_res_table(self, table_name): size = SqlExecutor.csv_size(table_name) if size == 0: getLogger(LOG).debug("Fix empty csv for table %s", table_name) cols = SqlExecutor.get_query_columns(self.get_query()) header = ','.join(cols) + '\n' with open(self.csv_name(table_name), 'w') as f: f.write(header)
def handle_transmitter_op(self, job_syntax_item, endpoint): opname = job_syntax_item[OP_KEY] # run batches sequentially / parallel if BATCH_TYPE_KEY in job_syntax_item: if job_syntax_item[BATCH_TYPE_KEY] == BATCH_TYPE_PARALLEL_KEY: batch_seq = False elif job_syntax_item[BATCH_TYPE_KEY] == BATCH_TYPE_SEQUENTIAL_KEY: batch_seq = True else: getLogger(STDERR).error('Unknown batch type: %s', job_syntax_item[BATCH_TYPE_KEY]) exit(1) else: batch_seq = False # parallel by default csv_key_val = var_replaced(self.variables, job_syntax_item, CSV_KEY) csv_filename = SqlExecutor.csv_name(csv_key_val) csv_data = self.csvdata(csv_filename) num_lines = len(csv_data) # do nothing for empty data set if num_lines <= 1: getLogger(LOG).info('skip empty csv') stub = ['"Id","Success","Created","Error"\n'] result_ids = parse_batch_res_data(stub) else: objname = job_syntax_item[endpoint] conn = self.endpoints.endpoint(endpoint) max_batch_size = int(job_syntax_item[BATCH_SIZE_KEY]) getLogger(STDOUT).info('EXECUTE: %s %s, lines count=%d', opname, objname, num_lines - 1) t_before = time.time() if len(csv_data): if opname == OP_UPDATE: res = conn.bulk_update(objname, csv_data, max_batch_size, batch_seq) elif opname == OP_DELETE: res = conn.bulk_delete(objname, csv_data, max_batch_size, batch_seq) elif opname == OP_INSERT: res = conn.bulk_insert(objname, csv_data, max_batch_size, batch_seq) else: getLogger(STDERR).error("Operation '%s' isn't supported" % opname) exit(1) result_ids = parse_batch_res_data(res) t_after = time.time() getLogger(STDOUT).info('SF %s Took time: %.2f' \ % (opname, t_after-t_before)) if NEW_IDS_TABLE in job_syntax_item: results_file_name = \ SqlExecutor.csv_name(job_syntax_item[NEW_IDS_TABLE]) with open(results_file_name, 'w') as result_ids_file: csv_data = csv_from_bulk_data(result_ids) result_ids_file.write(csv_data) getLogger(LOG).info('Saved result ids: %s', results_file_name) getLogger(LOG).info('Done: %s operation', opname)
def _create_script(self, variables): imports = '' if CSVLIST_KEY in self.job_syntax_item: for table_name in self.job_syntax_item[CSVLIST_KEY]: #check if table name contain vars table_name = SqlExecutor.prepare_query_put_vars( table_name, self.variables) imports += ".import {csv} {name}\n"\ .format(csv=self.csv_name(table_name), name=table_name) output = '' if CSV_KEY in self.job_syntax_item: table_name = var_replaced(variables, self.job_syntax_item, CSV_KEY) output += ".headers on\n" output += ".output {csv}\n"\ .format(csv=self.csv_name(table_name)) getLogger(LOG).info('working on table=%s', table_name) elif VAR_KEY in self.job_syntax_item: output += ".headers off\n" output += ".output stdout\n" elif BATCH_BEGIN_KEY in self.job_syntax_item: output += ".headers on\n" output += ".output stdout\n" getLogger(MOREINFO).info('EXECUTE [CSV]: %s', self.get_query()) input_data = SQLITE_SCRIPT_FMT.format(imports=imports, output=output, query=self.get_query()) return input_data
def get_query(self): if not self.query: self.query = SqlExecutor.prepare_query_put_vars( self.job_syntax_item[QUERY_KEY], self.variables) # get csvlist after variables substitution values = JobSyntax.parse_query_params(self.get_query(), {}) if CSVLIST_KEY in values: getLogger(LOG).debug('Fix csvlist: %s', values[CSVLIST_KEY]) if CSVLIST_KEY in self.job_syntax_item: tmp = self.job_syntax_item[CSVLIST_KEY] tmp.extend(values[CSVLIST_KEY]) # filter out duplicate items self.job_syntax_item[CSVLIST_KEY] = list(Set(tmp)) else: self.job_syntax_item[CSVLIST_KEY] = values[CSVLIST_KEY] if not CONST_KEY in self.job_syntax_item: self.query = self.query.replace(CSV_KEY + '.', '') self.query = self.query.replace(DST_KEY + '.', '') self.query = self.query.replace(SRC_KEY + '.', '') # end query with ';' add if ';' not exist if self.query and self.query[-1] != ';': self.query += ';' return self.query
def test_job_controller(mock_docall, m): # mock setup sf_bulk_connector.JOB_CHECK_TIMER = 0 mockers.mock_job_controller(mock_docall, m) # test itself setdatadir(tempfile.mkdtemp()) # test debug coverage mriya.log.INITIALIZED_LOGGERS = {} mriya.log.LOGGING_LEVEL = logging.DEBUG loginit(__name__) print "test_job_controller" # prepare test_csv.csv test_csv = ['"Alexa__c"', '"hello\n\n2"'] with open(SqlExecutor.csv_name('test_csv'), "w") as test_csv_f: test_csv_f.write('"Alexa__c"\n"hello<N CR><N CR>2"\n') notch = 'test1234567' with open('tests/sql/complicated.sql') as sql_f: lines = sql_f.readlines() job_syntax = JobSyntaxExtended(lines) with open(config_filename) as conf_file: job_controller = JobController(conf_file.name, endpoint_names, job_syntax, {}, False) job_controller.run_job() del job_controller # check resulted data with open(SqlExecutor.csv_name('some_data_staging')) as resulted_file: csv_data = get_bulk_data_from_csv_stream(resulted_file) name_idx = csv_data.fields.index('Name') assert 1 == len(csv_data.rows) assert csv_data.rows[0][name_idx] == notch with open(SqlExecutor.csv_name('newids')) as newids_file: csv_data = get_bulk_data_from_csv_stream(newids_file) id_idx = csv_data.fields.index('Id') try: assert 1 == len(csv_data.rows) except: print "len(csv_data.rows)", len(csv_data.rows) raise for row in csv_data.rows: assert len(row[id_idx]) >= 15 assert open(SqlExecutor.csv_name('test_csv')).read() \ == open(SqlExecutor.csv_name('test_csv_2')).read() # run another test using created data run_test_graph(datadir(), "tests/sql/complicated.sql")
def handle_result(self, bulk_res): # handle empty result - fix it by adding column names if bulk_res and bulk_res[0] == EMPTY_SF_RESPONSE: cols = SqlExecutor.get_query_columns(self.get_query()) header = ','.join(cols) bulk_res = [header] if len(bulk_res) > 1: #ignore last empty results bulk_res = bulk_res[:-1] # handle result if CSV_KEY in self.job_syntax_item: csv_key_val = var_replaced(self.variables, self.job_syntax_item, CSV_KEY) csvfname = SqlExecutor.csv_name(csv_key_val) bulk_data.save_escape_csv_lines_as_csv_file(csvfname, bulk_res) elif VAR_KEY in self.job_syntax_item: res = bulk_data.parse_batch_res_data(bulk_res) if res.rows: self.save_var(self.job_syntax_item[VAR_KEY], res.rows[0][0])
def get_query(self): if not self.query: self.query = SqlExecutor.prepare_query_put_vars( self.job_syntax_item[QUERY_KEY], self.variables) if not CONST_KEY in self.job_syntax_item: self.query = self.query.replace(CSV_KEY + '.', '') self.query = self.query.replace(DST_KEY + '.', '') self.query = self.query.replace(SRC_KEY + '.', '') # get rid of trailing ';' in query if self.query and self.query[-1] == ';': self.query = self.query[:-1] return self.query
def test_columns(): loginit(__name__) query = "SELECT foo, (SELECT 1 WHERE 0=1) as foo2, foo as foo3, \ a.foo4 FROM " res = SqlExecutor.get_query_columns(query) try: assert (res == ['foo', 'foo2', 'foo3', 'foo4']) except: print res raise query = " SELECT Id as id2, \ b.bomba, (SELECT a.Id FROM foo a WHERE a.nope = b.Id) as id__c, \ (SELECT a.Id from foo2 a WHERE a.ggg = b.nnn) as super \ FROM sometable b WHERE ..." res = SqlExecutor.get_query_columns(query) try: assert (res == ['id2', 'bomba', 'id__c', 'super']) except: print res raise
def run_job(self): batch_items = None batch = None for job_syntax_item in self.job_syntax: if self.debug_steps: print "NEXT SQL:", SqlExecutor.prepare_query_put_vars( job_syntax_item['line'], self.variables) print "continue execution? y/n" if not self.step_by_step(): exit(0) if BATCH_KEY in job_syntax_item: self.run_in_loop(job_syntax_item) else: self.handle_job_item_(job_syntax_item)
def __init__(self, config_filename, endpoint_names, job_syntax, variables, debug_steps): #loginit(__name__) self.config = None if config_filename: self.config_file = open(config_filename) self.config = ConfigParser() self.config.read_file(self.config_file) self.job_syntax = job_syntax self.endpoints = Endpoints(self.config, endpoint_names) self.variables = variables self.debug_steps = debug_steps # create csv file for an internal batch purpose ints1000_csv = SqlExecutor.csv_name(INTS_TABLE) with open(ints1000_csv, 'w') as ints: ints.write('i\n') for i in xrange(10001): ints.write('%d\n' % i) ints.flush()
def add_item_to_graph(item_x, idx, graph_nodes, csvdir, aggregated_csvs): edges = [] # get csv relations if CSVLIST_KEY in item_x: edges.extend(item_x[CSVLIST_KEY]) elif OBJNAME_KEY in item_x: node_name = item_x[FROM_KEY] + '.' + item_x[OBJNAME_KEY] edges.append(node_name) graph_nodes[node_name] = GraphNodeData(id=idx, edges=[], shape=SHAPE_BOX, color=COLOR_GREEN, style='', info=EXTERNAL_OBJECT_READ, href='') idx = idx + 1 # get var relations if QUERY_KEY in item_x: edges.extend(SqlExecutor.get_query_var_names(item_x[QUERY_KEY])) csvhref, nodeinfo = ('','') # var nodes if VAR_KEY in item_x: node_name = item_x[VAR_KEY] color=COLOR_GREEN csvhref, nodeinfo = get_href_info(item_x, node_name, csvdir, aggregated_csvs) nodeinfo += 'query: %s' % item_x[LINE_KEY] if PUBLISH_KEY in item_x: color=COLOR_RED if node_name in graph_nodes: edges.extend(graph_nodes[node_name].edges) graph_nodes[node_name] = GraphNodeData(id=idx, edges=list(Set(edges)), shape=SHAPE_ELLIPSE, style=STYLE_DASHED, color=color, info=nodeinfo, href=csvhref) print '%s : "%s"\n' % (item_x[VAR_KEY], item_x[LINE_KEY]) idx = idx + 1 # csv nodes elif CSV_KEY in item_x: node_name = item_x[CSV_KEY] csvhref, nodeinfo = get_href_info(item_x, node_name, csvdir, aggregated_csvs) nodeinfo += 'query: %s' % item_x[LINE_KEY] if node_name in graph_nodes: edges.extend(graph_nodes[node_name].edges) graph_nodes[node_name] = GraphNodeData(id=idx, edges=list(Set(edges)), shape=SHAPE_ELLIPSE, color='', style='', info=nodeinfo, href=csvhref) print '%s : "%s"\n' % (item_x[CSV_KEY], item_x[LINE_KEY]) if OP_KEY in item_x: idx = idx + 1 if DST_KEY in item_x: node2_name = 'dst:%s:%s' % (item_x[OP_KEY], item_x[DST_KEY]) elif SRC_KEY in item_x: node2_name = 'src:%s:%s' % (item_x[OP_KEY], item_x[SRC_KEY]) while node2_name in graph_nodes: node2_name += '.' graph_nodes[node2_name] \ = GraphNodeData(id=idx, edges=[node_name], shape=SHAPE_BOX, color=COLOR_RED, style='', info=EXTERNAL_OBJECT_WRITE, href='') idx = idx + 1 # add node as result of operation node3_name = item_x[NEW_IDS_TABLE] csvhref, nodeinfo = get_href_info(item_x, node3_name, csvdir, aggregated_csvs) nodeinfo += EXTERNAL_OBJECT_RESULT graph_nodes[node3_name] \ = GraphNodeData(id=idx, edges=[node2_name], shape=SHAPE_ELLIPSE, color='', style='', info=nodeinfo, href=csvhref) idx = idx + 1 return (idx, graph_nodes)
def test_var_csv(mock_docall, m): # mock setup sf_bulk_connector.JOB_CHECK_TIMER = 0 mockers.mock_var_csv(mock_docall, m) # test itself setdatadir(tempfile.mkdtemp()) loginit(__name__) print "test_var_csv" macro_lines = [ 'SELECT i from csv.ints10000 WHERE i>=CAST(10 as INTEGER) \ LIMIT 2; => batch_begin:i:NESTED', 'SELECT {NESTED}; => var:foo2', '=> batch_end:NESTED', "SELECT '{STATIC_VAR}'; => var:static_var" ] lines = [ 'SELECT 1; => var:one', "SELECT 'csv.ints10000'; => var:CSV_INTS10000 => const:", "SELECT * FROM {CSV_INTS10000} LIMIT 1; => var:VAR_0", 'SELECT "9+0"; => var:nine', 'SELECT Id FROM src.Account LIMIT 1 => csv:sfvar', 'SELECT * FROM csv.sfvar => var:sfvar', 'SELECT {one} as f1, {nine}+1 as f2; => csv:one_ten', "SELECT 'one_nine_ten' => var:ONENINETEN", 'SELECT f1, {nine} as f9, (SELECT f2 FROM csv.one_ten) as f10 \ FROM csv.one_ten; => csv:{ONENINETEN}', 'SELECT i from csv.ints10000 WHERE i>=2 LIMIT 2; \ => batch_begin:i:PARAM', 'SELECT {PARAM}; => var:foo', '=> macro:macro_test_batch:STATIC_VAR:something', '=> batch_end:PARAM', 'SELECT {PARAM}; => var:final_test' ] expected = [{ 'from': 'csv', 'query': 'SELECT 1;', 'var': 'one' }, { 'from': 'csv', 'query': "SELECT 'csv.ints10000';", 'var': 'CSV_INTS10000', 'const': '', 'from': 'csv' }, { 'from': 'csv', 'query': "SELECT * FROM {CSV_INTS10000} LIMIT 1;", 'var': 'VAR_0' }, { 'from': 'csv', 'query': 'SELECT "9+0";', 'var': 'nine' }, { 'objname': 'Account', 'query': 'SELECT Id FROM Account LIMIT 1', 'csv': 'sfvar', 'from': 'src', 'objname': 'Account' }, { 'query': 'SELECT * FROM sfvar', 'var': 'sfvar', 'csvlist': ['sfvar'], 'from': 'csv' }, { 'query': 'SELECT {one} as f1, {nine}+1 as f2;', 'csv': 'one_ten' }, { 'from': 'csv', 'query': "SELECT 'one_nine_ten'", 'var': 'ONENINETEN' }, { 'query': 'SELECT f1, {nine} as f9, (SELECT f2 FROM \ one_ten) as f10 FROM one_ten;', 'csvlist': ['one_ten'], 'csv': '{ONENINETEN}', 'from': 'csv' }, { 'batch': [{ 'line': 'SELECT {PARAM}; => var:foo', 'query': 'SELECT {PARAM};', 'var': 'foo', 'from': 'csv' }, { 'batch': [{ 'line': 'SELECT {NESTED}; => var:foo2', 'query': 'SELECT {NESTED};', 'var': 'foo2', 'from': 'csv' }], 'batch_begin': ('i', 'NESTED'), 'csvlist': ['ints10000'], 'from': 'csv', 'line': 'SELECT i from csv.ints10000 WHERE i>=CAST(10 as INTEGER) LIMIT 2; => batch_begin:i:NESTED', 'query': 'SELECT i from ints10000 WHERE i>=CAST(10 as INTEGER) LIMIT 2;' }, { 'line': "SELECT 'something'; => var:static_var", 'query': "SELECT 'something';", 'var': 'static_var', 'from': 'csv' }], 'batch_begin': ('i', 'PARAM'), 'csvlist': ['ints10000'], 'from': 'csv', 'query': 'SELECT i from ints10000 WHERE i>=2 LIMIT 2;' }, { 'from': 'csv', 'query': 'SELECT {PARAM};', 'var': 'final_test' }] job_syntax_extended = JobSyntaxExtended(lines, {'macro_test_batch': macro_lines}) assert_job_syntax_lines(job_syntax_extended.items(), expected) try: os.remove(SqlExecutor.csv_name('one_nine_ten')) except: pass with open(config_filename) as conf_file: job_controller = JobController(conf_file.name, endpoint_names, job_syntax_extended, {}, False) job_controller.run_job() res_batch_params = job_controller.variables[BATCH_PARAMS_KEY] assert res_batch_params == ['2', '3'] sfvar = job_controller.variables['sfvar'] assert len(sfvar) >= 15 final_param = job_controller.variables['final_test'] assert final_param == '3' del job_controller with open(SqlExecutor.csv_name('one_nine_ten')) as resulted_file: assert resulted_file.read() == 'f1,f9,f10\n1,9,10\n'