def run(): """ """ if len(sys.argv) == 1: show_help() return else: # logic set process_jobs = [] for i in range(len(sys.argv[0:])): if sys.argv[i].startswith("--"): try: option = sys.argv[i][2:] except: show_help() return # show version if option == "version": show_version() return # show some help elif option == "help": show_help() return # get the raw data elif option == "data": # Add info to jobs process_jobs.append({"data": sys.argv[i + 1]}) # get the bulk URL elif option == "bulk": # Add info to jobs process_jobs.append({"bulk": sys.argv[i + 1]}) # get the bulk index elif option == "index": # Add info to jobs process_jobs.append({"index": sys.argv[i + 1]}) # get the bulk type elif option == "type": # Add info to jobs process_jobs.append({"type": sys.argv[i + 1]}) # check raw JSON elif option == "check": # Add info to jobs process_jobs.append("check") # check if bulk API is valid elif option == "import": # Add info to jobs process_jobs.append("import") # add multi-threads support elif option == "thread": # Add info to jobs process_jobs.append({"thread_amount": sys.argv[i + 1]}) process_jobs.append("thread") data = "" bulk = "" index = "" doc_type = "" thread_amount = 1 # Get info from process_jobs # fix the syntax bug after upgrading support from Python2.7 to Python3.6, thanks to @tdracz for job in process_jobs: if type(job) == dict: if 'data' in job: data = job['data'] if 'bulk' in job: bulk = job['bulk'] if 'index' in job: index = job['index'] if 'type' in job: doc_type = job['type'] if 'thread' in job: thread_amount = int(job['thread_amount']) #### 1) Only check not importing if ("check" in process_jobs) and ("import" not in process_jobs): # check JSON flag = validate_json_data(json_file=data) if flag == True: print("All raw JSON data valid!") return # Process the jobs in process_jobs # 2) Only import without checking #### 2.1) import, check , no multi-threads if ("check" in process_jobs) and ("import" in process_jobs) and ( "thread" not in process_jobs): # check JSON flag = validate_json_data(json_file=data) if flag == True: print("All raw JSON data valid!") es = Elasticsearch([bulk], verify_certs=True) # read JSON data with open(data, 'r') as f: for line in f: es.index( index=index, doc_type=doc_type, #id=2, body=json.loads(line)) print("Successfully data imported!") return #### 2.2) import, no check, no multi-threads if ("check" not in process_jobs) and ("import" in process_jobs) and ( "thread" not in process_jobs): es = Elasticsearch([bulk], verify_certs=True) # read JSON data with open(data, 'r') as f: for line in f: es.index( index=index, doc_type=doc_type, #id=2, body=json.loads(line)) print("Successfully data imported!") return #### 2.3) import, no check, multi-threads if ("import" in process_jobs) and ("check" not in process_jobs) and ( "thread" in process_jobs): # check file lines lines = c_file_lines(json_file=data) # if lines < 1024, it will only use 1 thread to finish this job, no matter how many you want if lines < 1024: #if lines < 4: # Only for debugging es = Elasticsearch([bulk], verify_certs=True) # read JSON data with open(data, 'r') as f: for line in f: es.index( index=index, doc_type=doc_type, #id=2, body=json.loads(line)) else: # calculate each thread reads how many lines start_stop_line_list = new_return_start_stop_for_multi_thread_in_list( lines=lines, thread_amount=thread_amount) threads = [] for i in start_stop_line_list: #t = StoppableThread(target=worker_import_to_es_for_threading, args=(data, i['start'], i['stop'])) t = threading.Thread( target=worker_import_to_es_for_threading, args=( data, i['start'], i['stop'], Elasticsearch([bulk], verify_certs=True), index, doc_type, )) threads.append(t) t.start() t.join() # stop all threads if interrupts try: while len(threading.enumerate()) > 1: pass print("Successfully data imported!") return except KeyboardInterrupt: # for i in threads: # i.stop() print("Data importing interrupted!") exit(0) return print("Successfully data imported!") return #### 2.4) import, check, multi-threads if ("import" in process_jobs) and ("check" in process_jobs) and ( "thread" in process_jobs): # check JSON flag = validate_json_data(json_file=data) if flag == True: print("All raw JSON data valid!") # check file lines lines = c_file_lines(json_file=data) # if lines < 1024, it will only use 1 thread to finish this job, no matter how many you want if lines < 1024: #if lines < 4: # Only for debugging es = Elasticsearch([bulk], verify_certs=True) # read JSON data with open(data, 'r') as f: for line in f: es.index( index=index, doc_type=doc_type, #id=2, body=json.loads(line)) print("Successfully data imported!") exit(0) return else: # calculate each thread reads how many lines start_stop_line_list = new_return_start_stop_for_multi_thread_in_list( lines=lines, thread_amount=thread_amount) threads = [] for i in start_stop_line_list: #t = StoppableThread(target=worker_import_to_es_for_threading, args=(data, i['start'], i['stop'])) t = threading.Thread( target=worker_import_to_es_for_threading, args=( data, i['start'], i['stop'], Elasticsearch([bulk], verify_certs=True), index, doc_type, )) threads.append(t) t.start() t.join() # stop all threads if interrupts try: # there is at least one main threading for all threadings while len(threading.enumerate()) > 1: pass print("Successfully data imported!") exit(0) return except KeyboardInterrupt: print(len(threading.enumerate())) # for i in threads: # i.stop() print("Data importing interrupted!") exit(0) return else: show_help() return
def run(): """ """ if len(sys.argv) == 1: show_help() return else: # logic set process_jobs = [] for i in range(len(sys.argv[0:])): if sys.argv[i].startswith("--"): try: option = sys.argv[i][2:] except: show_help() return # show version if option == "version": show_version() return # show some help elif option == "help": show_help() return # get the raw data elif option == "data": # Add info to jobs process_jobs.append( {"data": sys.argv[i+1]} ) # get the bulk URL elif option == "bulk": # Add info to jobs process_jobs.append( {"bulk": sys.argv[i+1]} ) # get the bulk index elif option == "index": # Add info to jobs process_jobs.append( {"index": sys.argv[i+1]} ) # get the bulk type elif option == "type": # Add info to jobs process_jobs.append( {"type": sys.argv[i+1]} ) # check raw JSON elif option == "check": # Add info to jobs process_jobs.append( "check" ) # check if bulk API is valid elif option == "import": # Add info to jobs process_jobs.append( "import" ) # add multi-threads support elif option == "thread": # Add info to jobs process_jobs.append( {"thread_amount": sys.argv[i+1]} ) process_jobs.append( "thread" ) data = "" bulk = "" index = "" doc_type = "" thread_amount = 1 # Get info from process_jobs for job in process_jobs: if type(job) == dict: if job.has_key('data'): data = job['data'] if job.has_key('bulk'): bulk = job['bulk'] if job.has_key('index'): index = job['index'] if job.has_key('type'): doc_type = job['type'] if job.has_key('thread'): thread_amount = int(job['thread_amount']) #### 1) Only check not importing if ("check" in process_jobs) and ("import" not in process_jobs) : # check JSON flag = validate_json_data(json_file=data) if flag == True: print("All raw JSON data valid!") return # Process the jobs in process_jobs # 2) Only import without checking #### 2.1) import, check , no multi-threads if ("check" in process_jobs) and ("import" in process_jobs) and ("thread" not in process_jobs): # check JSON flag = validate_json_data(json_file=data) if flag == True: print("All raw JSON data valid!") es = Elasticsearch([bulk], verify_certs=True) # read JSON data with open(data, 'r') as f: for line in f: es.index(index=index, doc_type=doc_type, #id=2, body=json.loads(line) ) print("Successfully data imported!") return #### 2.2) import, no check, no multi-threads if ("check" not in process_jobs) and ("import" in process_jobs) and ("thread" not in process_jobs): es = Elasticsearch([bulk], verify_certs=True) # read JSON data with open(data, 'r') as f: for line in f: es.index(index=index, doc_type=doc_type, #id=2, body=json.loads(line) ) print("Successfully data imported!") return #### 2.3) import, no check, multi-threads if ("import" in process_jobs) and ("check" not in process_jobs) and ("thread" in process_jobs): # check file lines lines = c_file_lines(json_file=data) # if lines < 1024, it will only use 1 thread to finish this job, no matter how many you want if lines < 1024: #if lines < 4: # Only for debugging es = Elasticsearch([bulk], verify_certs=True) # read JSON data with open(data, 'r') as f: for line in f: es.index(index=index, doc_type=doc_type, #id=2, body=json.loads(line) ) else: # calculate each thread reads how many lines start_stop_line_list = new_return_start_stop_for_multi_thread_in_list(lines=lines, thread_amount=thread_amount) threads = [] for i in start_stop_line_list: #t = StoppableThread(target=worker_import_to_es_for_threading, args=(data, i['start'], i['stop'])) t = threading.Thread(target=worker_import_to_es_for_threading, args=(data, i['start'], i['stop'], Elasticsearch([bulk], verify_certs=True), index, doc_type, ) ) threads.append(t) t.start() t.join() # stop all threads if interrupts try: while len(threading.enumerate()) > 1: pass print("Successfully data imported!") return except KeyboardInterrupt: # for i in threads: # i.stop() print("Data importing interrupted!") exit(0) return print("Successfully data imported!") return #### 2.4) import, check, multi-threads if ("import" in process_jobs) and ("check" in process_jobs) and ("thread" in process_jobs): # check JSON flag = validate_json_data(json_file=data) if flag == True: print("All raw JSON data valid!") # check file lines lines = c_file_lines(json_file=data) # if lines < 1024, it will only use 1 thread to finish this job, no matter how many you want if lines < 1024: #if lines < 4: # Only for debugging es = Elasticsearch([bulk], verify_certs=True) # read JSON data with open(data, 'r') as f: for line in f: es.index(index=index, doc_type=doc_type, #id=2, body=json.loads(line) ) print("Successfully data imported!") exit(0) return else: # calculate each thread reads how many lines start_stop_line_list = new_return_start_stop_for_multi_thread_in_list(lines=lines, thread_amount=thread_amount) threads = [] for i in start_stop_line_list: #t = StoppableThread(target=worker_import_to_es_for_threading, args=(data, i['start'], i['stop'])) t = threading.Thread(target=worker_import_to_es_for_threading, args=(data, i['start'], i['stop'], Elasticsearch([bulk], verify_certs=True), index, doc_type, ) ) threads.append(t) t.start() t.join() # stop all threads if interrupts try: # there is at least one main threading for all threadings while len(threading.enumerate()) > 1: pass print("Successfully data imported!") exit(0) return except KeyboardInterrupt: print(len(threading.enumerate())) # for i in threads: # i.stop() print("Data importing interrupted!") exit(0) return else: show_help() return
def count_file_lines(json_files=""): return c_file_lines(json_file)