Ejemplo n.º 1
0
def run():
    """
    """

    if len(sys.argv) == 1:
        show_help()
        return
    else:
        # logic set
        process_jobs = []

        for i in range(len(sys.argv[0:])):
            if sys.argv[i].startswith("--"):
                try:
                    option = sys.argv[i][2:]
                except:
                    show_help()
                    return

                # show version
                if option == "version":
                    show_version()
                    return

                # show some help
                elif option == "help":
                    show_help()
                    return

                # get the raw data
                elif option == "data":
                    # Add info to jobs
                    process_jobs.append({"data": sys.argv[i + 1]})

                # get the bulk URL
                elif option == "bulk":
                    # Add info to jobs
                    process_jobs.append({"bulk": sys.argv[i + 1]})

                # get the bulk index
                elif option == "index":
                    # Add info to jobs
                    process_jobs.append({"index": sys.argv[i + 1]})

                # get the bulk type
                elif option == "type":
                    # Add info to jobs
                    process_jobs.append({"type": sys.argv[i + 1]})

                # check raw JSON
                elif option == "check":
                    # Add info to jobs
                    process_jobs.append("check")

                # check if bulk API is valid
                elif option == "import":
                    # Add info to jobs
                    process_jobs.append("import")

                # add multi-threads support
                elif option == "thread":
                    # Add info to jobs
                    process_jobs.append({"thread_amount": sys.argv[i + 1]})
                    process_jobs.append("thread")

        data = ""
        bulk = ""
        index = ""
        doc_type = ""
        thread_amount = 1
        # Get info from process_jobs
        # fix the syntax bug after upgrading support from Python2.7 to Python3.6, thanks to @tdracz
        for job in process_jobs:
            if type(job) == dict:
                if 'data' in job:
                    data = job['data']
                if 'bulk' in job:
                    bulk = job['bulk']
                if 'index' in job:
                    index = job['index']
                if 'type' in job:
                    doc_type = job['type']
                if 'thread' in job:
                    thread_amount = int(job['thread_amount'])

        #### 1) Only check not importing
        if ("check" in process_jobs) and ("import" not in process_jobs):
            # check JSON
            flag = validate_json_data(json_file=data)
            if flag == True:
                print("All raw JSON data valid!")
            return

        # Process the jobs in process_jobs
        # 2) Only import without checking
        #### 2.1) import, check , no multi-threads
        if ("check" in process_jobs) and ("import" in process_jobs) and (
                "thread" not in process_jobs):

            # check JSON
            flag = validate_json_data(json_file=data)
            if flag == True:
                print("All raw JSON data valid!")

            es = Elasticsearch([bulk], verify_certs=True)
            # read JSON data
            with open(data, 'r') as f:
                for line in f:
                    es.index(
                        index=index,
                        doc_type=doc_type,
                        #id=2,
                        body=json.loads(line))

            print("Successfully data imported!")
            return

        #### 2.2) import, no check, no multi-threads
        if ("check" not in process_jobs) and ("import" in process_jobs) and (
                "thread" not in process_jobs):
            es = Elasticsearch([bulk], verify_certs=True)
            # read JSON data
            with open(data, 'r') as f:
                for line in f:
                    es.index(
                        index=index,
                        doc_type=doc_type,
                        #id=2,
                        body=json.loads(line))

            print("Successfully data imported!")
            return

        #### 2.3) import, no check, multi-threads
        if ("import" in process_jobs) and ("check" not in process_jobs) and (
                "thread" in process_jobs):

            # check file lines
            lines = c_file_lines(json_file=data)
            # if lines < 1024, it will only use 1 thread to finish this job, no matter how many you want
            if lines < 1024:
                #if lines < 4:                                              # Only for debugging
                es = Elasticsearch([bulk], verify_certs=True)
                # read JSON data
                with open(data, 'r') as f:
                    for line in f:
                        es.index(
                            index=index,
                            doc_type=doc_type,
                            #id=2,
                            body=json.loads(line))
            else:
                # calculate each thread reads how many lines
                start_stop_line_list = new_return_start_stop_for_multi_thread_in_list(
                    lines=lines, thread_amount=thread_amount)

                threads = []
                for i in start_stop_line_list:
                    #t = StoppableThread(target=worker_import_to_es_for_threading, args=(data, i['start'], i['stop']))
                    t = threading.Thread(
                        target=worker_import_to_es_for_threading,
                        args=(
                            data,
                            i['start'],
                            i['stop'],
                            Elasticsearch([bulk], verify_certs=True),
                            index,
                            doc_type,
                        ))
                    threads.append(t)
                    t.start()
                    t.join()

                # stop all threads if interrupts
                try:
                    while len(threading.enumerate()) > 1:
                        pass
                    print("Successfully data imported!")
                    return
                except KeyboardInterrupt:
                    # for i in threads:
                    # i.stop()
                    print("Data importing interrupted!")
                    exit(0)
                    return

            print("Successfully data imported!")
            return

        #### 2.4) import, check, multi-threads

        if ("import" in process_jobs) and ("check" in process_jobs) and (
                "thread" in process_jobs):

            # check JSON
            flag = validate_json_data(json_file=data)
            if flag == True:
                print("All raw JSON data valid!")

            # check file lines
            lines = c_file_lines(json_file=data)
            # if lines < 1024, it will only use 1 thread to finish this job, no matter how many you want
            if lines < 1024:
                #if lines < 4:                                              # Only for debugging
                es = Elasticsearch([bulk], verify_certs=True)
                # read JSON data
                with open(data, 'r') as f:
                    for line in f:
                        es.index(
                            index=index,
                            doc_type=doc_type,
                            #id=2,
                            body=json.loads(line))
                print("Successfully data imported!")
                exit(0)
                return
            else:
                # calculate each thread reads how many lines
                start_stop_line_list = new_return_start_stop_for_multi_thread_in_list(
                    lines=lines, thread_amount=thread_amount)

                threads = []
                for i in start_stop_line_list:
                    #t = StoppableThread(target=worker_import_to_es_for_threading, args=(data, i['start'], i['stop']))
                    t = threading.Thread(
                        target=worker_import_to_es_for_threading,
                        args=(
                            data,
                            i['start'],
                            i['stop'],
                            Elasticsearch([bulk], verify_certs=True),
                            index,
                            doc_type,
                        ))
                    threads.append(t)
                    t.start()
                    t.join()

                # stop all threads if interrupts
                try:
                    # there is at least one main threading for all threadings
                    while len(threading.enumerate()) > 1:
                        pass
                    print("Successfully data imported!")
                    exit(0)
                    return
                except KeyboardInterrupt:
                    print(len(threading.enumerate()))
                    # for i in threads:
                    # i.stop()
                    print("Data importing interrupted!")
                    exit(0)
                    return

        else:
            show_help()
            return
Ejemplo n.º 2
0
def run():
    """
    """
    
    if len(sys.argv) == 1:
        show_help()
        return
    else:
        # logic set
        process_jobs = []
        
        for i in range(len(sys.argv[0:])):
            if sys.argv[i].startswith("--"):
                try:
                    option = sys.argv[i][2:]
                except:
                    show_help()
                    return

                # show version
                if option == "version":
                    show_version()
                    return

                # show some help
                elif option == "help":
                    show_help()
                    return

                # get the raw data
                elif option == "data":
                    # Add info to jobs
                    process_jobs.append(
                        {"data": sys.argv[i+1]}
                    )

                # get the bulk URL
                elif option == "bulk":
                    # Add info to jobs
                    process_jobs.append(
                        {"bulk": sys.argv[i+1]}
                    ) 
                
                # get the bulk index
                elif option == "index":
                    # Add info to jobs
                    process_jobs.append(
                        {"index": sys.argv[i+1]}
                    ) 
                
                # get the bulk type
                elif option == "type":
                    # Add info to jobs
                    process_jobs.append(
                        {"type": sys.argv[i+1]}
                    ) 
                
                
                # check raw JSON
                elif option == "check":
                    # Add info to jobs
                    process_jobs.append(
                        "check"
                    )

                    
                # check if bulk API is valid      
                elif option == "import":
                    # Add info to jobs
                    process_jobs.append(
                        "import"
                    )

                # add multi-threads support
                elif option == "thread":
                    # Add info to jobs
                    process_jobs.append(
                        {"thread_amount": sys.argv[i+1]}
                    ) 
                    process_jobs.append(
                        "thread"
                    )
                

        data = ""
        bulk = ""
        index = ""
        doc_type = ""
        thread_amount = 1
        # Get info from process_jobs
        for job in process_jobs:
            if type(job) == dict:
                if job.has_key('data'):
                    data = job['data']
                if job.has_key('bulk'):
                    bulk = job['bulk']
                if job.has_key('index'):
                    index = job['index']
                if job.has_key('type'):
                    doc_type = job['type']
                if job.has_key('thread'):
                    thread_amount = int(job['thread_amount'])

        #### 1) Only check not importing
        if ("check" in process_jobs) and ("import" not in process_jobs) :
            # check JSON
            flag = validate_json_data(json_file=data)
            if flag == True:
                print("All raw JSON data valid!")
            return
                
        # Process the jobs in process_jobs
        # 2) Only import without checking
        #### 2.1) import, check , no multi-threads
        if ("check" in process_jobs) and ("import" in process_jobs) and ("thread" not in process_jobs):
            
            # check JSON
            flag = validate_json_data(json_file=data)
            if flag == True:
                print("All raw JSON data valid!")
                
            es = Elasticsearch([bulk], verify_certs=True)
            # read JSON data
            with open(data, 'r') as f:
                for line in f:
                    es.index(index=index, doc_type=doc_type, 
                        #id=2, 
                        body=json.loads(line)
                    )
            
            print("Successfully data imported!")
            return


        #### 2.2) import, no check, no multi-threads
        if ("check" not in process_jobs) and ("import" in process_jobs) and ("thread" not in process_jobs):
            es = Elasticsearch([bulk], verify_certs=True)
            # read JSON data
            with open(data, 'r') as f:
                for line in f:
                    es.index(index=index, doc_type=doc_type, 
                        #id=2, 
                        body=json.loads(line)
                    )
            
            print("Successfully data imported!")
            return


        #### 2.3) import, no check, multi-threads
        if ("import" in process_jobs) and ("check" not in process_jobs) and ("thread" in process_jobs):
   


            # check file lines
            lines = c_file_lines(json_file=data)
            # if lines < 1024, it will only use 1 thread to finish this job, no matter how many you want
            if lines < 1024:
            #if lines < 4:                                              # Only for debugging 
                es = Elasticsearch([bulk], verify_certs=True)
                # read JSON data
                with open(data, 'r') as f:
                    for line in f:
                        es.index(index=index, doc_type=doc_type, 
                            #id=2, 
                            body=json.loads(line)
                        )
            else:
                # calculate each thread reads how many lines
                start_stop_line_list = new_return_start_stop_for_multi_thread_in_list(lines=lines, thread_amount=thread_amount)

                threads = []
                for i in start_stop_line_list:
                    #t = StoppableThread(target=worker_import_to_es_for_threading, args=(data, i['start'], i['stop']))
                    t = threading.Thread(target=worker_import_to_es_for_threading, 
                                         args=(data, i['start'], i['stop'], Elasticsearch([bulk], verify_certs=True), index, doc_type, )
                    )
                    threads.append(t)
                    t.start()
                    t.join()
                    

                # stop all threads if interrupts
                try:
                    while len(threading.enumerate()) > 1:
                        pass
                    print("Successfully data imported!")
                    return
                except KeyboardInterrupt:
                    # for i in threads:
                        # i.stop()
                    print("Data importing interrupted!")
                    exit(0)
                    return

            print("Successfully data imported!")
            return
 
 
         #### 2.4) import, check, multi-threads

        if ("import" in process_jobs) and ("check" in process_jobs) and ("thread" in process_jobs):

            # check JSON
            flag = validate_json_data(json_file=data)
            if flag == True:
                print("All raw JSON data valid!")



            # check file lines
            lines = c_file_lines(json_file=data)
            # if lines < 1024, it will only use 1 thread to finish this job, no matter how many you want
            if lines < 1024:
            #if lines < 4:                                              # Only for debugging 
                es = Elasticsearch([bulk], verify_certs=True)
                # read JSON data
                with open(data, 'r') as f:
                    for line in f:
                        es.index(index=index, doc_type=doc_type, 
                            #id=2, 
                            body=json.loads(line)
                        )
                print("Successfully data imported!")
                exit(0)
                return
            else:
                # calculate each thread reads how many lines
                start_stop_line_list = new_return_start_stop_for_multi_thread_in_list(lines=lines, thread_amount=thread_amount)

                threads = []
                for i in start_stop_line_list:
                    #t = StoppableThread(target=worker_import_to_es_for_threading, args=(data, i['start'], i['stop']))
                    t = threading.Thread(target=worker_import_to_es_for_threading, 
                                         args=(data, i['start'], i['stop'], Elasticsearch([bulk], verify_certs=True), index, doc_type, )
                    )
                    threads.append(t)
                    t.start()
                    t.join()
                    

                # stop all threads if interrupts
                try:
                    # there is at least one main threading for all threadings
                    while len(threading.enumerate()) > 1:
                        pass
                    print("Successfully data imported!")
                    exit(0)
                    return
                except KeyboardInterrupt:
                    print(len(threading.enumerate()))
                    # for i in threads:
                        # i.stop()
                    print("Data importing interrupted!")
                    exit(0)
                    return

 
        else:
            show_help()
            return
Ejemplo n.º 3
0
 def count_file_lines(json_files=""):
     return c_file_lines(json_file)
Ejemplo n.º 4
0
 def count_file_lines(json_files=""):
     return c_file_lines(json_file)