def backup_field(self):
     report = report_func.initial_report('backup_field', self.project_id, self.schedule_type, self.schedule_date_key, \
                                                                                                         self.schedule_time_key)
     start_run = time.time()
     try:
         if self.environment == 'development':
             objects = pickle.load(open('dags/data_warehouse_prod/backup/' + self.project_backup_dir + self.project_field_dir \
                                                                               + self.project_id + self.backup_file_type, 'rb'))
             data_objects = [item for item in objects]
             handle = open('dags/data_warehouse_prod/backup/test_data/field_' + str(self.start.strftime("%Y-%m-%d")) \
                                                                                                  + self.backup_file_type, 'wb')
             pickle.dump(data_objects, handle, protocol=pickle.HIGHEST_PROTOCOL)
             handle.close()
         else:
             client = MongoClient(self.uri)
             data_query = client[self.database_name][self.performance_collection_name].find(self.query_field)
             data_objects = [item for item in data_query]
             client.close()
             handle = open(self.backup_dir + self.project_backup_dir + self.project_performance_dir \
                                                           + str(self.start.strftime("%Y-%m-%d")) + self.backup_file_type, 'wb')
             pickle.dump(data_objects, handle, protocol=pickle.HIGHEST_PROTOCOL)
             handle.close()
         report.status_code = 'PASSED'
     except Exception as e:
         report.status_code = 'FAILED'
         report.description = str(e)
     finally:
         report.total_time_run_second = time.time()-start_run
         self.reports.append(report)
         return report
 def clean(self):
     report = report_func.initial_report('clean', self.project_id, self.schedule_type, self.schedule_date_key, \
                                                                                                         self.schedule_time_key)
     start_run = time.time()
     try:
         if self.environment == 'development' or self.environment == 'production':
             now = self.start - timedelta(days=1)
             file_name = str(now.strftime("%Y-%m-%d"))
             list_path = [
                 self.backup_dir + self.project_backup_dir + self.project_docs_dir + file_name + self.backup_file_type,
                 self.backup_dir + self.project_backup_dir + self.project_trans_dir + file_name + self.backup_file_type,
                 self.backup_dir + self.project_backup_dir + self.project_performance_dir + file_name + self.backup_file_type,
                 self.backup_dir + self.project_backup_dir + self.project_field_dir + file_name + self.backup_file_type,
             ]
             description = ''
             for item in list_path:
                 if os.path.exists(item):
                     os.remove(item)
                 else:
                     description +=  ', ' + "The {path} does not exist".format(path = item)
             report.description = description
         report.status_code = 'PASSED'
     except Exception as e:
         report.status_code = 'FAILED'
         report.description = str(e)
     finally:
         report.total_time_run_second = time.time()-start_run
         self.reports.append(report)
         return report
Example #3
0
    def fact_document(self):
        report = report_func.initial_report('fact_document', self.project_id,
                                            self.schedule_type,
                                            self.schedule_date_key,
                                            self.schedule_time_key)
        start_run = time.time()
        try:
            datas = []
            data_docs, data_trans = self.get_docs_and_trans()
            list_created = [{'doc_id': func.bson_object_to_string(data['_id']), 'created_date': data['created_date']} \
                                                                                                             for data in data_docs]
            _id = self.db.get_max_id_table(schema=self.schema,
                                           table=self.fact_document_table,
                                           col='document_key')
            if _id == None: _id = 1
            else: _id += 1
            for data in data_trans:
                if len(data['records']) == 0:
                    continue
                created_date_utc_7  = func.created_date_of_docs_by_id(func.bson_object_to_string(data['doc_id']), \
                                                                                      list_created) + datetime.timedelta(hours = 7)
                last_modified_utc_7 = data[
                    'last_modified'] + datetime.timedelta(hours=7)
                import_date_key_utc_7, import_time_key_utc_7 = func.handle_date_to_date_and_time_id(
                    created_date_utc_7)
                export_date_key_utc_7, export_time_key_utc_7 = func.handle_date_to_date_and_time_id(
                    last_modified_utc_7)
                document_id = func.bson_object_to_string(data['doc_id'])
                doc_set_id = func.bson_object_to_string(data['doc_set_id']),
                _obj = FactDocumentModel(
                    document_key=_id,
                    ori_document_id=func.bson_object_to_string(data['_id']),
                    project_id=self.project_id,
                    document_id=document_id,
                    doc_set_id=doc_set_id,
                    remark_code=None,
                    remark_description=None,
                    import_date_key=import_date_key_utc_7,
                    import_time_key=import_time_key_utc_7,
                    export_date_key=export_date_key_utc_7,
                    export_time_key=export_time_key_utc_7,
                    import_timestamp=created_date_utc_7,
                    export_timestamp=last_modified_utc_7,
                )
                self.document_key_checks.append({
                    'document_key': _id,
                    'document_id': document_id,
                    'doc_set_id': doc_set_id
                })
                datas.append(_obj)
                _id += 1
#             self.db.create([item.__dict__ for item in datas], self.schema, self.fact_document_table)
            report.status_code = 'PASSED'
        except Exception as e:
            report.status_code = 'FAILED'
            report.description = str(e)
        finally:
            report.total_time_run_second = time.time() - start_run
            self.reports.append(report)
            return report
Example #4
0
 def dim_field(self):
     report = report_func.initial_report('dim_field', self.project_id,
                                         self.schedule_type,
                                         self.schedule_date_key,
                                         self.schedule_time_key)
     start_run = time.time()
     try:
         check_date = self.start
         if check_date == func.check_dim_field_run(
                 self.start) == True or self.environment == 'development':
             data_fields = self.get_field()
             results = []
             for data in data_fields:
                 _obj = DimFieldModel(
                     field_key=func.bson_object_to_string(data['_id']),
                     project_id=self.project_id,
                     name=data['name'],
                     control_type=data['control_type'],
                     default_value=data['default_value'],
                     counted_character=data['counted_character'],
                     counted_character_date_from_key=
                     20210101,  # hard code for test nedded change
                     counted_character_time_from_key=
                     0,  # hard code for test nedded change
                     counted_character_date_to_key=
                     20210131,  # hard code for test nedded change
                     counted_character_time_to_key=
                     235959,  # hard code for test nedded change
                     counted_character_from_timestamp=
                     '2021-01-01 00:00:00',  # hard code for test nedded change
                     counted_character_to_timestamp=
                     '2021-01-31 23:59:59',  # hard code for test nedded change
                     is_sub_field=False,
                 )
                 results.append(_obj)
             report.status_code = 'PASSED'
             self.db.update([item.__dict__ for item in results],
                            self.schema, self.dim_field_table)
         else:
             report.status_code = 'PASSED'
     except Exception as e:
         report.status_code = 'FAILED'
         report.description = str(e)
     finally:
         report.total_time_run_second = time.time() - start_run
         self.reports.append(report)
         return report
 def fact_performance(self):
     report = report_func.initial_report('fact_performance', self.project_id, self.schedule_type, self.schedule_date_key, \
                                                                                                         self.schedule_time_key)
     start_run = time.time()
     try:
         datas = []
         data_performance = self.get_performance()
         _id = 1
         for performance in data_performance:
             process_key = func.get_process_key_performance_gda(performance['type'], performance['task_def_key'])
             captured_date_timestamp_utc_7 = performance['time'] + datetime.timedelta(hours = 7)
             document_key = None
             obj_ = FactPerformanceModel(
                     performance_key = _id,
                     ori_performance_id = func.bson_object_to_string(performance['_id']),
                     document_key = document_key,
                     project_id = self.project_id,  
                     group_id = performance['group_id'],  
                     document_id = performance['doc_id'],  
                     reworked = func.int_to_bool(performance['rework_count']),  
                     work_type_key = func.get_working_type_id_by_name(performance['work_type']),  
                     process_key = func.get_process_key_performance_gda(performance['type'], performance['task_def_key']),  
                     number_of_record = performance['records'],
                     number_of_item = performance['items'],  
                     number_of_field = performance['fields'],
                     number_of_character = performance['chars'],  
                     user_name = performance['username'], 
                     ip = performance['ip'], 
                     captured_date_timestamp = captured_date_timestamp_utc_7,  
                     captured_date_key = func.time_to_date_key(captured_date_timestamp_utc_7),  
                     captured_time_key = func.time_to_time_key(captured_date_timestamp_utc_7),  
                     total_time_second = performance['total_time']    
             )
             datas.append(obj_)
             _id+=1
         self.db.update([item.__dict__ for item in datas], self.schema, self.fact_performance_table)
         report.status_code = 'PASSED'
     except Exception as e:
         report.status_code = 'FAILED'
         report.description = str(e)
     finally:
         report.total_time_run_second = time.time()-start_run
         self.reports.append(report)
         return report
 def check_connect(self):
     report = report_func.initial_report('check_connect', self.project_id, self.schedule_type, self.schedule_date_key, \
                                                                                                         self.schedule_time_key)
     start_run = time.time()
     try:
         if self.environment == 'development':
             pass
         else:
             client = MongoClient(self.uri, serverSelectionTimeoutMS= self.maxSevSelDelay)
             client.server_info()
             client.close()
         report.status_code = 'PASSED'
     except Exception as e:
         report.status_code = 'FAILED'
         report.description = str(e)
     finally:
         report.total_time_run_second = time.time()-start_run
         self.reports.append(report)
         return report
    def fact_data_extract(self):
        report = report_func.initial_report('fact_data_extraction', self.project_id, self.schedule_type, self.schedule_date_key, \
                                self.schedule_time_key)
        start_run = time.time()
        try:
            data_docs, data_trans = self.get_docs_and_trans()
            col_ignores = ['ImagePath']
            trans_ignore = ['doc_id', 'doc_uri', 'fileName', 'fileName_Bad', 'filter_control', 'getBatchName', 'keyer', \
                            'keyer_proof', 'keyer_type', 'FolderOutput', 'Image']
            results = []                
            for data in data_docs:
                records = data['records']
                document_id = func.bson_object_to_string(data['_id'])          
                document_key = None
                doc_set_id = func.bson_object_to_string(data['doc_set_id'])
                for i in range(len(records)):
                    record_id = i+1
                    record = records[i]
                    for key, value in record.items():
                        if key == 'keyed_data':
                            for keyed_data in value:
                                source = keyed_data['source']
                                task_def_key = keyed_data['task_def_key']
                                data_needed = keyed_data['data'][0].items()
                                last_modified_utc_7 = keyed_data['createdtime'] + datetime.timedelta(hours = 7)
                                user_name = keyed_data['keyer']
                                performance_key = None
                                if source != 'queue_transform' and task_def_key.startswith('Type'):
                                    process_key = 3 # human input keyed_data kpi
                                if source != 'queue_transform' and task_def_key == 'Verify_Hold_Type':
                                    process_key = 12 # human check bad_image keyed_data not kpi                               
                                elif source == 'queue_transform' and task_def_key.startswith('Type'):
                                    process_key = 4 # 'machine save input keyed_data'
                                elif source != 'queue_transform' and task_def_key.startswith('Proof'):
                                    process_key = 5 # human qc input keyed_data' kpi
                                elif source == 'queue_transform' and task_def_key.startswith('Proof'):
                                    process_key = 6 # 'machine save qc keyed_data'
                                for field_name, field_value_dict in data_needed:
                                    if field_name in col_ignores:
                                        continue
                                    _obj = FactDataExtractionModel(
                                        document_key = document_key,
                                        performance_key = performance_key,
                                        ori_document_id = document_id,
                                        project_id = self.project_id,
                                        document_id = document_id,
                                        doc_set_id =  doc_set_id,
                                        record_id = record_id,
                                        last_modified_date_key = func.time_to_date_key(last_modified_utc_7),
                                        last_modified_time_key = func.time_to_time_key(last_modified_utc_7),
                                        last_modified_timestamp = last_modified_utc_7,
                                        user_name = user_name,
                                        process_key = process_key,
                                        field_name = field_name,
                                        field_value = field_value_dict['text']
                                    )
                                    results.append(_obj)

                        elif key == 'final_data':
                            final_data = value[0]
                            data_needed = final_data['data'][0].items()
                            last_modified_utc_7 = final_data['createdtime'] + datetime.timedelta(hours = 7)
                            user_name = final_data['keyer']
                            process_key = 10
                            for field_name, field_value_dict in data_needed:
                                if field_name in col_ignores:
                                    continue
                                _obj = FactDataExtractionModel(
                                    document_key = document_key,
                                    performance_key = None,
                                    ori_document_id = document_id,
                                    project_id = self.project_id,
                                    document_id = document_id,
                                    doc_set_id =  doc_set_id,
                                    record_id = record_id,
                                    last_modified_date_key = func.time_to_date_key(last_modified_utc_7),
                                    last_modified_time_key = func.time_to_time_key(last_modified_utc_7),
                                    last_modified_timestamp = last_modified_utc_7,
                                    user_name = user_name,
                                    process_key = process_key,
                                    field_name = field_name,
                                    field_value = field_value_dict['text']
                                )
                                results.append(_obj)

                        elif key == 'qc_ed_data':
                            qc_ed_data = value[0][0]
                            if 'qc_fields_err' not in qc_ed_data.keys():
                                continue
                            qc_ed_data_err = qc_ed_data['qc_fields_err']
                            data_needed = qc_ed_data_err[0].items()
                            last_modified_utc_7 = qc_ed_data['createdtime'] + datetime.timedelta(hours = 7)
                            user_name = qc_ed_data['keyer']
                            process_key = 8
                            performance_key = None
                            for field_name, field_value_dict in data_needed:
                                if field_name in col_ignores:
                                    continue
                                _obj = FactDataExtractionModel(
                                    document_key = document_key,
                                    performance_key = performance_key,
                                    ori_document_id = document_id,
                                    project_id = self.project_id,
                                    document_id = document_id,
                                    doc_set_id =  doc_set_id,
                                    record_id = record_id,
                                    last_modified_date_key = func.time_to_date_key(last_modified_utc_7),
                                    last_modified_time_key = func.time_to_time_key(last_modified_utc_7),
                                    last_modified_timestamp = last_modified_utc_7,
                                    user_name = user_name,
                                    process_key = process_key,
                                    field_name = field_name,
                                    field_value = field_value_dict['text']
                                )
                                results.append(_obj)

                        elif key == 'apr_ed_data':
                            report.description = 'Not handle aprove qc data because not have sample data'
                            print('F**K???????: ')
                            
            for data in data_trans:
                document_id = func.bson_object_to_string(data['doc_id'])
                document_key = None
                ori_document_id = func.bson_object_to_string(data['_id'])
                doc_set_id =  func.bson_object_to_string(data['doc_set_id'])
                performance_key = None
                records = data['records']
                last_modified_utc_7 = data['last_modified'] + datetime.timedelta(hours = 7)
                for i in range(len(records)):
                    record = records[i]
                    record_id = i + 1
                    data_needed = record.items()
                    for field_name, field_value in data_needed:
                        if field_name in trans_ignore:
                            continue
                        _obj = FactDataExtractionModel(
                            document_key = document_key,
                            performance_key = performance_key,
                            ori_document_id = ori_document_id,
                            project_id = self.project_id,
                            document_id = document_id,
                            doc_set_id =  doc_set_id,
                            record_id = record_id,
                            last_modified_date_key = func.time_to_date_key(last_modified_utc_7),
                            last_modified_time_key = func.time_to_time_key(last_modified_utc_7),
                            last_modified_timestamp = last_modified_utc_7,
                            user_name = None,
                            process_key = 11,
                            field_name = field_name,
                            field_value = field_value
                        )
                        results.append(_obj)
            self.db.update([item.__dict__ for item in results], self.schema, self.fact_data_extraction_table)
            report.status_code = 'PASSED'
        except Exception as e:
            report.status_code = 'FAILED'
            report.description = str(e)
        finally:
            report.total_time_run_second = time.time()-start_run
            self.reports.append(report)
            return report
Example #8
0
    def fact_performance(self):
        report = report_func.initial_report('fact_performance', self.project_id, self.schedule_type, self.schedule_date_key, \
                                                                                                            self.schedule_time_key)
        start_run = time.time()
        try:
            datas = []
            data_performance = self.get_performance()
            _id = self.db.get_max_id_table(schema=self.schema,
                                           table=self.fact_performance_table,
                                           col='performance_key')
            if _id == None: _id = 1
            else: _id += 1
            test_3 = 0
            test_5 = 0
            test_8 = 0
            types = []
            tasks = []
            for performance in data_performance:
                process_key = func.get_process_key_performance_gda(
                    performance['type'], performance['task_def_key'])
                if performance['type'] not in types:
                    types.append(performance['type'])
                if performance['task_def_key'] not in tasks:
                    types.append(performance['task_def_key'])
                if process_key == 3:
                    test_3 += 1
                if process_key == 5:
                    test_5 += 1
                if process_key == 8:
                    test_8 += 1
                captured_date_timestamp_utc_7 = performance[
                    'time'] + datetime.timedelta(hours=7)
                document_key = self.get_document_key_by_document_id(
                    performance['doc_id'])
                obj_ = FactPerformanceModel(
                    performance_key=_id,
                    ori_performance_id=func.bson_object_to_string(
                        performance['_id']),
                    document_key=document_key,
                    project_id=self.project_id,
                    group_id=performance['group_id'],
                    document_id=performance['doc_id'],
                    reworked=func.int_to_bool(performance['rework_count']),
                    work_type_key=func.get_working_type_id_by_name(
                        performance['work_type']),
                    process_key=func.get_process_key_performance_gda(
                        performance['type'], performance['task_def_key']),
                    number_of_record=performance['records'],
                    number_of_item=performance['items'],
                    number_of_field=performance['fields'],
                    number_of_character=performance['chars'],
                    user_name=performance['username'],
                    ip=performance['ip'],
                    captured_date_timestamp=captured_date_timestamp_utc_7,
                    captured_date_key=func.time_to_date_key(
                        captured_date_timestamp_utc_7),
                    captured_time_key=func.time_to_time_key(
                        captured_date_timestamp_utc_7),
                    total_time_second=performance['total_time'])
                datas.append(obj_)
                self.performance_key_checks.append({
                    'performance_key':
                    _id,
                    'user_name':
                    performance['username'],
                    'document_key':
                    document_key,
                    'module_type':
                    performance['type'],
                    'task_def_key':
                    performance['task_def_key']
                })
                _id += 1
            print('types', types)
            print('tasks', tasks)
            print('human_input', test_3)
            print('human_verify_input', test_5)
            print('human_qc_data', test_8)

            self.db.create([item.__dict__ for item in datas], self.schema,
                           self.fact_performance_table)
            report.status_code = 'PASSED'
        except Exception as e:
            report.status_code = 'FAILED'
            report.description = str(e)
        finally:
            report.total_time_run_second = time.time() - start_run
            self.reports.append(report)
            return report