def fetch_max_num_of_middle_steps(physical_quantity='isotope'): """ 获取选定物理量中所有文件 middle_step 的最大值 Parameters ---------- physical_quantity : str or PhysicalQuantity, default = 'isotope' 物理量,可以是物理量名的list[str]或str, 默认为核素密度 Returns ------- int """ files = fetch_files_by_name(filenames='all') if type_checker(physical_quantity, PhysicalQuantity) == 'str': physical_quantity = fetch_physical_quantities_by_name( physical_quantity).pop() max_num = 0 for file in files: nuc_data = fetch_data_by_filename_and_physical_quantity( file, physical_quantity, True) cur_num = len(nuc_data.columns) if max_num < cur_num: max_num = cur_num return max_num - 4
def fetch_data_by_filename_and_physical_quantities(filename, physical_quantities, is_all_step=False): """ 根据输入的File和physical quantities从Nuc, NucData,PhysicalQuantity table获取数据 Parameters ---------- filename : File File object physical_quantities : list[str] or str or list[PhysicalQuantity] or PhysicalQuantity 物理量,可以是物理量名的list[str]或str, 也可以是PhysicalQuantity list也可以是list[PhysicalQuantity]或PhysicalQuantity is_all_step : bool, default false 是否读取全部中间结果数据列,默认只读取最终结果列 Returns ------- dict[str, pd.DataFrame] 返回一个结果字典,key为物理量名(str),value为对应物理量的数据(DataFrame) """ dict_df_data = {} if type_checker(filename, File) == 'str': filename = fetch_files_by_name(filename).pop() if type_checker(physical_quantities, PhysicalQuantity) == 'str': physical_quantities = fetch_physical_quantities_by_name( physical_quantities) physical_quantity: PhysicalQuantity for physical_quantity in physical_quantities: nuc_data = fetch_data_by_filename_and_physical_quantity( filename, physical_quantity, is_all_step) dict_df_data[physical_quantity.name] = nuc_data return dict_df_data
def save_extracted_data_to_exel(nuc_data_id, filenames=None, physical_quantities=None, is_all_step=False, result_path=Path('.'), merge=True): """ 将数据存入到exel文件 将传入的File list中包含的文件的数据存到exel文件 如无filenames is None,则包含所有文件 Parameters ---------- nuc_data_id : list[int] filenames : list[File or str] or File or str physical_quantities : list[str or PhysicalQuantity] or str or PhysicalQuantity 物理量,可以是物理量名的list[str]或str, 也可以是list[PhysicalQuantity]或PhysicalQuantity is_all_step : bool, default = False 是否读取全部中间结果数据列,默认只读取最终结果列 result_path : Path or str merge : bool, default = True 是否将结果合并输出至一个文件,否则单独输出至每个文件 Returns ------- """ if type_checker(filenames, File) == 'str': filenames = fetch_files_by_name(filenames) if type_checker(physical_quantities, PhysicalQuantity) == 'str': physical_quantities = fetch_physical_quantities_by_name( physical_quantities) file_name = 'final.xlsx' if is_all_step: file_name = f'all_steps_{file_name}' if merge: Path(result_path).joinpath(file_name).unlink(missing_ok=True) else: for filename in filenames: if is_all_step: Path(result_path).joinpath( f'all_steps_{filename.name}.xlsx').unlink(missing_ok=True) else: Path(result_path).joinpath(f'{filename.name}.xlsx').unlink( missing_ok=True) del filename physical_quantity: PhysicalQuantity for physical_quantity in physical_quantities: df_left = pd.DataFrame(data=None, columns=['nuc_ix', 'name']) filename: File for filename in filenames: files_name = f'{filename.name}.xlsx' if is_all_step: files_name = f'all_steps_{filename.name}.xlsx' df_right = fetch_extracted_data_by_filename_and_physical_quantity( nuc_data_id, filename, physical_quantity, is_all_step) if not df_right.empty: df_left = pd.merge(df_left, df_right, how='outer', on=['nuc_ix', 'name']) if not merge: save_to_excel({physical_quantity.name: df_left}, files_name, result_path) df_left = pd.DataFrame(data=None, columns=['nuc_ix', 'name']) if merge: save_to_excel({physical_quantity.name: df_left}, file_name, result_path)
def prediction(filenames, physical_quantity='isotope', is_all_step=False, model_type='iforest', model=None, fraction=0.01): """ Parameters ---------- filenames : list[File or str] or File or str physical_quantity : str or PhysicalQuantity, default = 'isotope' 物理量,可以是物理量名的list[str]或str, 默认为核素密度 is_all_step : bool, default = False 是否读取全部中间结果数据列,默认只读取最终结果列 model_type : str model fraction : float Returns ------- pd.DataFrame """ if type_checker(filenames, File) == 'str': filenames = fetch_files_by_name(filenames) if type_checker(physical_quantity, PhysicalQuantity) == 'str': physical_quantity = fetch_physical_quantities_by_name( physical_quantity).pop() nuc_data_left = pd.DataFrame(columns=['nuc_ix', 'name']) for filename in filenames: nuc_data_right = fetch_data_by_filename_and_physical_quantity( filename, physical_quantity, is_all_step) if nuc_data_right.empty: continue nuc_data_right.rename(columns={ 'first_step': f'{filename.name}_first_step', 'last_step': f'{filename.name}_last_step' }, inplace=True) columns = { col: f'{filename.name}_{col}' for col in nuc_data_right.columns.tolist() if 'middle_step' in col } nuc_data_right.rename(columns=columns, inplace=True) numeric_columns = [ col for col in nuc_data_right.columns.tolist() if col not in ['nuc_ix', 'name'] ] nuc_data_right[numeric_columns] = nuc_data_right[ numeric_columns].astype('float64', copy=False) nuc_data_left = pd.merge(nuc_data_left, nuc_data_right, how='outer', on=['nuc_ix', 'name']) if model_type is not None: model = train_model(nuc_data=nuc_data_left, model_type=model_type, fraction=fraction) result_prediction = predict_model(model, data=nuc_data_left) return result_prediction[result_prediction['Anomaly'] == 1].drop( columns='Anomaly')
def save_prediction_to_exel(filenames, result_path, physical_quantities='isotope', is_all_step=False, merge=True, model_type=None, model_name=None, fraction=0.001): """ Parameters ---------- filenames : list[File or str] or File or str physical_quantities : list[str or PhysicalQuantity] or str or PhysicalQuantity 物理量,可以是物理量名的list[str]或str, 也可以是list[PhysicalQuantity]或PhysicalQuantity is_all_step : bool, default = False 是否读取全部中间结果数据列,默认只读取最终结果列 result_path : Path or str merge : bool, default = True 是否将结果合并输出至一个文件,否则单独输出至每个文件 model_type : str model_name : str fraction Returns ------- """ if type_checker(filenames, File) == 'str': filenames = fetch_files_by_name(filenames) if type_checker(physical_quantities, PhysicalQuantity) == 'str': physical_quantities = fetch_physical_quantities_by_name( physical_quantities) if model_type is None: model = load_model(model_name) else: model = None result_path = Path(result_path).joinpath('anomaly_detection_result') prefix = model_type file_name = 'final.xlsx' if is_all_step: file_name = f'all_steps_{file_name}' file_name = f'{prefix}_{file_name}' if merge: Path(result_path).joinpath(file_name).unlink(missing_ok=True) else: for filename in filenames: if is_all_step: Path(result_path).joinpath( f'{prefix}_all_steps_{filename.name}.xlsx').unlink( missing_ok=True) else: Path(result_path).joinpath( f'{prefix}_{filename.name}.xlsx').unlink(missing_ok=True) del filename for physical_quantity in physical_quantities: if merge: df_result = prediction(filenames=filenames, physical_quantity=physical_quantity, is_all_step=is_all_step, model_type=model_type, model=model, fraction=fraction) df_result.dropna(axis=1, how='all', inplace=True) save_to_excel({physical_quantity.name: df_result}, file_name, result_path) else: df_left = pd.DataFrame(data=None, columns=['nuc_ix', 'name']) for filename in filenames: files_name = f'{prefix}_{filename.name}.xlsx' if is_all_step: files_name = f'{prefix}_all_steps_{filename.name}.xlsx' df_right = prediction(filenames=filenames, physical_quantity=physical_quantity, is_all_step=is_all_step, model_type=model_type, model=model, fraction=fraction) if not df_right.empty: df_right.rename(columns={ 'Anomaly_Score': f'{filename.name}_Anomaly_Score' }, inplace=True) df_right.dropna(axis=1, how='all', inplace=True) df_left = pd.merge(df_left, df_right, how='outer', on=['nuc_ix', 'name']) save_to_excel({physical_quantity.name: df_left}, files_name, result_path) df_left = pd.DataFrame(data=None, columns=['nuc_ix', 'name'])
def save_comparison_result_to_excel(nuc_data_id, reference_file, comparison_files, result_path, physical_quantities='isotope', deviation_mode='relative', threshold=Decimal('1.0E-12'), is_all_step=False): """ 选定一个基准文件,使其与对比文件列表中的文件一一对比,计算并输出对比结果至工作簿(xlsx文件) Parameters ---------- nuc_data_id : list[int] reference_file : File or str 基准文件 comparison_files : list[str or File]or File or str 对比文件列表 result_path : Path or str physical_quantities : list[str or PhysicalQuantity] or str or PhysicalQuantity, default = 'isotope' 对比用物理量,可以是物理量名的list[str]或str, 也可以是PhysicalQuantity list也可以是list[PhysicalQuantity]或PhysicalQuantity 默认为核素密度 deviation_mode : str, default = 'relative' 绝对=absolute 相对=relative 偏差模式,分为绝对和相对,默认为相对 threshold : Decimal, default = Decimal('1.0E-12') 偏差阈值,默认1.0E-12 is_all_step : bool, default = False 是否读取全部中间结果数据列,默认只读取最终结果列 Returns ------- """ if type_checker(reference_file, File) == 'str': reference_file = fetch_files_by_name(reference_file) if type_checker(comparison_files, File) == 'str': comparison_files = fetch_files_by_name(comparison_files) for comparison_file in comparison_files: print((reference_file.name, comparison_file.name)) dict_df_all = calculate_comparative_result( nuc_data_id=nuc_data_id, reference_file=reference_file, comparison_file=comparison_file, physical_quantities=physical_quantities, deviation_mode=deviation_mode, threshold=threshold, is_all_step=is_all_step) file_name = f'{deviation_mode}_{threshold}_{reference_file.name}_vs_{comparison_file.name}.xlsx' if is_all_step: file_name = f'all_step_{file_name}' Path(result_path).joinpath('comparative_result').joinpath( file_name).unlink(missing_ok=True) save_to_excel(dict_df_all, file_name, Path(result_path).joinpath('comparative_result'))
def calculate_comparative_result(nuc_data_id, reference_file, comparison_file, physical_quantities='isotope', deviation_mode='relative', threshold=Decimal('1.0E-12'), is_all_step=False): """ 选定一个基准文件,一个对比文件,与其进行对比,计算并返回对比结果 Parameters ---------- nuc_data_id : list[int] reference_file : File or str 基准文件 comparison_file : File or str 对比文件 physical_quantities : list[str or PhysicalQuantity] or str or PhysicalQuantity, default = 'isotope' 对比用物理量,可以是物理量名的list[str]或str, 也可以是PhysicalQuantity list也可以是list[PhysicalQuantity]或PhysicalQuantity 默认为核素密度 deviation_mode : str, default = 'relative' 绝对=absolute 相对=relative 偏差模式,分为绝对和相对,默认为相对 threshold : Decimal, default = Decimal('1.0E-12') 偏差阈值,默认1.0E-12 is_all_step : bool, default = False 是否读取全部中间结果数据列,默认只读取最终结果列 Returns ------- dict[str, pd.DataFrame] """ if type_checker([reference_file, comparison_file], File) == 'str': reference_file = fetch_files_by_name(reference_file).pop() comparison_file = fetch_files_by_name(comparison_file).pop() if type_checker(physical_quantities, PhysicalQuantity) == 'str': physical_quantities = fetch_physical_quantities_by_name( physical_quantities) dict_df_all = {} physical_quantity: PhysicalQuantity for physical_quantity in physical_quantities: reference_data = fetch_extracted_data_by_filename_and_physical_quantity( nuc_data_id, reference_file, physical_quantity, is_all_step) comparison_data = fetch_extracted_data_by_filename_and_physical_quantity( nuc_data_id, comparison_file, physical_quantity, is_all_step) if reference_data.empty or comparison_data.empty: continue reference_data, comparison_data = _complement_columns( reference_data, comparison_data, reference_file.name, comparison_file.name) df_deviation, reserved_index = _calculate_deviation( reference_data, comparison_data, deviation_mode, Decimal(threshold)) dict_df_all[physical_quantity. name] = _merge_reference_comparison_and_deviation( reference_data, comparison_data, df_deviation, reserved_index) return dict_df_all
def fetch_transposed_data_by_filename_and_physical_quantity( filename, physical_quantity, is_all_step=False): """ 根据输入的 File 和 physical quantity 从 Nuc, NucData,PhysicalQuantity table获取数据 Parameters ---------- filename : File File object physical_quantity : str or PhysicalQuantity 物理量,可以是物理量的 str,PhysicalQuantity is_all_step : bool, default false 是否读取全部中间结果数据列,默认只读取第一步和最后一步 Returns ------- pd.DataFrame """ if type_checker(filename, File) == 'str': filename = fetch_files_by_name(filename).pop() if type_checker(physical_quantity, PhysicalQuantity) == 'str': physical_quantity = fetch_physical_quantities_by_name( physical_quantity).pop() df_left = pd.DataFrame(data=None, columns=['nuc_ix', 'name']) file_id = filename.id physical_quantity_id = physical_quantity.id if not is_all_step: # 不读取中间结果,所以不选择NucData.middle_steps,否则反之 stmt = lambda_stmt(lambda: select(Nuc.nuc_ix, Nuc.name, NucData. first_step, NucData.last_step)) else: stmt = lambda_stmt( lambda: select(Nuc.nuc_ix, Nuc.name, NucData.first_step, NucData. last_step, NucData.middle_steps)) stmt += lambda s: s.join(Nuc, Nuc.id == NucData.nuc_id) stmt += lambda s: s.join( PhysicalQuantity, PhysicalQuantity.id == NucData.physical_quantity_id) stmt += lambda s: s.where(NucData.file_id == file_id, PhysicalQuantity.id == physical_quantity_id) with Session() as session: column_names = [column.name for column in list(stmt.selected_columns)] df_right = pd.DataFrame(data=session.execute(stmt).all(), columns=column_names) if is_all_step: exclude_middle_steps = df_right.drop(columns='middle_steps', axis=1) del column_names[-1] exclude_middle_steps.columns = column_names middle_steps = pd.DataFrame([ middle_steps_line_parsing(middle_steps) for middle_steps in df_right['middle_steps'] if middle_steps is not None ]) df_right = pd.concat([exclude_middle_steps, middle_steps], axis=1, copy=False) if not df_right.empty: df_left = pd.merge(df_left, df_right, how='outer', on=['nuc_ix', 'name']) df_left.sort_values(by=['nuc_ix'], inplace=True) nuc_ix = df_left.loc[:, ['nuc_ix', 'name']] df_left = df_left.T df_left.columns = df_left.loc['name'] df_left.drop(['nuc_ix', 'name'], inplace=True) df_left = df_left.astype('float64', copy=False) reindex = df_left.index.tolist() reindex.append(reindex.pop(1)) df_left = df_left.reindex(reindex, copy=False) time_interval = pd.Series( (filename.time_interval * i for i in range(filename.repeat_times + 1)), name='time_interval', index=df_left.index) time_interval = time_interval / pd.to_timedelta(1, unit='D') df_left = pd.concat([df_left, time_interval], axis=1) return nuc_ix, df_left
def fetch_extracted_data_by_filename_and_physical_quantity( nuc_data_id, filename, physical_quantity, is_all_step=False): """ 获取 extracted_data Parameters ---------- nuc_data_id : list[int] filename :str or File physical_quantity : str or PhysicalQuantity is_all_step : bool, default = False 是否读取全部中间结果数据列,默认只读取最终结果列 Returns ------- """ if type_checker(filename, File) == 'str': filename = fetch_files_by_name(filename).pop() if type_checker(physical_quantity, PhysicalQuantity) == 'str': physical_quantity = fetch_physical_quantities_by_name( physical_quantity).pop() df_left = pd.DataFrame(data=None, columns=['nuc_ix', 'name']) physical_quantity_id = physical_quantity.id filename: File file_id = filename.id if not is_all_step: # 不读取中间结果,所以不选择NucData.middle_steps,否则反之 stmt = lambda_stmt( lambda: select(Nuc.nuc_ix, Nuc.name, NucData.last_step).where( NucData.id.in_(nuc_data_id))) else: stmt = lambda_stmt( lambda: select(Nuc.nuc_ix, Nuc.name, NucData.last_step, NucData. middle_steps).where(NucData.id.in_(nuc_data_id))) stmt += lambda s: s.join(Nuc, Nuc.id == NucData.nuc_id) stmt += lambda s: s.join( PhysicalQuantity, PhysicalQuantity.id == NucData.physical_quantity_id) stmt += lambda s: s.where(NucData.file_id == file_id, PhysicalQuantity.id == physical_quantity_id) with Session() as session: if not is_all_step: column_names = ['nuc_ix', 'name', f'{filename.name}_last_step'] df_right = pd.DataFrame(data=session.execute(stmt).all(), columns=column_names) else: column_names = [ 'nuc_ix', 'name', f'{filename.name}_last_step', 'middle_steps' ] df_right = pd.DataFrame(data=session.execute(stmt).all(), columns=column_names) exclude_middle_steps = df_right.drop(columns='middle_steps', axis=1) del column_names[-1] exclude_middle_steps.columns = column_names middle_steps = pd.DataFrame([ middle_steps_line_parsing(middle_steps) for middle_steps in df_right['middle_steps'] if middle_steps is not None ]) middle_step_column_names = [ f'{filename.name}_{name}' for name in middle_steps.columns.tolist() ] middle_steps.columns = middle_step_column_names df_right = pd.concat([exclude_middle_steps, middle_steps], axis=1, copy=False) if not df_right.empty: df_left = pd.merge(df_left, df_right, how='outer', on=['nuc_ix', 'name']) df_left.sort_values(by=['nuc_ix'], inplace=True) return df_left
def fetch_extracted_data_id(filenames=None, physical_quantities='all', nuclide_list=None): """ 获取extracted_data的id Parameters ---------- filenames : list[File] or File File object physical_quantities : list[str or PhysicalQuantity] or str or PhysicalQuantity 物理量,可以是物理量名的list[str]或str, 也可以是list[PhysicalQuantity]或PhysicalQuantity nuclide_list : list[str] 核素list Returns ------- list[int] """ if type_checker(filenames, File) == 'str': filenames = fetch_files_by_name(filenames) if not isinstance(filenames, list): filenames = [filenames] if type_checker(physical_quantities, PhysicalQuantity) == 'str': physical_quantities = fetch_physical_quantities_by_name( physical_quantities) nuc_data_id = [] with Session() as session: for filename in filenames: physical_quantities_id = [ physical_quantity.id for physical_quantity in physical_quantities ] file_id = filename.id if nuclide_list is None: # 核素列表为空则过滤first_step和last_step皆为0的records stmt = (select(NucData.id).where( NucData.file_id == file_id, NucData.physical_quantity_id.in_( physical_quantities_id)).where( or_(NucData.first_step != 0, NucData.last_step != 0))) elif nuclide_list == 'all': stmt = (select(NucData.id).where( NucData.file_id == file_id, NucData.physical_quantity_id.in_(physical_quantities_id))) else: # 核素不为gamma时,依照核素列表过滤records,否则反之 for physical_quantity in physical_quantities: if physical_quantity.name == 'gamma_spectra': gamma_physical_quantity_id = physical_quantity.id gamma_stmt = (select(NucData.id).where( NucData.file_id == file_id, NucData.physical_quantity_id == gamma_physical_quantity_id)) nuc_data_id.extend( session.execute(gamma_stmt).scalars().all()) stmt = (select(NucData.id).join( Nuc, Nuc.id == NucData.nuc_id).where( NucData.file_id == file_id, NucData.physical_quantity_id.in_( physical_quantities_id)).where( Nuc.name.in_(nuclide_list))) nuc_data_id.extend(session.execute(stmt).scalars().all()) return nuc_data_id
def fetch_data_by_filename_and_nuclide_list(filename, physical_quantities, nuclide_list, is_all_step=False): """ 根据输入的File,physical quantities,nuclide_list(核素列表),all_step 从Nuc, NucData,PhysicalQuantity table获取数据 Parameters ---------- filename : File File object physical_quantities : list[str] or str or list[PhysicalQuantity] or PhysicalQuantity 物理量,可以是物理量名的list[str]或str, 也可以是PhysicalQuantity list也可以是list[PhysicalQuantity]或PhysicalQuantity nuclide_list : list[str] 核素list is_all_step : bool, default false 是否读取全部中间结果数据列,默认只读取最终结果列 Returns ------- dict[str, pd.DataFrame] 返回一个结果字典,key为物理量名(str),value为对应物理量的数据(DataFrame) """ dict_df_data = {} if type_checker(physical_quantities, PhysicalQuantity) == 'str': physical_quantities = fetch_physical_quantities_by_name( physical_quantities) with Session() as session: physical_quantity: PhysicalQuantity for physical_quantity in physical_quantities: file_id = filename.id physical_quantity_id = physical_quantity.id if not is_all_step: # 不读取中间结果,所以不选择NucData.middle_steps,否则反之 stmt = lambda_stmt( lambda: select(Nuc.nuc_ix, Nuc.name, NucData.first_step, NucData.last_step)) else: stmt = lambda_stmt( lambda: select(Nuc.nuc_ix, Nuc.name, NucData.first_step, NucData.last_step, NucData.middle_steps)) stmt += lambda s: s.join(Nuc, Nuc.id == NucData.nuc_id) stmt += lambda s: s.join( PhysicalQuantity, PhysicalQuantity.id == NucData. physical_quantity_id) stmt += lambda s: s.where( NucData.file_id == file_id, PhysicalQuantity.id == physical_quantity_id) if nuclide_list is None: # 核素列表为空则过滤first_step和last_step皆为0的records stmt += lambda s: s.where( or_(NucData.first_step != 0, NucData.last_step != 0)) else: if physical_quantity.name != 'gamma_spectra': # 核素不为gamma时,依照核素列表过滤records,否则反之 stmt += lambda s: s.where(Nuc.name.in_(nuclide_list)) nuc_data = pd.DataFrame( data=session.execute(stmt).all(), columns=tuple(column.name for column in list(stmt.selected_columns))) if is_all_step: nuc_data_exclude_middle_steps = nuc_data.drop( columns='middle_steps', axis=1) middle_steps = pd.DataFrame([ middle_steps_line_parsing(middle_steps) for middle_steps in nuc_data['middle_steps'] if middle_steps is not None ]) del nuc_data nuc_data = pd.concat( [nuc_data_exclude_middle_steps, middle_steps], axis=1, copy=False) nuc_data.sort_values(by=['nuc_ix'], inplace=True) dict_df_data[physical_quantity.name] = nuc_data return dict_df_data
def fetch_data_by_filename_and_physical_quantity(filename, physical_quantity, is_all_step=False): """ 根据输入的 File 和 physical quantity 从 Nuc, NucData,PhysicalQuantity table获取数据 Parameters ---------- filename : File File object physical_quantity : str or PhysicalQuantity 物理量,可以是物理量的 str,PhysicalQuantity is_all_step : bool, default false 是否读取全部中间结果数据列,默认只读取最终结果列 Returns ------- pd.DataFrame """ if type_checker(filename, File) == 'str': filename = fetch_files_by_name(filename).pop() if type_checker(physical_quantity, PhysicalQuantity) == 'str': physical_quantity = fetch_physical_quantities_by_name( physical_quantity).pop() df_left = pd.DataFrame(data=None, columns=['nuc_ix', 'name']) file_id = filename.id physical_quantity_id = physical_quantity.id if not is_all_step: # 不读取中间结果,所以不选择NucData.middle_steps,否则反之 stmt = lambda_stmt(lambda: select(Nuc.nuc_ix, Nuc.name, NucData. first_step, NucData.last_step)) else: stmt = lambda_stmt( lambda: select(Nuc.nuc_ix, Nuc.name, NucData.first_step, NucData. last_step, NucData.middle_steps)) stmt += lambda s: s.join(Nuc, Nuc.id == NucData.nuc_id) stmt += lambda s: s.join( PhysicalQuantity, PhysicalQuantity.id == NucData.physical_quantity_id) stmt += lambda s: s.where(NucData.file_id == file_id, PhysicalQuantity.id == physical_quantity_id) with Session() as session: column_names = [column.name for column in list(stmt.selected_columns)] df_right = pd.DataFrame(data=session.execute(stmt).all(), columns=column_names) if is_all_step: exclude_middle_steps = df_right.drop(columns='middle_steps', axis=1) del column_names[-1] exclude_middle_steps.columns = column_names middle_steps = pd.DataFrame([ middle_steps_line_parsing(middle_steps) for middle_steps in df_right['middle_steps'] if middle_steps is not None ]) df_right = pd.concat([exclude_middle_steps, middle_steps], axis=1, copy=False) if not df_right.empty: df_left = pd.merge(df_left, df_right, how='outer', on=['nuc_ix', 'name']) df_left.sort_values(by=['nuc_ix'], inplace=True) return df_left