def attribute_process(grainger_df, uom_df, lov_df, lov_list, gamut_dict: Dict, att_process_node): attribute_df = pd.DataFrame() grainger_att_vals = pd.DataFrame() # remove "-" values from attribute value field grainger_df.loc[grainger_df.Grainger_Attribute_Value == '-', 'Grainger_Attribute_Value'] = np.NaN grainger_att_vals = q.grainger_values(grainger_df) temp_df, gamut_dict = grainger_process(grainger_df, grainger_att_vals, uom_df, lov_df, lov_list, gamut_dict, att_process_node) attribute_df = pd.concat([attribute_df, temp_df], axis=0, sort=False) attribute_df = attribute_df.drop_duplicates(subset=['Grainger_Attr_ID']) if 'Grainger_Attribute_Name' in attribute_df.columns: # drop files that have an exact match for "Item" and "Series" and a contains match for green attributes attribute_df = attribute_df[~attribute_df['Grainger_Attribute_Name'].str.contains(r'^Item$', regex=True)] attribute_df = attribute_df[~attribute_df['Grainger_Attribute_Name'].str.contains(r'^Series$', regex=True)] attribute_df = attribute_df[~attribute_df['Grainger_Attribute_Name'].str.contains('Green Certification')] attribute_df = attribute_df[~attribute_df['Grainger_Attribute_Name'].str.contains('Green Environmental')] attribute_df['STEP Attribute Name'] = attribute_df['Grainger_Attribute_Name'] attribute_df = attribute_df.rename(columns={'Segment_ID':'Segment ID', 'Segment_Name':'Segment Name', \ 'Family_ID':'Family ID', 'Family_Name':'Family Name', 'Category_ID':'STEP Category ID', \ 'Category_Name':'Category Name', 'Grainger_Attr_ID':'STEP Attribute ID', 'Percent_Numeric':'%_Numeric', \ 'Potential_UOMs':'Potential UOMs', 'Grainger_Attribute_Name':'Attribute Name', \ 'Recommended_Data_Type':'Recommended Data Type', 'Sample_Values':'Sample Values', \ 'Restricted_Attribute_Value_Domain':'Restricted Attribute Value Domain'}) return attribute_df, gamut_dict
def gamut_process(gnode, gamut_dict): """if gamut node has not been preiously process (in gamut_dict), process and add it to the dictionary""" gamut_df = q.gamut_definition(gnode, 'tax_att."categoryId"') if gamut_df.empty==False: gamut_df['alt_gamut_name'] = process.process_att(gamut_df['Gamut_Attribute_Name']) #prep att name for merge gamut_dict[gnode] = gamut_df #store the processed df in dict for future reference else: print('{} EMPTY Gamut DATAFRAME'.format(gnode)) return gamut_dict, gamut_df
def gws_process(gws_node, gws_dict: Dict): """if gws node has not been previously processed (in gws_dict), process and add it to the dictionary""" gws_df = q.gws_atts(ws_attr_query, gws_node, 'tax.id') #tprod."categoryId"') #get gws attribute values for each gamut_l3 node\ if gws_df.empty==False: gws_df = gws_df.drop_duplicates(subset='GWS_Attr_ID') #gws attribute IDs are unique, so no need to group by pim node before getting unique gws_df['alt_gws_name'] = process.process_att(gws_df['GWS_Attribute_Name']) #prep att name for merge gws_dict[gws_node] = gws_df #store the processed df in dict for future reference else: print('{} EMPTY GWS DATAFRAME'.format(gws_node)) return gws_dict, gws_df
def grainger_process(grainger_df, grainger_all, uom_df, lov_df, lov_list, gamut_dict, grainger_node): """create a list of grainger skus, run through through the gws_skus query and pull gws attribute data if skus are present concat both dataframs and join them on matching attribute names""" df = pd.DataFrame() grainger_new = pd.DataFrame() cat_name = grainger_df['Category_Name'].unique() cat_name = list(cat_name) cat_name = cat_name.pop() print('cat name = {} {}'.format(grainger_node, cat_name)) grainger_skus = grainger_df.drop_duplicates( subset='Grainger_SKU' ) #create list of unique grainger skus that feed into gws query grainger_sku_count = len(grainger_skus) print('grainger sku count = ', grainger_sku_count) if len(grainger_skus) > 7000: num_split = round(len(grainger_skus) / 7000, 0) num_split = int(num_split) if num_split == 1: num_split = 2 print('splitting grainger_df into {} batches'.format(num_split)) grainger_split = np.array_split(grainger_df, num_split) for i in range(0, num_split): loop_time = time.time() print('batch no: ', i + 1) temp_df = analyze(grainger_split[i], uom_df, lov_df, lov_list) grainger_new = pd.concat([grainger_new, temp_df], axis=0, sort=False) print("--- grainger loop time = {} minutes ---".format( round((time.time() - loop_time) / 60, 2))) grainger_df = grainger_new else: grainger_df = analyze(grainger_df, uom_df, lov_df, lov_list) grainger_df = grainger_df.drop_duplicates(subset=[ 'Category_ID', 'Grainger_Attr_ID' ]) #group by Category_ID and attribute name and keep unique grainger_df['STEP Blue Path'] = grainger_df['Segment_Name'] + ' > ' + grainger_df['Family_Name'] + \ ' > ' + grainger_df['Category_Name'] grainger_df = grainger_df.drop( ['Grainger_SKU', 'Grainger_Attribute_Value'], axis=1) #remove unneeded columns if grainger_all.empty == False: # for non-text rows, clean up UOMs in sample value column for row in grainger_df.itertuples(): potential_uoms = str(row.Potential_UOMs) dt = str(row.Recommended_Data_Type) if dt != 'text': grainger_df = process_sample_vals(grainger_df, row, potential_uoms) grainger_df['alt_grainger_name'] = process.process_att( grainger_df['Grainger_Attribute_Name']) #prep att name for merge gamut_skus = q.gws_skus( grainger_skus) #get gamut sku list to determine pim nodes to pull # if gws skus are present, go get the gamut attribute definition for the node if gamut_skus.empty == False: gamut_l3 = gamut_skus['Gamut_Node_ID'].unique() print('gamut L3s ', gamut_l3) for gamut_node in gamut_l3: if gamut_node in gamut_dict: gamut_df = gamut_dict[gamut_node] print('node {} in Gamut dict'.format(gamut_node)) else: gamut_dict, gamut_df = gamut_process(gamut_node, gamut_dict) if gamut_df.empty == False: node_name = gamut_df['Gamut_Node_Name'].unique() node_name = list(node_name) node_name = node_name.pop() print('node name = {} {}'.format(gamut_node, node_name)) #add correlating grainger and gamut data to opposite dataframes grainger_df = q.grainger_assign_nodes(grainger_df, gamut_df) gamut_df = q.gamut_assign_nodes(grainger_df, gamut_df) temp_df = pd.merge(grainger_df, gamut_df, left_on=['alt_grainger_name', 'Category_ID', 'Gamut_Node_ID', 'Gamut_Category_ID', \ 'Gamut_Category_Name', 'Gamut_Node_Name', 'Gamut_PIM_Path', 'STEP Blue Path', \ 'Segment_ID', 'Segment_Name', 'Family_ID', 'Family_Name', 'Category_Name'], right_on=['alt_gamut_name', 'Category_ID', 'Gamut_Node_ID', 'Gamut_Category_ID', \ 'Gamut_Category_Name', 'Gamut_Node_Name', 'Gamut_PIM_Path', 'STEP Blue Path', \ 'Segment_ID', 'Segment_Name', 'Family_ID', 'Family_Name', 'Category_Name'], how='outer') temp_df = match_category( temp_df ) #compare grainger and gamut atts and create column to say whether they match df = pd.concat( [df, temp_df], axis=0, sort=False ) #add prepped df for this gamut node to the final df df['Matching'] = df['Matching'].str.replace( 'no', 'Potential Match') df = df[df.Matching != 'GWS only'] # temp_df.to_csv('C:/Users/xcxg109/NonDriveFiles/graingerProcessDF.csv') else: print('Gamut {} EMPTY DATAFRAME'.format(gamut_node)) df = grainger_df df['Gamut_Attribute_Definition'] = '' else: df = grainger_df df['Gamut_Attribute_Definition'] = '' print('No Gamut SKUs for Grainger node {}'.format(grainger_node)) # df = pd.merge(df, gamut_df, left_on=['alt_grainger_name'], \ # right_on=['alt_gamut_name'], how='outer') # df = match_category(df) #compare grainger and gws atts and create column to say whether they match # df['Matching'] = df['Matching'].str.replace('no', 'Potential Match') # if gws_skus.empty==False: #create a dictionary of the unique gws nodes that corresponde to the grainger node # gws_l3 = gws_skus['GWS_Node_ID'].unique() #create list of pim nodes to pull # print('GWS L3s ', gws_l3) # for node in gws_l3: # if node in gws_dict: # gws_df = gws_dict[node] # print ('node {} in GWS dict'.format(node)) # else: # gws_dict, gws_df = gws_process(node, gws_dict) # if gws_df.empty==False: # node_name = gws_df['GWS_Node_Name'].unique() # node_name = list(node_name) # node_name = node_name.pop() # print('node name = {} {}'.format(node, node_name)) #add correlating grainger and gws data to opposite dataframes # grainger_df = q.grainger_assign_nodes(grainger_df, gws_df) # gws_df = q.gws_assign_nodes(grainger_df, gws_df) # temp_df = pd.merge(grainger_df, gws_df, left_on=['alt_grainger_name', 'Category_ID', 'GWS_Node_ID', 'GWS_Category_ID', \ # 'GWS_Category_Name', 'GWS_Node_Name', 'GWS_PIM_Path', 'STEP Blue Path', \ ## 'Segment_ID', 'Segment_Name', 'Family_ID', 'Family_Name', 'Category_Name'], # right_on=['alt_gws_name', 'Category_ID', 'GWS_Node_ID', 'GWS_Category_ID', \ # 'GWS_Category_Name', 'GWS_Node_Name', 'GWS_PIM_Path', 'STEP Blue Path', \ # 'Segment_ID', 'Segment_Name', 'Family_ID', 'Family_Name', 'Category_Name'], how='outer') # temp_df = match_category(temp_df) #compare grainger and gws atts and create column to say whether they match # df = pd.concat([df, temp_df], axis=0, sort=False) #add prepped df for this gws node to the final df # df['Matching'] = df['Matching'].str.replace('no', 'Potential Match') # drop all of the rows that are 'GWS only' in the Match column # df = df[df.Matching != 'GWS only'] # df = df.drop(['alt_grainger_name', 'GWS_Node_ID', 'GWS_Category_ID', 'GWS_Category_Name', \ # 'GWS_Node_Name', 'GWS_PIM_Path'], axis=1) #else: # print('GWS Node {} EMPTY DATAFRAME'.format(node)) # else: # df = grainger_df # print('No GWS SKUs for Grainger node {}'.format(grainger_node)) df.reset_index(drop=True, inplace=True) df = choose_definition(df) return df, gamut_dict #where gamut_att_temp is the list of all normalized values for gamut attributes
# read in uom and LOV files uom_df = pd.DataFrame() #uom_groups_url = 'https://raw.githubusercontent.com/gamut-code/attribute_mapping/master/UOM_data_sheet.csv' # create df of the uom groupings (ID and UOMs for each group) #data_file = requests.get(uom_groups_url).content #uom_df = pd.read_csv(io.StringIO(data_file.decode('utf-8'))) # get uom list filename = 'C:/Users/xcxg109/NonDriveFiles/reference/UOM_data_sheet.csv' uom_df = pd.read_csv(filename) # create df of the lovs and their concat values filename = 'C:/Users/xcxg109/NonDriveFiles/reference/LOV_list.csv' lov_df, lov_list = q.get_LOVs(filename) data_type = 'grainger_query' print('working...') search_level = fd.WS_search_type() if search_level == 'cat.CATEGORY_ID': search_data = fd.data_in(data_type, settings.directory_name) while True: try: query_level = input( "Query Level: \n1. ETL query (L1)\n2. ALL (no restrictions) Query " ) if query_level in ['1']:
elif search_level in ['2', 's', 'S']: search_level = 'single' break elif search_level in ['3', 'sku', 'SKU']: data_type = 'sku' break except ValueError: print('Invalid search type') search_data = fd.data_in(data_type, settings.directory_name) print('working...') start_time = time.time() # read in grainger data allCATS_df = q.get_att_values() if data_type == 'gws_query': for node in search_data: start_time = time.time() if search_level == 'single': init_ws_df = gws.gws_q(gws_values_single, 'tprod."categoryId"', node) elif search_level == 'group': init_ws_df = gws.gws_q(gws_values_group, 'tprod."categoryId"', node) print('k = ', node)
search_string = search_string + '; ' + 'dia.' search_string = search_string[2:] df.at[row.Index, 'Potential_UOMs'] = search_string return df # get uom list filename = 'C:/Users/xcxg109/NonDriveFiles/reference/UOM_data_sheet.csv' uom_df = pd.read_csv(filename) # create df of the lovs and their concat values filename = 'C:/Users/xcxg109/NonDriveFiles/reference/LOV_Categories.csv' lov_df, lov_list = q.get_LOVs(filename) lov_df['Potential_UOMs'] = '' for row in lov_df.itertuples(): lov_df = process_LOV_vals(lov_df, row) lov_df = lov_df.replace(r'^\s*$', np.NaN, regex=True) lov_df = lov_df.sort_values(['Potential_UOMs'], ascending=[True]) outfile = 'C:/Users/xcxg109/NonDriveFiles/reference/LOV_UOMs.xlsx' writer = pd.ExcelWriter(outfile, engine='xlsxwriter') workbook = writer.book lov_df.to_excel(writer, sheet_name="LOVs", startrow=0, startcol=0, index=False)
ws_df = pd.DataFrame() gamut_df = pd.DataFrame() attribute_df = pd.DataFrame() gamut_dict = dict() # read in WS data print('Choose WS PIM file') allCATS_df = q.get_att_values() # read in and clean WS data print('\nChoose Gamut file') Gamut_allCATS_df = q.get_att_values() print('working...') start_time = time.time() node_ids = allCATS_df['WS_Node_ID'].unique().tolist() print('number of nodes = ', len(node_ids)) for k in node_ids: ws_df = allCATS_df.loc[allCATS_df['WS_Node_ID']== k]
# read in WS data print('Choose WS PIM file') allCATS_df = q.get_att_values() # read in and clean WS data print('\nChoose Gamut file') Gamut_allCATS_df = q.get_att_values() print('working...') start_time = time.time() node_ids = allCATS_df['Category_ID'].unique().tolist() print('number of nodes = ', len(node_ids)) for node in node_ids: temp_df = allCATS_df.loc[allCATS_df['Category_ID']== node]