def regulatedfarelookup(lkupfilepath, df):
    """
    This imports a csv file as a source of lookup data for regulated fares.  A merge statement is used to join superfile with the regulated data.  
    Only about 20% of rows will match.
    No match will return NaN in the new fields.

    Parameters:
    lkupfilepath        - a string containing the filepath where the lookup text file is stored
    df                  - a dataframe containing the combined TOC data

    Returns:
    df                  - a dataframe with new fields (orig, dest, route,ticketa, flaga, ticket, flag2) added on the right hand on the dataframe.
    """
    #import fares lookup data
    print("Regulated fares lookup being loaded\n")
    fareslkupdata = pd.read_csv(lkupfilepath + 'regulated fares sort.csv')
    fareslkupvalues = pd.DataFrame(fareslkupdata)


    #data typing for farelookup values
    print("Data typeing fare lookup values\n")
    fareslkupvalues['orig'] = fareslkupvalues['orig'].astype('str')
    fareslkupvalues['dest'] = fareslkupvalues['dest'].astype('str')
    fareslkupvalues['route'] = fareslkupvalues['route'].astype('str')
    fareslkupvalues['ticket'] = fareslkupvalues['ticket'].astype('str')
    
    #set string length by padding fields with leading zeros
    print("Amending lookupvalue keys\n")
    fareslkupvalues['orig'] = fareslkupvalues['orig'].str.zfill(4)
    fareslkupvalues['dest'] = fareslkupvalues['dest'].str.zfill(4)
    fareslkupvalues['route'] = fareslkupvalues['route'].str.zfill(5)
    

    #merging lookup values with superfile
    print("Regulated fares lookup being performed.\n")
    df = pd.merge(left=df,right=fareslkupvalues,how='left',
             left_on=['Origin Code','Destination Code','Route Code', 'Product Code']
             ,right_on=['orig','dest','route','ticket'])

    #drop lookup dataframe no longer needed
    del fareslkupvalues

    # amend datatypes to categorical
    df = applydatatypes(df, ['Product Code','Origin Code','Destination Code','Route Code','orig','dest','route','ticketa','flaga','ticket','flag2'])
    
    return df
def getcategorylookup(df,filepath, filename,destinationpath):
    """
    This procedure loads category lookup data from excel file, conforms the format of fields to be return and joins it to the main file.  Non-matches are categorised as 'Missing' and 
    these rows are exported to a logging file.  Categorical datatyping is applied to new fields.

    Parameters:
    df                  - a dataframe containing the 'superfile'
    filepath            - a string containing the file path for the excel file
    filename            - a string containing the name of the excel file
    destinationpath     - a string containing the file path for the output to be sent

    Returns:
    df                  - a dataframe containing the 'superfile' with category information
    """
    print("Category Lookup Data being loaded \n")
    records = pd.read_excel(filepath + filename,sheet_name='Sheet1',heading=['Product Code','Product Primary Code','Category'])

    savtodf = records

    #define column names
    savtodf.columns = ['Product Code','Product Primary Code','Category']
    
    #force all categories to lower case
    savtodf['Category'] = savtodf['Category'].str.lower()

    print("Category Information being added\n")
    df = pd.merge(df,savtodf,how='left', left_on=['Product Code','Product Primary Code'],right_on=['Product Code','Product Primary Code'])

    #handle missing category information
    df['Category'].fillna('Missing',inplace=True)
    nonmatches = df[df['Category']=='Missing']
    unique_filtered_nonmatches = nonmatches[['Product Code','Product Primary Code']].copy().drop_duplicates()
    exportfile(unique_filtered_nonmatches,destinationpath, 'missing_categories')

    #apply datatyping
    df = applydatatypes(df,['Product Code','Product Primary Code','Category'])
    
    #remove unnecessary column
    del savtodf

    return df
def generatedata(originpath,destinationpath,regulatedfarespath,categorypath):
    """
    This joins a number of CSV source files into a common dataframe; handles the cases of sub profit centres being used in place of profit centres; 
    maps sectors, ticket_types, classes, regulated status and category information to the combined file.  This is then used to calculated advanced and non-advanced data.

    Parameters:
    originpath          - A string specifying the location of the individual TOC files are found
    destinationpath     - A string specifying the location of where output should be exported
    regulatedfarespath  - A string specifying the location of the lookup file for regulated fares file
    categorypath        - A string specifying the location of the lookup file for category

    Returns:
    superfile           - A dataframe consisting of the combined and mapped data.
    """
    list_of_tocs,filecount = getdata(originpath)

    joinedfile = combinefiles(list_of_tocs,filecount)

    if filecount > 50:
        print("As you are processing a large number of files, this may possibly cause the PC to freeze or crash due to memory issues.\n")  
        print("If this happens, restart the computer, then close down IE, Outlook and any other memory/resource hungry applications and try again.\n")


        
        
    superfile = joinedfile.copy()

    #drop where category_code not starting with 1 or 2
    superfile = superfile[superfile['Product Code'].str.contains('1[A-Z][A-Z][A-Z]|2[A-Z][A-Z][A-Z]',regex=True)]

    #fields to convert to categorical data type
    superfile = applydatatypes(superfile,['Carrier TOC / Third Party Code','Product Code','Product Primary Code'])

    #mapping of lookups starts here
    #mapping of sectors
    print("mapping sectors within superfile\n")
    sector_mapping = {'EK':'Lon SE','HO':'Lon SE','HQ':'Lon SE','HS':'Lon SE','HT':'Lon SE','HU':'Lon SE','HW':'Lon SE','HY':'Lon SE','HZ':'Lon SE','EX':'Lon SE',
                        'EA':'Regional','EI':'Regional','EJ':'Regional','HA':'Regional','HC':'Regional','HD':'Regional','HE':'Regional','HL':'Regional','ES':'Regional',
                        'EC':'Long D','EH':'Long D','HB':'Long D','HF':'Long D','HI':'Long D','HK':'Long D','HM':'Long D'}
    superfile = assignlookupvalues(superfile,'sectors', sector_mapping, "Carrier TOC / Third Party Code",  'sector' ,destinationpath)

    #mapping of tickettypes
    print("mapping ticket types within superfile\n")
    tickettypemapping = {'PG01':'Full','PG05':'Full','PG02':'Reduced','PG03':'Reduced','PG06':'Reduced','PG07':'Reduced','PG04':'Season','PG08':'Season'}
    superfile = assignlookupvalues(superfile,'ticket_type',tickettypemapping,"Product Primary Code",'ticket_type',destinationpath,'Other')
    
    #mapping of ticketclasses
    print("mapping ticket types within superfile\n")
    classmapping = {'1':'1', '2':'2','9':'2'}
    superfile['classreference'] = superfile['Product Code'].str[0]
    superfile = assignlookupvalues(superfile,'class',classmapping,'classreference','class',destinationpath,'2')
    del superfile['classreference']

    print("mapping regulated status within superfile\n")
    #getting the regulated fares lookup to add flag_2 information for faretypes
    superfile = regulatedfarelookup(regulatedfarespath,superfile )
    
    #setting rows as regulated/unregulated fares here   
    superfile = setregulatedfares(superfile,destinationpath)

    #mapping of categories
    print("mapping categories within superfile\n")
    superfile = getcategorylookup(superfile,categorypath,'Product_category_lookup_2020_v1.xlsx',destinationpath)

    #dropping columns no longer needed
    superfile = superfile.drop(['orig','dest','route'], axis=1)

    #apply final superfile datatyping
    superfile = applydatatypes(superfile,['Carrier TOC / Third Party Code','Origin Code','Destination Code','Route Code','Product Code','sector','ticket_type','class','Regulated_Status_Start','Regulated_Status_toc','Regulated_Status_Products','Regulated_Status_exceptions','Regulated_Status_class','Regulated_Status_PCC','Regulated_Status','Category'])

    #export full superfile for later testing of regulated status setting, if needed
    exportfile(superfile,destinationpath,"superfile with full regulated data")
   
    #delete the surplus Regulated status columns
    superfile = superfile.drop(['Regulated_Status_Start','Regulated_Status_toc','Regulated_Status_Products','Regulated_Status_exceptions','Regulated_Status_class','Regulated_Status_PCC'], axis=1)

    #producing distinct list of product codes with their assigned regulated status
    regulatedcheck = superfile[['Product Code','Product Primary Code','Regulated_Status']].drop_duplicates()
    exportfile(regulatedcheck,destinationpath,"regulated products check")

    return superfile
def setregulatedfares(df,destinationpath):
    """
    This processes the lookups of exceptional products, normal products and profit centres to create the fields needed  to assign the status of regulated/regulated to fares.
    It also chains these three fields into a logic can creates a new regulated_status field
    It also removes various fields that are no longer needed at the end of processing

    Parameters:
    df:             - a dataframe containg the superfile
    destinationpath - a string with the file path for any error file with missing rows to be sent
    
    output:
    df:             - a modified dataframe with a new field "Regulated_status" and 7 fields removed. 
    """
    
    #created new 'Regulated_Status' column with initial value of "Not Assigned"
    df['Regulated_Status_Start'] = 'Not Assigned'

    #assign values based on TOC Codes
    toc = {'SCR':'Regulated_by_TOC','LER':'Regulated_by_TOC','GWR':'Regulated_by_TOC','GXR':'Regulated_by_TOC','IEC':'Regulated_by_TOC','IWC':'Regulated_by_TOC','LTS':'Regulated_by_TOC'
                ,'MML':'Regulated_by_TOC','NCH':'Regulated_by_TOC','NLR':'Regulated_by_TOC','NSC':'Regulated_by_TOC','NSE':'Regulated_by_TOC','SWT':'Regulated_by_TOC','NIW':'Regulated_by_TOC'
                ,'RCL':'Regulated_by_TOC','RMS':'Regulated_by_TOC','RWB':'Regulated_by_TOC','TPE':'Regulated_by_TOC','HUL':'Regulated_by_TOC','IXC':'Regulated_by_TOC','FCC':'Regulated_by_TOC','NTH':'Regulated_by_TOC'
                ,'GWA':'Regulated_by_TOC','GCR':'Regulated_by_TOC','RNE':'Regulated_by_TOC','RNW':'Regulated_by_TOC','PDR':'Regulated_by_TOC','PSV':'Regulated_by_TOC','PWS':'Regulated_by_TOC','PGM':'Regulated_by_TOC'
                ,'PMR':'Regulated_by_TOC','PSC':'Regulated_by_TOC','PSY':'Regulated_by_TOC','PTW':'Regulated_by_TOC','PWM':'Regulated_by_TOC','DBA':'Regulated_by_TOC'
               ,'NFD':'Regulated_by_TOC','LRC':'Regulated_by_TOC','LBR':'Regulated_by_TOC','SMR':'Regulated_by_TOC'
               ,'ANG':'Regulated_by_TOC', 'GER':'Regulated_by_TOC','NTT':'Regulated_by_TOC','RCV':'Regulated_by_TOC','RWW':'Regulated_by_TOC','TLK':'Regulated_by_TOC','WGN':'Regulated_by_TOC'}
    
    df = assignedregulatedlookupvalues(df,'Regulation by TOCs',toc,'flag2','Regulated_Status_toc','Regulated_Status_Start')

    #assign values based on Product Codes
    products = {'2MTA':'Regulated_by_Product','2MTB':'Regulated_by_Product','2MTE':'Regulated_by_Product',
                '2MTJ':'Regulated_by_Product','2MTK':'Regulated_by_Product','2MTN':'Regulated_by_Product','2VQA':'Regulated_by_Product',
                '2MQA':'Regulated_by_Product', '2MSA':'Regulated_by_Product', '2MSH':'Regulated_by_Product', '2MSL':'Regulated_by_Product', '2MSW':'Regulated_by_Product', '2MTH':'Regulated_by_Product',
                '2MTL':'Regulated_by_Product', '2MTW':'Regulated_by_Product', '2CGE':'Regulated_by_Product','2CGJ':'Regulated_by_Product', '2CGK':'Regulated_by_Product', '2CGN':'Regulated_by_Product',
                '2CGO':'Regulated_by_Product', '2CGS':'Regulated_by_Product', '2CGT':'Regulated_by_Product', '2OBC':'Regulated_by_Product', '2OBD':'Regulated_by_Product', '2OBE':'Regulated_by_Product',
                '2OBF':'Regulated_by_Product', '2OCH':'Regulated_by_Product', '2OCI':'Regulated_by_Product', '2OCJ':'Regulated_by_Product', '2OCL':'Regulated_by_Product', '2OCN':'Regulated_by_Product',
                '2OEH':'Regulated_by_Product', '2CIC':'Regulated_by_Product', '2CID':'Regulated_by_Product', '2CIE':'Regulated_by_Product', '2CIF':'Regulated_by_Product'}
    
    df = assignedregulatedlookupvalues(df,'Regulation by Product',products,'Product Code','Regulated_Status_Products','Regulated_Status_toc')

    #assign values based on an exceptional products mapping
    regulatedexception = {'2ADA':'Unregulated_by_Product','2BDY':'Unregulated_by_Product'}
    df = assignedregulatedlookupvalues(df,'Regulation by Exception',regulatedexception,'Product Code','Regulated_Status_exceptions','Regulated_Status_Products')

    #assign values based on class
    unregulatedclass = {'1':'Unregulated_by_Class'}
    df = assignedregulatedlookupvalues(df,'Regulation by Class',unregulatedclass,'class','Regulated_Status_class','Regulated_Status_exceptions')
    
    #assign values based on Profit Centre Code
    regulatedprofitcentrecode = {'EC':'Unregulated_by_PCC','HM':'Unregulated_by_PCC'}
    df = assignedregulatedlookupvalues(df,'Regulation by Profit Centre',regulatedprofitcentrecode,'Carrier TOC / Third Party Code','Regulated_Status_PCC','Regulated_Status_class')

    #create final copy of regulated column
    df['Regulated_Status'] = df['Regulated_Status_PCC']
    
    #convert the final column from intermediate labels to final labels
    df['Regulated_Status'].replace('Not Assigned','Unregulated', inplace=True)
    df['Regulated_Status'].replace('Unregulated_by_Product','Unregulated', inplace=True)
    df['Regulated_Status'].replace('Unregulated_by_Class','Unregulated', inplace=True)
    df['Regulated_Status'].replace('Unregulated_by_PCC','Unregulated', inplace=True)
    df['Regulated_Status'].replace('Regulated_by_TOC','Regulated', inplace=True)
    df['Regulated_Status'].replace('Regulated_by_Product','Regulated', inplace=True)


    #drop fields no longer required
    del df['ticketa']
    del df['flaga']
    del df['ticket']
    del df['flag2']

    # Amend the relevant data types
    df = applydatatypes(df,['Regulated_Status_Start',
                            'Regulated_Status_toc',
                            'Regulated_Status_Products',
                            'Regulated_Status_exceptions',
                            'Regulated_Status_class',
                            
                            'Regulated_Status'])
    return df
Beispiel #5
0
def add_lennon_fares_info(df, lookupdf, year, typeofjoin):
    """
    This merges the LENNON data with the superfile and adds a year reference to the column title

    Parameters:       
    df          - a dataframe containing the combined data which is to be joined against
    lookupdf    - a dataframe containing the relevant LENNON information
    year        - a string representing the year the data relates to
    typeofjoin  - a string representing whether the  join is for advanced or non-advanced data
    
    Returns:
    df          - a dataframe with LENNON data joined
    """
    # apply appropriate merge type based on name of join
    if typeofjoin == 'non-advanced':
        #the non-advanced join
        #exportfile(df,'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\',"non-advanced input")
        df = pd.merge(left=df,
                      right=lookupdf,
                      how='left',
                      left_on=[
                          'Origin Code', 'Destination Code', 'Route Code',
                          'Product Code'
                      ],
                      right_on=[
                          'Origin Code', 'Destination Code', 'Route Code',
                          'Product Code'
                      ],
                      suffixes=('', '_LENNON' + year))

        df = applydatatypes(
            df,
            ['Origin Code', 'Destination Code', 'Route Code', 'Product Code'])

        #exportfile(lookupdf,'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\',"non-advanced lookup")
        #exportfile(df,'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\',"non-advanced output")

    elif typeofjoin == 'advanced':
        # advanced join
        # datatyping of key fields
        print("this is non advanced datatyping")
        df['Origin Code'] = df['Origin Code'].astype(str)
        df['Destination Code'] = df['Destination Code'].astype(str)
        df['Route Code'] = df['Route Code'].astype(str)
        df['class'] = df['class'].astype(str)

        #exportfile(df,'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\',"advanced input")

        df = pd.merge(
            left=df,
            right=lookupdf,
            how='left',
            left_on=['Origin Code', 'Destination Code', 'Route Code', 'class'],
            right_on=[
                'Origin Code', 'Destination Code', 'Route Code', 'class'
            ],
            suffixes=('', '_LENNON' + year))

        df = applydatatypes(
            df, ['Origin Code', 'Destination Code', 'Route Code', 'class'])
        #exportfile(lookupdf,'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\',"advanced lookup")
        #exportfile(df,'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\',"advanced output")

    else:
        print("Type of join not recognised")
        return None

    return df