Example #1
0
def main():
    #substitute for proper parameters in a function
    filelocation = 'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexSourceData\\advanced_and_non_advanced_output\\'
    outputto = 'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexSourceData\\advanced_and_non_advanced_output\\adv_non_advanced_and_superfile\\'

    print("getting superfile for weights")
    rawsuperfile = pd.read_csv(
        filelocation + 'superfile without regulated steps_20190425_16-04.csv',
        dtype={
            'Carrier TOC / Third Party Code': 'category',
            'Origin Code': 'category',
            'Destination Code': 'category',
            'Route Code': 'category',
            'Product Code': 'category',
            'Product Primary Code': 'category',
            'class': 'category',
            'sector': 'category',
            'ticket_type': 'category'
        })

    superfilefiltered = rawsuperfile[rawsuperfile['Category'] == 'season']

    print(type(superfilefiltered))
    print(superfilefiltered.head(5))
    print(superfilefiltered.info())

    groupedrawsuperfile = superfilefiltered.groupby(
        ['sector', ''])['Adjusted Earnings Amount'].agg('sum')

    exportfile(groupedrawsuperfile, outputto,
               "superfile others by product code")
Example #2
0
def main():
    """
    This was written at Peter Moran's request to perform a comparison between the old and new "regulated fares sort" files.  It was used once and is now kept in case a further update
    to this file is needed on an adhoc basis.

    Parameters
    None:   but inmports is 
    """
    
    lookupslocation = 'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexSourceData\\regulated_fares_data\\'
    destination = 'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexSourceData\\regulated_fares_data\\comparison output\\'
    lookupfileslist, count = getdata(lookupslocation)

    print(f"there are {count} files found.")

    newlookup = lookupfileslist[0]
    oldlookup = lookupfileslist[1]

    #join new to old // old to new
    new_uniquevalues = pd.merge(left=newlookup,right=oldlookup,how='left',
        left_on=['orig','dest','route','ticket'],right_on=['orig','dest','route','ticket'])

    old_uniquevalues = pd.merge(left=newlookup,right=oldlookup,how='right',
        left_on=['orig','dest','route','ticket'],right_on=['orig','dest','route','ticket'])

    print("These are values unique to new lookup")  
    new_uniquevalues = new_uniquevalues[new_uniquevalues.ticketa.isnull()==True]
    exportfile(new_uniquevalues,destination,'unique_new_values',1)

    print("These are values unique to old lookup")
    old_uniquevalues = old_uniquevalues[old_uniquevalues.new_flag.isnull()==True]
    exportfile(old_uniquevalues,destination,'unique_old_values',1)
def set_template():
    outputgoesto = 'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexSourceData\\Template_preparation\\'
    yeartocalculate = 'January 2019'
    JanuaryRPI = 2.5

    #get max load_id here' -1 to allow for replication of test data of last year
    lastyearsloadid = getmaxloadid(
        'NETL', 'factt_205_annual_Fares_Index_stat_release') - 1

    #get last year's data
    fares_index_sector_template = getDWdata(
        'NETL', 'factt_205_annual_Fares_Index_stat_release_test',
        lastyearsloadid)
    fares_index_tt_template = getDWdata(
        'NETL', 'factt_205_annual_Fares_Index_tt_stat_release_test',
        lastyearsloadid)

    #populate the template with blank entries for this year's data
    sector_template = set_blank_template(fares_index_sector_template,
                                         'ticket_category', JanuaryRPI)
    tt_template = set_blank_template(fares_index_tt_template, 'ticket_type',
                                     JanuaryRPI)

    #exportfile(sector_template,outputgoesto,"sector_template")
    #exportfile(tt_template,outputgoesto,"tt_template")

    #populate the template with new data, apart from from passenger revenue
    sector_prep = populatetemplate(sector_template, 'ticket_category',
                                   outputgoesto, JanuaryRPI, yeartocalculate)
    tt_prep = populatetemplate(tt_template, 'ticket_type', outputgoesto,
                               JanuaryRPI, yeartocalculate)

    #get the data required for calculating the percentage change for passenger revenue
    revjourneyraw = get_journey_by_revenue()

    #exportfile(revjourneyraw,outputgoesto,"raw rev journey")
    #insert the index values into the template
    sector_prep = insertrevjourneydata(sector_prep, revjourneyraw)

    #get latest year on year change here

    exportfile(sector_prep, outputgoesto, "sector_template_populated")
    exportfile(tt_prep, outputgoesto, "tt_template_populated")

    #drop comparison column no longer needed
    del sector_prep['passrev_variance_from_last_year']

    #import data to warehouse
    importdatatoDW(sector_prep, 'NETL',
                   'factt_205_annual_Fares_Index_stat_release_test')
    importdatatoDW(tt_prep, 'NETL',
                   'factt_205_annual_Fares_Index_tt_stat_release_test')
def parse_rdg_data(data1, data2,year):
    """
    This function splits two lists of lists by indices and converts them to a data frame  
   
    Parameters:
    - data1: A list of lists containing raw flow data
    - data2: a list of lists containing raw fare data
    - year: an integer giving the year the RDG data relates to.
    
    Returns:
    -dataset1: a dataframe containing parsed flow data
    -dataset2: a dataframe containing parsed fare data
    """
    flow_data = list()
    fare_record_data = list()

    for line in data1:
        line = [line[2:6],line[6:10],line[10:15],line[15:18],line[18],line[19],line[36:39],line[20:28],line[28:36],line[42:49]]
        flow_data.append(line)

    flow = pd.DataFrame(flow_data, columns=["ORIGIN_CODE","DESTINATION_CODE","ROUTE_CODE","STATUS_CODE","USAGE_CODE","DIRECTION","TOC","VALID_UNTIL","VALID_FROM","FLOW_ID"])
    flow['ROUTE_CODE'] = flow['ROUTE_CODE'].astype(object)
    flow.index.name="flow_idx"

    for line in data2:
        line=[line[2:9],line[9:12],line[12:20]]
        fare_record_data.append(line)

    fare_record = pd.DataFrame(fare_record_data, columns=["FLOW_ID","TICKET_CODE","FARE"])

    #place holder where the missing FARE info without a final zero has a zero added
    if year == 2019:
        #if ticket code?  = x, then append 0 to the fares value
        pass

    #temporary export for debug
    print("Flow described/n")
    print(flow.info())

    print("Fare described/n")
    print(fare_record.info())

    exportfile(flow,'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\',f"flow_info_{year}")
    exportfile(fare_record,'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\', f"fare_info_{year}")
    #end of temporary export for debug

    fare_record.index.name = "fare_record_idx"

    return flow,fare_record
def main():
    """
    This is the main entry point for the creation of the superfile, advanced and non-advanced files from the raw data and lookups.
    This calls various other functions to export the prepared files to be used in main_2 - get_results

    Parameters:
    NONE

    Returns:
    NONE, but exports the following CSV files:
    'superfile without regulated steps': The superfile, not including intermediate columns of the regulated fares process.  To be used in 2nd stage
    'advancedfile': The advanced file.  To be used in 2nd stage
    'nonadvancedfile'.  The non-advanced file.  To be used in 2nd stage

    """

    #parameters to be edited depending on users' file set up
    root = 'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexSourceData\\'
    originpath = root + 'TOC_files\\'
    regulatedfarespath = root + 'regulated_fares_data\\'
    RDGfarespath = root + 'RDG_Fares_information\\'  
    LENNONnonadvancedfarespath = root + 'LENNON_Fares_information\\non_advanced_data\\' 
    LENNONadvancedfarepath = root + 'LENNON_Fares_information\\advanced_data\\'
    categorypath = root + 'Product_Category_information\\ProdCatLookup\\'
    destinationpath = 'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\'

    # the calculation of the root 'superfile'
    superfile = generatedata(originpath,destinationpath,regulatedfarespath,categorypath)
    

    ##copies of superfile, so superfile remains unamended for the various other functions
    superfileforadvanced = superfile.copy()
    superfilefornonadvanced = superfile.copy()
    superfileforrailfinance = superfile.copy()
    
    #export superfile
    print("the superfile is coming....\n")
    exportfile(superfile,destinationpath,'superfile without regulated steps')
    
    #extraction of summed earnings and journeys for check of initial TOC extraction
    totalscheck = superfile.groupby(['Carrier TOC / Third Party Code'])['Adjusted Earnings Amount','Operating Journeys'].agg('sum')
    exportfile(totalscheck,destinationpath,'sum_of_earnings_and_journies_by_toc')

    print("the advanced data is coming.... main\n")
    advanced = get_advanced_data(superfileforadvanced,destinationpath,LENNONadvancedfarepath)
    exportfile(advanced,destinationpath,'advancedfile')
     
    ##calculation of non-advanced data prior to manual validation and advantix data
    print ("The non-advanced data is coming....")
    nonadvanced = get_non_advanced_data(superfilefornonadvanced,destinationpath,RDGfarespath,LENNONnonadvancedfarespath)
    exportfile(nonadvanced,destinationpath,'nonadvancedfile')
    
    print("The data for rail financials is being prepared")
    getrailfinancial(superfileforrailfinance,destinationpath)
def getcategorylookup(df,filepath, filename,destinationpath):
    """
    This procedure loads category lookup data from excel file, conforms the format of fields to be return and joins it to the main file.  Non-matches are categorised as 'Missing' and 
    these rows are exported to a logging file.  Categorical datatyping is applied to new fields.

    Parameters:
    df                  - a dataframe containing the 'superfile'
    filepath            - a string containing the file path for the excel file
    filename            - a string containing the name of the excel file
    destinationpath     - a string containing the file path for the output to be sent

    Returns:
    df                  - a dataframe containing the 'superfile' with category information
    """
    print("Category Lookup Data being loaded \n")
    records = pd.read_excel(filepath + filename,sheet_name='Sheet1',heading=['Product Code','Product Primary Code','Category'])

    savtodf = records

    #define column names
    savtodf.columns = ['Product Code','Product Primary Code','Category']
    
    #force all categories to lower case
    savtodf['Category'] = savtodf['Category'].str.lower()

    print("Category Information being added\n")
    df = pd.merge(df,savtodf,how='left', left_on=['Product Code','Product Primary Code'],right_on=['Product Code','Product Primary Code'])

    #handle missing category information
    df['Category'].fillna('Missing',inplace=True)
    nonmatches = df[df['Category']=='Missing']
    unique_filtered_nonmatches = nonmatches[['Product Code','Product Primary Code']].copy().drop_duplicates()
    exportfile(unique_filtered_nonmatches,destinationpath, 'missing_categories')

    #apply datatyping
    df = applydatatypes(df,['Product Code','Product Primary Code','Category'])
    
    #remove unnecessary column
    del savtodf

    return df
Example #7
0
def lastminutechanges(df):
    """
    This is the location to make any last minute changes to the advanced non-advanced dataset prior to calculation of the calculations.
    These changes will tend to be diagnostic in character, usually around the assignment of product codes to regulated/unregulated status.
    All origin and destination codes that contain an alphabetical character
    Selected product codes are removed (refunds to season tickets?)

    Parameters
    df:     A dataframe containing the merged advanced and non advanced data

    Returns:
    df:     An amended dataframe
    Exports various cuts and summed groups of advanced and non advanced data
    
    
    """
    fileoutputpath = 'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexSourceData\\advanced_and_non_advanced_output\\adv_non_advanced_and_superfile\\'

    #remove the orgin and destination codes that contain an alphabetic character
    df = df[df['Origin Code'].str.contains('[0-9][0-9][0-9][0-9]')]
    df = df[df['Destination Code'].str.contains('[0-9][0-9][0-9][0-9]')]

    #remove these specific product codes: defined by Peter Moran by unknown process
    productcodestoremove = ['2MTC', '2MTD', '2MTF', '2MTG']
    df = df[~df['Product Code'].isin(productcodestoremove)]

    #export for Nisha to test if reassignment has worked
    filtereddf = df[df.Category == 'season']
    groupedadvandnonadvanced = filtereddf.groupby(
        ['Category', 'Product Code', 'sector', 'class',
         'Regulated_Status'])['Weightings'].agg('sum')
    exportfile(groupedadvandnonadvanced, fileoutputpath,
               "advandnonadv grouped")

    #export for Nisah to see whether season unregulated reassignment has worked
    subcutofdataseasonUnregulated = df[(df['Category'] == 'season') & (
        df['Regulated_Status'] == 'Unregulated')]
    exportfile(subcutofdataseasonUnregulated, fileoutputpath,
               'seasonunregulated')

    return df
def main():
    # This is intended as an independent module from the rest of the Fares Index Process.
    # The intent is to enable the identification of duplicates flow id's which are assigned deleted or retained in the wider RDG file to be used as part of the wider process

    #parameters to be edited depending on users' file set up
    root = 'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexSourceData\\'
    originpath = root + 'TOC_files\\'
    regulatedfarespath = root + 'regulated_fares_data\\'
    RDGfarespath = root + 'RDG_Fares_information\\'
    LENNONnonadvancedfarespath = root + 'LENNON_Fares_information\\non_advanced_data\\'
    LENNONadvancedfarepath = root + 'LENNON_Fares_information\\advanced_data\\'
    categorypath = root + 'Product_Category_information\\ProdCatLookup\\'
    destinationpath = 'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\'

    RDGprices2019 = get_rdg_prices_info(RDGfarespath, '2019 fares extract.txt',
                                        destinationpath, 'prices2019.csv',
                                        '2019', True)

    RDGprices2020 = get_rdg_prices_info(RDGfarespath, '2020 fares extract.txt',
                                        destinationpath, 'prices2020.csv',
                                        '2020', True)

    exportfile(RDGprices2019, destinationpath, "final RDG for 2019")
    exportfile(RDGprices2020, destinationpath, "final RDG for 2020")
def main():
    """
    This was written as a 'quick and dirty' product code check for Peter Moran

    """
    df = pd.read_csv(
        'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\rawsuperfile_20190305_12-57.csv'
    )

    df2aaa = df[df['Product Code'] == '2AAA']
    df2aaareg = df2aaa[df2aaa['Regulated_Status'] == 'Regulated']

    df2baf = df[df['Product Code'] == '2BAF']
    df2bafreg = df2baf[df2baf['Regulated_Status'] == 'Regulated']

    df2bfp = df[df['Product Code'] == '2BFP']
    df2bfpreg = df2bfp[df2bfp['Regulated_Status'] == 'Regulated']

    df2aaasum = df2aaareg.groupby(
        ['Product Code', 'Origin Code',
         'Destination Code'])['Adjusted Earnings Amount'].agg('sum')
    df2bafsum = df2bafreg.groupby(
        ['Product Code', 'Origin Code',
         'Destination Code'])['Adjusted Earnings Amount'].agg('sum')
    df2bfpsum = df2bfpreg.groupby(
        ['Product Code', 'Origin Code',
         'Destination Code'])['Adjusted Earnings Amount'].agg('sum')

    exportfile(
        df2aaasum,
        'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\',
        '2aaa - pm.csv')
    exportfile(
        df2bafsum,
        'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\',
        '2baf - pm.csv')
    exportfile(
        df2bfpsum,
        'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexOutput\\',
        '2bfp - pm.csv')
def get_rdg_prices_info(infilepath,infilename,outfilepath,outfilename,year,excludeflowid = False):
    """
    A zip file is downloaded from a RDG website, which produces a zip file.  Once extracted, the file with the file named  "RJFAF174.FFL" should be opened in notepad++ and saved as a .txt file.  
    The only relevant file is the one with the extension .FFL
    This procedure gets the RDG .txt file, splits it into flow and fare_price information dataframes, combines them into 
    a joined csv file, which has a lookup to add LENNON ticket codes for later use in the LENNON-based superfile.

    Parameters:
    infilepath      - a string containing the filepath location of the RDG file
    infilename      - a string containing the filename of the RDG file
    outfilepath     - a string containing the destination of the combined file
    outfilename     - a string containing the file name of the combined file
    year            - a string representing the year the prices info relates to
    excludeflowid   - a boolean representing whether duplicated flow ids should be excluded or not

    Returns:
    combined_data   - a dataframe containing the confirmed prices data.
    """
    
    print(f"getting RDG prices data for {year} \n ")
    flow_list, fare_list = extract_rdg_data(infilepath,infilename)
    
    print("splitting the data into flow and fares \n")
    flow_df, fares_df = parse_rdg_data(flow_list, fare_list,year)

    print("replacing the outofbounds date values \n ")
    #replacing the outofbounds date value 31122999 with 31122100
    flow_df['VALID_UNTIL'].replace(['31122999'],['31122100'],inplace=True)
    
    print("converting the valid_until into date format \n")
    #formatting the valid_until field as a date format
    flow_df['DATE_VALID_UNTIL'] = flow_df['VALID_UNTIL'].apply(lambda x: pd.to_datetime(str(x),format='%d%m%Y'))

    #remove rows where the Valid_Until date !=  the max value of Valid_Until
    idx = flow_df.groupby(['ORIGIN_CODE','DESTINATION_CODE','ROUTE_CODE'])['DATE_VALID_UNTIL'].transform(max) == flow_df['DATE_VALID_UNTIL']
    flow_df = flow_df[idx]

    print("exporting the flow and fares with separate info\n")
    #exportfile(flow_df,outfilepath,'flow_info_'+ year)

    #joining the flow and fares information
    print("joining flow and fares information\n")
    combined_data = pd.merge(flow_df,fares_df, on='FLOW_ID')
    combined_data.reset_index(drop=True, inplace=True)
    combined_data.index.name="FLOW_AND_FARES_INDEX"

    #temporary export of combined 
    exportfile(combined_data,outfilepath,f"flow_and_fares_combined_no_lennon_{year}")

    #This is superceded by lines 41 and 42
    #add the filter for given year for flow_id to remove duplicate flow id information
    #combined_data_no_duplicates = removeRDGduplicates(combined_data, year,excludeflowid)

    #reading in the lookup value for the LENNON codes lookup
    lookupinfo = pd.read_excel(infilepath +'Lennon_product_codes_and_Fares_ticket_types_2017.xlsx','Fares to Lennon coding')
    
    ##join lookupinfo with Lennon keys
    combined_data_with_lennon = pd.merge(combined_data,lookupinfo,'left',left_on=['TICKET_CODE'],right_on=['Fares ticket type code'])

    # remove duplicates where fares are the same
    combined_data_with_lennon.drop_duplicates(subset=['ORIGIN_CODE','DESTINATION_CODE','ROUTE_CODE','TICKET_CODE','FARE'],keep='first',inplace=True)
    
    #flag up duplicates where fares are different
    flowandfaresduplicateflag = combined_data_with_lennon.duplicated(subset=['ORIGIN_CODE','DESTINATION_CODE','ROUTE_CODE','TICKET_CODE'],keep=False)
    duplicateswithdifferentfares = combined_data_with_lennon[flowandfaresduplicateflag]
    exportfile(duplicateswithdifferentfares,outfilepath,"Duplicates with different fares in flow and fares file for_" + year)

    ##temp export of combined_data_with_Lennon
    exportfile(combined_data_with_lennon,outfilepath,"combined_flow_and_fares_with_lennon_"+year)

    ##return the completed file

    return combined_data_with_lennon
def generatedata(originpath,destinationpath,regulatedfarespath,categorypath):
    """
    This joins a number of CSV source files into a common dataframe; handles the cases of sub profit centres being used in place of profit centres; 
    maps sectors, ticket_types, classes, regulated status and category information to the combined file.  This is then used to calculated advanced and non-advanced data.

    Parameters:
    originpath          - A string specifying the location of the individual TOC files are found
    destinationpath     - A string specifying the location of where output should be exported
    regulatedfarespath  - A string specifying the location of the lookup file for regulated fares file
    categorypath        - A string specifying the location of the lookup file for category

    Returns:
    superfile           - A dataframe consisting of the combined and mapped data.
    """
    list_of_tocs,filecount = getdata(originpath)

    joinedfile = combinefiles(list_of_tocs,filecount)

    if filecount > 50:
        print("As you are processing a large number of files, this may possibly cause the PC to freeze or crash due to memory issues.\n")  
        print("If this happens, restart the computer, then close down IE, Outlook and any other memory/resource hungry applications and try again.\n")


        
        
    superfile = joinedfile.copy()

    #drop where category_code not starting with 1 or 2
    superfile = superfile[superfile['Product Code'].str.contains('1[A-Z][A-Z][A-Z]|2[A-Z][A-Z][A-Z]',regex=True)]

    #fields to convert to categorical data type
    superfile = applydatatypes(superfile,['Carrier TOC / Third Party Code','Product Code','Product Primary Code'])

    #mapping of lookups starts here
    #mapping of sectors
    print("mapping sectors within superfile\n")
    sector_mapping = {'EK':'Lon SE','HO':'Lon SE','HQ':'Lon SE','HS':'Lon SE','HT':'Lon SE','HU':'Lon SE','HW':'Lon SE','HY':'Lon SE','HZ':'Lon SE','EX':'Lon SE',
                        'EA':'Regional','EI':'Regional','EJ':'Regional','HA':'Regional','HC':'Regional','HD':'Regional','HE':'Regional','HL':'Regional','ES':'Regional',
                        'EC':'Long D','EH':'Long D','HB':'Long D','HF':'Long D','HI':'Long D','HK':'Long D','HM':'Long D'}
    superfile = assignlookupvalues(superfile,'sectors', sector_mapping, "Carrier TOC / Third Party Code",  'sector' ,destinationpath)

    #mapping of tickettypes
    print("mapping ticket types within superfile\n")
    tickettypemapping = {'PG01':'Full','PG05':'Full','PG02':'Reduced','PG03':'Reduced','PG06':'Reduced','PG07':'Reduced','PG04':'Season','PG08':'Season'}
    superfile = assignlookupvalues(superfile,'ticket_type',tickettypemapping,"Product Primary Code",'ticket_type',destinationpath,'Other')
    
    #mapping of ticketclasses
    print("mapping ticket types within superfile\n")
    classmapping = {'1':'1', '2':'2','9':'2'}
    superfile['classreference'] = superfile['Product Code'].str[0]
    superfile = assignlookupvalues(superfile,'class',classmapping,'classreference','class',destinationpath,'2')
    del superfile['classreference']

    print("mapping regulated status within superfile\n")
    #getting the regulated fares lookup to add flag_2 information for faretypes
    superfile = regulatedfarelookup(regulatedfarespath,superfile )
    
    #setting rows as regulated/unregulated fares here   
    superfile = setregulatedfares(superfile,destinationpath)

    #mapping of categories
    print("mapping categories within superfile\n")
    superfile = getcategorylookup(superfile,categorypath,'Product_category_lookup_2020_v1.xlsx',destinationpath)

    #dropping columns no longer needed
    superfile = superfile.drop(['orig','dest','route'], axis=1)

    #apply final superfile datatyping
    superfile = applydatatypes(superfile,['Carrier TOC / Third Party Code','Origin Code','Destination Code','Route Code','Product Code','sector','ticket_type','class','Regulated_Status_Start','Regulated_Status_toc','Regulated_Status_Products','Regulated_Status_exceptions','Regulated_Status_class','Regulated_Status_PCC','Regulated_Status','Category'])

    #export full superfile for later testing of regulated status setting, if needed
    exportfile(superfile,destinationpath,"superfile with full regulated data")
   
    #delete the surplus Regulated status columns
    superfile = superfile.drop(['Regulated_Status_Start','Regulated_Status_toc','Regulated_Status_Products','Regulated_Status_exceptions','Regulated_Status_class','Regulated_Status_PCC'], axis=1)

    #producing distinct list of product codes with their assigned regulated status
    regulatedcheck = superfile[['Product Code','Product Primary Code','Regulated_Status']].drop_duplicates()
    exportfile(regulatedcheck,destinationpath,"regulated products check")

    return superfile
Example #12
0
def main():
    """
    This is a second stage of the fares index process which imports the CSV files advanced, nonadvanced and superfile.  The last two files are combined and then joined in term to the superfile.
    Another function mimics the SUMPRODUCT function of excel.  A second function runs over the dataframe from the previous function and them combines the answers to produce the final answerset

    Parameters
    None:       But it does import adv, nonadv and superfile from file locations as CSV and converts them to dataframes

    Returns:
    None:       But it does export a csv file containing the final answerset.
    """

    #define where the files are and where they will go
    filelocation = 'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexSourceData\\advanced_and_non_advanced_output\\'
    outputto = 'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexSourceData\\advanced_and_non_advanced_output\\adv_non_advanced_and_superfile\\'
    final_answerto = 'C:\\Users\\gwilliams\\Desktop\\Python Experiments\\work projects\\FaresIndexSourceData\\Template_preparation\\'

    print("getting advanced data\n")
    advanceddata = pd.read_csv(
        filelocation + 'advancedfile_20200409_10-15.csv',
        dtype={
            'Carrier TOC / Third Party Code': 'category',
            'Origin Code': 'category',
            'Destination Code': 'category',
            'Route Code': 'category',
            'Product Code': 'category',
            'Product Primary Code': 'category',
            'class': 'category',
            'sector': 'category'
        })

    print("Getting non-advanced data\n")
    nonadvanceddata = pd.read_csv(
        filelocation + 'nonadvancedfile_20200409_10-41.csv',
        dtype={
            'Carrier TOC / Third Party Code': 'category',
            'Origin Code': 'category',
            'Destination Code': 'category',
            'Route Code': 'category',
            'Product Code': 'category',
            'Product Primary Code': 'category',
            'class': 'category',
            'sector': 'category'
        })

    print("getting superfile for weights")
    rawsuperfile = pd.read_csv(
        filelocation + 'superfile without regulated steps_20200409_10-10.csv',
        dtype={
            'Carrier TOC / Third Party Code': 'category',
            'Origin Code': 'category',
            'Destination Code': 'category',
            'Route Code': 'category',
            'Product Code': 'category',
            'Product Primary Code': 'category',
            'class': 'category',
            'sector': 'category',
            'ticket_type': 'category'
        })

    print("preparing the superfile...\n")
    preparedsuperfile = preparesuperfile(rawsuperfile)

    #join the advanced and nonadvanced data
    print("data joined.  showing metadata\n")
    advandnonadv = joinadvandnonadv([advanceddata, nonadvanceddata])

    #sort the advandnonadv and superfile by common fields so they match up when paired later
    advandnonadv.sort_values(
        by=['sector', 'class', 'Category', 'Regulated_Status'],
        ascending=True,
        inplace=True)
    preparedsuperfile.sort_values(
        by=['sector', 'class', 'Category', 'Regulated_Status'],
        ascending=True,
        inplace=True)

    #calculate the weighted averages by sector, class, category and regulated_status
    answergrid = calc_weighted_average_price_change(
        advandnonadv, preparedsuperfile,
        ['sector', 'class', 'Category', 'Regulated_Status'])

    #change name of weighted_price_change
    answergrid.rename(columns={answergrid.columns[0]: 'weighted_price_change'},
                      inplace=True)

    #remove the group superweights where the sum is zero
    answergrid = answergrid.drop(
        answergrid[answergrid['Weightings_super'] == 0].index)

    #flatten the answergrid
    answergrid.columns = [
        ''.join(col).strip() for col in answergrid.columns.values
    ]
    answergrid = answergrid.reset_index()

    #wpc * superweightings
    answergrid['wpc_and_weights'] = answergrid[
        'weighted_price_change'] * answergrid['Weightings_super']
    print("this is the answergrid\n")

    exportfile(answergrid, outputto, "answerfile")

    #calculate the final set of group splits from the answer file as separate dataframes
    sectorsplit1 = calc_final(answergrid, ['sector'], 'sector1')
    sectorsplit2 = calc_final(answergrid, ['sector'], 'sector2')
    classsplit = calc_final(answergrid, ['class'], 'class')
    sectorclasssplit = calc_final(answergrid, ['sector', 'class'],
                                  'sector and class')
    regulatedstatussplit = calc_final(answergrid, ['Regulated_Status'],
                                      'regulation')
    categorysplit = calc_final(answergrid, ['Category'], 'category')
    sectorcategorysplit = calc_final(answergrid, ['sector', 'Category'],
                                     'sector and category')
    sectorclassregulatedstatus = calc_final(
        answergrid, ['sector', 'class', 'Regulated_Status'],
        'sector, class and regulation')
    classregulatedstatus = calc_final(answergrid,
                                      ['class', 'Regulated_Status'],
                                      'class and regulation')

    #create a nosplit calcfinal and add to the list of final answer subsets below
    listoffinalanswersubsetnames = [
        'sectorsplit1', 'sectorsplit2', 'classsplit', 'sectorclasssplit',
        'regulatedstatussplit', 'categorysplit', 'sectorcategorysplit',
        'sectorclassregulatedstatus', 'classregulatedstatus'
    ]
    listoffinalanswersubsets = [
        sectorsplit1, sectorsplit2, classsplit, sectorclasssplit,
        regulatedstatussplit, categorysplit, sectorcategorysplit,
        sectorclassregulatedstatus, classregulatedstatus
    ]

    dictoffinalanswersubset = dict(
        zip(listoffinalanswersubsetnames, listoffinalanswersubsets))

    #combine the group splits as one dataframe
    combined_answers_data = pd.concat([
        sectorsplit1, sectorsplit2, classsplit, sectorclasssplit,
        regulatedstatussplit, categorysplit, sectorcategorysplit,
        sectorclassregulatedstatus, classregulatedstatus
    ])

    #for names,subsets in dictoffinalanswersubset.items():
    #    print(names + "\n")
    #    print(subsets)
    #    print("\n")

    #rename column headers
    combined_answers_data.index.rename("parts_of_the_grouping", inplace=True)
    combined_answers_data.columns = [
        'grouping_name', 'average_price_change', 'superweights',
        'percentage_share_of_superweights_in_grouping'
    ]

    #end the process by exporting the final answer

    exportfile(combined_answers_data, final_answerto, "final answerset")
def get_rdg_prices_info(infilepath,
                        infilename,
                        outfilepath,
                        outfilename,
                        year,
                        excludeflowid=False):
    """
    This procedure gets the RDG .txt file, splits it into flow and fare_price information dataframes, 
    The flow dataset has a valid_until field which is converted into a date within range (31122999 to 31122100)
    This VALID_UNTIL is used to remove duplicates from flow information by removing rows where VALID_UNTIL field is not the highest possible value.
    It then combines the fares and flow into a joined csv file, which has a lookup to add LENNON ticket codes for later use in the LENNON-based superfile.

    Parameters:
    infilepath      - a string containing the filepath location of the RDG file
    infilename      - a string containing the filename of the RDG file
    outfilepath     - a string containing the destination of the combined file
    outfilename     - a string containing the file name of the combined file
    year            - a string representing the year the prices info relates to
    excludeflowid   - a boolean representing whether duplicated flow ids should be excluded or not

    Returns:
    combined_data   - a dataframe containing the confirmed prices data.
    """

    print(f"getting RDG prices data for {year} \n ")
    flow_list, fare_list = getdata(infilepath, infilename)

    print("splitting the data into flow and fares\n")
    flow_df, fares_df = splitter(flow_list, fare_list)

    print("replacing the outofbounds date values \n ")
    #replacing the outofbounds date value 31122999 with 31122100
    flow_df['VALID_UNTIL'].replace(['31122999'], ['31122100'], inplace=True)

    print("converting the valid_until into date format \n")
    #formatting the date valid until
    flow_df['DATE_VALID_UNTIL'] = flow_df['VALID_UNTIL'].apply(
        lambda x: pd.to_datetime(str(x), format='%d%m%Y'))

    #remove rows where the Valid_Until date !=  the max value of Valid_Until
    idx = flow_df.groupby([
        'ORIGIN_CODE', 'DESTINATION_CODE', 'ROUTE_CODE'
    ])['DATE_VALID_UNTIL'].transform(max) == flow_df['DATE_VALID_UNTIL']
    flow_df = flow_df[idx]

    print("exporting the flow and fares with separate info\n")
    exportfile(flow_df, outfilepath, 'flow_info_' + year)
    exportfile(fares_df, outfilepath, 'fares_info' + year)

    #joining the flow and fares information
    print("joining flow and fares information\n")
    combined_data = pd.merge(flow_df, fares_df, on='FLOW_ID')
    combined_data.reset_index(drop=True, inplace=True)
    combined_data.index.name = "FLOW_AND_FARES_INDEX"

    #add the filter for given year for flow_id to remove duplicate flow id information
    #combined_data_no_duplicates = removeRDGduplicates(combined_data, year,excludeflowid)

    #reading in the lookup value for the LENNON codes lookup
    lookupinfo = pd.read_excel(
        infilepath + 'Lennon_product_codes_and_Fares_ticket_types_2017.xlsx',
        'Fares to Lennon coding')

    ##join lookupinfo with Lennon keys
    combined_data_with_lennon = pd.merge(combined_data,
                                         lookupinfo,
                                         'left',
                                         left_on=['TICKET_CODE'],
                                         right_on=['Fares ticket type code'])

    # remove duplicates where fares are the same
    combined_data_with_lennon.drop_duplicates(subset=[
        'ORIGIN_CODE', 'DESTINATION_CODE', 'ROUTE_CODE', 'TICKET_CODE', 'FARE'
    ],
                                              keep='first',
                                              inplace=True)

    #flag up duplicates where fares are different
    flowandfaresduplicateflag = combined_data_with_lennon.duplicated(
        subset=[
            'ORIGIN_CODE', 'DESTINATION_CODE', 'ROUTE_CODE', 'TICKET_CODE'
        ],
        keep=False)
    duplicateswithdifferentfares = combined_data_with_lennon[
        flowandfaresduplicateflag]
    exportfile(
        duplicateswithdifferentfares, outfilepath,
        "Duplicates with different fares in flow and fares file for_" + year)

    ##return the completed file
    return combined_data_with_lennon
def get_non_advanced_data(df, destinationpath, RDGfarespath, LENNONfarespath):
    """
    This function takes the combined file as a data frame, adds prepared RDG price data, adds LENNON price data which is used to fill gaps in RDG data.
    The combined file then moves rows where there is no prices information for both years.
    'earnings column' is renamed weightings
    NULL and zero information is removed before a percentage change calculation is made.
    Rows where percentage changes are less than -20% and weightings  < £500,000 are extracted from combined file into a separate data frame, which is exported as 'little changes' for manual data validation
    Rows where percentage changes are more than 20% and weightings  < £500,000 are extracted from combined file into a separate data frame, which is exported as 'big changes' for manual validation
    Rows where weightings are greater than £500,000 are extracted from superfile, which is exported as 'big earnings' for avantix data to be added.
    The remaining rows of populated file are exported as populated data
    
    Parameters:
    df                  - A dataframe containing a combined file of TOC information with dimension information
    destinationfilepath - A string containing the location where output should be sent.
    RDGfarespath        - A string containing the location of the RDG lookup information
    LENNONfarespath     - A string containing the location of the LENNON lookup information

    Returns
    None, but
    exports a file of "little changes" for manual data validation
    exports a file of "big changes for manual data validation
    exports a file of "big earners" for manual addition of avantix data
    exports a file of populated data
    """

    #remove all advanced fares
    df = df[df['Category'] != 'advance']

    print("Starting to get RDG data\n")
    RDGprices2019 = get_rdg_prices_info(RDGfarespath, '2019 fares extract.txt',
                                        destinationpath, 'prices2019.csv',
                                        '2019', False)

    RDGprices2020 = get_rdg_prices_info(RDGfarespath, '2020 fares extract.txt',
                                        destinationpath, 'prices2020.csv',
                                        '2020', False)

    print("about to merge RDG info into main dataset.\n")

    #merging RDG fares information
    df = addRDGfaresinfo(df, RDGprices2019, '_2019')
    df = addRDGfaresinfo(df, RDGprices2020, '_2020')

    #exportfile(df,destinationpath,'non_advanced_data_after_RDG')

    print("datatyping of key columns\n")
    #datatyping
    #df['Origin Code'] = df['Origin Code'].str.zfill(4)
    #df['Destination Code'] = df['Destination Code'].str.zfill(4)
    #df['Route Code'] = df['Route Code'].str.zfill(5)

    #converting RDG fares to numeric
    print("convert rdg fares to numeric\n")
    df[['RDG_FARES_2019',
        'RDG_FARES_2020']] = df[['RDG_FARES_2019',
                                 'RDG_FARES_2020']].apply(pd.to_numeric)

    #getting LENNON fare information
    print("getting non-advanced LENNON information\n")
    LENNONprices2019 = get_lennon_price_info('2019', LENNONfarespath,
                                             'pricefile_nonadvanced_2019.csv',
                                             'non-advanced')
    LENNONprices2020 = get_lennon_price_info('2020', LENNONfarespath,
                                             'pricefile_nonadvanced_2020.csv',
                                             'non-advanced')

    #merging LENNON fares information
    print("merging non-advanced LENNON information with non-advanced file\n")
    df = add_lennon_fares_info(df, LENNONprices2019, '_2019', 'non-advanced')
    df = add_lennon_fares_info(df, LENNONprices2020, '_2020', 'non-advanced')

    #replace empty RDG data with LENNON data
    df['RDG_FARES_2019'].fillna(df['LENNON_PRICE_2019'], inplace=True)
    df['RDG_FARES_2020'].fillna(df['LENNON_PRICE_2020'], inplace=True)

    #drop unnecessary columns: these column headers are derived from the source file
    del df['LENNON_PRICE_2019']
    del df['LENNON_PRICE_2020']

    # rename the RDG Fares Columns, Earnings and axis
    df.rename(columns={
        'RDG_FARES_2019': 'FARES_2019',
        'RDG_FARES_2020': 'FARES_2020',
        'Adjusted Earnings Amount': 'Weightings'
    },
              inplace=True)
    df.rename_axis('index')

    #filters non-advanced where earnings are over £500,000
    bigearners = df.query('Weightings > 500000')

    #drop rows where FARES are NaN or 0
    populated2019and2020 = handlezeroandnulls(df)

    #add percentagechange to populated file
    populated2019and2020 = percentagechange(populated2019and2020, 'FARES_2020',
                                            'FARES_2019')

    #this is for validation of large percentage changes' amended data to be added to coredata later
    bigchange = populated2019and2020.query(
        'percentage_change > 20.0 and Weightings < 500000')
    littlechange = populated2019and2020.query(
        'percentage_change < -20.0 and Weightings < 500000')

    #not filtering at this stage anymore
    coredata = populated2019and2020.copy()

    #export diagnostic files
    exportfile(bigchange.sort_values('Weightings', ascending=False),
               destinationpath, 'big_change_file')
    exportfile(littlechange.sort_values('Weightings', ascending=False),
               destinationpath, 'little_change_file')
    exportfile(bigearners.sort_values('Weightings', ascending=False),
               destinationpath, 'big_earners_file')

    return coredata