Ejemplo n.º 1
0
def create_retailer_laptop_train_data():
    file_path = 'data/train/retailer_laptop_data.csv'
    
    if not os.path.exists(file_path):
        print('Generating Retailer Laptop train data . . .')
        # Get the laptop data from the different sources
        amazon_laptops = pd.read_csv('data/base/amazon_laptop_titles.csv')
        walmart_laptops = pd.read_csv('data/base/walmart_laptop_titles.csv')
        newegg_laptops = pd.read_csv('data/base/newegg_laptop_titles.csv')

        # Concatenate the data
        laptops = remove_misc(pd.concat([amazon_laptops, walmart_laptops, newegg_laptops]))
        laptops['title'] = laptops['title'].apply(lambda x: remove_stop_words(x, omit_punctuation=['.']))
        laptops = laptops.drop_duplicates(subset=['title'])

        # Create positive titles
        pos_titles = create_pos_laptop_data(laptops)
        pos_titles = pos_titles.drop_duplicates(subset=['title_one', 'title_two'])
        
        # Create negative titles
        neg_titles = create_neg_laptop_data(laptops)
        neg_titles = neg_titles.drop_duplicates(subset=['title_one', 'title_two'])

        # Combine the positive and negative DataFrames and put them in a CSV
        retailer_laptop_df = create_final_data(pos_titles, neg_titles)
        retailer_laptop_df.to_csv(file_path)
    
    else:
        print('Already have Retailer Laptop train data. Moving on . . .')
Ejemplo n.º 2
0
def create_data():
    '''
    Runs the necessary functions to create the data for training.
    '''

    # Don't show the copy warnings
    pd.set_option('mode.chained_assignment', None)

    # Run the functions
    populate_spec()
    create_pcpartpicker_data()
    create_general_cpu_data()
    create_final_drive_data()
    create_pseudo_laptop_data()
    final_gb_data = create_final_data(gen_gb_pos_data(), gen_neg_gb_data())
    final_gb_data.reset_index(inplace=True)
    randomize_units(final_gb_data, units=['gb'])
    create_laptop_test_data()
    create_neg_laptop_test_data()
    create_retailer_laptop_train_data()
    create_computer_gs_data()

    print('Generating gigabyte data (as in just examples that use GB)')

    # Load all the data
    final_computer_df = pd.read_csv('data/train/wdc_computers.csv')
    final_pseudo_laptop_df = pd.read_csv('data/train/spec_train_data_new.csv')
    final_pcpartpicker_data = pd.read_csv(
        'data/train/final_pcpartpicker_data.csv').sample(frac=1)
    more_cpu_data = pd.read_csv('data/train/more_cpu_data.csv')
    more_drive_data = pd.read_csv('data/train/more_drive_data.csv')
    retailer_laptop_df = pd.read_csv('data/train/retailer_laptop_data.csv')
    all_data = [
        final_computer_df, final_pseudo_laptop_df, more_cpu_data,
        final_gb_data, more_drive_data, retailer_laptop_df
    ]

    # Print the sizes of the data
    print('Computer df size: {}'.format(len(final_computer_df)))
    print('Pseudo-Laptop df size: {}'.format(len(final_pseudo_laptop_df)))
    print('PCPartPicker df size: {}'.format(len(final_pcpartpicker_data)))
    print('More Drive Data df size: {}'.format(len(more_drive_data)))
    print('More CPU Data df size: {}'.format(len(more_cpu_data)))
    print('Final GB Data: {}'.format(len(final_gb_data)))
    print('Retailer Laptop Data: {}'.format(len(retailer_laptop_df)))

    # Concatenate everything
    total_data = pd.concat(all_data)
    total_data = total_data.sample(frac=1)
    total_data = remove_misc(total_data)

    # Get the max length of the data for padding in BERT
    Common.MAX_LEN = get_max_len(total_data)

    print('Total data size: {}'.format(len(total_data)))

    # Save the data
    total_data.to_csv('data/train/total_data.csv', index=False)
Ejemplo n.º 3
0
def split_test_data(df):
    '''
    Split test data into the data and the labels
    '''

    df = remove_misc(df).to_numpy()
    df_labels = df[:, 2].astype('float32')
    df_data = df[:, 0:2]
    return df_data, df_labels
Ejemplo n.º 4
0
def create_pcpartpicker_data():
    '''
    Creates data for CPU, RAM, and drive data.
    Saves the data to final_pcpartpicker_data.csv
    '''

    file_path = 'data/train/final_pcpartpicker_data.csv'
    if not os.path.exists(file_path):
        print('Generating PCPartPicker data . . .')
        ram_df = remove_misc(pd.read_csv('data/base/pos_ram_titles.csv'))
        cpu_df = remove_misc(pd.read_csv('data/base/pos_cpu_titles.csv'))
        hard_drive_df = remove_misc(
            pd.read_csv('data/base/pos_hard_drive_titles.csv'))

        # Generate all the positive data for the categories
        pos_ram_data = generate_pos_pcpartpicker_data(ram_df)
        pos_cpu_data = generate_pos_pcpartpicker_data(cpu_df)
        pos_hard_drive_data = generate_pos_pcpartpicker_data(hard_drive_df)

        # Generate all the negative data for the categories
        neg_ram_data = generate_neg_pcpartpicker_data(ram_df)
        neg_cpu_data = generate_neg_pcpartpicker_data(cpu_df)
        neg_hard_drive_data = generate_neg_pcpartpicker_data(hard_drive_df)

        # Generate the final data
        final_ram_data = create_final_data(pos_ram_data, neg_ram_data)
        final_cpu_data = create_final_data(pos_cpu_data, neg_cpu_data)
        final_hard_drive_data = create_final_data(pos_hard_drive_data,
                                                  neg_hard_drive_data)

        print('Amount of data for the CPU data, RAM data and drive data',
              len(final_cpu_data), len(final_ram_data),
              len(final_hard_drive_data))

        # Concatenate the data and save it
        final_pcpartpicker_df = pd.concat(
            [final_ram_data, final_cpu_data, final_hard_drive_data])
        final_pcpartpicker_df.reset_index(inplace=True)
        randomize_units(final_pcpartpicker_df, units=['gb'])
        final_pcpartpicker_df.to_csv(file_path)

    else:
        print('Already have PCPartPicker data. Moving on . . .')
Ejemplo n.º 5
0
    create_data()

# Convert floats to one-hot arrays
def convert_to_one_hot(Y, C):
    """
    Function to create the 
    """
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

# Get the data from the file
total_data = pd.read_csv('data/train/total_data.csv')
Common.MAX_LEN = get_max_len(total_data)

# Drop the Unnamed column
total_data = remove_misc(total_data)

# Organize the data into seperate dataframes
train_data1 = []
train_data2 = []
labels = []
total_iloc = total_data.iloc()
for idx in range(len(total_data)):
    title_one_base = [' '] * Common.MAX_LEN
    title_two_base = [' '] * Common.MAX_LEN
    row = total_iloc[idx]
    
    for row_idx, x in enumerate(row.title_one.split(' ')):
        title_one_base[row_idx] = x
    
    for row_idx, x in enumerate(row.title_two.split(' ')):