def create_laptop_test_data(): ''' Creates positive and negative test laptop data and saves it to final_laptop_data.csv ''' file_path = 'data/test/final_laptop_test_data.csv' # Load the test laptop data laptop_df = pd.read_csv('data/base/retailer_test.csv') laptop_df['index'] = laptop_df.index laptop_df['index'].astype('object') if not os.path.exists(file_path): print('Generating test laptop data . . . ') # Create the negative and positive dataframes neg_df = create_neg_laptop_test_data(laptop_df) pos_df = create_pos_laptop_test_data(laptop_df) # Concatenate the data and save it final_laptop_test_df = create_final_data(pos_df, neg_df) final_laptop_test_df = final_laptop_test_df.sample(frac=1) final_laptop_test_df.to_csv(file_path) else: print('Already have test laptop data. Moving on . . . ')
def create_retailer_laptop_train_data(): file_path = 'data/train/retailer_laptop_data.csv' if not os.path.exists(file_path): print('Generating Retailer Laptop train data . . .') # Get the laptop data from the different sources amazon_laptops = pd.read_csv('data/base/amazon_laptop_titles.csv') walmart_laptops = pd.read_csv('data/base/walmart_laptop_titles.csv') newegg_laptops = pd.read_csv('data/base/newegg_laptop_titles.csv') # Concatenate the data laptops = remove_misc(pd.concat([amazon_laptops, walmart_laptops, newegg_laptops])) laptops['title'] = laptops['title'].apply(lambda x: remove_stop_words(x, omit_punctuation=['.'])) laptops = laptops.drop_duplicates(subset=['title']) # Create positive titles pos_titles = create_pos_laptop_data(laptops) pos_titles = pos_titles.drop_duplicates(subset=['title_one', 'title_two']) # Create negative titles neg_titles = create_neg_laptop_data(laptops) neg_titles = neg_titles.drop_duplicates(subset=['title_one', 'title_two']) # Combine the positive and negative DataFrames and put them in a CSV retailer_laptop_df = create_final_data(pos_titles, neg_titles) retailer_laptop_df.to_csv(file_path) else: print('Already have Retailer Laptop train data. Moving on . . .')
def create_spec_laptop_data(): file_path = 'data/train/spec_train_data.csv' if not os.path.exists(file_path): print('Generating general spec data for laptops . . . ') populate_spec() if not os.path.exists('data/train/spec_data.csv'): print( 'Generating spec data combinations. WARNING: THIS WILL CONSUME RESOURCES AND TAKE A LONG TIME.' ) gen_spec_combos() spec_df = pd.read_csv('data/train/spec_data.csv') pos_df = create_pos_spec_data(spec_df, rm_attrs=[['company'], ['product'], ['screen'], ['product', 'screen'], ['company', 'screen']], add_attrs=[]) neg_df = create_neg_spec_laptop( spec_df, ['cpu', 'ram', 'hard_drive', 'product', 'inches', 'screen']) final_spec_df = create_final_data(pos_df, neg_df) final_spec_df.to_csv(file_path) else: print('Already have spec data. Moving on . . .')
def create_computer_gs_data(): file_path = 'data/train/wdc_computers.csv' if not os.path.exists(file_path): print('Generating Gold Standard Computer data . . .') # Get the titles from the WDC Product Corpus if not os.path.exists('data/base/computer_wdc_whole_no_duplicates.csv'): computer_df = generate_computer_data() computer_df = computer_df.drop_duplicates('title') computer_df.to_csv('data/base/computer_wdc_whole_no_duplicates.csv') else: computer_df = pd.read_csv('data/base/computer_wdc_whole_no_duplicates.csv') # Get "good" clusters from the data valid_clusters = list(get_valid_clusters(computer_df)) computer_train_wdc_pos = pd.DataFrame(columns=["title_one", "title_two", "label"]) computer_train_wdc_neg = pd.DataFrame(columns=["title_one", "title_two", "label"]) # Positive data creation for cluster in valid_clusters: computer_train_wdc_pos = computer_train_wdc_pos.append(create_pos_from_cluster(computer_df, cluster)) # Negative data creation for cluster in valid_clusters: computer_train_wdc_neg = computer_train_wdc_neg.append(create_neg_from_cluster(computer_df, cluster, valid_clusters)) # Concatenate the data computer_train_wdc = create_final_data(computer_train_wdc_pos, computer_train_wdc_neg) computer_train_wdc.to_csv('data/train/wdc_computers.csv') else: print('Already have Gold Standard Computer Data. Moving on . . .')
def create_data(): ''' Runs the necessary functions to create the data for training. ''' # Don't show the copy warnings pd.set_option('mode.chained_assignment', None) # Run the functions populate_spec() create_pcpartpicker_data() create_general_cpu_data() create_final_drive_data() create_pseudo_laptop_data() final_gb_data = create_final_data(gen_gb_pos_data(), gen_neg_gb_data()) final_gb_data.reset_index(inplace=True) randomize_units(final_gb_data, units=['gb']) create_laptop_test_data() create_neg_laptop_test_data() create_retailer_laptop_train_data() create_computer_gs_data() print('Generating gigabyte data (as in just examples that use GB)') # Load all the data final_computer_df = pd.read_csv('data/train/wdc_computers.csv') final_pseudo_laptop_df = pd.read_csv('data/train/spec_train_data_new.csv') final_pcpartpicker_data = pd.read_csv( 'data/train/final_pcpartpicker_data.csv').sample(frac=1) more_cpu_data = pd.read_csv('data/train/more_cpu_data.csv') more_drive_data = pd.read_csv('data/train/more_drive_data.csv') retailer_laptop_df = pd.read_csv('data/train/retailer_laptop_data.csv') all_data = [ final_computer_df, final_pseudo_laptop_df, more_cpu_data, final_gb_data, more_drive_data, retailer_laptop_df ] # Print the sizes of the data print('Computer df size: {}'.format(len(final_computer_df))) print('Pseudo-Laptop df size: {}'.format(len(final_pseudo_laptop_df))) print('PCPartPicker df size: {}'.format(len(final_pcpartpicker_data))) print('More Drive Data df size: {}'.format(len(more_drive_data))) print('More CPU Data df size: {}'.format(len(more_cpu_data))) print('Final GB Data: {}'.format(len(final_gb_data))) print('Retailer Laptop Data: {}'.format(len(retailer_laptop_df))) # Concatenate everything total_data = pd.concat(all_data) total_data = total_data.sample(frac=1) total_data = remove_misc(total_data) # Get the max length of the data for padding in BERT Common.MAX_LEN = get_max_len(total_data) print('Total data size: {}'.format(len(total_data))) # Save the data total_data.to_csv('data/train/total_data.csv', index=False)
def create_pcpartpicker_data(): ''' Creates data for CPU, RAM, and drive data. Saves the data to final_pcpartpicker_data.csv ''' file_path = 'data/train/final_pcpartpicker_data.csv' if not os.path.exists(file_path): print('Generating PCPartPicker data . . .') ram_df = remove_misc(pd.read_csv('data/base/pos_ram_titles.csv')) cpu_df = remove_misc(pd.read_csv('data/base/pos_cpu_titles.csv')) hard_drive_df = remove_misc( pd.read_csv('data/base/pos_hard_drive_titles.csv')) # Generate all the positive data for the categories pos_ram_data = generate_pos_pcpartpicker_data(ram_df) pos_cpu_data = generate_pos_pcpartpicker_data(cpu_df) pos_hard_drive_data = generate_pos_pcpartpicker_data(hard_drive_df) # Generate all the negative data for the categories neg_ram_data = generate_neg_pcpartpicker_data(ram_df) neg_cpu_data = generate_neg_pcpartpicker_data(cpu_df) neg_hard_drive_data = generate_neg_pcpartpicker_data(hard_drive_df) # Generate the final data final_ram_data = create_final_data(pos_ram_data, neg_ram_data) final_cpu_data = create_final_data(pos_cpu_data, neg_cpu_data) final_hard_drive_data = create_final_data(pos_hard_drive_data, neg_hard_drive_data) print('Amount of data for the CPU data, RAM data and drive data', len(final_cpu_data), len(final_ram_data), len(final_hard_drive_data)) # Concatenate the data and save it final_pcpartpicker_df = pd.concat( [final_ram_data, final_cpu_data, final_hard_drive_data]) final_pcpartpicker_df.reset_index(inplace=True) randomize_units(final_pcpartpicker_df, units=['gb']) final_pcpartpicker_df.to_csv(file_path) else: print('Already have PCPartPicker data. Moving on . . .')
def create_pcpartpicker_data(): ram_df = remove_misc(pd.read_csv('data/train/pos_ram_titles.csv')) cpu_df = remove_misc(pd.read_csv('data/train/pos_cpu_titles.csv')) hard_drive_df = remove_misc(pd.read_csv('data/train/pos_hard_drive_titles.csv')) # Generate all the positive data for the categories pos_ram_data = generate_pos_pcpartpicker_data(ram_df) pos_cpu_data = generate_pos_pcpartpicker_data(cpu_df) pos_hard_drive_data = generate_pos_pcpartpicker_data(hard_drive_df) # Generate all the negative data for the categories neg_ram_data = generate_neg_pcpartpicker_data(ram_df) neg_cpu_data = generate_neg_pcpartpicker_data(cpu_df) neg_hard_drive_data = generate_neg_pcpartpicker_data(hard_drive_df) # Generate the final data final_ram_data = create_final_data(pos_ram_data, neg_ram_data) final_cpu_data = create_final_data(pos_cpu_data, neg_cpu_data) final_hard_drive_data = create_final_data(pos_hard_drive_data, neg_hard_drive_data) print('Amount of data for the CPU data, RAM data and hard drive data', len(final_cpu_data), len(final_ram_data), len(final_hard_drive_data)) return final_cpu_data, final_ram_data, final_hard_drive_data
def create_final_drive_data(): file_path = 'data/train/more_drive_data.csv' if not os.path.exists(file_path): print('Generating general drive data . . . ') # Generate the data pos_df = generate_pos_hard_drive_data() neg_df = generate_neg_hard_drive_data() # Concatenate the data and save it final_df = create_final_data(pos_df, neg_df) final_df.to_csv(file_path) else: print('Already have general drive data. Moving on . . .')
def create_general_cpu_data(): file_path = 'data/train/more_cpu_data.csv' if not os.path.exists(file_path): print('Generating general cpu data . . . ') # Create the positive and negative examples pos_df = generate_pos_cpu_data() neg_df = generate_neg_cpu_data() # Concatenate the data and save it final_cpu_df = create_final_data(pos_df, neg_df) final_cpu_df.to_csv(file_path) else: print('Already have general cpu data data. Moving on . . .')
def create_laptop_data(): # Load the laptop data laptop_df = pd.read_csv('data/train/laptops.csv', encoding='latin-1') create_attribute_sets(laptop_df) neg_df = create_neg_laptop_data( laptop_df, attributes=['Cpu', 'Memory', 'Ram', 'Inches', 'Product']) pos_df = create_pos_laptop_data(laptop_df, rm_attrs=[['Company'], ['TypeName'], ['ScreenResolution'], ['Product'], ['TypeName', 'ScreenResolution']], add_attrs=[]) final_laptop_df = create_final_data(pos_df, neg_df) final_laptop_df = final_laptop_df.sample(frac=1) return final_laptop_df
def create_final_drive_data(): ''' Creates positive and negative drive data and saves it to more_drive_data.csv ''' file_path = 'data/train/more_drive_data.csv' if not os.path.exists(file_path): print('Generating general drive data . . . ') # Generate the data pos_df = generate_pos_hard_drive_data() neg_df = generate_neg_hard_drive_data() # Concatenate the data and save it final_df = create_final_data(pos_df, neg_df) final_df.reset_index(inplace=True) randomize_units(final_df, ['gb']) final_df.to_csv(file_path) else: print('Already have general drive data. Moving on . . .')
def create_laptop_data(): file_path = 'data/train/final_laptop_data.csv' # Load the laptop data laptop_df = pd.read_csv('data/train/laptops.csv', encoding='latin-1') # Create the attribute sets for the LaptopAttributes create_attribute_sets(laptop_df) if not os.path.exists(file_path): print('Generating laptop data . . . ') # Create the negative and positive dataframes neg_df = create_neg_laptop_data(laptop_df, attributes=['Cpu', 'Memory', 'Ram', 'Inches', 'Product']) pos_df = create_pos_laptop_data(laptop_df, rm_attrs = [['Company'], ['TypeName'], ['ScreenResolution'], ['Product'], ['TypeName', 'ScreenResolution']], add_attrs = []) # Concatenate the data and save it final_laptop_df = create_final_data(pos_df, neg_df) final_laptop_df = final_laptop_df.sample(frac=1) final_laptop_df.to_csv(file_path) else: print('Already have laptop data. Moving on . . . ')