def create_pos_neg_data(df, neg_attrs): temp = [] for idx in tqdm(range(0, int(len(df) * 0.04))): # Must start off with two positive titles first_row = df.iloc[idx] neg_attr = neg_attrs[idx % len(neg_attrs)] # Randomly choose the attributes that are not already in the row brand = random.choice(LaptopAttributes.laptop_brands) inches = random.choice(list(LaptopAttributes.inches)) screen = random.choice(list(LaptopAttributes.screen)) drive_type = random.choice(['ssd', 'hdd']) pos = format_laptop_row(first_row.copy(), brand, inches, screen, drive_type) new_attr = pos[neg_attr] while new_attr == pos[neg_attr]: new_attr = random.sample( LaptopAttributes.get_all_data()[neg_attr.lower()], 1)[0] neg = pos.copy() neg[neg_attr] = new_attr temp.append([ remove_stop_words(concatenate_row(pos.copy())), remove_stop_words(concatenate_row(pos.copy())), 1 ]) temp.append([ remove_stop_words(concatenate_row(pos.copy())), remove_stop_words(concatenate_row(neg.copy())), 0 ]) return pd.DataFrame(temp, columns=Common.COLUMN_NAMES)
def generate_neg_pcpartpicker_data(df): ''' Creates negative data from any of the PCPartPicker datasets ''' columns = list(df.columns) neg_df = pd.DataFrame(columns=['title_one', 'title_two', 'label']) df_list = df.iloc() for idx in tqdm(range(len(df))): row = df_list[idx] for col in columns: if not pd.isnull(row[col]): neg_idx = None while neg_idx == idx or neg_idx is None: neg_idx = random.randint(0, len(df) - 1) neg_title = None while neg_title == None or pd.isnull(neg_title): neg_title = df_list[neg_idx][random.choice(columns)] neg_df = neg_df.append( pd.DataFrame([[ remove_stop_words(row[col]), remove_stop_words(neg_title), 0 ]], columns=['title_one', 'title_two', 'label'])) return neg_df
def create_neg_laptop_data(laptop_df, attributes): new_column_names = ['title_one', 'title_two', 'label'] temp = [] for row in tqdm(range(len(laptop_df))): # Create a copy of the row for the negative example neg_row = laptop_df.iloc[row] for attribute_class in attributes: # Get the row in the laptop_data orig_row = laptop_df.iloc[row] # Get the attribute that we are trying to change attribute_val = orig_row[attribute_class] # Temporarily value for the new value new_val = attribute_val # Make sure we really get a new attribute while new_val == attribute_val: new_val = random.sample(LaptopAttributes.get_all_data()[attribute_class.lower()], 1)[0] # Change the value in the neg_row to the new value neg_row[attribute_class] = new_val # Concatenate and normalize the data title_one = remove_stop_words(concatenate_row(orig_row).lower()) title_two = remove_stop_words(concatenate_row(neg_row).lower()) # Append the data to the new df temp.append([title_one, title_two, 0]) return pd.DataFrame(temp, columns=new_column_names)
def generate_neg_hard_drive_data(): neg_df = [] drives = ['{} GB'.format(x) for x in range(8, 1001, 8) ] + ['{} TB'.format(x) for x in range(1, 20)] for drive in drives: new_drive = drive while new_drive == drive: new_drive = random.choice(drives) orig_variations = [] new_variations = [] # For hard drive for x in hard_drive_types: orig_variations.append('{} {}'.format(drive, x)) new_variations.append('{} {}'.format(new_drive, x)) # For ssd for x in ssd_types: orig_variations.append('{} {}'.format(drive, x)) new_variations.append('{} {}'.format(new_drive, x)) for old in orig_variations: for new in new_variations: neg_df.append( [remove_stop_words(old), remove_stop_words(new), 0]) return pd.DataFrame(neg_df, columns=COLUMN_NAMES)
def generate_pos_hard_drive_data(): ''' Creates positive data with the same drive size, but different modifiers. Ex: 10 gb internal hard drive vs 10 gb hdd. ''' pos_df = [] drives = ['{} GB'.format(x) for x in range(1, 3193) ] + ['{} TB'.format(x) for x in range(1, 101)] for drive in drives: # For hard drives pos_df.append([ remove_stop_words('{} {}'.format( drive, random.choice(Common.HARD_DRIVE_TYPES))), remove_stop_words('{} {}'.format( drive, random.choice(Common.HARD_DRIVE_TYPES))), 1 ]) # For SSDs pos_df.append([ remove_stop_words('{} {}'.format(drive, random.choice(Common.SSD_TYPES))), remove_stop_words('{} {}'.format(drive, random.choice(Common.SSD_TYPES))), 1 ]) return pd.DataFrame(pos_df, columns=Common.COLUMN_NAMES)
def extract_key_features(cluster): ''' Simplies the DataFrames extracted from the WDC Product Corpus Only includes the ID, description, title, and title + description ''' new_cluster = cluster.loc[:, ("id", "description", "title")] new_cluster["title"] = new_cluster["title"].map(lambda x: remove_stop_words(x)) new_cluster["description"] = new_cluster["description"].map(lambda x: remove_stop_words(str(x))) new_cluster["titleDesc"] = new_cluster["title"].map(lambda x: x.split(" ")) + new_cluster["description"].map(lambda x: x.split(" ")).map(lambda x: x[0:6]) return new_cluster
def create_pos_spec_data(df, rm_attrs, add_attrs): temp = [] df_iloc = df.iloc() COLUMN_NAMES = ['title_one', 'title_two', 'label'] for row in tqdm(range(int(len(df) * 2.3e-4))): # Set the new row to the same as the original to begin changing it new_row = df_iloc[row] # Get the row in the df and add the inch attribute orig_row = df_iloc[row] # Set product and company orig_row['company'] = orig_row['brand'].split(' ', 1)[0] orig_row['product'] = orig_row['brand'].split(' ', 1)[1] new_row['company'] = orig_row['brand'].split(' ', 1)[0] new_row['product'] = orig_row['brand'].split(' ', 1)[1] # Get a random inch attribute inch_attr = random.choice(list(LaptopAttributes.inches)) # Get random screen attribute screen_attr = random.choice(list(LaptopAttributes.screen)) # Get random hard drive attribute and type hard_drive_attr = random.choice(list(SpecAttributes.hard_drive)) # Get whether it will be an ssd or a hard drive drive_type = random.choice([hard_drive_types, ssd_types]) # Set the attributes orig_row['inches'] = inch_attr orig_row['screen'] = screen_attr orig_row['hard_drive'] = '{} {}'.format(hard_drive_attr, random.choice(drive_type)) new_row['inches'] = inch_attr new_row['screen'] = screen_attr new_row['hard_drive'] = '{} {}'.format(hard_drive_attr, random.choice(drive_type)) for attr_list in rm_attrs: # Simply create a copy of new_row so that we do not have to keep on generating the same thing pos_row = new_row.copy() for attr in attr_list: pos_row[attr] = '' title_one = remove_stop_words( concatenate_spec_data(orig_row).lower()) title_two = remove_stop_words( concatenate_spec_data(pos_row).lower()) temp.append([title_one, title_two, 1]) return pd.DataFrame(temp, columns=COLUMN_NAMES)
def change_unit_retailer_data(df, units, space=True): """ Replaces units like 8 gb with 8gb to have a better distribution across the dataset """ temp = [] # For each unit, do the replacement on it for unit in units: matcher = unit_matcher(unit) for idx in range(len(df)): for col in ['Amazon', 'Newegg', 'Walmart', 'BestBuy']: title = df.at[idx, col] if type(title) is str: title = remove_stop_words(df.at[idx, col]) title_matches = matcher.findall(title) if len(title_matches) > 0: neg_title = replace_units(title, title_matches, unit, space) title = replace_space(title, title_matches, unit, space) neg_matches = matcher.findall(neg_title) neg_title = replace_space(neg_title, neg_matches, unit, space) temp.append([title, neg_title, 0]) return pd.DataFrame(temp, columns=['title_one', 'title_two', 'label'])
def create_retailer_laptop_train_data(): file_path = 'data/train/retailer_laptop_data.csv' if not os.path.exists(file_path): print('Generating Retailer Laptop train data . . .') # Get the laptop data from the different sources amazon_laptops = pd.read_csv('data/base/amazon_laptop_titles.csv') walmart_laptops = pd.read_csv('data/base/walmart_laptop_titles.csv') newegg_laptops = pd.read_csv('data/base/newegg_laptop_titles.csv') # Concatenate the data laptops = remove_misc(pd.concat([amazon_laptops, walmart_laptops, newegg_laptops])) laptops['title'] = laptops['title'].apply(lambda x: remove_stop_words(x, omit_punctuation=['.'])) laptops = laptops.drop_duplicates(subset=['title']) # Create positive titles pos_titles = create_pos_laptop_data(laptops) pos_titles = pos_titles.drop_duplicates(subset=['title_one', 'title_two']) # Create negative titles neg_titles = create_neg_laptop_data(laptops) neg_titles = neg_titles.drop_duplicates(subset=['title_one', 'title_two']) # Combine the positive and negative DataFrames and put them in a CSV retailer_laptop_df = create_final_data(pos_titles, neg_titles) retailer_laptop_df.to_csv(file_path) else: print('Already have Retailer Laptop train data. Moving on . . .')
def inference(): ''' Test model using your own titles ''' title1 = input('First title: ') title2 = input('Second title: ') title1 = remove_stop_words(title1) title2 = remove_stop_words(title2) data = np.array([title1, title2]).reshape(1, 2) forward = net(*character_bert_preprocess_batch(data)) np_forward = forward.detach().numpy()[0] print('Output: {}'.format(torch.argmax(forward))) print('Softmax: Negative {:.4f}%, Positive {:.4f}%'.format( np_forward[0], np_forward[1]))
def create_pos_laptop_data(laptop_df, rm_attrs, add_attrs): new_column_names = ['title_one', 'title_two', 'label'] temp = [] for row in tqdm(range(len(laptop_df))): # Remove the attribute from the new title for attr_list in rm_attrs: # Create a copy of the row for the negative example new_row = laptop_df.iloc[row] orig_row = laptop_df.iloc[row] for attr in attr_list: new_row[attr] = '' title_one = remove_stop_words(concatenate_row(orig_row).lower()) title_two = remove_stop_words(concatenate_row(new_row).lower()) temp.append([title_one, title_two, 1]) return pd.DataFrame(temp, columns=new_column_names)
def preprocessing(orig_data): """ Normalizes the data by getting rid of stopwords and punctuation """ # Iterate over the original dataframe (I know it is slow and there are probably better ways to do it) iloc_data = orig_data.iloc # Will temporarily store the title data before it gets put into a DataFrame temp = [] # Iterate over the data for idx in tqdm(range(len(orig_data))): row = iloc_data[idx] title_left = remove_stop_words(row.title_left) title_right = remove_stop_words(row.title_right) # Append the newly created row (title_left, title_right, label) to the the temporary list temp.append([title_left, title_right, row.label]) # Return DataFrame of the title data, simplified return pd.DataFrame(temp, columns=COLUMN_NAMES)
def cpu_variations(cpu): ''' Creates different forms of a cpu title Ex: amd ryzen 5 3600, amd ryzen 5 3600 6 core processor, amd ryzen 5 3600 3.6 ghz processor and ryzen 5 3600 6 core 3.6 ghz processor. ''' temp = [] # This would be something like 'amd ryzen 5 3600 6 cores 3.6 ghz processor' temp.append(remove_stop_words('{} {} core {} processor'.format(cpu['name'], cpu['cores'], cpu['core_clock']))) # Just the base 'amd ryzen 5 3600' temp.append(remove_stop_words(cpu['name'])) # Add in only cores 'amd ryzen 5 3600 6 core processor' temp.append(remove_stop_words('{} {} core processor'.format(cpu['name'], cpu['cores']))) # Add in only ghz 'amd ryzen 5 3600 3.6 ghz processor' temp.append(remove_stop_words('{} {} processor'.format(cpu['name'], cpu['core_clock']))) return temp
def generate_pos_hard_drive_data(): pos_df = [] drives = ['{} GB'.format(x) for x in range(1, 3193) ] + ['{} TB'.format(x) for x in range(1, 101)] for drive in drives: # For hard drives pos_df.append([ remove_stop_words('{} {}'.format(drive, random.choice(hard_drive_types))), remove_stop_words('{} {}'.format(drive, random.choice(hard_drive_types))), 1 ]) # For SSDs pos_df.append([ remove_stop_words('{} {}'.format(drive, random.choice(ssd_types))), remove_stop_words('{} {}'.format(drive, random.choice(ssd_types))), 1 ]) return pd.DataFrame(pos_df, columns=COLUMN_NAMES)
def cpu_variations(cpu): temp = [] # This would be something like 'amd ryzen 5 3600 6 cores 3.6 ghz processor' temp.append( remove_stop_words('{} {} core {} processor'.format( cpu['name'], cpu['cores'], cpu['core_clock']))) # Just the base 'amd ryzen 5 3600' temp.append(remove_stop_words(cpu['name'])) # Add in only cores 'amd ryzen 5 3600 6 core processor' temp.append( remove_stop_words('{} {} core processor'.format( cpu['name'], cpu['cores']))) # Add in only ghz 'amd ryzen 5 3600 3.6 ghz processor' temp.append( remove_stop_words('{} {} processor'.format(cpu['name'], cpu['core_clock']))) return temp
def create_pos_laptop_test_data(laptop_df): ''' Creates the positive test laptop data ''' retailers = ['Amazon', 'Newegg', 'Walmart', 'BestBuy'] pos_data = [] for row in laptop_df.iloc: temp = [] for retailer in retailers: if type(row[retailer]) is str: temp.append(row[retailer]) temp = list(combinations(temp, 2)) for combo in temp: combo = list(combo) combo[0] = remove_stop_words(combo[0]).lower() combo[1] = remove_stop_words(combo[1]).lower() pos_data.append(list(combo) + [1]) return pd.DataFrame(pos_data, columns=['title_one', 'title_two', 'label'])
def create_neg_laptop_test_data(laptop_df): ''' Creates the negative test laptop data ''' retailers = ['Amazon', 'Newegg', 'Walmart', 'BestBuy'] neg_data = [] for row in laptop_df.iloc: temp = [] for retailer in retailers: if type(row[retailer]) is str: orig_product = row[retailer] neg_product = '' # Get a subset of the laptop dataframe that has titles that are similar to the original, but still different comp_df = laptop_df.loc[laptop_df['Company'] == row['Company']] comp_df = comp_df.loc[laptop_df['index'] != row['index']] idx = random.randint(0, len(comp_df) - 1) neg_row = comp_df.iloc[idx] while True: rand_retailer = random.sample(retailers, 1)[0] neg_product = neg_row[rand_retailer] if type(neg_product ) is str and neg_product != orig_product: temp = [ remove_stop_words(orig_product).lower(), remove_stop_words(neg_product).lower(), 0 ] neg_data.append(temp) break else: continue return pd.DataFrame(neg_data, columns=['title_one', 'title_two', 'label'])
def generate_neg_hard_drive_data(): ''' Creates negative data with different drives sizes. Ex: 10 gb ssd vs 20 gb ssd. ''' neg_df = [] drives = ['{} GB'.format(x) for x in range(8, 1001, 8) ] + ['{} TB'.format(x) for x in range(1, 20)] for drive in drives: new_drive = drive while new_drive == drive: new_drive = random.choice(drives) orig_variations = [] new_variations = [] # For hard drive for x in Common.HARD_DRIVE_TYPES: orig_variations.append('{} {}'.format(drive, x)) new_variations.append('{} {}'.format(new_drive, x)) # For ssd for x in Common.SSD_TYPES: orig_variations.append('{} {}'.format(drive, x)) new_variations.append('{} {}'.format(new_drive, x)) for old in orig_variations: for new in new_variations: neg_df.append( [remove_stop_words(old), remove_stop_words(new), 0]) return pd.DataFrame(neg_df, columns=Common.COLUMN_NAMES)
def generate_pos_pcpartpicker_data(df): ''' Creates positive data from any of the PCPartPicker datasets ''' columns = list(df.columns) pos_df = pd.DataFrame(columns=['title_one', 'title_two', 'label']) for idx in tqdm(range(len(df))): row = df.iloc()[idx] titles = [] for col in columns: if not pd.isnull(row[col]): titles.append(remove_stop_words(row[col])) if len(titles) > 1: combs = combinations(titles, 2) for comb in combs: comb = list(comb) comb.append(1) pos_df = pos_df.append( pd.DataFrame([comb], columns=['title_one', 'title_two', 'label'])) return pos_df
def create_neg_spec_laptop(df, attributes): df_iloc = df.iloc() temp = [] for row in tqdm(range(int(len(df) * 1.91e-4))): # Create a copy of the row for the negative example for attribute_class in attributes: neg_row = df_iloc[row] # Get the row in the laptop_data and add the inch attribute orig_row = df_iloc[row] # Set product and company orig_row['company'] = orig_row['brand'].split(' ', 1)[0] orig_row['product'] = orig_row['brand'].split(' ', 1)[1] neg_row['company'] = orig_row['brand'].split(' ', 1)[0] neg_row['product'] = orig_row['brand'].split(' ', 1)[1] # Get a random inch attribute inch_attr = random.choice(list(LaptopAttributes.inches)) # Get random screen attribute screen_attr = random.choice(list(LaptopAttributes.screen)) # Set the attributes orig_row['inches'] = inch_attr neg_row['inches'] = inch_attr orig_row['screen'] = screen_attr neg_row['screen'] = screen_attr if attribute_class == 'inches': # New inch attribute new_inch_attr = inch_attr # If the original attribute is still the same, keep getting a random one while inch_attr == new_inch_attr: new_inch_attr = random.choice(list( LaptopAttributes.inches)) neg_row['inches'] = new_inch_attr elif attribute_class == 'screen': # Have screen attr orig_screen_attr = random.choice(list(LaptopAttributes.screen)) # New screen attribute new_screen_attr = screen_attr # If the original attribute is still the same, keep getting a random one while orig_screen_attr == new_screen_attr: new_screen_attr = random.choice( list(LaptopAttributes.screen)) neg_row['screen'] = new_screen_attr orig_row['screen'] = orig_screen_attr elif attribute_class == 'product': # New product attr new_product_attr = orig_row['product'] # If the original attribute is still the same, keep getting a random one while orig_row['product'] == new_product_attr: new_product_attr = random.choice( SpecAttributes.laptop_brands).split(' ', 1)[1] neg_row['product'] = new_product_attr elif attribute_class == 'hard_drive': # New drive attribute new_drive_attr = orig_row['hard_drive'] # If the original attribute is still the same, keep getting a random one while orig_row['hard_drive'] == new_drive_attr: new_drive_attr = random.choice(SpecAttributes.hard_drive) neg_row['hard_drive'] = '{} {}'.format( new_drive_attr, random.choice([ random.choice(hard_drive_types), random.choice(ssd_types) ])) orig_row['hard_drive'] = '{} {}'.format( orig_row['hard_drive'], random.choice([ random.choice(hard_drive_types), random.choice(ssd_types) ])) else: # Get the attribute that we are trying to change attribute_val = orig_row[attribute_class] # Temporarily value for the new value new_val = attribute_val # Make sure we really get a new attribute while new_val == attribute_val: new_val = random.sample( SpecAttributes.get_all_data()[attribute_class.lower()], 1)[0] # Change the value in the neg_row to the new value neg_row[attribute_class] = new_val # We still need to add the phrasing to the hard drive attribute if it is not the current attribute class if attribute_class != 'hard_drive': drive_type = random.choice([ random.choice(hard_drive_types), random.choice(ssd_types) ]) neg_row['hard_drive'] = '{} {}'.format(neg_row['hard_drive'], drive_type) orig_row['hard_drive'] = '{} {}'.format( orig_row['hard_drive'], drive_type) # Concatenate and normalize the data title_one = remove_stop_words( concatenate_spec_data(orig_row).lower()) title_two = remove_stop_words( concatenate_spec_data(neg_row).lower()) # Append the data to the temp list temp.append([title_one, title_two, 0]) # Return the DataFrame created from temp return pd.DataFrame(temp, columns=COLUMN_NAMES)
class LaptopRetailerRegEx: populate_spec() laptop_brands = {'gateway', 'panasonic', 'toughbook', 'msi'} product_attrs = {'vivobook'} cpu_attributes = {'intel', 'm 2', '2 core', '4 core', '6 core', '8 core'} for brand in LaptopAttributes.laptop_brands: laptop_brands.add(brand.split(' ')[0].lower()) product_attrs.add(' '.join(brand.split(' ')[1:]).lower()) intel_cpu_df = pd.read_csv('data/base/intel_cpus.csv') intel_cpu_df = intel_cpu_df['title'].map( lambda x: remove_stop_words(x, omit_punctuation=['.']).split(' ')) for i in range(len(intel_cpu_df)): cpu_attributes.update(intel_cpu_df.iloc[i]) amd_cpu_df = pd.read_csv('data/base/amd_cpus.csv') amd_cpu_df = amd_cpu_df['title'].map( lambda x: remove_stop_words(x, omit_punctuation=['.']).split(' ')) for i in range(len(amd_cpu_df)): cpu_attributes.update(amd_cpu_df.iloc[i]) laptop_brands = list(laptop_brands) laptop_brands.sort(key=len, reverse=True) product_attrs = list(product_attrs) product_attrs.sort(key=len, reverse=True) cpu_attributes = list(cpu_attributes) cpu_attributes.sort(key=len, reverse=True) ram_modifiers = ['memory', 'ram', 'ddr4', 'ddr4 ram', 'ddr4 memory'] ram_modifiers.sort() hard_drive_modifiers = [ 'hdd', 'hard drive', 'disk drive', 'storage', 'hard drive storage', 'hdd storage' ] hard_drive_modifiers.sort(key=len, reverse=True) ssd_modifiers = [ 'ssd', 'solid state drive', 'solid state disk', 'pcie', 'pcie ssd', 'ssd storage' ] ssd_modifiers.sort(key=len, reverse=True) annoying_words = [ 'windows 10', 'win 10', 'windows 10 in s mode', 'windows', '3.0', '3.1', '3.2', 'optical drive', 'cd drive', 'dvd drive' ] annoying_words.sort(key=len, reverse=True) ram_modifier_matcher = re.compile( "\\b" + "(?!\S)|\\b".join(ram_modifiers) + "(?!\S)", re.IGNORECASE) random_matcher = re.compile( "\\b" + "(?!\S)|\\b".join(annoying_words) + "(?!\S)", re.IGNORECASE) cpu_matcher = re.compile( "\\b" + "(?!\S)|\\b".join(cpu_attributes) + "(?!\S)", re.IGNORECASE) brand_matcher = re.compile( "\\b" + "(?!\S)|\\b".join(laptop_brands) + "(?!\S)", re.IGNORECASE) product_attr_matcher = re.compile( "\\b" + "(?!\S)|\\b".join(product_attrs) + "(?!\S)", re.IGNORECASE) ram_matcher = re.compile( ' ?[0-9]+.{0,1}' + 'gb ?' + '(?:' + '|'.join([x for x in ram_modifiers]) + ')(?!\S)', re.IGNORECASE) hard_drive_matcher = re.compile( ' ?[0-9]+.{0,1}' + '(?:gb|tb) ?' + '(?:' + '|'.join([x for x in hard_drive_modifiers]) + ')(?!\S)', re.IGNORECASE) ssd_matcher = re.compile( ' ?[0-9]+.{0,1}' + '(?:gb|tb) ?' + '(?:' + '|'.join([x for x in ssd_modifiers]) + ')(?!\S)', re.IGNORECASE) gbtb_matcher = re.compile(' ?[0-9]+.{0,1}' + '(?:gb|tb)' + '(?!\S)', re.IGNORECASE) inch_matcher = re.compile('[1][0-9]\"?\"?.?[0-9]?\"?\"? ?(?:inch)?(?!\S)', re.IGNORECASE) del laptop_brands, product_attrs, cpu_attributes, intel_cpu_df, amd_cpu_df
model = siamese_network((MAX_LEN)) else: print('Using the exponential distance softmax.') from src.model_architectures.exp_distance_softmax import siamese_network model = siamese_network((MAX_LEN)) model.summary() # Load the model using the weights model.load_weights('models/DistanceSigmoid_40epoch_84%_val.h5') title_one = 'True Wireless Earbuds VANKYO X200 Bluetooth 5 0 Earbuds in Ear TWS Stereo Headphones Smart LED Display Charging Case IPX8 Waterproof 120H Playtime Built Mic Deep Bass Sports Work' title_two = 'TOZO T10 Bluetooth 5 0 Wireless Earbuds Wireless Charging Case IPX8 Waterproof TWS Stereo Headphones Ear Built Mic Headset Premium Sound Deep Bass Sport Black' title_one_arr = [' '] * MAX_LEN title_two_arr = [' '] * MAX_LEN title_one = remove_stop_words(title_one.lower()) title_two = remove_stop_words(title_two.lower()) for idx, x in enumerate(title_one.split(' ')): title_one_arr[idx] = x for idx, x in enumerate(title_two.split(' ')): title_two_arr[idx] = x title_one_arr = np.array(title_one_arr).reshape(1, MAX_LEN).astype('<U22') title_two_arr = np.array(title_two_arr).reshape(1, MAX_LEN).astype('<U22') print(model.predict([title_one_arr, title_two_arr]))