Python remove_stop_words Exemples, src.preprocessing.remove_stop_words Python Exemples

Exemple #1

0

Afficher le fichier

def create_pos_neg_data(df, neg_attrs):
    temp = []
    for idx in tqdm(range(0, int(len(df) * 0.04))):
        # Must start off with two positive titles
        first_row = df.iloc[idx]
        neg_attr = neg_attrs[idx % len(neg_attrs)]

        # Randomly choose the attributes that are not already in the row
        brand = random.choice(LaptopAttributes.laptop_brands)
        inches = random.choice(list(LaptopAttributes.inches))
        screen = random.choice(list(LaptopAttributes.screen))
        drive_type = random.choice(['ssd', 'hdd'])

        pos = format_laptop_row(first_row.copy(), brand, inches, screen,
                                drive_type)

        new_attr = pos[neg_attr]

        while new_attr == pos[neg_attr]:
            new_attr = random.sample(
                LaptopAttributes.get_all_data()[neg_attr.lower()], 1)[0]

        neg = pos.copy()
        neg[neg_attr] = new_attr

        temp.append([
            remove_stop_words(concatenate_row(pos.copy())),
            remove_stop_words(concatenate_row(pos.copy())), 1
        ])
        temp.append([
            remove_stop_words(concatenate_row(pos.copy())),
            remove_stop_words(concatenate_row(neg.copy())), 0
        ])

    return pd.DataFrame(temp, columns=Common.COLUMN_NAMES)

Exemple #2

0

Afficher le fichier

def generate_neg_pcpartpicker_data(df):
    '''
    Creates negative data from any of the PCPartPicker datasets
    '''

    columns = list(df.columns)
    neg_df = pd.DataFrame(columns=['title_one', 'title_two', 'label'])
    df_list = df.iloc()
    for idx in tqdm(range(len(df))):
        row = df_list[idx]
        for col in columns:
            if not pd.isnull(row[col]):
                neg_idx = None
                while neg_idx == idx or neg_idx is None:
                    neg_idx = random.randint(0, len(df) - 1)

                neg_title = None
                while neg_title == None or pd.isnull(neg_title):
                    neg_title = df_list[neg_idx][random.choice(columns)]

                neg_df = neg_df.append(
                    pd.DataFrame([[
                        remove_stop_words(row[col]),
                        remove_stop_words(neg_title), 0
                    ]],
                                 columns=['title_one', 'title_two', 'label']))

    return neg_df

Exemple #3

0

Afficher le fichier

Fichier : laptop_data_creation.py Projet : Adamkaram/Supervised-Product-Similarity

def create_neg_laptop_data(laptop_df, attributes):
    new_column_names = ['title_one', 'title_two', 'label']
    temp = []
    for row in tqdm(range(len(laptop_df))):
        # Create a copy of the row for the negative example
        neg_row = laptop_df.iloc[row]
        for attribute_class in attributes:
            # Get the row in the laptop_data
            orig_row = laptop_df.iloc[row]
            
            # Get the attribute that we are trying to change
            attribute_val = orig_row[attribute_class]
            
            # Temporarily value for the new value
            new_val = attribute_val
            
            # Make sure we really get a new attribute
            while new_val == attribute_val:
                new_val = random.sample(LaptopAttributes.get_all_data()[attribute_class.lower()], 1)[0]
            
            # Change the value in the neg_row to the new value
            neg_row[attribute_class] = new_val
            
            # Concatenate and normalize the data
            title_one = remove_stop_words(concatenate_row(orig_row).lower())
            title_two = remove_stop_words(concatenate_row(neg_row).lower())
            
            # Append the data to the new df
            temp.append([title_one, title_two, 0])

    return pd.DataFrame(temp, columns=new_column_names)

Exemple #4

0

Afficher le fichier

Fichier : general_drive_data.py Projet : Adamkaram/Supervised-Product-Similarity

def generate_neg_hard_drive_data():
    neg_df = []
    drives = ['{} GB'.format(x) for x in range(8, 1001, 8)
              ] + ['{} TB'.format(x) for x in range(1, 20)]

    for drive in drives:
        new_drive = drive

        while new_drive == drive:
            new_drive = random.choice(drives)

        orig_variations = []
        new_variations = []

        # For hard drive
        for x in hard_drive_types:
            orig_variations.append('{} {}'.format(drive, x))
            new_variations.append('{} {}'.format(new_drive, x))

        # For ssd
        for x in ssd_types:
            orig_variations.append('{} {}'.format(drive, x))
            new_variations.append('{} {}'.format(new_drive, x))

        for old in orig_variations:
            for new in new_variations:
                neg_df.append(
                    [remove_stop_words(old),
                     remove_stop_words(new), 0])

    return pd.DataFrame(neg_df, columns=COLUMN_NAMES)

Exemple #5

0

Afficher le fichier

def generate_pos_hard_drive_data():
    '''
    Creates positive data with the same drive size, but different modifiers.
    Ex: 10 gb internal hard drive vs  10 gb hdd.
    '''

    pos_df = []
    drives = ['{} GB'.format(x) for x in range(1, 3193)
              ] + ['{} TB'.format(x) for x in range(1, 101)]
    for drive in drives:
        # For hard drives
        pos_df.append([
            remove_stop_words('{} {}'.format(
                drive, random.choice(Common.HARD_DRIVE_TYPES))),
            remove_stop_words('{} {}'.format(
                drive, random.choice(Common.HARD_DRIVE_TYPES))), 1
        ])

        # For SSDs
        pos_df.append([
            remove_stop_words('{} {}'.format(drive,
                                             random.choice(Common.SSD_TYPES))),
            remove_stop_words('{} {}'.format(drive,
                                             random.choice(Common.SSD_TYPES))),
            1
        ])

    return pd.DataFrame(pos_df, columns=Common.COLUMN_NAMES)

Exemple #6

0

Afficher le fichier

def extract_key_features(cluster):
    '''
    Simplies the DataFrames extracted from the WDC Product Corpus
    Only includes the ID, description, title, and title + description
    '''

    new_cluster = cluster.loc[:, ("id", "description", "title")]
    new_cluster["title"] = new_cluster["title"].map(lambda x: remove_stop_words(x))
    new_cluster["description"] = new_cluster["description"].map(lambda x: remove_stop_words(str(x)))
    new_cluster["titleDesc"] = new_cluster["title"].map(lambda x: x.split(" ")) + new_cluster["description"].map(lambda x: x.split(" ")).map(lambda x: x[0:6])
    return new_cluster

Exemple #7

0

Afficher le fichier

Fichier : spec_creation.py Projet : Adamkaram/Supervised-Product-Similarity

def create_pos_spec_data(df, rm_attrs, add_attrs):
    temp = []
    df_iloc = df.iloc()
    COLUMN_NAMES = ['title_one', 'title_two', 'label']
    for row in tqdm(range(int(len(df) * 2.3e-4))):
        # Set the new row to the same as the original to begin changing it
        new_row = df_iloc[row]

        # Get the row in the df and add the inch attribute
        orig_row = df_iloc[row]

        # Set product and company
        orig_row['company'] = orig_row['brand'].split(' ', 1)[0]
        orig_row['product'] = orig_row['brand'].split(' ', 1)[1]
        new_row['company'] = orig_row['brand'].split(' ', 1)[0]
        new_row['product'] = orig_row['brand'].split(' ', 1)[1]

        # Get a random inch attribute
        inch_attr = random.choice(list(LaptopAttributes.inches))

        # Get random screen attribute
        screen_attr = random.choice(list(LaptopAttributes.screen))

        # Get random hard drive attribute and type
        hard_drive_attr = random.choice(list(SpecAttributes.hard_drive))

        # Get whether it will be an ssd or a hard drive
        drive_type = random.choice([hard_drive_types, ssd_types])

        # Set the attributes
        orig_row['inches'] = inch_attr
        orig_row['screen'] = screen_attr
        orig_row['hard_drive'] = '{} {}'.format(hard_drive_attr,
                                                random.choice(drive_type))
        new_row['inches'] = inch_attr
        new_row['screen'] = screen_attr
        new_row['hard_drive'] = '{} {}'.format(hard_drive_attr,
                                               random.choice(drive_type))

        for attr_list in rm_attrs:
            # Simply create a copy of new_row so that we do not have to keep on generating the same thing
            pos_row = new_row.copy()

            for attr in attr_list:
                pos_row[attr] = ''

            title_one = remove_stop_words(
                concatenate_spec_data(orig_row).lower())
            title_two = remove_stop_words(
                concatenate_spec_data(pos_row).lower())

            temp.append([title_one, title_two, 1])

    return pd.DataFrame(temp, columns=COLUMN_NAMES)

Exemple #8

0

Afficher le fichier

def change_unit_retailer_data(df, units, space=True):
    """
    Replaces units like 8 gb with 8gb to have a better distribution across the dataset
    """
    temp = []

    # For each unit, do the replacement on it
    for unit in units:
        matcher = unit_matcher(unit)
        for idx in range(len(df)):
            for col in ['Amazon', 'Newegg', 'Walmart', 'BestBuy']:
                title = df.at[idx, col]
                if type(title) is str:
                    title = remove_stop_words(df.at[idx, col])
                    title_matches = matcher.findall(title)
                    if len(title_matches) > 0:
                        neg_title = replace_units(title, title_matches, unit,
                                                  space)
                        title = replace_space(title, title_matches, unit,
                                              space)

                        neg_matches = matcher.findall(neg_title)
                        neg_title = replace_space(neg_title, neg_matches, unit,
                                                  space)

                        temp.append([title, neg_title, 0])

    return pd.DataFrame(temp, columns=['title_one', 'title_two', 'label'])

Exemple #9

0

Afficher le fichier

def create_retailer_laptop_train_data():
    file_path = 'data/train/retailer_laptop_data.csv'
    
    if not os.path.exists(file_path):
        print('Generating Retailer Laptop train data . . .')
        # Get the laptop data from the different sources
        amazon_laptops = pd.read_csv('data/base/amazon_laptop_titles.csv')
        walmart_laptops = pd.read_csv('data/base/walmart_laptop_titles.csv')
        newegg_laptops = pd.read_csv('data/base/newegg_laptop_titles.csv')

        # Concatenate the data
        laptops = remove_misc(pd.concat([amazon_laptops, walmart_laptops, newegg_laptops]))
        laptops['title'] = laptops['title'].apply(lambda x: remove_stop_words(x, omit_punctuation=['.']))
        laptops = laptops.drop_duplicates(subset=['title'])

        # Create positive titles
        pos_titles = create_pos_laptop_data(laptops)
        pos_titles = pos_titles.drop_duplicates(subset=['title_one', 'title_two'])
        
        # Create negative titles
        neg_titles = create_neg_laptop_data(laptops)
        neg_titles = neg_titles.drop_duplicates(subset=['title_one', 'title_two'])

        # Combine the positive and negative DataFrames and put them in a CSV
        retailer_laptop_df = create_final_data(pos_titles, neg_titles)
        retailer_laptop_df.to_csv(file_path)
    
    else:
        print('Already have Retailer Laptop train data. Moving on . . .')

Exemple #10

0

Afficher le fichier

def inference():
    '''
    Test model using your own titles
    '''

    title1 = input('First title: ')
    title2 = input('Second title: ')

    title1 = remove_stop_words(title1)
    title2 = remove_stop_words(title2)

    data = np.array([title1, title2]).reshape(1, 2)
    forward = net(*character_bert_preprocess_batch(data))
    np_forward = forward.detach().numpy()[0]

    print('Output: {}'.format(torch.argmax(forward)))
    print('Softmax: Negative {:.4f}%, Positive {:.4f}%'.format(
        np_forward[0], np_forward[1]))

Exemple #11

0

Afficher le fichier

Fichier : laptop_data_creation.py Projet : Adamkaram/Supervised-Product-Similarity

def create_pos_laptop_data(laptop_df, rm_attrs, add_attrs):
    new_column_names = ['title_one', 'title_two', 'label']
    temp = []
    for row in tqdm(range(len(laptop_df))):
        # Remove the attribute from the new title
        for attr_list in rm_attrs:
            # Create a copy of the row for the negative example
            new_row = laptop_df.iloc[row]
            orig_row = laptop_df.iloc[row]
            for attr in attr_list:
                new_row[attr] = ''
        
            title_one = remove_stop_words(concatenate_row(orig_row).lower())
            title_two = remove_stop_words(concatenate_row(new_row).lower())
            
            temp.append([title_one, title_two, 1])
    
    return pd.DataFrame(temp, columns=new_column_names)

Exemple #12

0

Afficher le fichier

Fichier : gs_data_creation.py Projet : Adamkaram/Supervised-Product-Similarity

def preprocessing(orig_data):
    """
    Normalizes the data by getting rid of stopwords and punctuation
    """
    # Iterate over the original dataframe (I know it is slow and there are probably better ways to do it)
    iloc_data = orig_data.iloc

    # Will temporarily store the title data before it gets put into a DataFrame
    temp = []

    # Iterate over the data
    for idx in tqdm(range(len(orig_data))):
        row = iloc_data[idx]
        title_left = remove_stop_words(row.title_left)
        title_right = remove_stop_words(row.title_right)

        # Append the newly created row (title_left, title_right, label) to the the temporary list
        temp.append([title_left, title_right, row.label])

    # Return DataFrame of the title data, simplified
    return pd.DataFrame(temp, columns=COLUMN_NAMES)

Exemple #13

0

Afficher le fichier

def cpu_variations(cpu):
    '''
    Creates different forms of a cpu title
    Ex: amd ryzen 5 3600, amd ryzen 5 3600 6 core processor, 
    amd ryzen 5 3600 3.6 ghz processor and ryzen 5 3600 6 core 3.6 ghz processor.
    '''
    
    temp = []
    
    # This would be something like 'amd ryzen 5 3600 6 cores 3.6 ghz processor'
    temp.append(remove_stop_words('{} {} core {} processor'.format(cpu['name'], cpu['cores'], cpu['core_clock'])))

    # Just the base 'amd ryzen 5 3600'
    temp.append(remove_stop_words(cpu['name']))

    # Add in only cores 'amd ryzen 5 3600 6 core processor'
    temp.append(remove_stop_words('{} {} core processor'.format(cpu['name'], cpu['cores'])))

    # Add in only ghz 'amd ryzen 5 3600 3.6 ghz processor'
    temp.append(remove_stop_words('{} {} processor'.format(cpu['name'], cpu['core_clock'])))
    
    return temp

Exemple #14

0

Afficher le fichier

Fichier : general_drive_data.py Projet : Adamkaram/Supervised-Product-Similarity

def generate_pos_hard_drive_data():
    pos_df = []
    drives = ['{} GB'.format(x) for x in range(1, 3193)
              ] + ['{} TB'.format(x) for x in range(1, 101)]
    for drive in drives:
        # For hard drives
        pos_df.append([
            remove_stop_words('{} {}'.format(drive,
                                             random.choice(hard_drive_types))),
            remove_stop_words('{} {}'.format(drive,
                                             random.choice(hard_drive_types))),
            1
        ])

        # For SSDs
        pos_df.append([
            remove_stop_words('{} {}'.format(drive, random.choice(ssd_types))),
            remove_stop_words('{} {}'.format(drive, random.choice(ssd_types))),
            1
        ])

    return pd.DataFrame(pos_df, columns=COLUMN_NAMES)

Exemple #15

0

Afficher le fichier

def cpu_variations(cpu):
    temp = []

    # This would be something like 'amd ryzen 5 3600 6 cores 3.6 ghz processor'
    temp.append(
        remove_stop_words('{} {} core {} processor'.format(
            cpu['name'], cpu['cores'], cpu['core_clock'])))

    # Just the base 'amd ryzen 5 3600'
    temp.append(remove_stop_words(cpu['name']))

    # Add in only cores 'amd ryzen 5 3600 6 core processor'
    temp.append(
        remove_stop_words('{} {} core processor'.format(
            cpu['name'], cpu['cores'])))

    # Add in only ghz 'amd ryzen 5 3600 3.6 ghz processor'
    temp.append(
        remove_stop_words('{} {} processor'.format(cpu['name'],
                                                   cpu['core_clock'])))

    return temp

Exemple #16

0

Afficher le fichier

Fichier : retailer_test_creation.py Projet : mikosa/Supervised-Product-Similarity

def create_pos_laptop_test_data(laptop_df):
    '''
    Creates the positive test laptop data
    '''

    retailers = ['Amazon', 'Newegg', 'Walmart', 'BestBuy']
    pos_data = []
    for row in laptop_df.iloc:
        temp = []

        for retailer in retailers:
            if type(row[retailer]) is str:
                temp.append(row[retailer])

        temp = list(combinations(temp, 2))

        for combo in temp:
            combo = list(combo)
            combo[0] = remove_stop_words(combo[0]).lower()
            combo[1] = remove_stop_words(combo[1]).lower()
            pos_data.append(list(combo) + [1])

    return pd.DataFrame(pos_data, columns=['title_one', 'title_two', 'label'])

Exemple #17

0

Afficher le fichier

Fichier : retailer_test_creation.py Projet : mikosa/Supervised-Product-Similarity

def create_neg_laptop_test_data(laptop_df):
    '''
    Creates the negative test laptop data
    '''
    retailers = ['Amazon', 'Newegg', 'Walmart', 'BestBuy']
    neg_data = []
    for row in laptop_df.iloc:
        temp = []

        for retailer in retailers:
            if type(row[retailer]) is str:
                orig_product = row[retailer]
                neg_product = ''

                # Get a subset of the laptop dataframe that has titles that are similar to the original, but still different
                comp_df = laptop_df.loc[laptop_df['Company'] == row['Company']]
                comp_df = comp_df.loc[laptop_df['index'] != row['index']]
                idx = random.randint(0, len(comp_df) - 1)
                neg_row = comp_df.iloc[idx]

                while True:
                    rand_retailer = random.sample(retailers, 1)[0]
                    neg_product = neg_row[rand_retailer]

                    if type(neg_product
                            ) is str and neg_product != orig_product:
                        temp = [
                            remove_stop_words(orig_product).lower(),
                            remove_stop_words(neg_product).lower(), 0
                        ]
                        neg_data.append(temp)
                        break

                    else:
                        continue

    return pd.DataFrame(neg_data, columns=['title_one', 'title_two', 'label'])

Exemple #18

0

Afficher le fichier

def generate_neg_hard_drive_data():
    '''
    Creates negative data with different drives sizes.
    Ex: 10 gb ssd vs 20 gb ssd.
    '''

    neg_df = []
    drives = ['{} GB'.format(x) for x in range(8, 1001, 8)
              ] + ['{} TB'.format(x) for x in range(1, 20)]

    for drive in drives:
        new_drive = drive

        while new_drive == drive:
            new_drive = random.choice(drives)

        orig_variations = []
        new_variations = []

        # For hard drive
        for x in Common.HARD_DRIVE_TYPES:
            orig_variations.append('{} {}'.format(drive, x))
            new_variations.append('{} {}'.format(new_drive, x))

        # For ssd
        for x in Common.SSD_TYPES:
            orig_variations.append('{} {}'.format(drive, x))
            new_variations.append('{} {}'.format(new_drive, x))

        for old in orig_variations:
            for new in new_variations:
                neg_df.append(
                    [remove_stop_words(old),
                     remove_stop_words(new), 0])

    return pd.DataFrame(neg_df, columns=Common.COLUMN_NAMES)

Exemple #19

0

Afficher le fichier

def generate_pos_pcpartpicker_data(df):
    '''
    Creates positive data from any of the PCPartPicker datasets
    '''

    columns = list(df.columns)
    pos_df = pd.DataFrame(columns=['title_one', 'title_two', 'label'])
    for idx in tqdm(range(len(df))):
        row = df.iloc()[idx]
        titles = []
        for col in columns:
            if not pd.isnull(row[col]):
                titles.append(remove_stop_words(row[col]))
        if len(titles) > 1:
            combs = combinations(titles, 2)
            for comb in combs:
                comb = list(comb)
                comb.append(1)
                pos_df = pos_df.append(
                    pd.DataFrame([comb],
                                 columns=['title_one', 'title_two', 'label']))

    return pos_df

Exemple #20

0

Afficher le fichier

Fichier : spec_creation.py Projet : Adamkaram/Supervised-Product-Similarity

def create_neg_spec_laptop(df, attributes):
    df_iloc = df.iloc()
    temp = []
    for row in tqdm(range(int(len(df) * 1.91e-4))):
        # Create a copy of the row for the negative example
        for attribute_class in attributes:
            neg_row = df_iloc[row]
            # Get the row in the laptop_data and add the inch attribute
            orig_row = df_iloc[row]

            # Set product and company
            orig_row['company'] = orig_row['brand'].split(' ', 1)[0]
            orig_row['product'] = orig_row['brand'].split(' ', 1)[1]
            neg_row['company'] = orig_row['brand'].split(' ', 1)[0]
            neg_row['product'] = orig_row['brand'].split(' ', 1)[1]

            # Get a random inch attribute
            inch_attr = random.choice(list(LaptopAttributes.inches))

            # Get random screen attribute
            screen_attr = random.choice(list(LaptopAttributes.screen))

            # Set the attributes
            orig_row['inches'] = inch_attr
            neg_row['inches'] = inch_attr
            orig_row['screen'] = screen_attr
            neg_row['screen'] = screen_attr

            if attribute_class == 'inches':
                # New inch attribute
                new_inch_attr = inch_attr

                # If the original attribute is still the same, keep getting a random one
                while inch_attr == new_inch_attr:
                    new_inch_attr = random.choice(list(
                        LaptopAttributes.inches))

                neg_row['inches'] = new_inch_attr

            elif attribute_class == 'screen':
                # Have screen attr
                orig_screen_attr = random.choice(list(LaptopAttributes.screen))

                # New screen attribute
                new_screen_attr = screen_attr

                # If the original attribute is still the same, keep getting a random one
                while orig_screen_attr == new_screen_attr:
                    new_screen_attr = random.choice(
                        list(LaptopAttributes.screen))

                neg_row['screen'] = new_screen_attr
                orig_row['screen'] = orig_screen_attr

            elif attribute_class == 'product':
                # New product attr
                new_product_attr = orig_row['product']

                # If the original attribute is still the same, keep getting a random one
                while orig_row['product'] == new_product_attr:
                    new_product_attr = random.choice(
                        SpecAttributes.laptop_brands).split(' ', 1)[1]

                neg_row['product'] = new_product_attr

            elif attribute_class == 'hard_drive':
                # New drive attribute
                new_drive_attr = orig_row['hard_drive']

                # If the original attribute is still the same, keep getting a random one
                while orig_row['hard_drive'] == new_drive_attr:
                    new_drive_attr = random.choice(SpecAttributes.hard_drive)

                neg_row['hard_drive'] = '{} {}'.format(
                    new_drive_attr,
                    random.choice([
                        random.choice(hard_drive_types),
                        random.choice(ssd_types)
                    ]))
                orig_row['hard_drive'] = '{} {}'.format(
                    orig_row['hard_drive'],
                    random.choice([
                        random.choice(hard_drive_types),
                        random.choice(ssd_types)
                    ]))

            else:
                # Get the attribute that we are trying to change
                attribute_val = orig_row[attribute_class]

                # Temporarily value for the new value
                new_val = attribute_val

                # Make sure we really get a new attribute
                while new_val == attribute_val:
                    new_val = random.sample(
                        SpecAttributes.get_all_data()[attribute_class.lower()],
                        1)[0]

                # Change the value in the neg_row to the new value
                neg_row[attribute_class] = new_val

            # We still need to add the phrasing to the hard drive attribute if it is not the current attribute class
            if attribute_class != 'hard_drive':
                drive_type = random.choice([
                    random.choice(hard_drive_types),
                    random.choice(ssd_types)
                ])
                neg_row['hard_drive'] = '{} {}'.format(neg_row['hard_drive'],
                                                       drive_type)
                orig_row['hard_drive'] = '{} {}'.format(
                    orig_row['hard_drive'], drive_type)

            # Concatenate and normalize the data
            title_one = remove_stop_words(
                concatenate_spec_data(orig_row).lower())
            title_two = remove_stop_words(
                concatenate_spec_data(neg_row).lower())

            # Append the data to the temp list
            temp.append([title_one, title_two, 0])

    # Return the DataFrame created from temp
    return pd.DataFrame(temp, columns=COLUMN_NAMES)

Exemple #21

0

Afficher le fichier

Fichier : laptop_data_classes.py Projet : mikosa/Supervised-Product-Similarity

class LaptopRetailerRegEx:
    populate_spec()
    laptop_brands = {'gateway', 'panasonic', 'toughbook', 'msi'}
    product_attrs = {'vivobook'}
    cpu_attributes = {'intel', 'm 2', '2 core', '4 core', '6 core', '8 core'}

    for brand in LaptopAttributes.laptop_brands:
        laptop_brands.add(brand.split(' ')[0].lower())
        product_attrs.add(' '.join(brand.split(' ')[1:]).lower())

    intel_cpu_df = pd.read_csv('data/base/intel_cpus.csv')
    intel_cpu_df = intel_cpu_df['title'].map(
        lambda x: remove_stop_words(x, omit_punctuation=['.']).split(' '))
    for i in range(len(intel_cpu_df)):
        cpu_attributes.update(intel_cpu_df.iloc[i])

    amd_cpu_df = pd.read_csv('data/base/amd_cpus.csv')
    amd_cpu_df = amd_cpu_df['title'].map(
        lambda x: remove_stop_words(x, omit_punctuation=['.']).split(' '))
    for i in range(len(amd_cpu_df)):
        cpu_attributes.update(amd_cpu_df.iloc[i])

    laptop_brands = list(laptop_brands)
    laptop_brands.sort(key=len, reverse=True)

    product_attrs = list(product_attrs)
    product_attrs.sort(key=len, reverse=True)

    cpu_attributes = list(cpu_attributes)
    cpu_attributes.sort(key=len, reverse=True)

    ram_modifiers = ['memory', 'ram', 'ddr4', 'ddr4 ram', 'ddr4 memory']
    ram_modifiers.sort()

    hard_drive_modifiers = [
        'hdd', 'hard drive', 'disk drive', 'storage', 'hard drive storage',
        'hdd storage'
    ]
    hard_drive_modifiers.sort(key=len, reverse=True)

    ssd_modifiers = [
        'ssd', 'solid state drive', 'solid state disk', 'pcie', 'pcie ssd',
        'ssd storage'
    ]
    ssd_modifiers.sort(key=len, reverse=True)

    annoying_words = [
        'windows 10', 'win 10', 'windows 10 in s mode', 'windows', '3.0',
        '3.1', '3.2', 'optical drive', 'cd drive', 'dvd drive'
    ]
    annoying_words.sort(key=len, reverse=True)

    ram_modifier_matcher = re.compile(
        "\\b" + "(?!\S)|\\b".join(ram_modifiers) + "(?!\S)", re.IGNORECASE)
    random_matcher = re.compile(
        "\\b" + "(?!\S)|\\b".join(annoying_words) + "(?!\S)", re.IGNORECASE)
    cpu_matcher = re.compile(
        "\\b" + "(?!\S)|\\b".join(cpu_attributes) + "(?!\S)", re.IGNORECASE)
    brand_matcher = re.compile(
        "\\b" + "(?!\S)|\\b".join(laptop_brands) + "(?!\S)", re.IGNORECASE)
    product_attr_matcher = re.compile(
        "\\b" + "(?!\S)|\\b".join(product_attrs) + "(?!\S)", re.IGNORECASE)
    ram_matcher = re.compile(
        ' ?[0-9]+.{0,1}' + 'gb ?' + '(?:' +
        '|'.join([x for x in ram_modifiers]) + ')(?!\S)', re.IGNORECASE)
    hard_drive_matcher = re.compile(
        ' ?[0-9]+.{0,1}' + '(?:gb|tb) ?' + '(?:' +
        '|'.join([x for x in hard_drive_modifiers]) + ')(?!\S)', re.IGNORECASE)
    ssd_matcher = re.compile(
        ' ?[0-9]+.{0,1}' + '(?:gb|tb) ?' + '(?:' +
        '|'.join([x for x in ssd_modifiers]) + ')(?!\S)', re.IGNORECASE)
    gbtb_matcher = re.compile(' ?[0-9]+.{0,1}' + '(?:gb|tb)' + '(?!\S)',
                              re.IGNORECASE)
    inch_matcher = re.compile('[1][0-9]\"?\"?.?[0-9]?\"?\"? ?(?:inch)?(?!\S)',
                              re.IGNORECASE)
    del laptop_brands, product_attrs, cpu_attributes, intel_cpu_df, amd_cpu_df

Exemple #22

0

Afficher le fichier

Fichier : tf_test_model.py Projet : mikosa/Supervised-Product-Similarity

    model = siamese_network((MAX_LEN))

else:
    print('Using the exponential distance softmax.')
    from src.model_architectures.exp_distance_softmax import siamese_network
    model = siamese_network((MAX_LEN))

model.summary()

# Load the model using the weights
model.load_weights('models/DistanceSigmoid_40epoch_84%_val.h5')

title_one = 'True Wireless Earbuds VANKYO X200 Bluetooth 5 0 Earbuds in Ear TWS Stereo Headphones Smart LED Display Charging Case IPX8 Waterproof 120H Playtime Built Mic Deep Bass Sports Work'
title_two = 'TOZO T10 Bluetooth 5 0 Wireless Earbuds Wireless Charging Case IPX8 Waterproof TWS Stereo Headphones Ear Built Mic Headset Premium Sound Deep Bass Sport Black'

title_one_arr = [' '] * MAX_LEN
title_two_arr = [' '] * MAX_LEN
title_one = remove_stop_words(title_one.lower())
title_two = remove_stop_words(title_two.lower())

for idx, x in enumerate(title_one.split(' ')):
    title_one_arr[idx] = x

for idx, x in enumerate(title_two.split(' ')):
    title_two_arr[idx] = x

title_one_arr = np.array(title_one_arr).reshape(1, MAX_LEN).astype('<U22')
title_two_arr = np.array(title_two_arr).reshape(1, MAX_LEN).astype('<U22')

print(model.predict([title_one_arr, title_two_arr]))