def isDifferentModel_2(pair, feature):
    if feature in products[pair[0]] and feature in products[pair[1]]:
        return simfunctions.overlap_coefficient(
            tokenizers.qgram(delNonNumber(pair[0], feature), 3),
            tokenizers.qgram(delNonNumber(pair[1], feature), 3))
    else:
        return noneValue
Esempio n. 2
0
def setMatchNumKeyField(pair, fields):
    data1 = nonNum_re.sub(' ', str(products[pair[1]])).strip().split()
    if data1:
        for d1 in data1:
            if len(d1) >= MIN_KEY_LEN:
                if PARTIAL_KEY:
                    data2 = tokenizers.qgram(d1, MIN_KEY_LEN)
                else:
                    data2 = tokenizers.qgram(d1, len(d1))
                for d2 in data2:
                    for field in fields:
                        try:
                            if (d2 in products[pair[0]][field][0]):
                                return 1
                        except:
                            continue
    return 0
 def test_qgrams_valid(self):
     self.assertEqual(qgram(''), [])
     self.assertEqual(qgram('a'), [])
     self.assertEqual(qgram('aa'), ['aa'])
     self.assertEqual(qgram('database'), ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se'])
     self.assertEqual(qgram('d', 1), ['d'])
     self.assertEqual(qgram('database', 3), ['dat', 'ata', 'tab', 'aba', 'bas', 'ase'])
 def test_qgrams_valid(self):
     self.assertEqual(qgram(''), [])
     self.assertEqual(qgram('a'), [])
     self.assertEqual(qgram('aa'), ['aa'])
     self.assertEqual(qgram('database'),
                      ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se'])
     self.assertEqual(qgram('d', 1), ['d'])
     self.assertEqual(qgram('database', 3),
                      ['dat', 'ata', 'tab', 'aba', 'bas', 'ase'])
Esempio n. 5
0
def generate_feature(file_name):
    lines = stage3_helper.read_file(file_name)

    features = []
    labels = []

    all_names = []
    for line in lines:
        json1, json2, label = stage3_helper.read_jsons_label_from_line(line)
        string1, string2 = stage3_helper.get_attribute_from_jsons(
            json1, json2, product_name)
        all_names.append(tokenizers.whitespace(string1))
        all_names.append(tokenizers.whitespace(string2))

    for line in lines:
        json1, json2, label = stage3_helper.read_jsons_label_from_line(line)

        feature = []

        # TODO: Add more features and optimize features.

        # product_type
        string1, string2 = stage3_helper.get_attribute_from_jsons(
            json1, json2, product_type)
        string1 = string1.lower()
        string2 = string2.lower()
        feature.append(
            simfunctions.jaccard(tokenizers.whitespace(string1),
                                 tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1))
        feature.append(
            simfunctions.jaro(
                tokenizers.whitespace(string1)[0],
                tokenizers.whitespace(string2)[0]))
        # if len(string1) == len(string2):
        #     feature.append(simfunctions.hamming_distance(string1, string2))
        # else:
        #     feature.append(5)
        feature.append(
            simfunctions.cosine(tokenizers.whitespace(string1),
                                tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.overlap_coefficient(tokenizers.whitespace(string1),
                                             tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.monge_elkan(tokenizers.whitespace(string1),
                                     tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.tfidf(tokenizers.whitespace(string1),
                               tokenizers.whitespace(string2)))
        feature.append(len(string1))
        feature.append(len(string2))
        feature.append(len(string1) - len(string2))
        feature.append(len(tokenizers.whitespace(string1)))
        feature.append(len(tokenizers.whitespace(string2)))
        # product_name
        string1, string2 = stage3_helper.get_attribute_from_jsons(
            json1, json2, product_name)
        string1 = string1.lower()
        string2 = string2.lower()
        feature.append(
            simfunctions.jaccard(tokenizers.whitespace(string1),
                                 tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1))
        if len(string1) == len(string2):
            feature.append(simfunctions.hamming_distance(string1, string2))
        else:
            feature.append(5)
        feature.append(
            simfunctions.cosine(tokenizers.whitespace(string1),
                                tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.overlap_coefficient(tokenizers.whitespace(string1),
                                             tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.monge_elkan(tokenizers.whitespace(string1),
                                     tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.tfidf(tokenizers.whitespace(string1),
                               tokenizers.whitespace(string2)))
        feature.append(len(string1))
        feature.append(len(string2))
        feature.append(len(string1) - len(string2))
        feature.append(
            simfunctions.jaro(
                tokenizers.whitespace(string1)[0],
                tokenizers.whitespace(string2)[0]))
        feature.append(len(tokenizers.whitespace(string1)))
        feature.append(len(tokenizers.whitespace(string2)))

        # product_segment
        string1, string2 = stage3_helper.get_attribute_from_jsons(
            json1, json2, product_segment)
        string1 = string1.lower()
        string2 = string2.lower()
        feature.append(
            simfunctions.jaccard(tokenizers.qgram(string1, 3),
                                 tokenizers.qgram(string2, 3)))
        feature.append(
            simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1))
        # if len(string1) == len(string2):
        #     feature.append(simfunctions.hamming_distance(string1, string2))
        # else:
        #     feature.append(5)
        feature.append(
            simfunctions.cosine(tokenizers.whitespace(string1),
                                tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.overlap_coefficient(tokenizers.whitespace(string1),
                                             tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.monge_elkan(tokenizers.whitespace(string1),
                                     tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.tfidf(tokenizers.whitespace(string1),
                               tokenizers.whitespace(string2)))
        feature.append(
            simfunctions.jaro(
                tokenizers.whitespace(string1)[0],
                tokenizers.whitespace(string2)[0]))
        feature.append(len(string1))
        feature.append(len(string2))
        feature.append(len(string1) - len(string2))

        feature.append(len(tokenizers.whitespace(string1)))
        feature.append(len(tokenizers.whitespace(string2)))
        # product_long_description
        string1, string2 = stage3_helper.get_attribute_from_jsons(
            json1, json2, product_long_description)

        if string1 is None or string2 is None:
            feature.append(0.5)
            feature.append(0)
            feature.append(0)
            feature.append(0)
            feature.append(0)
            # feature.append(0)
            # feature.append(0)
            # feature.append(0)
            # feature.append(0)
        else:
            string1 = string1.lower()
            string2 = string2.lower()
            string1 = stage3_helper.cleanhtml(string1)
            string2 = stage3_helper.cleanhtml(string2)
            string1 = stage3_helper.clean_stop_word(string1)
            string2 = stage3_helper.clean_stop_word(string2)
            feature.append(
                simfunctions.jaccard(tokenizers.whitespace(string1),
                                     tokenizers.whitespace(string2)))
            # feature.append(simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1))
            feature.append(
                simfunctions.overlap_coefficient(
                    tokenizers.whitespace(string1),
                    tokenizers.whitespace(string2)))
            # feature.append(simfunctions.monge_elkan(tokenizers.whitespace(string1), tokenizers.whitespace(string2)))
            # feature.append(simfunctions.tfidf(tokenizers.whitespace(string1), tokenizers.whitespace(string2)))
            feature.append(len(string1))
            feature.append(len(string2))
            feature.append(len(string1) - len(string2))

        # product_brand
        string1, string2 = stage3_helper.get_attribute_from_jsons(
            json1, json2, product_brand)
        string1_name, string2_name = stage3_helper.get_attribute_from_jsons(
            json1, json2, product_name)

        if string1 is None or string1 == '':
            string1 = get_predict_brand(string1_name)

        if string2 is None or string2 == '':
            string2 = get_predict_brand(string2_name)

        if string1 is None or string2 is None:
            feature.append(0)
            feature.append(0)
            feature.append(0)
            feature.append(0)
            feature.append(0)
            feature.append(0)
            feature.append(0)
            feature.append(0)
        else:
            feature.append(
                simfunctions.jaccard(tokenizers.whitespace(string1),
                                     tokenizers.whitespace(string2)))
            feature.append(
                simfunctions.jaro_winkler(string1, string2, prefix_weight=0.1))
            feature.append(
                simfunctions.overlap_coefficient(
                    tokenizers.whitespace(string1),
                    tokenizers.whitespace(string2)))
            feature.append(
                simfunctions.monge_elkan(tokenizers.whitespace(string1),
                                         tokenizers.whitespace(string2)))
            feature.append(
                simfunctions.tfidf(tokenizers.whitespace(string1),
                                   tokenizers.whitespace(string2)))
            feature.append(len(string1))
            feature.append(len(string2))
            feature.append(len(string1) - len(string2))
            #feature.append(simfunctions.jaro(tokenizers.whitespace(string1)[0], tokenizers.whitespace(string2)[0]))

        # Contains similar model names.
        string1, string2 = stage3_helper.get_attribute_from_jsons(
            json1, json2, product_name)
        string1 = string1.lower()
        string2 = string2.lower()
        model_strs1 = stage3_helper.find_model_str(string1)
        model_strs2 = stage3_helper.find_model_str(string2)
        # share_model_str = False
        # for model in model_strs1:
        #     if model.lower() in string2.lower():
        #         share_model_str = True
        # for model in model_strs2:
        #     if model.lower() in string1.lower():
        #         share_model_str = True
        # if share_model_str:
        #     feature.append(1)
        # else:
        #     feature.append(0)
        if len(model_strs1) > 0 and len(model_strs2) > 0:
            feature.append(simfunctions.jaccard(model_strs1, model_strs2))
        else:
            feature.append(0.5)
        feature.append(len(model_strs1))
        feature.append(len(model_strs2))
        feature.append(len(model_strs1) - len(string2))

        # Other features.
        common = 0
        common_score = 0.0
        for item in json1:
            if item in json2:
                common += 1
                common_score += simfunctions.jaccard(
                    tokenizers.whitespace(json1[item][0]),
                    tokenizers.whitespace(json2[item][0]))
        common_score = common_score / common
        feature.append(len(json1))
        feature.append(len(json2))
        feature.append(len(json1) - len(json2))
        feature.append(common)
        feature.append(common_score)
        feature.append(len(json.dumps(json1)))
        feature.append(len(json.dumps(json2)))
        feature.append(len(json.dumps(json1)) - len(json.dumps(json2)))
        feature.append(
            simfunctions.jaccard(tokenizers.whitespace(json.dumps(json1)),
                                 tokenizers.whitespace(json.dumps(json2))))

        # Add one feature and label.
        features.append(feature)
        labels.append(stage3_helper.get_01_from_label(label))

    return features, labels, lines
Esempio n. 6
0
def generate_feature(filename):
    productName_courpus = []
    brand_courpus = []
    with open(filename, 'r') as f:
        for line in f:
            list_line = line.split('?')
            attribute_id1 = json.loads(list_line[2], encoding='latin-1')
            attribute_id2 = json.loads(list_line[4], encoding='latin-1')

            if "Product Name" in attribute_id1:
                productName_courpus.append(
                    tokenizers.delimiter(attribute_id1["Product Name"][0]))
            if "Product Name" in attribute_id2:
                productName_courpus.append(
                    tokenizers.delimiter(attribute_id2["Product Name"][0]))

            if "Brand" in attribute_id1:
                brand_courpus.append(
                    tokenizers.delimiter(attribute_id1["Brand"][0]))
            if "Brand" in attribute_id2:
                brand_courpus.append(
                    tokenizers.delimiter(attribute_id2["Brand"][0]))

    feature_matrix = []
    with open(filename, 'r') as f:
        i = 1
        for line in f:
            list_line = line.split('?')
            attribute_id1 = json.loads(list_line[2], encoding='latin-1')
            attribute_id2 = json.loads(list_line[4], encoding='latin-1')

            print 'Generate features for pair', i
            i = i + 1

            instance = []

            #Product Name 4
            if ("Product Name" in attribute_id1
                    and "Product Name" in attribute_id2):
                jaccard_productName = simfunctions.jaccard(
                    tokenizers.delimiter(attribute_id1["Product Name"][0]),
                    tokenizers.delimiter(attribute_id2["Product Name"][0]))
                jaccard3gram_productName = simfunctions.jaccard(
                    tokenizers.qgram(attribute_id1["Product Name"][0], 3),
                    tokenizers.qgram(attribute_id2["Product Name"][0], 3))
                tfidf_productName = simfunctions.tfidf(
                    tokenizers.delimiter(attribute_id1["Product Name"][0]),
                    tokenizers.delimiter(attribute_id2["Product Name"][0]),
                    productName_courpus)
                edit_productName = simfunctions.levenshtein(
                    attribute_id1["Product Name"][0],
                    attribute_id2["Product Name"][0])
                edit_productName = 1 - edit_productName / max(
                    len(attribute_id1["Product Name"][0]),
                    len(attribute_id2["Product Name"][0]))
            else:
                jaccard_productName = 0
                jaccard3gram_productName = 0
                tfidf_productName = 0
                edit_productName = 0

            instance += [
                jaccard_productName, jaccard3gram_productName,
                tfidf_productName, edit_productName
            ]

            #Manufacturer 3
            if ("Manufacturer" in attribute_id1
                    and "Manufacturer" in attribute_id2):
                jaccard_manufacturer = simfunctions.jaccard(
                    tokenizers.delimiter(attribute_id1["Manufacturer"][0]),
                    tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
                jaccard3gram_manufacturer = simfunctions.jaccard(
                    tokenizers.qgram(attribute_id1["Manufacturer"][0], 3),
                    tokenizers.qgram(attribute_id2["Manufacturer"][0], 3))
                tfidf_manufacturer = simfunctions.tfidf(
                    tokenizers.delimiter(attribute_id1["Manufacturer"][0]),
                    tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
            else:
                jaccard_manufacturer = 0
                jaccard3gram_manufacturer = 0
                tfidf_manufacturer = 0

            instance += [
                jaccard_manufacturer, jaccard3gram_manufacturer,
                tfidf_manufacturer
            ]

            #Color 3
            if ("Color" in attribute_id1 and "Color" in attribute_id2):
                jaccard_color = simfunctions.jaccard(
                    tokenizers.delimiter(attribute_id1["Color"][0]),
                    tokenizers.delimiter(attribute_id2["Color"][0]))
                jaccard3gram_color = simfunctions.jaccard(
                    tokenizers.qgram(attribute_id1["Color"][0], 3),
                    tokenizers.qgram(attribute_id2["Color"][0], 3))
                tfidf_color = simfunctions.tfidf(
                    tokenizers.delimiter(attribute_id1["Color"][0]),
                    tokenizers.delimiter(attribute_id2["Color"][0]))
            else:
                jaccard_color = 0
                jaccard3gram_color = 0
                tfidf_color = 0

            instance += [jaccard_color, jaccard3gram_color, tfidf_color]

            #Product Type 3
            if ("Product Type" in attribute_id1
                    and "Product Type" in attribute_id2):
                jaccard_productType = simfunctions.jaccard(
                    tokenizers.delimiter(attribute_id1["Product Type"][0]),
                    tokenizers.delimiter(attribute_id2["Product Type"][0]))
                jaccard3gram_productType = simfunctions.jaccard(
                    tokenizers.qgram(attribute_id1["Product Type"][0], 3),
                    tokenizers.qgram(attribute_id2["Product Type"][0], 3))
                tfidf_productType = simfunctions.tfidf(
                    tokenizers.delimiter(attribute_id1["Product Type"][0]),
                    tokenizers.delimiter(attribute_id2["Product Type"][0]))
            else:
                jaccard_productType = 0
                jaccard3gram_productType = 0
                tfidf_productType = 0

            instance += [
                jaccard_productType, jaccard3gram_productType,
                tfidf_productType
            ]

            #Product Segment 3
            if "Product Segment" in attribute_id1 and "Product Segment" in attribute_id2:
                jaccard_productSegment = simfunctions.jaccard(
                    tokenizers.delimiter(attribute_id1["Product Segment"][0]),
                    tokenizers.delimiter(attribute_id2["Product Segment"][0]))
                jaccard3gram_productSegment = simfunctions.jaccard(
                    tokenizers.qgram(attribute_id1["Product Segment"][0], 3),
                    tokenizers.qgram(attribute_id2["Product Segment"][0], 3))
                if (attribute_id1["Product Segment"][0] ==
                        attribute_id2["Product Segment"][0]):
                    exactMatch_productSegment = 1
                else:
                    exactMatch_productSegment = 0
            else:
                exactMatch_productSegment = 0
                jaccard_productSegment = 0
                jaccard3gram_productSegment = 0

            instance += [
                exactMatch_productSegment, jaccard_productSegment,
                jaccard3gram_productSegment
            ]

            #Brand 4
            if ("Brand" in attribute_id1 and "Brand" in attribute_id2):
                jaccard_brand = simfunctions.jaccard(
                    tokenizers.delimiter(attribute_id1["Brand"][0]),
                    tokenizers.delimiter(attribute_id2["Brand"][0]))
                jaccard3gram_brand = simfunctions.jaccard(
                    tokenizers.qgram(attribute_id1["Brand"][0], 3),
                    tokenizers.qgram(attribute_id2["Brand"][0], 3))
                edit_brand = simfunctions.levenshtein(
                    attribute_id1["Brand"][0], attribute_id2["Brand"][0])
                edit_brand = 1 - edit_brand / max(
                    len(attribute_id1["Brand"][0]),
                    len(attribute_id2["Brand"][0]))
                tfidf_brand = simfunctions.tfidf(
                    tokenizers.delimiter(attribute_id1["Brand"][0]),
                    tokenizers.delimiter(attribute_id2["Brand"][0]),
                    brand_courpus)
            else:
                jaccard3gram_brand = 0
                jaccard_brand = 0
                edit_brand = 0
                tfidf_brand = 0

            instance += [
                jaccard_brand, jaccard3gram_brand, edit_brand, tfidf_brand
            ]

            #Category 2
            if ("Category" in attribute_id1 and "Category" in attribute_id2):
                jaccard_category = simfunctions.jaccard(
                    tokenizers.delimiter(attribute_id1["Category"][0]),
                    tokenizers.delimiter(attribute_id2["Category"][0]))
                jaccard3gram_category = simfunctions.jaccard(
                    tokenizers.qgram(attribute_id1["Category"][0], 3),
                    tokenizers.qgram(attribute_id2["Category"][0], 3))
            else:
                jaccard_category = 0
                jaccard3gram_category = 0

            instance += [jaccard_category, jaccard3gram_category]

            #Long Description 3
            if ("Product Long Description" in attribute_id1
                    and "Product Long Description" in attribute_id2):
                tfidf_long_description = simfunctions.tfidf(
                    tokenizers.delimiter(
                        attribute_id1["Product Long Description"][0]),
                    tokenizers.delimiter(
                        attribute_id2["Product Long Description"][0]))
                jaccard_long_description = simfunctions.jaccard(
                    tokenizers.delimiter(
                        attribute_id1["Product Long Description"][0]),
                    tokenizers.delimiter(
                        attribute_id2["Product Long Description"][0]))
                jaccard3_long_description = simfunctions.jaccard(
                    tokenizers.qgram(
                        attribute_id1["Product Long Description"][0], 3),
                    tokenizers.qgram(
                        attribute_id2["Product Long Description"][0], 3))
            else:
                tfidf_long_description = 0
                jaccard_long_description = 0
                jaccard3_long_description = 0

            instance += [
                tfidf_long_description, jaccard_long_description,
                jaccard3_long_description
            ]

            #Short Description 3
            if ("Product Short Description" in attribute_id1
                    and "Product Short Description" in attribute_id2):
                jaccard_short_description = simfunctions.jaccard(
                    tokenizers.delimiter(
                        attribute_id1["Product Short Description"][0]),
                    tokenizers.delimiter(
                        attribute_id2["Product Short Description"][0]))
                jaccard3_short_description = simfunctions.jaccard(
                    tokenizers.qgram(
                        attribute_id1["Product Short Description"][0], 3),
                    tokenizers.qgram(
                        attribute_id2["Product Short Description"][0], 3))
                tfidf_short_description = simfunctions.tfidf(
                    tokenizers.delimiter(
                        attribute_id1["Product Short Description"][0]),
                    tokenizers.delimiter(
                        attribute_id2["Product Short Description"][0]))
            else:
                jaccard_short_description = 0
                jaccard3_short_description = 0
                tfidf_short_description = 0

            instance += [
                jaccard_short_description, jaccard3_short_description,
                tfidf_short_description
            ]

            #Other in long 8
            if ("Product Name" in attribute_id1
                    and "Product Long Description" in attribute_id2):
                name_set = tokenizers.delimiter(
                    attribute_id1["Product Name"][0])
                des = attribute_id2["Product Long Description"][0]
                count = 0
                for name in name_set:
                    if name in des:
                        count = count + 1
                name1_in_des2 = count / len(name_set)
            else:
                name1_in_des2 = 0

            instance += [name1_in_des2]

            if ("Product Long Description" in attribute_id1
                    and "Product Name" in attribute_id2):
                name_set = tokenizers.delimiter(
                    attribute_id2["Product Name"][0])
                des = attribute_id1["Product Long Description"][0]
                count = 0
                for name in name_set:
                    if name in des:
                        count = count + 1
                name2_in_des1 = count / len(name_set)
            else:
                name2_in_des1 = 0

            instance += [name2_in_des1]

            if ("Brand" in attribute_id1
                    and "Product Long Description" in attribute_id2):
                brand_set = tokenizers.delimiter(attribute_id1["Brand"][0])
                des = attribute_id2["Product Long Description"][0]
                count = 0
                for brand in brand_set:
                    if brand in des:
                        count = count + 1
                brand1_in_des2 = count / len(brand_set)
            else:
                brand1_in_des2 = 0

            instance += [brand1_in_des2]

            if ("Brand" in attribute_id2
                    and "Product Long Description" in attribute_id1):
                brand_set = tokenizers.delimiter(attribute_id2["Brand"][0])
                des = attribute_id1["Product Long Description"][0]
                count = 0
                for brand in brand_set:
                    if brand in des:
                        count = count + 1
                brand2_in_des1 = count / len(brand_set)
            else:
                brand2_in_des1 = 0

            instance += [brand2_in_des1]

            if ("Manufacturer" in attribute_id1
                    and "Product Long Description" in attribute_id2):
                manufacturer_set = tokenizers.delimiter(
                    attribute_id1["Manufacturer"][0])
                des = attribute_id2["Product Long Description"][0]
                count = 0
                for manufacturer in manufacturer_set:
                    if manufacturer in des:
                        count = count + 1
                manufacturer1_in_des2 = count / len(manufacturer_set)
            else:
                manufacturer1_in_des2 = 0

            instance += [manufacturer1_in_des2]

            if ("Manufacturer" in attribute_id2
                    and "Product Long Description" in attribute_id1):
                manufacturer_set = tokenizers.delimiter(
                    attribute_id2["Manufacturer"][0])
                des = attribute_id1["Product Long Description"][0]
                count = 0
                for manufacturer in manufacturer_set:
                    if manufacturer in des:
                        count = count + 1
                manufacturer2_in_des1 = count / len(manufacturer_set)
            else:
                manufacturer2_in_des1 = 0

            instance += [manufacturer2_in_des1]

            if ("Product Short Description" in attribute_id1
                    and "Product Long Description" in attribute_id2):
                short_des_set = tokenizers.delimiter(
                    attribute_id1["Product Short Description"][0])
                des = attribute_id2["Product Long Description"][0]
                count = 0
                for short in short_des_set:
                    if short in des:
                        count = count + 1
                short1_in_des2 = count / len(short_des_set)
            else:
                short1_in_des2 = 0

            instance += [short1_in_des2]

            if ("Product Short Description" in attribute_id2
                    and "Product Long Description" in attribute_id1):
                short_des_set = tokenizers.delimiter(
                    attribute_id2["Product Short Description"][0])
                des = attribute_id1["Product Long Description"][0]
                count = 0
                for short in short_des_set:
                    if short in des:
                        count = count + 1
                short2_in_des1 = count / len(short_des_set)
            else:
                short2_in_des1 = 0

            instance += [short2_in_des1]

            #Other in short 6
            if ("Product Name" in attribute_id1
                    and "Product Short Description" in attribute_id2):
                name_set = tokenizers.delimiter(
                    attribute_id1["Product Name"][0])
                des = attribute_id2["Product Short Description"][0]
                count = 0
                for name in name_set:
                    if name in des:
                        count = count + 1
                name1_in_short2 = count / len(name_set)
            else:
                name1_in_short2 = 0

            instance += [name1_in_short2]

            if ("Product Short Description" in attribute_id1
                    and "Product Name" in attribute_id2):
                name_set = tokenizers.delimiter(
                    attribute_id2["Product Name"][0])
                des = attribute_id1["Product Short Description"][0]
                count = 0
                for name in name_set:
                    if name in des:
                        count = count + 1
                name2_in_short1 = count / len(name_set)
            else:
                name2_in_short1 = 0

            instance += [name2_in_short1]

            if ("Brand" in attribute_id1
                    and "Product Short Description" in attribute_id2):
                brand_set = tokenizers.delimiter(attribute_id1["Brand"][0])
                des = attribute_id2["Product Short Description"][0]
                count = 0
                for brand in brand_set:
                    if brand in des:
                        count = count + 1
                brand1_in_short2 = count / len(brand_set)
            else:
                brand1_in_short2 = 0

            instance += [brand1_in_short2]

            if ("Brand" in attribute_id2
                    and "Product Short Description" in attribute_id1):
                brand_set = tokenizers.delimiter(attribute_id2["Brand"][0])
                des = attribute_id1["Product Short Description"][0]
                count = 0
                for brand in brand_set:
                    if brand in des:
                        count = count + 1
                brand2_in_short1 = count / len(brand_set)
            else:
                brand2_in_short1 = 0

            instance += [brand2_in_short1]

            if ("Manufacturer" in attribute_id1
                    and "Product Short Description" in attribute_id2):
                manufacturer_set = tokenizers.delimiter(
                    attribute_id1["Manufacturer"][0])
                des = attribute_id2["Product Short Description"][0]
                count = 0
                for manufacturer in manufacturer_set:
                    if manufacturer in des:
                        count = count + 1
                manufacturer1_in_short2 = count / len(manufacturer_set)
            else:
                manufacturer1_in_short2 = 0

            instance += [manufacturer1_in_short2]

            if ("Manufacturer" in attribute_id2
                    and "Product Short Description" in attribute_id1):
                manufacturer_set = tokenizers.delimiter(
                    attribute_id2["Manufacturer"][0])
                des = attribute_id1["Product Short Description"][0]
                count = 0
                for manufacturer in manufacturer_set:
                    if manufacturer in des:
                        count = count + 1
                manufacturer2_in_short1 = count / len(manufacturer_set)
            else:
                manufacturer2_in_short1 = 0

            instance += [manufacturer2_in_short1]

            #new 15
            if ("Manufacturer Part Number" in attribute_id1
                    and "Product Long Description" in attribute_id2):
                manu_part_number_set = tokenizers.delimiter(
                    attribute_id1["Manufacturer Part Number"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for manu_part in manu_part_number_set:
                    if manu_part in des_set:
                        count = count + 1
                manu_part1_in_des2 = count / len(manu_part_number_set)
            else:
                manu_part1_in_des2 = 0

            instance += [manu_part1_in_des2]

            if ("Manufacturer Part Number" in attribute_id2
                    and "Product Long Description" in attribute_id1):
                manu_part_number_set = tokenizers.delimiter(
                    attribute_id2["Manufacturer Part Number"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for manu_part in manu_part_number_set:
                    if manu_part in des_set:
                        count = count + 1
                manu_part2_in_des1 = count / len(manu_part_number_set)
            else:
                manu_part2_in_des1 = 0

            instance += [manu_part2_in_des1]

            if ("Assembled Product Length" in attribute_id1
                    and "Product Long Description" in attribute_id2):
                length_set = tokenizers.delimiter(
                    attribute_id1["Assembled Product Length"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for length in length_set:
                    if length in des_set:
                        count = count + 1
                length1_in_des2 = count / len(length_set)
            else:
                length1_in_des2 = 0

            instance += [length1_in_des2]

            if ("Assembled Product Length" in attribute_id2
                    and "Product Long Description" in attribute_id1):
                length_set = tokenizers.delimiter(
                    attribute_id2["Assembled Product Length"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for length in length_set:
                    if length in des_set:
                        count = count + 1
                length2_in_des1 = count / len(length_set)
            else:
                length2_in_des1 = 0

            instance += [length2_in_des1]

            if ("Assembled Product Width" in attribute_id1
                    and "Product Long Description" in attribute_id2):
                width_set = tokenizers.delimiter(
                    attribute_id1["Assembled Product Width"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for width in width_set:
                    if width in des_set:
                        count = count + 1
                width1_in_des2 = count / len(width_set)
            else:
                width1_in_des2 = 0

            instance += [width1_in_des2]

            if ("Assembled Product Width" in attribute_id2
                    and "Product Long Description" in attribute_id1):
                width_set = tokenizers.delimiter(
                    attribute_id2["Assembled Product Width"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for width in width_set:
                    if width in des_set:
                        count = count + 1
                width2_in_des1 = count / len(width_set)
            else:
                width2_in_des1 = 0

            instance += [width2_in_des1]

            if ("Assembled Product Height" in attribute_id1
                    and "Product Long Description" in attribute_id2):
                height_set = tokenizers.delimiter(
                    attribute_id1["Assembled Product Height"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for height in height_set:
                    if height in des_set:
                        count = count + 1
                height1_in_des2 = count / len(height_set)
            else:
                height1_in_des2 = 0

            instance += [height1_in_des2]

            if ("Assembled Product Height" in attribute_id2
                    and "Product Long Description" in attribute_id1):
                height_set = tokenizers.delimiter(
                    attribute_id2["Assembled Product Height"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for height in height_set:
                    if height in des_set:
                        count = count + 1
                height2_in_des1 = count / len(height_set)
            else:
                height2_in_des1 = 0

            instance += [height2_in_des1]

            if ("Type" in attribute_id1
                    and "Product Long Description" in attribute_id2):
                type_set = tokenizers.delimiter(attribute_id1["Type"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for type in type_set:
                    if type in des_set:
                        count = count + 1
                type1_in_des2 = count / len(type_set)
            else:
                type1_in_des2 = 0

            instance += [type1_in_des2]

            if ("Type" in attribute_id2
                    and "Product Long Description" in attribute_id1):
                type_set = tokenizers.delimiter(attribute_id2["Type"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for type in type_set:
                    if type in des_set:
                        count = count + 1
                type2_in_des1 = count / len(type_set)
            else:
                type2_in_des1 = 0

            instance += [type2_in_des1]

            if ("Operating System" in attribute_id1
                    and "Product Long Description" in attribute_id2):
                op_set = tokenizers.delimiter(
                    attribute_id1["Operating System"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for op in op_set:
                    if op in op_set:
                        count = count + 1
                op1_in_des2 = count / len(op_set)
            else:
                op1_in_des2 = 0

            instance += [op1_in_des2]

            if ("Operating System" in attribute_id2
                    and "Product Long Description" in attribute_id1):
                op_set = tokenizers.delimiter(
                    attribute_id2["Operating System"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for op in op_set:
                    if op in op_set:
                        count = count + 1
                op2_in_des1 = count / len(op_set)
            else:
                op2_in_des1 = 0

            instance += [op2_in_des1]

            if ("Screen Size" in attribute_id1
                    and "Product Long Description" in attribute_id2):
                ss_set = tokenizers.delimiter(attribute_id1["Screen Size"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for ss in ss_set:
                    if ss in ss_set:
                        count = count + 1
                ss1_in_des2 = count / len(ss_set)
            else:
                ss1_in_des2 = 0

            instance += [ss1_in_des2]

            if ("Screen Size" in attribute_id2
                    and "Product Long Description" in attribute_id1):
                ss_set = tokenizers.delimiter(attribute_id2["Screen Size"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for ss in ss_set:
                    if ss in ss_set:
                        count = count + 1
                ss2_in_des1 = count / len(ss_set)
            else:
                ss2_in_des1 = 0

            instance += [ss2_in_des1]

            if "Product Long Description" in attribute_id2:
                all_set = []
                for key in attribute_id1:
                    if key is not "Product Long Description":
                        value_list = tokenizers.delimiter(
                            attribute_id1[key][0])
                        for v in value_list:
                            all_set.append(v)
                des = attribute_id2["Product Long Description"][0]
                count = 0
                for a in all_set:
                    if a in des:
                        count += 1
                all1_in_des2 = count / len(all_set)
            else:
                all1_in_des2 = 0

            instance += [all1_in_des2]

            feature_matrix.append(instance)

    return feature_matrix
Esempio n. 7
0
	id2 = pair[1]
	attribute_id1 = product_dict[id1]
	attribute_id2 = product_dict[id2]
	id.append([id1,id2])

	# class label
	if (match_dict[pair] == 'MATCH'):
		classlabels.append(1)
	else:
		classlabels.append(0)


	####feature: Product Name ---- Jaccard Score (word boudary, 3-gram), edit distance, tf/idf
	if ("Product Name" in attribute_id1 and "Product Name" in attribute_id2):
		jaccard_productName = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]))
		jaccard3gram_productName = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Name"][0], 3), tokenizers.qgram(attribute_id2["Product Name"][0], 3))
		tfidf_productName = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]), productName_courpus)
		edit_productName = simfunctions.levenshtein(attribute_id1["Product Name"][0], attribute_id2["Product Name"][0])
		edit_productName = 1 - edit_productName/max(len(attribute_id1["Product Name"][0]), len(attribute_id2["Product Name"][0]))
	else:
		jaccard_productName = 0
		jaccard3gram_productName = 0
		tfidf_productName = 0
		edit_productName = 0

	####feature: Manufacturer
	if ("Manufacturer" in attribute_id1 and "Manufacturer" in attribute_id2):
		jaccard_manufacturer = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
		jaccard3gram_manufacturer = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Manufacturer"][0], 3), tokenizers.qgram(attribute_id2["Manufacturer"][0], 3))
		tfidf_manufacturer = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
	else:
Esempio n. 8
0
def generate_feature(filename):
    productName_courpus = []
    brand_courpus = []
    with open(filename, 'r') as f:
        for line in f:
            list_line = line.split('?')
            attribute_id1 = json.loads(list_line[2], encoding = 'latin-1')
            attribute_id2 = json.loads(list_line[4], encoding = 'latin-1')

            if "Product Name" in attribute_id1:
		        productName_courpus.append(tokenizers.delimiter(attribute_id1["Product Name"][0]))
            if "Product Name" in attribute_id2:
                productName_courpus.append(tokenizers.delimiter(attribute_id2["Product Name"][0]))

            if "Brand" in attribute_id1:
                brand_courpus.append(tokenizers.delimiter(attribute_id1["Brand"][0]))
            if "Brand" in attribute_id2:
                brand_courpus.append(tokenizers.delimiter(attribute_id2["Brand"][0]))

    feature_matrix = []
    with open(filename, 'r') as f:
        i = 1
        for line in f:
            list_line = line.split('?')
            attribute_id1 = json.loads(list_line[2], encoding = 'latin-1')
            attribute_id2 = json.loads(list_line[4], encoding = 'latin-1')

            print 'Generate features for pair', i
            i = i+1

            instance = []

            #Product Name 4
            if ("Product Name" in attribute_id1 and "Product Name" in attribute_id2):
                jaccard_productName = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]))
                jaccard3gram_productName = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Name"][0], 3), tokenizers.qgram(attribute_id2["Product Name"][0], 3))
                tfidf_productName = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Name"][0]), tokenizers.delimiter(attribute_id2["Product Name"][0]), productName_courpus)
                edit_productName = simfunctions.levenshtein(attribute_id1["Product Name"][0], attribute_id2["Product Name"][0])
                edit_productName = 1 - edit_productName/max(len(attribute_id1["Product Name"][0]), len(attribute_id2["Product Name"][0]))
            else:
                jaccard_productName = 0
                jaccard3gram_productName = 0
                tfidf_productName = 0
                edit_productName = 0

            instance += [jaccard_productName, jaccard3gram_productName, tfidf_productName, edit_productName]

            #Manufacturer 3
            if ("Manufacturer" in attribute_id1 and "Manufacturer" in attribute_id2):
                jaccard_manufacturer = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
                jaccard3gram_manufacturer = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Manufacturer"][0], 3), tokenizers.qgram(attribute_id2["Manufacturer"][0], 3))
                tfidf_manufacturer = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Manufacturer"][0]), tokenizers.delimiter(attribute_id2["Manufacturer"][0]))
            else:
                jaccard_manufacturer = 0
                jaccard3gram_manufacturer = 0
                tfidf_manufacturer = 0

            instance += [jaccard_manufacturer, jaccard3gram_manufacturer, tfidf_manufacturer]

            #Color 3
            if ("Color" in attribute_id1 and "Color" in attribute_id2):
                jaccard_color = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Color"][0]), tokenizers.delimiter(attribute_id2["Color"][0]))
                jaccard3gram_color = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Color"][0], 3), tokenizers.qgram(attribute_id2["Color"][0], 3))
                tfidf_color = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Color"][0]), tokenizers.delimiter(attribute_id2["Color"][0]))
            else:
                jaccard_color = 0
                jaccard3gram_color = 0
                tfidf_color = 0

            instance += [jaccard_color, jaccard3gram_color, tfidf_color]

            #Product Type 3
            if ("Product Type" in attribute_id1 and "Product Type" in attribute_id2):
                jaccard_productType = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Type"][0]),tokenizers.delimiter(attribute_id2["Product Type"][0]))
                jaccard3gram_productType = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Type"][0], 3),tokenizers.qgram(attribute_id2["Product Type"][0], 3))
                tfidf_productType = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Type"][0]),tokenizers.delimiter(attribute_id2["Product Type"][0]))
            else:
                jaccard_productType = 0
                jaccard3gram_productType = 0
                tfidf_productType = 0

            instance += [jaccard_productType, jaccard3gram_productType, tfidf_productType]

            #Product Segment 3
            if "Product Segment" in attribute_id1 and "Product Segment" in attribute_id2:
                jaccard_productSegment = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Segment"][0]),tokenizers.delimiter(attribute_id2["Product Segment"][0]))
                jaccard3gram_productSegment= simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Segment"][0], 3),tokenizers.qgram(attribute_id2["Product Segment"][0], 3))
                if (attribute_id1["Product Segment"][0] == attribute_id2["Product Segment"][0]):
                    exactMatch_productSegment = 1
                else:
                    exactMatch_productSegment = 0
            else:
                exactMatch_productSegment = 0
                jaccard_productSegment = 0
                jaccard3gram_productSegment = 0

            instance += [exactMatch_productSegment, jaccard_productSegment, jaccard3gram_productSegment]

            #Brand 4
            if ("Brand" in attribute_id1 and "Brand" in attribute_id2):
                jaccard_brand = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Brand"][0]),tokenizers.delimiter(attribute_id2["Brand"][0]))
                jaccard3gram_brand = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Brand"][0], 3),
                                                          tokenizers.qgram(attribute_id2["Brand"][0], 3))
                edit_brand = simfunctions.levenshtein(attribute_id1["Brand"][0], attribute_id2["Brand"][0])
                edit_brand = 1 - edit_brand/max(len(attribute_id1["Brand"][0]), len(attribute_id2["Brand"][0]))
                tfidf_brand = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Brand"][0]), tokenizers.delimiter(attribute_id2["Brand"][0]), brand_courpus)
            else:
                jaccard3gram_brand = 0
                jaccard_brand = 0
                edit_brand = 0
                tfidf_brand = 0

            instance += [jaccard_brand, jaccard3gram_brand, edit_brand, tfidf_brand]

            #Category 2
            if ("Category" in attribute_id1 and "Category" in attribute_id2):
                jaccard_category = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Category"][0]),
                                                        tokenizers.delimiter(attribute_id2["Category"][0]))
                jaccard3gram_category = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Category"][0], 3),
                                                          tokenizers.qgram(attribute_id2["Category"][0], 3))
            else:
                jaccard_category = 0
                jaccard3gram_category = 0

            instance += [jaccard_category, jaccard3gram_category]

            #Long Description 3
            if ("Product Long Description" in attribute_id1 and "Product Long Description" in attribute_id2):
                tfidf_long_description = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Long Description"][0]), tokenizers.delimiter(attribute_id2["Product Long Description"][0]))
                jaccard_long_description = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Long Description"][0]), tokenizers.delimiter(attribute_id2["Product Long Description"][0]))
                jaccard3_long_description = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Long Description"][0], 3), tokenizers.qgram(attribute_id2["Product Long Description"][0], 3))
            else:
                tfidf_long_description = 0
                jaccard_long_description = 0
                jaccard3_long_description = 0

            instance += [tfidf_long_description, jaccard_long_description, jaccard3_long_description]

            #Short Description 3
            if ("Product Short Description" in attribute_id1 and "Product Short Description" in attribute_id2):
                jaccard_short_description = simfunctions.jaccard(tokenizers.delimiter(attribute_id1["Product Short Description"][0]), tokenizers.delimiter(attribute_id2["Product Short Description"][0]))
                jaccard3_short_description = simfunctions.jaccard(tokenizers.qgram(attribute_id1["Product Short Description"][0], 3), tokenizers.qgram(attribute_id2["Product Short Description"][0], 3))
                tfidf_short_description = simfunctions.tfidf(tokenizers.delimiter(attribute_id1["Product Short Description"][0]), tokenizers.delimiter(attribute_id2["Product Short Description"][0]))
            else:
                jaccard_short_description = 0
                jaccard3_short_description = 0
                tfidf_short_description = 0

            instance += [jaccard_short_description, jaccard3_short_description, tfidf_short_description]

            #Other in long 8
            if ("Product Name" in attribute_id1 and "Product Long Description" in attribute_id2):
                name_set = tokenizers.delimiter(attribute_id1["Product Name"][0])
                des = attribute_id2["Product Long Description"][0]
                count = 0
                for name in name_set:
                    if name in des:
                        count = count+1
                name1_in_des2 = count/len(name_set)
            else:
                name1_in_des2 = 0

            instance += [name1_in_des2]

            if ("Product Long Description" in attribute_id1 and "Product Name" in attribute_id2):
                name_set = tokenizers.delimiter(attribute_id2["Product Name"][0])
                des = attribute_id1["Product Long Description"][0]
                count = 0
                for name in name_set:
                    if name in des:
                        count = count+1
                name2_in_des1 = count/len(name_set)
            else:
                name2_in_des1 = 0

            instance += [name2_in_des1]

            if ("Brand" in attribute_id1 and "Product Long Description" in attribute_id2):
                brand_set = tokenizers.delimiter(attribute_id1["Brand"][0])
                des = attribute_id2["Product Long Description"][0]
                count = 0
                for brand in brand_set:
                    if brand in des:
                        count = count+1
                brand1_in_des2 = count/len(brand_set)
            else:
                brand1_in_des2 = 0

            instance += [brand1_in_des2]

            if ("Brand" in attribute_id2 and "Product Long Description" in attribute_id1):
                brand_set = tokenizers.delimiter(attribute_id2["Brand"][0])
                des = attribute_id1["Product Long Description"][0]
                count = 0
                for brand in brand_set:
                    if brand in des:
                        count = count+1
                brand2_in_des1 = count/len(brand_set)
            else:
                brand2_in_des1 = 0

            instance += [brand2_in_des1]

            if ("Manufacturer" in attribute_id1 and "Product Long Description" in attribute_id2):
                manufacturer_set = tokenizers.delimiter(attribute_id1["Manufacturer"][0])
                des = attribute_id2["Product Long Description"][0]
                count = 0
                for manufacturer in manufacturer_set:
                    if manufacturer in des:
                        count = count+1
                manufacturer1_in_des2 = count/len(manufacturer_set)
            else:
                manufacturer1_in_des2 = 0

            instance += [manufacturer1_in_des2]

            if ("Manufacturer" in attribute_id2 and "Product Long Description" in attribute_id1):
                manufacturer_set = tokenizers.delimiter(attribute_id2["Manufacturer"][0])
                des = attribute_id1["Product Long Description"][0]
                count = 0
                for manufacturer in manufacturer_set:
                    if manufacturer in des:
                        count = count+1
                manufacturer2_in_des1 = count/len(manufacturer_set)
            else:
                manufacturer2_in_des1 = 0

            instance += [manufacturer2_in_des1]

            if ("Product Short Description" in attribute_id1 and "Product Long Description" in attribute_id2):
                short_des_set = tokenizers.delimiter(attribute_id1["Product Short Description"][0])
                des = attribute_id2["Product Long Description"][0]
                count = 0
                for short in short_des_set:
                    if short in des:
                        count = count+1
                short1_in_des2 = count/len(short_des_set)
            else:
                short1_in_des2 = 0

            instance += [short1_in_des2]

            if ("Product Short Description" in attribute_id2 and "Product Long Description" in attribute_id1):
                short_des_set = tokenizers.delimiter(attribute_id2["Product Short Description"][0])
                des = attribute_id1["Product Long Description"][0]
                count = 0
                for short in short_des_set:
                    if short in des:
                        count = count+1
                short2_in_des1 = count/len(short_des_set)
            else:
                short2_in_des1 = 0

            instance += [short2_in_des1]

            #Other in short 6
            if ("Product Name" in attribute_id1 and "Product Short Description" in attribute_id2):
                name_set = tokenizers.delimiter(attribute_id1["Product Name"][0])
                des = attribute_id2["Product Short Description"][0]
                count = 0
                for name in name_set:
                    if name in des:
                        count = count+1
                name1_in_short2 = count/len(name_set)
            else:
                name1_in_short2 = 0

            instance += [name1_in_short2]

            if ("Product Short Description" in attribute_id1 and "Product Name" in attribute_id2):
                name_set = tokenizers.delimiter(attribute_id2["Product Name"][0])
                des = attribute_id1["Product Short Description"][0]
                count = 0
                for name in name_set:
                    if name in des:
                        count = count+1
                name2_in_short1 = count/len(name_set)
            else:
                name2_in_short1 = 0

            instance += [name2_in_short1]

            if ("Brand" in attribute_id1 and "Product Short Description" in attribute_id2):
                brand_set = tokenizers.delimiter(attribute_id1["Brand"][0])
                des = attribute_id2["Product Short Description"][0]
                count = 0
                for brand in brand_set:
                    if brand in des:
                        count = count+1
                brand1_in_short2 = count/len(brand_set)
            else:
                brand1_in_short2 = 0

            instance += [brand1_in_short2]

            if ("Brand" in attribute_id2 and "Product Short Description" in attribute_id1):
                brand_set = tokenizers.delimiter(attribute_id2["Brand"][0])
                des = attribute_id1["Product Short Description"][0]
                count = 0
                for brand in brand_set:
                    if brand in des:
                        count = count+1
                brand2_in_short1 = count/len(brand_set)
            else:
                brand2_in_short1 = 0

            instance += [brand2_in_short1]

            if ("Manufacturer" in attribute_id1 and "Product Short Description" in attribute_id2):
                manufacturer_set = tokenizers.delimiter(attribute_id1["Manufacturer"][0])
                des = attribute_id2["Product Short Description"][0]
                count = 0
                for manufacturer in manufacturer_set:
                    if manufacturer in des:
                        count = count+1
                manufacturer1_in_short2 = count/len(manufacturer_set)
            else:
                manufacturer1_in_short2 = 0

            instance += [manufacturer1_in_short2]

            if ("Manufacturer" in attribute_id2 and "Product Short Description" in attribute_id1):
                manufacturer_set = tokenizers.delimiter(attribute_id2["Manufacturer"][0])
                des = attribute_id1["Product Short Description"][0]
                count = 0
                for manufacturer in manufacturer_set:
                    if manufacturer in des:
                        count = count+1
                manufacturer2_in_short1 = count/len(manufacturer_set)
            else:
                manufacturer2_in_short1 = 0

            instance += [manufacturer2_in_short1]

            #new 15
            if ("Manufacturer Part Number" in attribute_id1 and "Product Long Description" in attribute_id2):
                manu_part_number_set = tokenizers.delimiter(attribute_id1["Manufacturer Part Number"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for manu_part in manu_part_number_set:
                    if manu_part in des_set:
                        count = count+1
                manu_part1_in_des2 = count/len(manu_part_number_set)
            else:
                manu_part1_in_des2 = 0

            instance += [manu_part1_in_des2]

            if ("Manufacturer Part Number" in attribute_id2 and "Product Long Description" in attribute_id1):
                manu_part_number_set = tokenizers.delimiter(attribute_id2["Manufacturer Part Number"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for manu_part in manu_part_number_set:
                    if manu_part in des_set:
                        count = count+1
                manu_part2_in_des1 = count/len(manu_part_number_set)
            else:
                manu_part2_in_des1 = 0

            instance += [manu_part2_in_des1]

            if ("Assembled Product Length" in attribute_id1 and "Product Long Description" in attribute_id2):
                length_set = tokenizers.delimiter(attribute_id1["Assembled Product Length"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for length in length_set:
                    if length in des_set:
                        count = count+1
                length1_in_des2 = count/len(length_set)
            else:
                length1_in_des2 = 0

            instance += [length1_in_des2]

            if ("Assembled Product Length" in attribute_id2 and "Product Long Description" in attribute_id1):
                length_set = tokenizers.delimiter(attribute_id2["Assembled Product Length"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for length in length_set:
                    if length in des_set:
                        count = count+1
                length2_in_des1 = count/len(length_set)
            else:
                length2_in_des1 = 0

            instance += [length2_in_des1]

            if ("Assembled Product Width" in attribute_id1 and "Product Long Description" in attribute_id2):
                width_set = tokenizers.delimiter(attribute_id1["Assembled Product Width"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for width in width_set:
                    if width in des_set:
                        count = count+1
                width1_in_des2 = count/len(width_set)
            else:
                width1_in_des2 = 0

            instance += [width1_in_des2]

            if ("Assembled Product Width" in attribute_id2 and "Product Long Description" in attribute_id1):
                width_set = tokenizers.delimiter(attribute_id2["Assembled Product Width"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for width in width_set:
                    if width in des_set:
                        count = count+1
                width2_in_des1 = count/len(width_set)
            else:
                width2_in_des1 = 0

            instance += [width2_in_des1]

            if ("Assembled Product Height" in attribute_id1 and "Product Long Description" in attribute_id2):
                height_set = tokenizers.delimiter(attribute_id1["Assembled Product Height"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for height in height_set:
                    if height in des_set:
                        count = count+1
                height1_in_des2 = count/len(height_set)
            else:
                height1_in_des2 = 0

            instance += [height1_in_des2]

            if ("Assembled Product Height" in attribute_id2 and "Product Long Description" in attribute_id1):
                height_set = tokenizers.delimiter(attribute_id2["Assembled Product Height"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for height in height_set:
                    if height in des_set:
                        count = count+1
                height2_in_des1 = count/len(height_set)
            else:
                height2_in_des1 = 0

            instance += [height2_in_des1]

            if ("Type" in attribute_id1 and "Product Long Description" in attribute_id2):
                type_set = tokenizers.delimiter(attribute_id1["Type"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for type in type_set:
                    if type in des_set:
                        count = count+1
                type1_in_des2 = count/len(type_set)
            else:
                type1_in_des2 = 0

            instance += [type1_in_des2]

            if ("Type" in attribute_id2 and "Product Long Description" in attribute_id1):
                type_set = tokenizers.delimiter(attribute_id2["Type"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for type in type_set:
                    if type in des_set:
                        count = count+1
                type2_in_des1 = count/len(type_set)
            else:
                type2_in_des1 = 0

            instance += [type2_in_des1]

            if ("Operating System" in attribute_id1 and "Product Long Description" in attribute_id2):
                op_set = tokenizers.delimiter(attribute_id1["Operating System"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for op in op_set:
                    if op in op_set:
                        count = count+1
                op1_in_des2 = count/len(op_set)
            else:
                op1_in_des2 = 0

            instance += [op1_in_des2]

            if ("Operating System" in attribute_id2 and "Product Long Description" in attribute_id1):
                op_set = tokenizers.delimiter(attribute_id2["Operating System"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for op in op_set:
                    if op in op_set:
                        count = count+1
                op2_in_des1 = count/len(op_set)
            else:
                op2_in_des1 = 0

            instance += [op2_in_des1]

            if ("Screen Size" in attribute_id1 and "Product Long Description" in attribute_id2):
                ss_set = tokenizers.delimiter(attribute_id1["Screen Size"][0])
                des_set = attribute_id2["Product Long Description"][0]
                count = 0
                for ss in ss_set:
                    if ss in ss_set:
                        count = count+1
                ss1_in_des2 = count/len(ss_set)
            else:
                ss1_in_des2 = 0

            instance += [ss1_in_des2]

            if ("Screen Size" in attribute_id2 and "Product Long Description" in attribute_id1):
                ss_set = tokenizers.delimiter(attribute_id2["Screen Size"][0])
                des_set = attribute_id1["Product Long Description"][0]
                count = 0
                for ss in ss_set:
                    if ss in ss_set:
                        count = count+1
                ss2_in_des1 = count/len(ss_set)
            else:
                ss2_in_des1 = 0

            instance += [ss2_in_des1]

            if "Product Long Description" in attribute_id2:
                all_set = []
                for key in attribute_id1:
                    if key is not "Product Long Description":
                        value_list = tokenizers.delimiter(attribute_id1[key][0])
                        for v in value_list:
                            all_set.append(v)
                des = attribute_id2["Product Long Description"][0]
                count = 0
                for a in all_set:
                    if a in des:
                        count += 1
                all1_in_des2 = count/len(all_set)
            else:
                all1_in_des2 = 0

            instance += [all1_in_des2]

            feature_matrix.append(instance)

    return feature_matrix
 def test_qgrams_none(self):
     self.assertEqual(qgram(None), [])
Esempio n. 10
0
    attribute_id2 = product_dict[id2]
    id.append([id1, id2])

    # class label
    if (match_dict[pair] == 'MATCH'):
        classlabels.append(1)
    else:
        classlabels.append(0)

    ####feature: Product Name ---- Jaccard Score (word boudary, 3-gram), edit distance, tf/idf
    if ("Product Name" in attribute_id1 and "Product Name" in attribute_id2):
        jaccard_productName = simfunctions.jaccard(
            tokenizers.delimiter(attribute_id1["Product Name"][0]),
            tokenizers.delimiter(attribute_id2["Product Name"][0]))
        jaccard3gram_productName = simfunctions.jaccard(
            tokenizers.qgram(attribute_id1["Product Name"][0], 3),
            tokenizers.qgram(attribute_id2["Product Name"][0], 3))
        tfidf_productName = simfunctions.tfidf(
            tokenizers.delimiter(attribute_id1["Product Name"][0]),
            tokenizers.delimiter(attribute_id2["Product Name"][0]),
            productName_courpus)
        edit_productName = simfunctions.levenshtein(
            attribute_id1["Product Name"][0], attribute_id2["Product Name"][0])
        edit_productName = 1 - edit_productName / max(
            len(attribute_id1["Product Name"][0]),
            len(attribute_id2["Product Name"][0]))
    else:
        jaccard_productName = 0
        jaccard3gram_productName = 0
        tfidf_productName = 0
        edit_productName = 0
 def test_qgrams_none(self):
     self.assertEqual(qgram(None), [])