Beispiel #1
0
def gen_simple_feature(a, b):
    feats = {}
    for name in [
            'title', 'description', 'price', 'categoryID', 'locationID',
            'metroID'
    ]:
        feats['same_' + name] = a[name] == b[name]
    feats['same_lat_lon'] = (a['lat'] == b['lat']) and (a['lon'] == b['lon'])
    feats['location_distance'] = ((a['lat'] - b['lat'])**2 +
                                  (a['lon'] - b['lon'])**2)**(1 / 2)

    create_numeric_comparison(feats, a['price'], b['price'], 'price')
    try:
        create_numeric_comparison(feats, len(a['title']), len(b['title']),
                                  'title_length')
    except TypeError:
        pass
    try:
        create_numeric_comparison(feats, len(a['description']),
                                  len(b['description']), 'description_length')
    except TypeError:
        pass
    try:
        create_numeric_comparison(feats, len(a['images_array']),
                                  len(b['images_array']), 'images_count')
    except TypeError:
        pass

    feats['attrsJSON_key_jaccard'] = jaccard(a['attrsJSON'].keys(),
                                             b['attrsJSON'].keys())
    feats['attrsJSON_item_jaccard'] = jaccard(a['attrsJSON'].items(),
                                              b['attrsJSON'].items())

    return feats
Beispiel #2
0
def gen_text_similarity_feature(sa,
                                sb,
                                prefix='',
                                ngrams_word_jaccard=[],
                                use_char_ngram_jaccard=False,
                                ngrams_char_jaccard=[3, 4, 5]):
    if not isinstance(sa, str) or not isinstance(sb, str):
        return {}
    feats = {}

    wa0 = tokenize0(sa)
    wb0 = tokenize0(sb)
    wa1 = tokenize1(sa)
    wb1 = tokenize1(sb)

    feats[prefix + 'word0_jaccard'] = jaccard(wa0, wb0)
    feats[prefix + 'word1_jaccard'] = jaccard(wa1, wb1)

    for n in ngrams_word_jaccard:
        feats[prefix + 'word0_jaccard_{}gram'.format(n)] = word_jaccard_ngram(
            wa0, wb0, n)
        feats[prefix + 'word1_jaccard_{}gram'.format(n)] = word_jaccard_ngram(
            wa1, wb1, n)

    if use_char_ngram_jaccard:
        for n in ngrams_char_jaccard:
            feats[prefix +
                  'char_jaccard_{}gram'.format(n)] = char_jaccard_ngram(
                      sa, sb, n)

    feats[prefix + 'jw'] = jaro_winkler(sa, sb)
    feats[prefix +
          'edit_distance_ratio'] = edit_distance(sa, sb) / (len(sa) + len(sb))

    return feats
 def searchtfjabPlusRelevent(self,searchList , releventdocstr , documentList):
     queryVector = self.getVectorKeywordIndex(searchList)
     relevenceVector = self.getVectorKeywordIndex(releventdocstr)
     self.documentVectors = self.getVectorKeywordIndexSeprated(documentList)
     ratings = [util.jaccard(queryVector , documentVector) for documentVector in self.documentVectors]
     ratingrel = [util.jaccard(relevenceVector , documentVector) for documentVector in self.documentVectors]
     for i in range(len(ratings)):
         ratings[i] += (ratingrel[i] * 0.5)
     return ratings
Beispiel #4
0
def pair_features(hashes1, hashes2):
    feats = [jaccard(binary_matrix_to_int(hashes1), binary_matrix_to_int(hashes2))]

    D = pairwise_distances(hashes1, hashes2, metric='hamming')
    if D.shape[0] > D.shape[1]:
        D = D.T
    if D.shape[0] == 0 or D.shape[1] == 0:
        feats.extend([np.nan] * 6)
    else:
        s0 = D.min(axis=1)
        s1 = D.max(axis=0)
        feats.extend([s0.min(), s0.max(), s0.mean(), s1.min(), s1.max(), s1.mean()])
    return feats
 def searchtfjab(self,searchList , documentList):
     queryVector = self.getVectorKeywordIndex(searchList)
     self.documentVectors = self.getVectorKeywordIndexSeprated(documentList)
     ratings = [util.jaccard(queryVector , documentVector) for documentVector in self.documentVectors]
     return ratings
Beispiel #6
0
def char_jaccard_ngram(a, b, n):
    return jaccard(char_ngrams(a, (n, n), binary=True),
                   char_ngrams(b, (n, n), binary=True))
Beispiel #7
0
def word_jaccard_ngram(a, b, n):
    return jaccard(word_ngrams(a, (n, n), binary=True),
                   word_ngrams(b, (n, n), binary=True))
                eval_sentiment_ids_list = []
                eval_selected_texts_list = []
                for _ in range(num_eval_batches):
                    _eval_texts, _eval_predicted_labels, _eval_sentiment_ids, _eval_selected_texts \
                        = sess.run([eval_texts, eval_predicted_labels, eval_sentiment_ids, eval_selected_texts])
                    eval_texts_list.extend(_eval_texts.tolist())
                    predicted_label_list.extend(
                        _eval_predicted_labels.tolist())
                    eval_sentiment_ids_list.extend(
                        _eval_sentiment_ids.tolist())
                    eval_selected_texts_list.extend(
                        _eval_selected_texts.tolist())

                logging.info("eval nums %d " % len(predicted_label_list))

                # calculate the jaccards
                eval_predict = eval_decoded_texts(eval_texts_list,
                                                  predicted_label_list,
                                                  eval_sentiment_ids_list,
                                                  tokenizer)
                jaccards = []
                for i in range(len(eval_predict)):
                    jaccards.append(
                        jaccard(eval_selected_texts_list[i], eval_predict[i]))
                score = np.mean(jaccards)
                logging.info("jaccards: %f" % score)

                logging.info("# fall back to train mode")
                sess.run(train_init_op)
                set_training = True