def gen_simple_feature(a, b): feats = {} for name in [ 'title', 'description', 'price', 'categoryID', 'locationID', 'metroID' ]: feats['same_' + name] = a[name] == b[name] feats['same_lat_lon'] = (a['lat'] == b['lat']) and (a['lon'] == b['lon']) feats['location_distance'] = ((a['lat'] - b['lat'])**2 + (a['lon'] - b['lon'])**2)**(1 / 2) create_numeric_comparison(feats, a['price'], b['price'], 'price') try: create_numeric_comparison(feats, len(a['title']), len(b['title']), 'title_length') except TypeError: pass try: create_numeric_comparison(feats, len(a['description']), len(b['description']), 'description_length') except TypeError: pass try: create_numeric_comparison(feats, len(a['images_array']), len(b['images_array']), 'images_count') except TypeError: pass feats['attrsJSON_key_jaccard'] = jaccard(a['attrsJSON'].keys(), b['attrsJSON'].keys()) feats['attrsJSON_item_jaccard'] = jaccard(a['attrsJSON'].items(), b['attrsJSON'].items()) return feats
def gen_text_similarity_feature(sa, sb, prefix='', ngrams_word_jaccard=[], use_char_ngram_jaccard=False, ngrams_char_jaccard=[3, 4, 5]): if not isinstance(sa, str) or not isinstance(sb, str): return {} feats = {} wa0 = tokenize0(sa) wb0 = tokenize0(sb) wa1 = tokenize1(sa) wb1 = tokenize1(sb) feats[prefix + 'word0_jaccard'] = jaccard(wa0, wb0) feats[prefix + 'word1_jaccard'] = jaccard(wa1, wb1) for n in ngrams_word_jaccard: feats[prefix + 'word0_jaccard_{}gram'.format(n)] = word_jaccard_ngram( wa0, wb0, n) feats[prefix + 'word1_jaccard_{}gram'.format(n)] = word_jaccard_ngram( wa1, wb1, n) if use_char_ngram_jaccard: for n in ngrams_char_jaccard: feats[prefix + 'char_jaccard_{}gram'.format(n)] = char_jaccard_ngram( sa, sb, n) feats[prefix + 'jw'] = jaro_winkler(sa, sb) feats[prefix + 'edit_distance_ratio'] = edit_distance(sa, sb) / (len(sa) + len(sb)) return feats
def searchtfjabPlusRelevent(self,searchList , releventdocstr , documentList): queryVector = self.getVectorKeywordIndex(searchList) relevenceVector = self.getVectorKeywordIndex(releventdocstr) self.documentVectors = self.getVectorKeywordIndexSeprated(documentList) ratings = [util.jaccard(queryVector , documentVector) for documentVector in self.documentVectors] ratingrel = [util.jaccard(relevenceVector , documentVector) for documentVector in self.documentVectors] for i in range(len(ratings)): ratings[i] += (ratingrel[i] * 0.5) return ratings
def pair_features(hashes1, hashes2): feats = [jaccard(binary_matrix_to_int(hashes1), binary_matrix_to_int(hashes2))] D = pairwise_distances(hashes1, hashes2, metric='hamming') if D.shape[0] > D.shape[1]: D = D.T if D.shape[0] == 0 or D.shape[1] == 0: feats.extend([np.nan] * 6) else: s0 = D.min(axis=1) s1 = D.max(axis=0) feats.extend([s0.min(), s0.max(), s0.mean(), s1.min(), s1.max(), s1.mean()]) return feats
def searchtfjab(self,searchList , documentList): queryVector = self.getVectorKeywordIndex(searchList) self.documentVectors = self.getVectorKeywordIndexSeprated(documentList) ratings = [util.jaccard(queryVector , documentVector) for documentVector in self.documentVectors] return ratings
def char_jaccard_ngram(a, b, n): return jaccard(char_ngrams(a, (n, n), binary=True), char_ngrams(b, (n, n), binary=True))
def word_jaccard_ngram(a, b, n): return jaccard(word_ngrams(a, (n, n), binary=True), word_ngrams(b, (n, n), binary=True))
eval_sentiment_ids_list = [] eval_selected_texts_list = [] for _ in range(num_eval_batches): _eval_texts, _eval_predicted_labels, _eval_sentiment_ids, _eval_selected_texts \ = sess.run([eval_texts, eval_predicted_labels, eval_sentiment_ids, eval_selected_texts]) eval_texts_list.extend(_eval_texts.tolist()) predicted_label_list.extend( _eval_predicted_labels.tolist()) eval_sentiment_ids_list.extend( _eval_sentiment_ids.tolist()) eval_selected_texts_list.extend( _eval_selected_texts.tolist()) logging.info("eval nums %d " % len(predicted_label_list)) # calculate the jaccards eval_predict = eval_decoded_texts(eval_texts_list, predicted_label_list, eval_sentiment_ids_list, tokenizer) jaccards = [] for i in range(len(eval_predict)): jaccards.append( jaccard(eval_selected_texts_list[i], eval_predict[i])) score = np.mean(jaccards) logging.info("jaccards: %f" % score) logging.info("# fall back to train mode") sess.run(train_init_op) set_training = True