def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) pos_list = _inter_norm_pos_list(obs_ngrams, target_ngrams) return pos_list
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) return np_utils._try_divide( self._get_match_count(obs_ngrams, target_ngrams, self.idx), len(target_ngrams))
def transform_one(self, obs, target, id): assert isinstance(obs, unicode) assert isinstance(target, unicode) if obs != self.last_obs: self.last_obs = obs obs_tokens = nlp_utils._tokenize(obs, token_pattern) self.last_obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_tokens = nlp_utils._tokenize(target, token_pattern) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) return self.distance(self.last_obs_ngrams, target_ngrams)
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_nterms = ngram_utils._nterms(obs_tokens, self.nterm) target_nterms = ngram_utils._nterms(target_tokens, self.nterm) s = 0. for w1 in obs_nterms: for w2 in target_nterms: if dist_utils._is_str_match(w1, w2, self.str_match_threshold): s += 1. return np_utils._try_divide(s, len(obs_nterms) * len(target_nterms))
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) s = 0. for w1 in obs_ngrams: for w2 in target_ngrams: if dist_utils._is_str_match(w1, w2, self.str_match_threshold): s += 1. return np_utils._try_divide(s, len(obs_ngrams)*len(target_ngrams))
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) s = 0. for w1 in obs_ngrams: for w2 in target_ngrams: if dist_utils._is_str_match(w1, w2, self.str_match_threshold): s += 1. break return s
def transform_one(self, obs, target, id): df = self.dfTrain[self.dfTrain["search_term"] == obs].copy() val_list = [config.MISSING_VALUE_NUMERIC] if df is not None: df = df[df["id"] != id].copy() df = df[df["relevance"] == self.relevance].copy() if df is not None and df.shape[0] > 0: target_tokens = nlp_utils._tokenize(target, token_pattern) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) val_list = [] for x in df[self.target_field]: x_tokens = nlp_utils._tokenize(x, token_pattern) x_ngrams = ngram_utils._ngrams(x_tokens, self.ngram) val_list.append(dist_utils._jaccard_coef(x_ngrams, target_ngrams)) return val_list
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) val_list = [] for w1 in obs_ngrams: s = 0. for w2 in target_ngrams: if dist_utils._is_str_match(w1, w2, self.str_match_threshold): s += 1. val_list.append(np_utils._try_divide(s, len(target_ngrams))) if len(val_list) == 0: val_list = [config.MISSING_VALUE_NUMERIC] return val_list
def transform_one(self, obs, target, id): df = self.dfTrain[self.dfTrain["search_term"] == obs].copy() val_list = [config.MISSING_VALUE_NUMERIC] if df is not None: df = df[df["id"] != id].copy() df = df[df["relevance"] == self.relevance].copy() if df is not None and df.shape[0] > 0: target_tokens = nlp_utils._tokenize(target, token_pattern) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) val_list = [] for x in df[self.target_field]: x_tokens = nlp_utils._tokenize(x, token_pattern) x_ngrams = ngram_utils._ngrams(x_tokens, self.ngram) val_list.append( dist_utils._jaccard_coef(x_ngrams, target_ngrams)) return val_list
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) counter = Counter(obs_tokens) count = np.asarray(list(counter.values())) proba = count/np.sum(count) entropy = -np.sum(proba*np.log(proba)) return entropy
def _get_avg_ngram_doc_len(self): lst = [] for target in self.target_corpus: target_tokens = nlp_utils._tokenize(target, token_pattern) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) lst.append(len(target_ngrams)) return np.mean(lst)
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) val_list = [] for w1 in obs_ngrams: _val_list = [] for w2 in target_ngrams: s = dist_utils._edit_dist(w1, w2) _val_list.append(s) if len(_val_list) == 0: _val_list = [ config.MISSING_VALUE_NUMERIC ] val_list.append( _val_list ) if len(val_list) == 0: val_list = [ [config.MISSING_VALUE_NUMERIC] ] return val_list
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_synset_list = [wn.synsets(obs_token) for obs_token in obs_tokens] target_synset_list = [wn.synsets(target_token) for target_token in target_tokens] val_list = [] for obs_synset in obs_synset_list: _val_list = [] for target_synset in target_synset_list: _s = self._maximum_similarity_for_two_synset_list(obs_synset, target_synset) _val_list.append(_s) if len(_val_list) == 0: _val_list = [config.MISSING_VALUE_NUMERIC] val_list.append( _val_list ) if len(val_list) == 0: val_list = [[config.MISSING_VALUE_NUMERIC]] return val_list
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) K = self.k1 * (1 - self.b + self.b * np_utils._try_divide(len(target_ngrams), self.avg_ngram_doc_len)) val_list = [] for w1 in obs_ngrams: s = 0. for w2 in target_ngrams: if dist_utils._is_str_match(w1, w2, self.str_match_threshold): s += 1. bm25 = s * self._get_idf(w1) * np_utils._try_divide(1 + self.k1, s + K) val_list.append(bm25) if len(val_list) == 0: val_list = [config.MISSING_VALUE_NUMERIC] return val_list
def __iter__(self): for column in self.columns: for sentence in self.df[column]: if not sentence in self.sent_label: self.cnt += 1 self.sent_label[sentence] = "SENT_%d"%self.cnt tokens = nlp_utils._tokenize(sentence, token_pattern) yield LabeledSentence(words=tokens, tags=[self.sent_label[sentence]])
def transform_one(self, obs, target, id): val_list = [] obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) for obs_token in obs_tokens: _val_list = [] if obs_token in self.model: for target_token in target_tokens: if target_token in self.model: sim = dist_utils._cosine_sim(self.model[obs_token], self.model[target_token]) _val_list.append(sim) if len(_val_list) == 0: _val_list = [config.MISSING_VALUE_NUMERIC] val_list.append( _val_list ) if len(val_list) == 0: val_list = [[config.MISSING_VALUE_NUMERIC]] return val_list
def _get_df_dict(self): # smoothing d = defaultdict(lambda : 1) for target in self.target_corpus: target_tokens = nlp_utils._tokenize(target, token_pattern) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) for w in set(target_ngrams): d[w] += 1 return d
def __iter__(self): for column in self.columns: for sentence in self.df[column]: if not sentence in self.sent_label: self.cnt += 1 self.sent_label[sentence] = "SENT_%d" % self.cnt tokens = nlp_utils._tokenize(sentence, token_pattern) yield LabeledSentence(words=tokens, tags=[self.sent_label[sentence]])
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) return dist_utils._dice_dist(obs_ngrams, target_ngrams)
def transform_one(self, obs, target, id): return len(nlp_utils._tokenize(obs, token_pattern))
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) return np_utils._try_divide(len(re.findall(r"\d", obs)), len(obs_tokens))
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) return np_utils._try_divide(len(set(obs_ngrams)), len(obs_ngrams))
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) return len(obs_tokens)
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) digits = re.findall(r"\d", obs) return np_utils._try_divide(len(digits), len(obs_tokens))
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) counter = Counter(obs_tokens) count = np.asarray(list(counter.values())) proba = count / float(np.sum(count)) return np_utils._entropy(proba)
def __iter__(self): for column in self.columns: for sentence in self.df[column]: tokens = nlp_utils._tokenize(sentence, token_pattern) yield tokens
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) return _inter_pos_list(target_ngrams, [obs_ngrams[self.idx]])
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) return np_utils._try_divide(self._get_match_count(obs_ngrams, target_ngrams, self.idx), len(target_ngrams))
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) return abs(np_utils._try_divide(len(obs_tokens), len(target_tokens)) - 1)