def word_match_share(row): swords = get_stop_words() q1set = set( [word for word in str(row['question1']).split() if word not in swords]) q2set = set( [word for word in str(row['question2']).split() if word not in swords]) return len(q1set.intersection(q2set))
def calculate_distance(q1: str, q2: str, model: gensim.models.KeyedVectors): swords = get_stop_words() q1 = [word for word in str(q1).lower().split() if word not in swords and word.isalpha()] q2 = [word for word in str(q2).lower().split() if word not in swords and word.isalpha()] wq1 = [] wq2 = [] for word in q1: try: wq1.append(model[word]) except: continue for word in q2: try: wq2.append(model[word]) except: continue maximum = 0 for q1_, w1 in zip(q1, wq1): minimum = 1e10 for q2_, w2 in zip(q2, wq2): minimum = min(minimum, euclidean(w1, w2)) maximum = max(maximum, minimum) for w2 in wq2: minimum = 1e10 for w1 in wq1: minimum = min(minimum, euclidean(w1, w2)) maximum = max(maximum, minimum) return maximum
def __init__(self, options): super().__init__(options) self.count_vectorizer = CountVectorizer(max_df=0.5, min_df=4, ngram_range=(3, 3)) self.tfidf_transformer = TfidfTransformer(norm=None) self.stop_words = get_stop_words()
def __init__(self, options): super().__init__(options) self.vectorizer = TfidfVectorizer(max_df=0.5, min_df=8, ngram_range=(5, 5), analyzer='char') self.stop_words = get_stop_words()
def calculate_feature(self, data): data = pd.read_csv(data) values = [] swords = get_stop_words() for i, row in tqdm(data.iterrows()): values.append(self._calculate_feature(row=row, swords=swords)) data[self.feature_name] = values return data[[self.feature_name]]
def __init__(self, options): super().__init__(options) self.vectorizer = TfidfVectorizer(max_df=0.5, min_df=8, ngram_range=(1, 1), binary=True, stop_words=["'s"]) self.stop_words = get_stop_words()
def __init__(self, options, max_df=0.5, min_df=20, ngram_range=(1, 1), binary=True): super().__init__(options) self.vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, ngram_range=ngram_range, binary=binary, analyzer='char') self.stop_words = get_stop_words() self.read_func = pd.read_csv
def _sentence2vec(s: str, model): swords = get_stop_words() words = str(s).lower().split() words = [w for w in words if w not in swords] words = [w for w in words if w.isalpha()] M = [] for w in words: try: M.append(model[w]) except: continue M = np.array(M) v = M.sum(axis=0) return v / np.sqrt((v**2).sum())
def nltk_stemming(data_file): stemmer = nltk.stem.PorterStemmer() swords = get_stop_words() cache_file = data_file + '.stemmed' if os.path.exists(cache_file): return pd.read_csv(cache_file) data = nltk_tokenize(data_file) data['question1'] = data['question1'].apply(lambda s: " ".join( stemming_words(str(s).split(), stopwords=swords, stemmer=stemmer)) ).values data['question2'] = data['question2'].apply(lambda s: " ".join( stemming_words(str(s).split(), stopwords=swords, stemmer=stemmer)) ).values data.to_csv(cache_file, index=False) return data
def word_match_share(row): q1words = {} q2words = {} swords = get_stop_words() for word in str(row['question1']).lower().split(): if word not in swords: q1words[word] = 1 for word in str(row['question2']).lower().split(): if word not in swords: q2words[word] = 1 if len(q1words) == 0 or len(q2words) == 0: # The computer-generated chaff includes a few questions that are nothing but stopwords return 0 shared_words_in_q1 = [w for w in q1words.keys() if w in q2words] shared_words_in_q2 = [w for w in q2words.keys() if w in q1words] return (len(shared_words_in_q1) + len(shared_words_in_q2)) / (len(q1words) + len(q2words))
def calculate_row_feature(self, row_): row = row_[1] swords = get_stop_words() q1 = [ word for word in str(row['question1']).split() if word not in swords ] q2 = [ word for word in str(row['question2']).split() if word not in swords ] wq1 = [] wq2 = [] for word in q1: try: wq1.append(self.model[word]) except: continue for word in q2: try: wq2.append(self.model[word]) except: continue distance = np.zeros((len(wq1), len(wq2))) for i1, w1 in enumerate(wq1): for i2, w2 in enumerate(wq2): distance[i1, i2] = np.dot(w1, w2) maximum = 0 for i1 in range(len(wq1)): minimum = 1e10 for i2 in range(len(wq2)): minimum = min(minimum, distance[i1, i2]) maximum = max(maximum, minimum) for i2 in range(len(wq2)): minimum = 1e10 for i1 in range(len(wq1)): minimum = min(minimum, distance[i1, i2]) maximum = max(maximum, minimum) return maximum
def __init__(self, options, ngram_range=(1, 1)): super().__init__(options) self.vectorizer = TfidfVectorizer(max_df=0.5, min_df=8, ngram_range=ngram_range) self.stop_words = get_stop_words()
def wmd(row, model: gensim.models.KeyedVectors): swords = get_stop_words() q1 = [word for word in str(row['question1']).split() if word not in swords] q2 = [word for word in str(row['question2']).split() if word not in swords] return model.wmdistance(q1, q2)