def create_word2vec_features(data, col1, col2, pref=''): logging.info('Creating Word2Vec features.') feature_class = pref + 'word2vec' if check_if_exists(feature_class): logging.info('Word2Vec features are already created.') return models = [] # Create our own model. corpus = list(data[col1]) + list(data[col2]) models.append(Word2VecModel(corpus=corpus, name='Corpus')) # Load pre-trained models. for file in os.listdir(MODELS_DIR): if file.endswith('.txt') or file.endswith('.bin'): models.append(Word2VecModel(path=os.path.join(MODELS_DIR, file), name=file.split('.', 1)[0])) res = pd.DataFrame() for model in models: estimator = Word2VecEstimator(model) res['%s_n_similarity' % model.name] = data.apply( lambda x: estimator.get_n_similarity(x[col1], x[col2]), axis=1) res['%s_n_similarity_imp' % model.name] = data.apply( lambda x: estimator.get_n_similarity_imp(x[col1], x[col2]), axis=1) res['%s_centroid_rmse' % model.name] = data.apply( lambda x: estimator.get_centroid_rmse(x[col1], x[col2]), axis=1) res['%s_centroid_rmse_imp' % model.name] = data.apply( lambda x: estimator.get_centroid_rmse_imp(x[col1], x[col2]), axis=1) add_features(feature_class, res.columns.tolist()) dump_features(feature_class, res) logging.info('Word2Vec features are created and saved to pickle file.')
def create_most_common_words_features(df_all, col1, col2, max_features=MAX_FEATURES, pref=''): logging.info('Creating most common words features.') feature_class = pref + 'most_common_words' if check_if_exists(feature_class): logging.info('Most common words features already created.') return count_vectorizer = CountVectorizer(min_df=MIN_DF, max_df=MAX_DF, max_features=MAX_FEATURES, strip_accents='unicode', analyzer='word', token_pattern=TOKEN_PATTERN, ngram_range=NGRAM_RANGE, stop_words='english', binary=True, vocabulary=None) documents = pd.concat([df_all[col1], df_all[col2]], axis=0) X = count_vectorizer.fit_transform(documents) logging.debug(count_vectorizer.get_feature_names()) X_col1 = X[0:len(df_all)] X_col2 = X[len(df_all):2 * len(df_all)] res = X_col1 + X_col2 dump_features(feature_class, res) logging.info('Most common words features are created and saved to pickle file.')
def create_common_words_count_features(data, pref=''): logging.info('Creating common words features') feature_class = pref + 'common_words' if check_if_exists(feature_class): logging.info('Common words features already created') return res = pd.DataFrame() res['common_words'] = data.apply( lambda x: common_words_count(x['words1'], x['words2']), axis=1) res['len1'] = data['words1'].apply(lambda x: len(x)) res['len2'] = data['words2'].apply(lambda x: len(x)) res['lenunion'] = data.apply( lambda x: union_words_count(x['words1'], x['words2']), axis=1) res['distance1'] = res['common_words'] / res['lenunion'] res['distance2'] = res['common_words'] / (res['len1'] + res['len2']) res['common_words_len'] = data.apply( lambda x: common_words_len(x['words1'], x['words2']), axis=1) res['abs_len1'] = data['words1'].apply(lambda x: words_len(x)) res['abs_len2'] = data['words2'].apply(lambda x: words_len(x)) res['abs_lenunion'] = data.apply( lambda x: union_words_len(x['words1'], x['words2']), axis=1) res['absdistance1'] = res['common_words_len'] / res['abs_lenunion'] res['absdistance2'] = res['common_words_len'] / (res['abs_len1'] + res['abs_len2']) features = res.columns.tolist() add_features(feature_class, features) dump_features(feature_class, res) logging.info('Common words features are created and saved to pickle file.')
def create_common_vocabulary_raw_tfidf_features(df_all, col1, col2, pref=''): logging.info('Creating common vocabulary raw tfidf features.') feature_class = pref + 'common_vocabulary_raw_tfidf' if check_if_exists(feature_class): logging.info('Common vocabulary raw tfidf features already created.') return tfidf_vectorizer = TfidfVectorizer(min_df=MIN_DF, max_df=MAX_DF, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=TOKEN_PATTERN, ngram_range=NGRAM_RANGE, use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english', norm=NORM, vocabulary=None) documents = pd.concat([df_all[col1], df_all[col2]], axis=0) X = tfidf_vectorizer.fit_transform(documents) X_col1 = X[0:len(df_all)] X_col2 = X[len(df_all):2 * len(df_all)] res = X_col1.multiply(X_col2) dump_features('%scommon_vocabulary_raw_tfidf_%s' % (pref, col1), X_col1) dump_features('%scommon_vocabulary_raw_tfidf_%s' % (pref, col2), X_col2) dump_features(feature_class, res) logging.info('Common vocabulary Raw tfidf features are created and saved' 'to pickle file.')
def create_svd_tfidf_features(columns, n_components=N_COMPONENTS, pref=''): logging.info('Creating svd tfidf features.') feature_class = pref + 'svd_tfidf' if check_if_exists(feature_class): logging.info('SVD tfidf features already created.') return data = [] svd = TruncatedSVD(n_components=n_components, n_iter=15) for c in columns: X = load_features('%sraw_tfidf_%s' % (pref, c)) X_transformed = svd.fit_transform(X) svd_columns = [ 'tfidf_svd_' + c + '_' + str(i) for i in range(n_components) ] data.append(pd.DataFrame(X_transformed, columns=svd_columns)) df = pd.concat(data, axis=1, ignore_index=True) add_features(feature_class, df.columns.tolist()) dump_features(feature_class, df) logging.info('Shape of svd tfidf features is %s' % str(df.shape)) logging.info('Svd tfidf features are created.')
def create_raw_tfidf_features(df_all, columns, pref=''): logging.info('Creating raw tfidf features.') feature_class = '%sraw_tfidf_%s' % (pref, columns[0]) if check_if_exists(feature_class): logging.info('Raw tfidf features already created.') return for c in columns: tfidf_vectorizer = TfidfVectorizer(min_df=MIN_DF, max_df=MAX_DF, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=TOKEN_PATTERN, ngram_range=NGRAM_RANGE, use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english', norm=NORM, vocabulary=None) X = tfidf_vectorizer.fit_transform(df_all[c]) logging.info('Shape of Tfidf transform matrix is %s' % str(X.shape)) feature_class = '%sraw_tfidf_%s' % (pref, c) dump_features(feature_class, X) logging.info('Raw tfidf features are created and saved to pickle file.')
def create_logistic_features(metafeatures_dir=METAFEATURES_DIR, preds_dir=PRED_DIR, pref=''): logging.info('Creating logistic features.') feature_class = pref + 'logistic' if check_if_exists(feature_class): logging.info('Logistic features (%s) already created.' % feature_class) return metafeatures_filenames = filenames_in_dir(metafeatures_dir, '.pickle') preds_filenames = filenames_in_dir(preds_dir, '.csv') common_filenames = set(metafeatures_filenames).intersection( set(preds_filenames)) common_filenames = sorted(common_filenames) # We are only interested in logistic metafeatures. common_filenames = [ f for f in common_filenames if f.startswith('Logistic') ] train_data = [] for filename in common_filenames: # Only logistic regression use as features. if not filename.startswith('Logistic'): continue with open((os.path.join(metafeatures_dir, filename + '.pickle')), 'rb') as file: try: metafeature = np.sum(pickle.load(file), axis=1) except: metafeature = pickle.load(file) metafeature = rescale_preds(metafeature, a=B, b=A) train_data.append(metafeature) train_data = np.stack(train_data, axis=1) train_data = pd.DataFrame(train_data, columns=common_filenames) # Load preds. test_data = [] for filename in common_filenames: file = os.path.join(preds_dir, filename + '.csv') preds = pd.read_csv(file, usecols=['is_duplicate']) # We need to rescale predictions back ot avoid double rescaling. # TODO: think about a better way to do it. preds = rescale_preds(preds, a=B, b=A) test_data.append(preds.values) test_data = np.concatenate(test_data, axis=1) test_data = pd.DataFrame(test_data, columns=common_filenames) data = pd.concat([train_data, test_data]) add_features(feature_class, common_filenames) dump_features(feature_class, data) logging.info('Logistic features are created and saved to pickle file.')
def create_magic_features(df_all, pref=''): logging.info('Creating magic features.') feature_class = pref + 'magic' if check_if_exists(feature_class): logging.info('Magic features (%s) already created.' % feature_class) return # 1. Creating questions dictionary: question -> hash_value. logging.debug('Creating questions dictionary...') questions1 = df_all[['question1', 'question2']].copy() questions2 = df_all[['question2', 'question1']].copy() questions2.rename(columns={'question1': 'question2', 'question2': 'question1'}, inplace=True) questions = questions1.append(questions2) questions.reset_index(inplace=True, drop=True) unique_questions = questions.drop_duplicates(subset=['question1']) unique_questions.reset_index(inplace=True, drop=True) questions_dict = pd.Series(unique_questions.index.values, index=unique_questions['question1'].values).to_dict() # 2. Creating hash values. logging.debug('Creating hash dictionary...') # res = pd.DataFrame() questions['q1hash'] = questions['question1'].map(questions_dict) questions['q2hash'] = questions['question2'].map(questions_dict) # 3. Creating intersection features. logging.debug('Creating edges.') questions['l1hash'] = questions['q1hash'].apply(lambda x: [x]) questions['l2hash'] = questions['q2hash'].apply(lambda x: [x]) questions['edges1'] = questions.groupby('q1hash')['l2hash'].transform(sum) questions['edges2'] = questions.groupby('q2hash')['l1hash'].transform(sum) # 4 wanted_cols = ['l1hash', 'l2hash', 'edges1', 'edges2', 'q1hash', 'q2hash'] res = questions[wanted_cols].copy()[0:len(df_all)] # 3. Creating intersection features. logging.debug('Creating intersection features...') res['common_edges'] = res.apply( lambda x: len(set(x.edges1).intersection(set(x.edges2))), axis=1) # 4. Is question 2 ever appeared as question 1 column. logging.debug('Creating q2 in q1 feature...') questions1 = set(res['q1hash'].values) res['q2inq1'] = res['q2hash'].apply(lambda x: int(x in questions1)) res.drop(['l1hash', 'l2hash', 'edges1', 'edges2'], axis=1, inplace=True) print(res.head()) add_features(feature_class, res.columns.tolist()) dump_features(feature_class, res) logging.info('Magic features are created and saved to pickle file.')
def create_tfidf_features(df_all, columns, qcol, unique=False, pref=''): logging.info('Creating tfidf features') feature_class = pref + 'tfidf' if check_if_exists(feature_class): logging.info('Tfidf features already created.') return df = pd.DataFrame() df['id'] = df_all['id'] if TFIDF_ANALYSIS: for c in columns: logging.info("Doing TFIDF Analysis, it may take some time") if unique: create_idf(df_all[c].unique()) else: create_idf(df_all[c]) # different types of tfidf types = ('binary', 'freq', 'log_freq', 'dnorm') # different ways of aggregating term tfidf in query indexes, prefixes = (0, 1, 2), ('s', 'm', 'a') # two different functions - one exact match, other - common words funcs, suffixes = [tfidf1, tfidf2], ('1', '2') for (func, suffix) in zip(funcs, suffixes): if (func == tfidf2) and (not TFIDF2): continue df['temp'] = df_all.apply( lambda x: func(x[qcol], x[c], type='all'), axis=1) ind = 0 for t in types: for prefix in prefixes: name = qcol + prefix + t + '_tfidf_' + c + '_' + suffix df[name] = df['temp'].map(lambda x: x[ind]) ind += 1 df.drop(['temp'], axis=1, inplace=True) logging.info('TFIDF analysis is finished') df.drop('id', axis=1, inplace=True) add_features(feature_class, df.columns.tolist()) dump_features(feature_class, df) logging.info('Tfidf features are created and saved to pickle file.')
def create_distance_tfidf_features(col1, col2, pref=''): logging.info('Creating distance tfidf features.') feature_class = pref + 'distance_tfidf' if check_if_exists(feature_class): logging.info('Distance tfidf features already created.') return X_col1 = load_features('%scommon_vocabulary_raw_tfidf_%s' % (pref, col1)) X_col2 = load_features('%scommon_vocabulary_raw_tfidf_%s' % (pref, col2)) res = pd.DataFrame() res['cosine_similarity_%s_%s' % (col1, col2)] = (list( map(cosine_sim, X_col1, X_col2))) res['rmse_%s_%s' % (col1, col2)] = (list(map(rmse, X_col1, X_col2))) add_features(feature_class, res.columns.tolist()) dump_features(feature_class, res) logging.info('Distance tfidf features are created.')
def create_count_features(df_all, pref=''): logging.info('Creating count features.') feature_class = pref + 'count' if check_if_exists(feature_class): logging.info('Count features (%s) already created.' % feature_class) return df_q1_q2 = df_all[['question1', 'question2']].reset_index(drop=True) df_q2_q1 = df_all[['question1', 'question2']].reset_index(drop=True) df_q2_q1.rename(columns={ 'question1': 'question2', 'question2': 'question1' }) df = pd.concat([df_q1_q2, df_q2_q1], axis=0, ignore_index=True) # Create count of q1 and q2 features. res = pd.DataFrame() grouper1 = df.reset_index().groupby('question1') grouper2 = df.reset_index().groupby('question2') res['q1count'] = grouper1['question2'].transform('count') res['q2count'] = grouper2['question1'].transform('count') res['q1rank'] = grouper1['question2'].rank() res['q2rank'] = grouper2['question1'].rank() # res['hash1'] = grouper1['index'].transform(lambda x: x.iloc[0]) # res['hash2'] = grouper2['index'].transform(lambda x: x.iloc[0]) res = res[0:len(df_q1_q2)] # Number of sentences count. res['sent1count'] = df_q1_q2['question1'].apply( lambda x: len(create_sentences(x))) res['sent2count'] = df_q1_q2['question2'].apply( lambda x: len(create_sentences(x))) add_features(feature_class, res.columns.tolist()) dump_features(feature_class, res) logging.info('Count features are created and saved to pickle file.')
def create_wordnet_features(data, pref=''): feature_class = pref + 'wordnet' logging.info('Creating wordnet (%s) features' % feature_class) if check_if_exists(feature_class): logging.info('Wordnet (%s) features already created' % feature_class) return res = pd.DataFrame() logging.info('Creating synonyms count...') res['synonyms_count'] = data.apply( lambda x: synonyms_count(x['words1'], x['words2']), axis=1) logging.info('Creating antonyms count...') res['antonyms_count'] = data.apply( lambda x: antonyms_count(x['words1'], x['words2']), axis=1) # logging.info('Creating hyponyms count...') # res['hyponyms_count'] = data.apply( # lambda x: hyponyms_count(x['words1'], x['words2']), axis=1) # logging.info('Creating hypernyms count...') # res['hypernyms_count'] = data.apply( # lambda x: hypernyms_count(x['words1'], x['words2']), axis=1) logging.info('Calculating synonyms and antonyms distances...') len1 = data['words1'].apply(lambda x: len(x)) len2 = data['words2'].apply(lambda x: len(x)) lenunion = data.apply( lambda x: union_words_count(x['words1'], x['words2']), axis=1) res['syn_distance1'] = res['synonyms_count'] / lenunion res['syn_distance2'] = res['synonyms_count'] / (len1 + len2) res['anton_distance1'] = res['antonyms_count'] / lenunion res['anton_distance2'] = res['antonyms_count'] / (len1 + len2) features = res.columns.tolist() add_features(feature_class, features) dump_features(feature_class, res) logging.info('Common words features are created and saved to pickle file.')
def create_common_vocabulary_svd_tfidf_features(n_components=2 * N_COMPONENTS, pref=''): logging.info('Creating common vocabulary svd tfidf features.') feature_class = pref + 'common_vocabulary_svd_tfidf' if check_if_exists(feature_class): logging.info('Common Vocabulary SVD tfidf features already created.') return svd = TruncatedSVD(n_components=n_components, n_iter=15) X = load_features(pref + 'common_vocabulary_raw_tfidf') X_transformed = svd.fit_transform(X) svd_columns = [ 'common_vocabulary_tfidf_svd_' + str(i) for i in range(n_components) ] data = pd.DataFrame(X_transformed, columns=svd_columns) add_features(feature_class, data.columns.tolist()) dump_features(feature_class, data) logging.info('Shape of common vocabulary svd tfidf features is %s' % str(data.shape)) logging.info('Common vocabulary SVD tfidf features are created.')
def create_specific_word_counts(df_all, specific_words=SPECIFIC_WORDS, pref=''): logging.info('Creating specific word features.') feature_class = pref + 'specific_words' if check_if_exists(feature_class): logging.info('Specific word features (%s) already created.' % feature_class) return # Doing some preprocessing to not relly of whether data is supplied # preprocessed already. df_all['question1'] = df_all['question1'].apply(lambda x: str(x).lower()) df_all['question2'] = df_all['question2'].apply(lambda x: str(x).lower()) res = pd.DataFrame() for word in specific_words: res[word + 'in_q1'] = df_all['question1'].apply(lambda x: int(word in x)) res[word + 'in_q2'] = df_all['question2'].apply(lambda x: int(word in x)) res[word] = res[word + 'in_q1'] + res[word + 'in_q2'] res['neg_in_q1'], res['neg_in_q2'], res['neg'] = 0, 0, 0 for word in NEGATION_WORDS: res['neg_in_q1'] = res['neg_in_q1'] + df_all['question1'].apply( lambda x: int(word in x)) res['neg_in_q2'] = res['neg_in_q2'] + df_all['question2'].apply( lambda x: int(word in x)) res['neg'] = res['neg_in_q1'] + res['neg_in_q2'] add_features(feature_class, res.columns.tolist()) dump_features(feature_class, res) logging.info( 'Specific word counts features are created and saved to pickle file.')
def create_grouping_features(df_all, pref=''): logging.info('Creating grouping features.') feature_class = pref + 'grouping' if check_if_exists(feature_class): logging.info('Grouping features (%s) already created.' % feature_class) return columns = ['distance1', 'distance2', 'absdistance1', 'absdistance2'] common_words = (load_features(pref + 'common_words')[columns].reset_index( drop=True)) if check_if_exists(pref + 'distance_tfidf'): distance_tfidf_features = (load_features(pref + 'distance_tfidf').reset_index( drop=True)) columns += distance_tfidf_features.columns.tolist() else: distance_tfidf_features = pd.DataFrame() if check_if_exists(pref + 'word2vec'): word2vec_features = load_features(pref + 'word2vec').reset_index(drop=True) columns += word2vec_features.columns.tolist() else: word2vec_features = pd.DataFrame() if check_if_exists(pref + 'wordnet'): wordnet_features = load_features(pref + 'wordnet').reset_index(drop=True) columns += wordnet_features.columns.tolist() else: wordnet_features = pd.DataFrame() df_q1_q2 = pd.concat([ common_words, distance_tfidf_features, word2vec_features, wordnet_features, df_all[['question1', 'question2' ]].reset_index(drop=True) ], axis=1) df_q2_q1 = pd.concat([ common_words, distance_tfidf_features, word2vec_features, wordnet_features, df_all[['question2', 'question1' ]].reset_index(drop=True) ], axis=1) df_q2_q1.rename(columns={ 'question1': 'question2', 'question2': 'question1' }) df = pd.concat([df_q1_q2, df_q2_q1], axis=0, ignore_index=True) # GroupBy objects. groupby_q1 = df.groupby('question1') groupby_q2 = df.groupby('question2') df['q1count'] = groupby_q1['question2'].transform('count') df['q2count'] = groupby_q2['question1'].transform('count') inds_q1_gr_q2 = (df['q1count'] > df['q2count'])[0:len(df_q1_q2)] inds_q2_gr_q1 = ~inds_q1_gr_q2 res = pd.DataFrame() groupers = ['min', 'max', 'mean'] for grouper in groupers: for col in columns: res[grouper + '_by_q1_' + col] = (groupby_q1[col].transform(grouper)[0:len(df_q1_q2)]) res[grouper + '_by_q2_' + col] = (groupby_q2[col].transform(grouper)[0:len(df_q1_q2)]) res[col] = df[col][0:len(df_q1_q2)] res['rel_q1_' + col] = res.apply(lambda x: np_utils.try_to_divide( x[col], x[grouper + '_by_q1_' + col]), axis=1) res['req_q2_' + col] = res.apply(lambda x: np_utils.try_to_divide( x[col], x[grouper + '_by_q2_' + col]), axis=1) res[grouper + '_by_' + col] = 0 res[grouper + '_by_' + col][inds_q1_gr_q2] = res[grouper + '_by_q1_' + col] res[grouper + '_by_' + col][inds_q2_gr_q1] = res[grouper + '_by_q2_' + col] res['rel_' + col] = res.apply(lambda x: np_utils.try_to_divide( x[col], x[grouper + '_by_' + col]), axis=1) del res[col] add_features(feature_class, res.columns.tolist()) dump_features(feature_class, res) logging.info('Grouping features are created and saved to pickle file.')