def gen_data_for_clf(wv_url, save_url): train_df = load_to_df(TRAIN_URL) test_df = load_to_df(TEST_URL) X = infer_avg_wvs(wv_url, train_df['word_seg'].apply(str.split)) y = train_df['class'].values X_test = infer_avg_wvs(wv_url, test_df['word_seg'].apply(str.split)) joblib.dump((X, y, X_test), save_url)
def ft_process(data_url=None): """ process data into what ft model need, and save it into './processed_data' dir Args: data_url: url to original .csv data Returns: str: url to saved processed data """ save_filename = basename(data_url).replace('.csv', '_ft.csv') save_url = from_project_root("embedding_model/processed_data/" + save_filename) # file specified by data_url is already processed if exists(save_url): return save_url if data_url is not None: labels, sentences = load_raw_data(data_url) else: train_df = load_to_df(TRAIN_URL) labels = train_df['class'].values sentences = train_df['word_seg'] with open(save_url, "w", encoding='utf-8', newline='\n') as ft_file: for i in range(len(labels)): label = FT_LABEL_PREFIX + str(labels[i]) sentence = ' '.join(sentences[i]) ft_file.write('{} {}\n'.format(label, sentence)) return save_url
def gen_data_for_stacking(args, column='word_seg', n_splits=5, random_state=None): """ Args: args: column: n_splits: random_state: Returns: """ train_df = load_to_df(TRAIN_URL) y = train_df['class'].values X = train_df[column].values X_test = load_to_df(TEST_URL)[column].values skf = StratifiedKFold(n_splits=n_splits, shuffle=bool(random_state), random_state=random_state) y_pred = np.zeros((X.shape[0],)) # for printing score of each fold y_pred_proba = np.zeros((X.shape[0], N_CLASSES)) y_test_pred_proba = np.zeros((X_test.shape[0], N_CLASSES)) with tempfile.NamedTemporaryFile() as t_file: for ind, (train_index, cv_index) in enumerate(skf.split(X, y)): # cv split X_train, X_cv = X[train_index], X[cv_index] y_train, y_cv = y[train_index], y[cv_index] with open(t_file.name, "w", encoding='utf-8', newline='\n') as ft_file: for i in range(len(y_train)): label = FT_LABEL_PREFIX + str(y_train[i]) ft_file.write('{} {}\n'.format(label, X_train[i])) clf = ft.supervised(t_file.name, output="tmp", thread=N_JOBS, label_prefix=FT_LABEL_PREFIX, **args) y_pred[cv_index] = [int(label[0]) for label in clf.predict(X_cv)] y_pred_proba[cv_index] = [[t[1] for t in sorted(proba, key=lambda x: int(x[0]))] for proba in clf.predict_proba(X_cv, N_CLASSES)] print("%d/%d cv macro f1 :" % (ind + 1, n_splits), f1_score(y_cv, y_pred[cv_index], average='macro')) y_test_pred_proba += [[t[1] for t in sorted(proba, key=lambda x: int(x[0]))] for proba in clf.predict_proba(X_test, N_CLASSES)] print("macro f1:", f1_score(y, y_pred, average='macro')) # calc macro_f1 score y_test_pred_proba /= n_splits # normalize to 1 return y_pred_proba, y, y_test_pred_proba
def train_w2v_model(data_url=None, kwargs=None): """ get or train a new d2v_model Args: data_url: url to data file, None to train use kwargs: args for d2v model Returns: w2v_model """ model_url = args_to_url(kwargs) if exists(model_url): return Word2Vec.load(model_url) if data_url is not None: _, sequences = load_raw_data(data_url) # use data from all train text and test text else: train_df = load_to_df(TRAIN_URL) test_df = load_to_df(TEST_URL) sequences = train_df['word_seg'].append(test_df['word_seg'], ignore_index=True) sequences = sequences.apply(str.split) print("Word2Vec model is training...\n trained model will be saved at \n ", model_url) s_time = time() # more info here [https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec] model = Word2Vec(sequences, workers=N_JOBS, **kwargs) e_time = time() print("training finished in %.3f seconds" % (e_time - s_time)) model.save(model_url) # save wv of model wv_save_url = model_url.replace('.bin', '.txt').replace('w2v', 'wv') model.wv.save_word2vec_format(wv_save_url, binary=False) return model
def tfidf_transform(train_url, test_url, column='word_seg', sublinear_tf=True, max_n=MAX_N, min_df=MIN_DF, max_df=MAX_DF, max_features=MAX_FEATURES): """ vectorize use TfidfVectorizer Args: train_url: url to train data test_url: url to test data column: column to use sublinear_tf: max_n: min_df: max_df: max_features: Returns: X, X_test, y: vectorized data """ # set token_pattern(default: (?u)\b\w\w+\b' to keep single char tokens vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=max_features, ngram_range=(1, max_n), sublinear_tf=sublinear_tf, token_pattern='(?u)\w+') train_df = load_to_df(train_url) X = vectorizer.fit_transform(train_df[column]) y = np.asarray(train_df['class']) X_test = None if test_url: X_test = vectorizer.transform(load_to_df(test_url)[column]) return X, y, X_test
def train_d2v_model(data_url=None, kwargs=None): """ get or train a new d2v_model Args: data_url: url to data file kwargs: args for d2v model Returns: w2v_model """ model_url = args_to_url(kwargs) if exists(model_url): return Doc2Vec.load(model_url) if data_url is not None: _, sequences = load_raw_data(data_url) # use data from all train text and test text else: train_df = load_to_df(TRAIN_URL) test_df = load_to_df(TEST_URL) sequences = train_df['word_seg'].append(test_df['word_seg'], ignore_index=True) sequences = sequences.apply(str.split) documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sequences)] print("Doc2Vec model is training...\n trained model will be saved at \n ", model_url) # more info here [https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec] s_time = time() model = Doc2Vec(documents, workers=N_JOBS, **kwargs) model.save(model_url) e_time = time() print("training finished in %.3f seconds" % (e_time - s_time)) return model
def generate_meta_feature(data_url, normalize=True): """ generate meta feature Args: data_url: url to data normalize: normalize result into [0, 1] Returns: generated meta DataFrame """ save_url = data_url.replace('.csv', '_meta_df.pk') if exists(save_url): return joblib.load(save_url) data_df = load_to_df(data_url) meta_df = pd.DataFrame() for level in ('word_seg', 'article'): # word num meta_df[level + '_num'] = data_df[level].apply(lambda x: len(x.split())) # different word num meta_df[level + '_unique'] = data_df[level].apply( lambda x: len(set(x.split()))) # most common word num meta_df[[level + '_common', level + '_common_num' ]] = pd.DataFrame(data_df[level].apply(lambda x: Counter( x.split()).most_common(1)[0]).tolist()).astype(int) # average phrase len meta_df[ 'avg_phrase_len'] = meta_df['article_num'] / meta_df['word_seg_num'] # normalization if normalize: for col in meta_df: meta_df[col] -= meta_df[col].min() meta_df[col] /= meta_df[col].max() joblib.dump(meta_df, save_url) return meta_df
def generate_vectors(train_url, test_url=None, column='article', trans_type=None, max_n=1, min_df=1, max_df=1.0, max_features=1, sublinear_tf=True, balanced=False, re_weight=0, verbose=False, drop_words=0): """ generate X, y, X_test vectors with csv(with header) url use pandas and CountVectorizer Args: train_url: url to train csv test_url: url to test csv, set to None if not need X_test column: column to use as feature trans_type: specific transformer, {'dc','idf'} max_n: max_n for ngram_range min_df: min_df for CountVectorizer max_df: max_df for CountVectorizer max_features: max_features for CountVectorizer sublinear_tf: sublinear_tf for default TfdcTransformer balanced: balanced for default TfdcTransformer, for idf transformer, it is use_idf re_weight: re_weight for TfdcTransformer verbose: True to show more information drop_words: randomly delete some words from sentences Returns: X, y, X_test """ verbose and print("loading '%s' level data from %s with pandas" % (column, train_url)) train_df = load_to_df(train_url) # vectorizer vec = CountVectorizer(ngram_range=(1, max_n), min_df=min_df, max_df=max_df, max_features=max_features, token_pattern='\w+') s_time = time() verbose and print("finish loading, vectorizing") verbose and print("vectorizer params:", vec.get_params()) sequences = train_df[column] # delete some words randomly for i, row in enumerate(sequences): if drop_words <= 0: break if np.random.ranf() < drop_words: row = np.array(row.split()) sequences.at[i] = ' '.join(row[np.random.ranf(row.shape) > 0.35]) X = vec.fit_transform(sequences) e_time = time() verbose and print("finish vectorizing in %.3f seconds, transforming" % (e_time - s_time)) # transformer if trans_type is None or trans_type == 'idf': trans = TfidfTransformer(sublinear_tf=sublinear_tf, use_idf=balanced) else: trans = TfdcTransformer(sublinear_tf=sublinear_tf, balanced=balanced, re_weight=re_weight) verbose and print("transformer params:", trans.get_params()) y = np.array((train_df["class"]).astype(int)) X = trans.fit_transform(X, y) X_test = None if test_url: verbose and print("transforming test set") test_df = load_to_df(test_url) X_test = vec.transform(test_df[column]) X_test = trans.transform(X_test) s_time = time() verbose and print("finish transforming in %.3f seconds\n" % (s_time - e_time)) return X, y, X_test
def dict_transform(tw_dict, train_url=TRAIN_URL, test_url=None, column='word_seg', max_n=MAX_N, min_df=MIN_DF, max_df=MAX_DF, max_features=MAX_FEATURES, normalize=True, sublinear_tf=True, re_weight=0): """ use offline dict to transform data into vector Args: train_url: url to train data (with header) test_url: url to test data (with header) sentences: list of sentence to be vectorized tw_dict: term weighting dict to use max_n: max_n for CountVectorizer min_df: min_df for CountVectorizer max_df: max_df for CountVectorizer normalize: normalize the vector or not sublinear_tf: use 1 + log(tf) instead of tf max_features: max_features for CountVectorizer re_weight: if re_weight > 0, use (1-re_weight) + (re_weight) * weights instead of weight column: column to use in dataframe Returns: X, y, X_test: vectorized data """ print("transforming...") train_df = load_to_df(train_url) vectorizer = CountVectorizer(min_df=min_df, max_df=max_df, ngram_range=(1, max_n), token_pattern='(?u)\w+', max_features=max_features) X_train = vectorizer.fit_transform( train_df[column]) # use train data to get vocab y_train = np.asarray(train_df['class']) X_test = vectorizer.transform( load_to_df(test_url)[column]) if test_url else None # get words of all columns represent to words = vectorizer.get_feature_names() # get weights of words weights = np.array([tw_dict[word] for word in words]) if re_weight > 0: weights = 1 + re_weight * weights for X in (X_train, X_test): if X is None: continue # sublinear_tf like tf-idf if sublinear_tf: X.data = np.log(X.data) + 1 X = X.multiply(weights) # can not use * to multiply if normalize: norm = sp.sparse.linalg.norm(X, axis=1) for i, row in enumerate(X.row): X.data[i] /= norm[row] return X_train, y_train, X_test