def get_X_y(): X,y,interest = process_file("jama/jama_article_info.csv","jama/jama_pmids.txt_matched_articles_filtered.csv") vectorizer = CountVectorizer(stop_words="english", min_df=2, token_pattern=r"(?u)95% confidence interval|95% CI|95% ci|[a-zA-Z0-9_*\-][a-zA-Z0-9_/*\-]+", binary=False, max_features=50000) X = vectorizer.fit_transform(X) return X,np.array(y),vectorizer
def get_X_y(): X, y, interest = process_file( "jama/jama_article_info.csv", "jama/jama_pmids.txt_matched_articles_filtered.csv") vectorizer = CountVectorizer( stop_words="english", min_df=2, token_pattern= r"(?u)95% confidence interval|95% CI|95% ci|[a-zA-Z0-9_*\-][a-zA-Z0-9_/*\-]+", binary=False, max_features=50000) X = vectorizer.fit_transform(X) return X, np.array(y), vectorizer
def get_X_y(): X, y, interest = process_file( "reuters/all_reuters_article_info.csv", "reuters/all_reuters_matched_articles_filtered.csv") vectorizer = CountVectorizer( ngram_range=(1, 2), stop_words="english", min_df=1, token_pattern= r"(?u)95% confidence interval|95% CI|95% ci|[a-zA-Z0-9_*\-][a-zA-Z0-9_/*\-]+", binary=False, max_features=50000) X = vectorizer.fit_transform(X) return X, np.array(y), vectorizer
def get_X_y(): X, y, interest = process_file( "reuters/all_reuters_article_info.csv", "reuters/all_reuters_matched_articles_filtered.csv" ) vectorizer = CountVectorizer( ngram_range=(1, 2), stop_words="english", min_df=1, token_pattern=r"(?u)95% confidence interval|95% CI|95% ci|[a-zA-Z0-9_*\-][a-zA-Z0-9_/*\-]+", binary=False, max_features=50000, ) X = vectorizer.fit_transform(X) return X, np.array(y), vectorizer