import json from predictocite.datasets.citation_groups import fetch_citationgroups import numpy as np import pandas as pd from sklearn import cross_validation, metrics from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_selection import SelectKBest, chi2 from sklearn.naive_bayes import MultinomialNB articles = fetch_citationgroups() #STEP 1: Split data X_train, X_test, y_train, y_test = cross_validation.train_test_split( articles.data, articles.target, test_size=0.25, random_state=25) #STEP 2: Extract features from text using TfidfVectorizer tfidf_vect = TfidfVectorizer(max_df=1, stop_words='english', ngram_range=(1, 2), encoding='utf-8', max_features=50000) """ fit_transform learns the vocabulary dictionary and return term-document matrix """
def setUp(self): groups = ['zero_citations', 'one_to_five_citations'] self.articles = fetch_citationgroups(groups)
def setUp(self): self.groups = ['one_to_five_citations'] self.articles = fetch_citationgroups(self.groups) preprocessor = TextPreprocessor(self.articles) split_data = preprocessor.split_data()
def test_fetch_all_citationgroups(self): self.all_articles = fetch_citationgroups() self.assertTrue(self.all_articles)