def create_pkz(target_dir, source_dir): """ Used to create pkz file - this allows to work with this data in offline mode (pkz is as a zipped pickle) Args: source_dir: Location of the .tar.gz file target_dir: Location to save pkz file """ DOWNLOADED_NAME = "20news-bydate.tar.gz" CACHE_NAME = "20news-bydate_py3.pkz" TRAIN_FOLDER = "20news-bydate-train" TEST_FOLDER = "20news-bydate-test" cache_path = os.path.join(target_dir, CACHE_NAME) train_path = os.path.join(target_dir, TRAIN_FOLDER) test_path = os.path.join(target_dir, TEST_FOLDER) if not os.path.exists(target_dir): os.makedirs(target_dir) tarfile.open(source_dir + DOWNLOADED_NAME, "r:gz").extractall(path=target_dir) # Store a zipped pickle cache = dict(train=load_files(train_path, encoding='latin1'), test=load_files(test_path, encoding='latin1')) compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec') with open(cache_path, 'wb') as f: f.write(compressed_content)
def download_classic(target_dir, cache_path): """Download the 20 newsgroups data and stored it as a zipped pickle.""" archive_path = os.path.join(target_dir, ARCHIVE_NAME) # all_path = os.path.join(target_dir, ALL_FOLDER) train_path = os.path.join(target_dir, TRAIN_FOLDER) test_path = os.path.join(target_dir, TEST_FOLDER) if not os.path.exists(target_dir): os.makedirs(target_dir) if os.path.exists(archive_path): # Download is not complete as the .tar.gz file is removed after # download. logger.warning("Download was incomplete, downloading again.") os.remove(archive_path) logger.warning("Downloading dataset from %s (1.5 MB)", URL) opener = urlopen(URL) with open(archive_path, 'wb') as f: f.write(opener.read()) logger.info("Decompressing %s", archive_path) tarfile.open(archive_path, "r:gz").extractall(path=target_dir) os.remove(archive_path) # Store a zipped pickle cache = dict(train=load_files(train_path, encoding='latin1'), test=load_files(test_path, encoding='latin1')) # cache = dict(all=load_files(all_path, encoding='latin1')) compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec') with open(cache_path, 'wb') as f: f.write(compressed_content) shutil.rmtree(target_dir) return cache
def get_cache(self, target_path): train_path = os.path.join(target_path, self.train_folder) test_path = os.path.join(target_path, self.test_folder) if not os.path.exists(target_path): os.makedirs(target_path) if not os.path.exists(train_path): os.makedirs(train_path) if not os.path.exists(test_path): os.makedirs(test_path) cache = dict(train=base.load_files(train_path, encoding='utf-8'), test=base.load_files(test_path, encoding='utf-8')) # Turn textual instances representation into text the object structure instances = list() for instance in cache['train'].data: instances.append(WixInstance(instance)) cache['train'].data = instances instances = list() for instance in cache['test'].data: instances.append(WixInstance(instance)) cache['test'].data = instances compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec') with open(self.cache_path, 'wb') as f: f.write(compressed_content) shutil.rmtree(target_path) return cache
def make_cache_sample(self, train_path, test_path, cache_path): # Store a zipped pickle cache = dict(train=base.load_files(train_path, encoding='utf-8'), test=base.load_files(test_path, encoding='utf-8')) compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec') with open(cache_path, 'wb') as f: f.write(compressed_content)
def set_category(self): news_train = base.load_files("news") self.categories = [ 'dunya', 'ekonomi', 'kultur-sanat', 'magazin', 'saglik', 'siyaset', 'spor', 'teknoloji', 'turkiye', 'yasam' ] text_clf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)), ]) parameters = { 'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3) } gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1) gs_clf = gs_clf.fit(news_train.data[:1000], news_train.target[:1000]) self.category = news_train.target_names[gs_clf.predict([self.data])[0]]
def get_cache(self, train_path, test_path, cache_path): cache = dict(train=base.load_files(train_path, encoding='utf-8'), test=base.load_files(test_path, encoding='utf-8')) # turn tweet text to Tweet objects. train_tweets = list() for tweet in cache['train'].data: try: tweet = tweet.encode('utf-8') if tweet != '': train_tweets.append(Tweet(tweet,self._textOnlyTweets)) except UnicodeEncodeError as unicode_error: print unicode_error.message cache['train'].data = train_tweets test_tweets = list() for tweet in cache['test'].data: try: tweet = tweet.encode('utf-8') if tweet != '': test_tweets.append(Tweet(tweet, self._textOnlyTweets)) except UnicodeEncodeError as unicode_error: print unicode_error.message cache['test'].data = test_tweets compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec') if not os.path.exists(cache_path.replace(self.cache_name, '')): os.makedirs(cache_path.replace(self.cache_name, '')) with open(cache_path, 'wb') as f: f.write(compressed_content) if not os.path.exists(cache_path): os.makedirs(cache_path) # TODO: When the cache is ready, need to delete all generated files # shutil.rmtree(cache_path) return cache
def _download_20newsgroups(target_dir, cache_path): """Download the 20 newsgroups data and stored it as a zipped pickle.""" train_path = os.path.join(target_dir, TRAIN_FOLDER) test_path = os.path.join(target_dir, TEST_FOLDER) if not os.path.exists(target_dir): os.makedirs(target_dir) # logger.info("Downloading dataset from %s (14 MB)", ARCHIVE.url) archive_path = _fetch_remote(ARCHIVE, dirname=target_dir) archive_path = 'C:/Users/Jackie/scikit_learn_data/20news_home/20news-bydate.tar.gz' logger.debug("Decompressing %s", archive_path) tarfile.open(archive_path, "r:gz").extractall(path=target_dir) # os.remove(archive_path)# do not remove the file package # Store a zipped pickle cache = dict(train=load_files(train_path, encoding='latin1'), test=load_files(test_path, encoding='latin1')) compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec') with open(cache_path, 'wb') as f: f.write(compressed_content) # shutil.rmtree(target_dir) #do not remove all the source files and directories return cache
def _load_document_classification(dataset_path, metadata, set_=None, **kwargs): if set_ is not None: dataset_path = os.path.join(dataset_path, set_) return load_files(dataset_path, metadata.get('description'), **kwargs)
import io import numpy as np """ train datasını trainetme_traine dosya pathını veya ismini direk vererek category category datalarını okuyor data = open("1.txt").read() bu kodda ise deneme datasını yüklüyorsun print(trainetme_train.target_names[gs_clf.predict([data])[0]]) bu kod ise categoryriyi print ettiriyor """ #categories = ['spor','teknoloji'] trainetme_train = base.load_files("news") ''' text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ]) ''' text_clf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
if opts.filtered: remove = ('headers', 'footers', 'quotes') else: remove = () print("Loading 20 newsgroups dataset for categories:") print(categories if categories else "all") """ data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42, remove=remove) data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42, remove=remove)""" data_train = base.load_files("news") data_test = base.load_files("test") data = data_train def fetchdata(data, categories): labels = [(data.target_names.index(cat), cat) for cat in categories] # Sort the categories to have the ordering of the labels labels.sort() labels, categories = zip(*labels) mask = np.in1d(data.target, labels) data.filenames = data.filenames[mask] data.target = data.target[mask] # searchsorted to have continuous labels data.target = np.searchsorted(labels, data.target) data.target_names = list(categories)