def biomtable_to_dataframe(biom_table_object): _bt = biom_table_object data = _bt.matrix_data.todense() out = pd.SparseDataFrame(data, index=_bt.ids('observation'), columns=_bt.ids('sample')) return out.to_dense()
def process_chunk_week(chunk: pd.SparseDataFrame, chunk_number: int, week: int, min_usage: int): """ Asynchronously process the input data which is a certain week of a certain chunk. Save the result to a pickle file. """ print('[%d] Processing week %d' % (week, week)) # fix the columns print('[%d] Removing week level from columns' % week) chunk.columns = chunk.columns.droplevel(0) # get the dummy columns for hashtags print('[%d] Making dummies for hashtags' % week) dummies_hashtags: pd.SparseDataFrame = chunk['hashtags'].apply( lambda v: v.lower() if type(v) == str else '').str.get_dummies(sep=',') dummies_hashtags_values = dummies_hashtags.values usage = dummies_hashtags_values.sum(0) high_usage = usage >= min_usage other = dummies_hashtags_values[:, usage < min_usage] dummies_hashtags = pd.SparseDataFrame( dummies_hashtags_values[:, high_usage], dummies_hashtags.index, dummies_hashtags.columns[high_usage].map(lambda c: 'hashtag_' + c)) dummies_hashtags['other_hashtags'] = other.sum(1) print('[%d] There are %d hashtag columns' % (week, dummies_hashtags.shape[1])) # get the dummy columns for mentions print('[%d] Making dummies for mentions' % week) dummies_mentions: pd.SparseDataFrame = chunk['mentions'].apply( lambda v: v.lower() if type(v) == str else '').str.get_dummies(sep=',') dummies_mentions_values = dummies_mentions.values usage = dummies_mentions_values.sum(0) high_usage = usage >= min_usage other = dummies_mentions_values[:, usage < min_usage] dummies_mentions = pd.SparseDataFrame( dummies_mentions_values[:, high_usage], dummies_mentions.index, dummies_mentions.columns[high_usage].map(lambda c: 'mention_' + c)) dummies_mentions['other_mentions'] = other.sum(1) print('[%d] There are %d mention columns' % (week, dummies_mentions.shape[1])) # get the dummy columns for urls print('[%d] Making dummies for urls' % week) dummies_urls: pd.SparseDataFrame = chunk['urls'].apply( lambda v: v.lower() if type(v) == str else '').str.get_dummies(sep=',') dummies_urls_values = dummies_urls.values usage = dummies_urls_values.sum(0) high_usage = usage >= min_usage other = dummies_urls_values[:, usage < min_usage] dummies_urls = pd.SparseDataFrame( dummies_urls_values[:, high_usage], dummies_urls.index, dummies_urls.columns[high_usage].map(lambda c: 'url_' + c)) dummies_urls['other_urls'] = other.sum(1) print('[%d] There are %d url columns' % (week, dummies_urls.shape[1])) # concatenate to one big data frame print('[%d] Concatenating dummies and copying tweets' % week) dummies: pd.SparseDataFrame = pd.concat( [dummies_hashtags, dummies_mentions, dummies_urls], axis=1) dummies['tweets'] = chunk['tweets'] # save to a pickle print('[%d] Saving' % week) dummies.to_pickle('../data/chunks/chunk_%d_week_%d.pkl' % (chunk_number, week))
amgPd_training = amgPd[temp] amgPd_training.to_csv(pathwrite+'training_randomData.csv', header = True, index = False) amgPd_test = amgPd[~temp] amgPd_test.to_csv(pathwrite+'test_randomData.csv', header = True, index = False) ### 1. hid Vs Category_list user_u = list(sorted(amgPd.hid.unique())) item_u = list(sorted((amgPd.Category_list.unique()))) row = amgPd.hid.astype('category', categories=user_u).cat.codes col = amgPd.Category_list.astype('category', categories=item_u).cat.codes data = np.array([1 for k in range(len(amgPd))]) sparse_matrix = csr_matrix((data, (row, col)), shape=(len(user_u), len(item_u))) df_tmp1 = pd.SparseDataFrame([pd.SparseSeries(sparse_matrix[i].toarray().ravel(), fill_value=0) for i in np.arange(sparse_matrix.shape[0])],index=user_u, columns=item_u, default_fill_value=0) finCols = ['hid'] len(finCols) finCols.extend(df_tmp1.columns) len(finCols) dfMtrx = np.empty(shape = (df_tmp1.shape[0]+1,df_tmp1.shape[1]+1), dtype=np.ndarray) dfMtrx[:1,:][0] = finCols dfMtrx[1:,0] = user_u dfMtrx[1:,1:] = df_tmp1.values print(dfMtrx.shape) np.savetxt(pathwrite+'Final_Data_recom1_fin.csv', dfMtrx, delimiter=",",fmt='%s') ### 1.1 hid Vs Category_list training data
def save(self, path=None, complevel=1, complib='zlib'): """ Save the container as an HDF5 archive. Args: path (str): Path where to save the container """ if path is None: path = self.hexuid + '.hdf5' elif os.path.isdir(path): path += os.sep + self.hexuid + '.hdf5' elif not (path.endswith('.hdf5') or path.endswith('.hdf')): raise ValueError( 'File path must have a ".hdf5" or ".hdf" extension.') with pd.HDFStore(path, 'w', complevel=complevel, complib=complib) as store: store['kwargs'] = pd.Series() store.get_storer('kwargs').attrs.metadata = self._rel() fc = 0 # Field counter (see special handling of fields below) for name, data in self._data().items(): if hasattr(data, '_revert_categories'): data._revert_categories() name = name[1:] if name.startswith('_') else name if isinstance(data, Field): # Fields are handled separately fname = 'FIELD{}_'.format(fc) + name + '/' store[fname + 'data'] = pd.DataFrame(data) for i, field in enumerate(data.field_values): ffname = fname + 'values' + str(i) if isinstance(field, pd.Series): store[ffname] = pd.Series(field) else: store[ffname] = pd.DataFrame(field) fc += 1 elif isinstance(data, Series): s = pd.Series(data) if isinstance(data.dtype, pd.types.dtypes.CategoricalDtype): s = s.astype('O') store[name] = s elif isinstance(data, DataFrame): store[name] = pd.DataFrame(data) elif isinstance(data, SparseSeries): s = pd.SparseSeries(data) if isinstance(data.dtype, pd.types.dtypes.CategoricalDtype): s = s.astype('O') store[name] = s elif isinstance(data, SparseDataFrame): store[name] = pd.SparseDataFrame(data) else: if hasattr(data, 'dtype') and isinstance( data.dtype, pd.types.dtypes.CategoricalDtype): data = data.astype('O') else: for col in data: if isinstance(data[col].dtype, pd.types.dtypes.CategoricalDtype): data[col] = data[col].astype('O') store[name] = data if hasattr(data, '_set_categories'): data._set_categories()
def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None): """Return the optimal data type given data, gene names and cell names. Parameters ---------- data : array-like gene_names : `str`, array-like or `None` (default: None) Either a filename or an array containing a list of gene symbols or ids. cell_names : `str`, array-like or `None` (default: None) Either a filename or an array containing a list of cell barcodes. sparse : `bool` or `None` (default: None) If not `None`, overrides default sparsity of the data. """ if gene_names is None and cell_names is None and \ not isinstance(data, pd.DataFrame): # just a matrix if sparse is not None: if sparse: if not sp.issparse(data): # return scipy.sparse.csr_matrix data = sp.csr_matrix(data) elif sp.issparse(data) and not sparse: # return numpy.ndarray data = data.toarray() else: # return data as is pass return data else: gene_names = _parse_gene_names(gene_names, data) cell_names = _parse_cell_names(cell_names, data) # dataframe with index and/or columns if sparse is None: # let the input data decide sparse = isinstance(data, pd.SparseDataFrame) or sp.issparse(data) if sparse and gene_names is not None and \ len(np.unique(gene_names)) < len(gene_names): warnings.warn( "Duplicate gene names detected! Forcing dense matrix", RuntimeWarning) sparse = False if sparse: # return pandas.SparseDataFrame if isinstance(data, pd.DataFrame): if gene_names is not None: data.columns = gene_names if cell_names is not None: data.index = cell_names if not isinstance(data, pd.SparseDataFrame): data = data.to_sparse(fill_value=0.0) else: data = pd.SparseDataFrame(data, default_fill_value=0.0) data.index = cell_names data.columns = gene_names else: # return pandas.DataFrame if isinstance(data, pd.DataFrame): if gene_names is not None: data.columns = gene_names if cell_names is not None: data.index = cell_names if isinstance(data, pd.SparseDataFrame): data = data.to_dense() else: if sp.issparse(data): data = data.toarray() data = pd.DataFrame(data, index=cell_names, columns=gene_names) return data
def add_text_features(data, strings, k = 5, keep = True): ##### PROCESSING LOOP for var in strings: ### TEXT PREPROCESSING # replace NaN with empty string data[var][pd.isnull(data[var])] = '' # remove common words freq = pd.Series(' '.join(data[var]).split()).value_counts()[:10] #freq = list(freq.index) #data[var] = data[var].apply(lambda x: " ".join(x for x in x.split() if x not in freq)) #data[var].head() # remove rare words freq = pd.Series(' '.join(data[var]).split()).value_counts()[-10:] #freq = list(freq.index) #data[var] = data[var].apply(lambda x: " ".join(x for x in x.split() if x not in freq)) #data[var].head() # convert to lowercase data[var] = data[var].apply(lambda x: " ".join(x.lower() for x in x.split())) # remove punctuation data[var] = data[var].str.replace('[^\w\s]','') ### COMPUTE BASIC FEATURES # word count data[var + '_word_count'] = data[var].apply(lambda x: len(str(x).split(" "))) data[var + '_word_count'][data[var] == ''] = 0 # character count data[var + '_char_count'] = data[var].str.len().fillna(0).astype('int64') ##### COMPUTE TF-IDF FEATURES # import vectorizer tfidf = TfidfVectorizer(max_features = k, lowercase = True, norm = 'l2', analyzer = 'word', stop_words = 'english', ngram_range = (1, 1)) # compute TF-IDF vals = tfidf.fit_transform(data[var]) vals = pd.SparseDataFrame(vals) vals.columns = [var + '_tfidf_' + str(p) for p in vals.columns] data = pd.concat([data, vals], axis = 1) ### CORRECTIONS # remove raw text if keep == False: del data[var] # print dimensions #print(data.shape) # return data return data
def test_create_dataset_pandas(self): data = [ ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'], ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'], ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'], ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'], ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes'] ] column_names = ['rnd_str', 'outlook', 'temperature', 'humidity', 'windy', 'play'] df = pd.DataFrame(data, columns=column_names) # enforce the type of each column df['outlook'] = df['outlook'].astype('category') df['windy'] = df['windy'].astype('bool') df['play'] = df['play'].astype('category') # meta-information name = '%s-pandas_testing_dataset' % self._get_sentinel() description = 'Synthetic dataset created from a Pandas DataFrame' creator = 'OpenML tester' collection_date = '01-01-2018' language = 'English' licence = 'MIT' default_target_attribute = 'play' citation = 'None' original_data_url = 'http://openml.github.io/openml-python' paper_url = 'http://openml.github.io/openml-python' dataset = openml.datasets.functions.create_dataset( name=name, description=description, creator=creator, contributor=None, collection_date=collection_date, language=language, licence=licence, default_target_attribute=default_target_attribute, row_id_attribute=None, ignore_attribute=None, citation=citation, attributes='auto', data=df, version_label='test', original_data_url=original_data_url, paper_url=paper_url ) upload_did = dataset.publish() self.assertEqual( _get_online_dataset_arff(upload_did), dataset._dataset, "Uploaded ARFF does not match original one" ) # Check that SparseDataFrame are supported properly sparse_data = scipy.sparse.coo_matrix(( [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]) )) column_names = ['input1', 'input2', 'y'] df = pd.SparseDataFrame(sparse_data, columns=column_names) # meta-information description = 'Synthetic dataset created from a Pandas SparseDataFrame' dataset = openml.datasets.functions.create_dataset( name=name, description=description, creator=creator, contributor=None, collection_date=collection_date, language=language, licence=licence, default_target_attribute=default_target_attribute, row_id_attribute=None, ignore_attribute=None, citation=citation, attributes='auto', data=df, version_label='test', original_data_url=original_data_url, paper_url=paper_url ) upload_did = dataset.publish() self.assertEqual( _get_online_dataset_arff(upload_did), dataset._dataset, "Uploaded ARFF does not match original one" ) self.assertEqual( _get_online_dataset_format(upload_did), 'sparse_arff', "Wrong format for dataset" ) # Check that we can overwrite the attributes data = [['a'], ['b'], ['c'], ['d'], ['e']] column_names = ['rnd_str'] df = pd.DataFrame(data, columns=column_names) df['rnd_str'] = df['rnd_str'].astype('category') attributes = {'rnd_str': ['a', 'b', 'c', 'd', 'e', 'f', 'g']} dataset = openml.datasets.functions.create_dataset( name=name, description=description, creator=creator, contributor=None, collection_date=collection_date, language=language, licence=licence, default_target_attribute=default_target_attribute, row_id_attribute=None, ignore_attribute=None, citation=citation, attributes=attributes, data=df, version_label='test', original_data_url=original_data_url, paper_url=paper_url ) upload_did = dataset.publish() downloaded_data = _get_online_dataset_arff(upload_did) self.assertEqual( downloaded_data, dataset._dataset, "Uploaded ARFF does not match original one" ) self.assertTrue( '@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}' in downloaded_data)
def SingleLearningThread(folderName, rawEmails_dtm, summarySentenceList, cAmount, gammaAmount): # Remove the CleanText as we only want that when printing out the info at the end rawEmails = rawEmails_dtm[[ 'TopOneSentence', 'TopTwoSentence', 'TopThreeSentence', 'TopFourSentence', 'TopFiveSentence', 'SentenceLengthBeforeStop', 'CosineSimilarity' ]].astype(float) accuracy_Array = [] #initialize folds kf = KFold(n_splits=3, shuffle=True, random_state=7) splitCounter = 1 #The internet told me to split it like this for train_index, test_index in kf.split(rawEmails, rawEmails): #Create a new naive_bayes model for each test set and then put its accuracy in an array clf = svm.SVC(C=cAmount, cache_size=5000, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma=gammaAmount, kernel='rbf', max_iter=-1, probability=False, shrinking=True, tol=.001, verbose=False) # fit and transform training into vector matrix clf.fit(rawEmails.iloc[train_index].values, summarySentenceList[train_index]) category_prediction_test = clf.predict( rawEmails.iloc[test_index].values) accuracy_Array.append( metrics.f1_score(summarySentenceList[test_index], category_prediction_test)) outputCSVDataframe = pd.concat([ pd.SparseDataFrame( rawEmails_dtm.iloc[test_index]).reset_index(drop=True), pd.DataFrame(list(summarySentenceList[test_index].astype(int)), columns=['Actual']).reset_index(drop=True), pd.DataFrame(category_prediction_test.astype(int), columns=['Predicted']) ], axis=1) outputCSVDataframe.to_csv('Output/SVM/UK_' + folderName + str(splitCounter) + '.csv', encoding='utf-8', index=False) splitCounter += 1 if (metrics.f1_score(summarySentenceList[test_index], category_prediction_test) > 0): statsLock.acquire() if isRunPerThread: global foldersName statsArray.append({ 'Learning_Type': foldersName, 'cAmount': cAmount, 'gammaAmount': gammaAmount, 'F1_Score': metrics.f1_score(summarySentenceList[test_index], category_prediction_test), 'Confusion_Matrix': metrics.confusion_matrix(summarySentenceList[test_index], category_prediction_test) }) else: statsArray.append({ 'Learning_Type': 'raw_SVM', 'cAmount': cAmount, 'gammaAmount': gammaAmount, 'F1_Score': metrics.f1_score(summarySentenceList[test_index], category_prediction_test), 'Confusion_Matrix': metrics.confusion_matrix(summarySentenceList[test_index], category_prediction_test) }) statsLock.release()
user_movie_ratings = pd.pivot_table(ratings_title, index='userId', columns='title', values='rating') most_rated_movies_1k = helper.get_most_rated_movies(user_movie_ratings, 1000) # To have sklearn run k-means clustering to a dataset with missing values like this, we will first cast it to the [sparse csr matrix](https://docs.scipy.org/doc/scipy-0.19.0/reference/generated/scipy.sparse.csr_matrix.html) type defined in the SciPi library. # # To convert from a pandas dataframe to a sparse matrix, we'll have to convert to SparseDataFrame, then use pandas' `to_coo()` method for the conversion. # # Note: `to_coo()` was only added in later versions of pandas. If you run into an error with the next cell, make sure pandas is up to date. # In[51]: sparse_ratings = csr_matrix(pd.SparseDataFrame(most_rated_movies_1k).to_coo()) # ## Let's cluster! # With k-means, we have to specify k, the number of clusters. Let's arbitrarily try k=20 (A better way to pick k is as illustrated above with the elbow method. That would take some processing time to run, however.): # In[52]: # 20 clusters predictions = KMeans(n_clusters=20, algorithm='full').fit_predict(sparse_ratings) # To visualize some of these clusters, we'll plot each cluster as a heat map: # In[53]: max_users = 70
print('Loading dataset...') train = pd.read_csv("/Users/yumatakenaka/Data/ratings_sample.csv") print('Finished') # trainをランダムサンプリング # train = train.sample(frac=0.01) # カラムをカテゴリ変数化 userId_categorical = pd.api.types.CategoricalDtype(categories=sorted(train.userId.unique()), ordered=True) movieId_categorical = pd.api.types.CategoricalDtype(categories=sorted(train.movieId.unique()), ordered=True) # カテゴリインスタンスを利用して新しいカラムを作成 row = train.userId.astype(userId_categorical).cat.codes col = train.movieId.astype(movieId_categorical).cat.codes # マトリックスにRatingの数値を当てはめる sparse_matrix = csr_matrix((train["rating"], (row, col)), shape=(userId_categorical.categories.size, movieId_categorical.categories.size)) # スパース行列をDataframe化する train_pivot = pd.SparseDataFrame(sparse_matrix, index = userId_categorical.categories, columns = movieId_categorical.categories, default_fill_value = 0, dtype = 'int') # %% # n_neiborsやalgorithm、metricなど重要なアーギュメントを設定 knn = NearestNeighbors(n_neighbors=9,algorithm= 'brute', metric= 'cosine') # 前処理したデータセットでモデルを訓練 model_knn = knn.fit(train_pivot) # %% def movie_prediction(movie): distance, indice = model_knn.kneighbors(train_pivot.iloc[train_pivot.index== movie].values.reshape(1,-1),n_neighbors=11) for i in range(0, len(distance.flatten())): if i == 0: print('Recommendations if you like the movie {0}:\n'.format(train_pivot[train_pivot.index== movie].index[0])) else: print('{0}: {1} with distance: {2}'.format(i,train_pivot.index[indice.flatten() [i]],distance.flatten()[i]))
def LoopThroughDocuments(filePath, folderName): fileNames = os.listdir(filePath) dataframe = pd.DataFrame(columns=[ 'RawFileName', 'FileName', 'CleanText', 'CleanTextNoPunc', 'FirstSentence', 'SecondSentence', 'ThirdSentence', 'FourthSentence', 'FifthSentence', 'TopOneSentence', 'TopTwoSentence', 'TopThreeSentence', 'TopFourSentence', 'TopFiveSentence', 'SentenceLengthBeforeStop', 'CosineSimilarity' ]) dataframeNoStop = pd.DataFrame(columns=[ 'RawFileName', 'FileName', 'CleanText', 'CleanTextNoPunc', 'FirstSentence', 'SecondSentence', 'ThirdSentence', 'FourthSentence', 'FifthSentence', 'TopOneSentence', 'TopTwoSentence', 'TopThreeSentence', 'TopFourSentence', 'TopFiveSentence', 'SentenceLengthBeforeStop', 'CosineSimilarity' ]) # Don't worry about reading files in if there is no summary atm if 'summary.txt' not in fileNames: return dataframe, dataframeNoStop queryTFIDF.append(folderName) # used for index creation while adding into a new dataframe counter = 0 # loop through all the files in the folder for fileName in fileNames: f = open(os.path.join(os.path.abspath(filePath), fileName), 'r', encoding='ISO-8859-1') # Read file and make copy of the file rawText = f.read().lower() f.close() RawTextNoStopWords = rawText + ' ' # this makes it a deep copy # Remove Stop Words for stopword in stop_words.ENGLISH_STOP_WORDS: RawTextNoStopWords = re.sub(r'\b' + stopword.lower() + r'\b', '', RawTextNoStopWords) #SPlit strings, remove sentences without a space in them, strip ends, remove newLine characters # remove new lines, split on strings that have a "." plus any white space, or split on ?!; or .* plus -(2 or more dashes) or . word whitespace rawText = re.split(r'\.\s+|[?!;]|\.*\-{2,}|\.\w\s|,\n+\s*', rawText) RawTextNoStopWords = re.split(r'\.\s+|[?!;]|\.*\-{2,}|\.\w\s|,\n+\s*', RawTextNoStopWords) rawText = [string.strip() for string in rawText if ' ' in string] RawTextNoStopWords = [ string.strip() for string in RawTextNoStopWords if ' ' in string ] rawText = [re.sub('[\n]', r'', string) for string in rawText] RawTextNoStopWords = [ re.sub('[\n]', r'', string) for string in RawTextNoStopWords ] # ######################################################################################### # Get Word Count Of Sentence # # ######################################################################################### RawSentenceLength = [] CleanSentenceLength = [] for sentenceCount in range(0, len(rawText)): RawSentenceLength.append(len(rawText[sentenceCount].split())) for sentenceCount in range(0, len(RawTextNoStopWords)): CleanSentenceLength.append( len(RawTextNoStopWords[sentenceCount].split())) # If there are no valid sentences in email, continue loop if not RawSentenceLength: continue maxVal = max(RawSentenceLength) normalized_RawSentenceLength = [ x / float(maxVal) for x in RawSentenceLength ] maxVal = max(CleanSentenceLength) normalized_CleanSentenceLength = [ x / float(maxVal) for x in CleanSentenceLength ] # ######################################################################################### # Get sentence relative position in the document # # ######################################################################################### isFirstRaw = np.zeros(len(rawText)) isSecondRaw = np.zeros(len(rawText)) isThirdRaw = np.zeros(len(rawText)) isFourthRaw = np.zeros(len(rawText)) isFifthRaw = np.zeros(len(rawText)) isFirstNoStop = np.zeros(len(RawTextNoStopWords)) isSecondNoStop = np.zeros(len(RawTextNoStopWords)) isThirdNoStop = np.zeros(len(RawTextNoStopWords)) isFourthNoStop = np.zeros(len(RawTextNoStopWords)) isFifthNoStop = np.zeros(len(RawTextNoStopWords)) RawTopTwoSentence = np.zeros(len(rawText)) RawTopThreeSentence = np.zeros(len(rawText)) RawTopFourSentence = np.zeros(len(rawText)) RawTopFiveSentence = np.zeros(len(rawText)) CleanTopTwoSentence = np.zeros(len(RawTextNoStopWords)) CleanTopThreeSentence = np.zeros(len(RawTextNoStopWords)) CleanTopFourSentence = np.zeros(len(RawTextNoStopWords)) CleanTopFiveSentence = np.zeros(len(RawTextNoStopWords)) # Set up sentence locality count if len(rawText) < 5: for count in range(0, len(rawText)): if count == 0: isFirstRaw[count] = 1 RawTopTwoSentence[count] = 1 RawTopThreeSentence[count] = 1 RawTopFourSentence[count] = 1 RawTopFiveSentence[count] = 1 elif count == 1: isSecondRaw[count] = 1 RawTopTwoSentence[count] = 1 RawTopThreeSentence[count] = 1 RawTopFourSentence[count] = 1 RawTopFiveSentence[count] = 1 elif count == 2: isThirdRaw[count] = 1 RawTopThreeSentence[count] = 1 RawTopFourSentence[count] = 1 RawTopFiveSentence[count] = 1 else: isFourthRaw[count] = 1 RawTopFourSentence[count] = 1 RawTopFiveSentence[count] = 1 else: for count in range(0, 5): if count == 0: isFirstRaw[count] = 1 RawTopTwoSentence[count] = 1 RawTopThreeSentence[count] = 1 RawTopFourSentence[count] = 1 RawTopFiveSentence[count] = 1 elif count == 1: isSecondRaw[count] = 1 RawTopTwoSentence[count] = 1 RawTopThreeSentence[count] = 1 RawTopFourSentence[count] = 1 RawTopFiveSentence[count] = 1 elif count == 2: isThirdRaw[count] = 1 RawTopThreeSentence[count] = 1 RawTopFourSentence[count] = 1 RawTopFiveSentence[count] = 1 elif count == 3: isFourthRaw[count] = 1 RawTopFourSentence[count] = 1 RawTopFiveSentence[count] = 1 else: isFifthRaw[count] = 1 RawTopFiveSentence[count] = 1 if len(RawTextNoStopWords) < 5: for count in range(0, len(RawTextNoStopWords)): if count == 0: isFirstNoStop[count] = 1 CleanTopTwoSentence[count] = 1 CleanTopThreeSentence[count] = 1 CleanTopFourSentence[count] = 1 CleanTopFiveSentence[count] = 1 elif count == 1: isSecondNoStop[count] = 1 CleanTopTwoSentence[count] = 1 CleanTopThreeSentence[count] = 1 CleanTopFourSentence[count] = 1 CleanTopFiveSentence[count] = 1 elif count == 2: isThirdNoStop[count] = 1 CleanTopThreeSentence[count] = 1 CleanTopFourSentence[count] = 1 CleanTopFiveSentence[count] = 1 else: isFourthNoStop[count] = 1 CleanTopFourSentence[count] = 1 CleanTopFiveSentence[count] = 1 else: for count in range(0, 5): if count == 0: isFirstNoStop[count] = 1 CleanTopTwoSentence[count] = 1 CleanTopThreeSentence[count] = 1 CleanTopFourSentence[count] = 1 CleanTopFiveSentence[count] = 1 elif count == 1: isSecondNoStop[count] = 1 CleanTopTwoSentence[count] = 1 CleanTopThreeSentence[count] = 1 CleanTopFourSentence[count] = 1 CleanTopFiveSentence[count] = 1 elif count == 2: isThirdNoStop[count] = 1 CleanTopThreeSentence[count] = 1 CleanTopFourSentence[count] = 1 CleanTopFiveSentence[count] = 1 elif count == 3: isFourthNoStop[count] = 1 CleanTopFourSentence[count] = 1 CleanTopFiveSentence[count] = 1 else: isFifthNoStop[count] = 1 CleanTopFiveSentence[count] = 1 # Assign bit that states whether the sentence is the first, second, ... , fifth (We'll see if this makes a difference) # Assigns the summary into the dataframe if fileName == 'summary.txt': # Create dataframe and concat it to what exists (if something exists) # Add all sentences into dataframe textObject = { 'RawFileName': folderName, 'FileName': folderName + '__summary', 'CleanText': rawText, 'CleanTextNoPunc': '', 'FirstSentence': isFirstRaw, 'SecondSentence': isSecondRaw, 'ThirdSentence': isThirdRaw, 'FourthSentence': isFourthRaw, 'FifthSentence': isFifthRaw, 'TopOneSentence': isFirstRaw, 'TopTwoSentence': RawTopTwoSentence, 'TopThreeSentence': RawTopThreeSentence, 'TopFourSentence': RawTopFourSentence, 'TopFiveSentence': RawTopFiveSentence, 'SentenceLengthBeforeStop': normalized_RawSentenceLength, 'CosineSimilarity': 0 } textObjectNoStopWords = { 'RawFileName': folderName, 'FileName': folderName + '__summary', 'CleanText': RawTextNoStopWords, 'CleanTextNoPunc': '', 'FirstSentence': isFirstNoStop, 'SecondSentence': isSecondNoStop, 'ThirdSentence': isThirdNoStop, 'FourthSentence': isFourthNoStop, 'FifthSentence': isFifthNoStop, 'TopOneSentence': isFirstNoStop, 'TopTwoSentence': CleanTopTwoSentence, 'TopThreeSentence': CleanTopThreeSentence, 'TopFourSentence': CleanTopFourSentence, 'TopFiveSentence': CleanTopFiveSentence, 'SentenceLengthBeforeStop': normalized_CleanSentenceLength, 'CosineSimilarity': 0 } # Checks to see if the text file is a number and if it is read it into the main dataframe elif fileName.split('.')[0].isnumeric(): # Create dataframe and concat it to what exists (if something exists) # Add all sentences into dataframe # if rawtext is 0 for some reason replace with empty strings textObject = { 'RawFileName': folderName, 'FileName': folderName + '__' + str(counter), 'CleanText': rawText, 'CleanTextNoPunc': '', 'FirstSentence': isFirstRaw, 'SecondSentence': isSecondRaw, 'ThirdSentence': isThirdRaw, 'FourthSentence': isFourthRaw, 'FifthSentence': isFifthRaw, 'TopOneSentence': isFirstRaw, 'TopTwoSentence': RawTopTwoSentence, 'TopThreeSentence': RawTopThreeSentence, 'TopFourSentence': RawTopFourSentence, 'TopFiveSentence': RawTopFiveSentence, 'SentenceLengthBeforeStop': normalized_RawSentenceLength, 'CosineSimilarity': 0 } textObjectNoStopWords = { 'RawFileName': folderName, 'FileName': folderName + '__' + str(counter), 'CleanText': RawTextNoStopWords, 'CleanTextNoPunc': '', 'FirstSentence': isFirstNoStop, 'SecondSentence': isSecondNoStop, 'ThirdSentence': isThirdNoStop, 'FourthSentence': isFourthNoStop, 'FifthSentence': isFifthNoStop, 'TopOneSentence': isFirstNoStop, 'TopTwoSentence': CleanTopTwoSentence, 'TopThreeSentence': CleanTopThreeSentence, 'TopFourSentence': CleanTopFourSentence, 'TopFiveSentence': CleanTopFiveSentence, 'SentenceLengthBeforeStop': normalized_CleanSentenceLength, 'CosineSimilarity': 0 } counter += 1 if dataframeNoStop.empty: dataframeNoStop = pd.DataFrame.from_dict(textObjectNoStopWords) else: dataframeNoStop = pd.concat([ dataframeNoStop, pd.DataFrame.from_dict(textObjectNoStopWords) ], ignore_index=True, sort=False) if dataframe.empty: dataframe = pd.DataFrame.from_dict(textObject) else: dataframe = pd.concat( [dataframe, pd.DataFrame.from_dict(textObject)], ignore_index=True, sort=False) #Remove punctuation from all sentences dataframeReset = dataframe.reset_index(drop=False) exclude = set(strng.punctuation) for index, row in dataframeReset.iterrows(): sentence = ''.join(ch for ch in row['CleanText'] if ch not in exclude) dataframeReset.loc[index, 'CleanTextNoPunc'] = sentence dataframeNoStopReset = dataframeNoStop.reset_index(drop=False) for index, row in dataframeNoStopReset.iterrows(): sentence = ''.join(ch for ch in row['CleanText'] if ch not in exclude) dataframeNoStopReset.loc[index, 'CleanTextNoPunc'] = sentence # ######################################################################################### # Runs NaiveBayes and SVM per thread to see what performs best # # ######################################################################################### if isRunPerThread: global foldersName foldersName = folderName isSummarySentence = np.zeros(len(dataframeReset)) goodSentences = dataframeReset[dataframeReset['FileName'].str.contains( 'summary')] summarySentenceList = dataframeReset['CleanText'].isin( goodSentences['CleanText']) # ####### # Set up Summary Sentence Array # ####### vect = TfidfVectorizer(ngram_range=(1, 2)) vect.fit(dataframeReset['CleanText']) rawVector = vect.transform( np.array([re.sub('_', r' ', folderName.lower())])) tfidfDataFrame = vect.fit_transform(dataframeReset['CleanText']) # Raw Emails for index, row in dataframeReset.iterrows(): cosineSim = metrics.pairwise.cosine_similarity( tfidfDataFrame[index], rawVector)[0][0] if cosineSim != 0: dataframeReset.loc[index, 'CosineSimilarity'] = cosineSim maxVal = max(dataframeReset['CosineSimilarity']) if maxVal != 0: for index, row in dataframeReset.iterrows(): dataframeReset.loc[index, 'CosineSimilarity'] = row[ 'CosineSimilarity'] / float(maxVal) #Create and assign the start of the return array the answer for the first accuracy score accuracy_Array = [] #initialize folds kf = KFold(n_splits=3, shuffle=True, random_state=7) splitCounter = 1 #The internet told me to split it like this for train_index, test_index in kf.split(dataframeReset, dataframeReset): #Create a new naive_bayes model for each test set and then put its accuracy in an array nb = MultinomialNB() vect = TfidfVectorizer(ngram_range=(1, 2)) # fit and transform training into vector matrix emails_train_dtm = vect.fit_transform( dataframeReset['CleanTextNoPunc'].iloc[train_index].values) emails_test_dtm = vect.transform( dataframeReset['CleanTextNoPunc'].iloc[test_index].values) #Fit and then compare the predictions nb.fit(emails_train_dtm, summarySentenceList[train_index]) category_prediction_test = nb.predict(emails_test_dtm) accuracy_Array.append( metrics.f1_score(summarySentenceList[test_index], category_prediction_test)) outputCSVDataframe = pd.concat([ pd.SparseDataFrame(emails_test_dtm).reset_index(drop=True), pd.DataFrame(list(summarySentenceList[test_index].astype(int)), columns=['Actual']).reset_index(drop=True), pd.DataFrame(category_prediction_test.astype(int), columns=['Predicted']) ], axis=1) outputCSVDataframe.to_csv('Output/TFIDF/UK_' + folderName + str(splitCounter) + '.csv', encoding='utf-8', index=False) splitCounter += 1 if len(accuracy_Array) > 0: naiveStatsArray.append({ 'ThreadName': folderName, 'F1_Score': sum(accuracy_Array) / float(len(accuracy_Array)), 'Confusion_Matrix': metrics.confusion_matrix(summarySentenceList[test_index], category_prediction_test) }) # ########## # SVM # ########## threads = [] for cAmount in np.linspace(1, 15, 15): for gammaAmount in np.linspace(.01, .1, 10): threads.append( Thread(target=SingleLearningThread, args=(folderName, dataframeReset[[ 'CleanTextNoPunc', 'TopOneSentence', 'TopTwoSentence', 'TopThreeSentence', 'TopFourSentence', 'TopFiveSentence', 'SentenceLengthBeforeStop', 'CosineSimilarity' ]], summarySentenceList, cAmount, gammaAmount))) threads[-1].start() return dataframeReset, dataframeNoStopReset
def transform(self, X): return pd.SparseDataFrame( data=super().transform(X), columns=self.column_names_, index=X.index if isinstance(X, pd.DataFrame) else None, default_fill_value=0)
def geodesic_matrix(x, tn_ids=None, directed=False, weight='weight'): """ Generates geodesic ("along-the-arbor") distance matrix for treenodes of given neuron. Parameters ---------- x : CatmaidNeuron | CatmaidNeuronList If list, must contain a SINGLE neuron. tn_ids : list | numpy.ndarray, optional Treenode IDs. If provided, will compute distances only FROM this subset to all other nodes. directed : bool, optional If True, pairs without a child->parent path will be returned with ``distance = "inf"``. weight : 'weight' | None, optional If ``weight`` distances are given as physical length. if ``None`` distances is number of nodes. Returns ------- pd.SparseDataFrame Geodesic distance matrix. Distances in nanometres. See Also -------- :func:`~pymaid.distal_to` Check if a node A is distal to node B. :func:`~pymaid.dist_between` Get point-to-point geodesic distances. """ if isinstance(x, core.CatmaidNeuronList): if len(x) == 1: x = x[0] else: raise ValueError('Cannot process more than a single neuron.') elif isinstance(x, core.CatmaidNeuron): pass else: raise ValueError('Unable to process data of type "{0}"'.format( type(x))) if x.igraph and config.use_igraph: nodeList = x.igraph.vs.get_attribute_values('node_id') # Matrix is ordered by vertex number m = _igraph_to_sparse(x.igraph, weight_attr=weight) else: nodeList = tuple(x.graph.nodes()) m = nx.to_scipy_sparse_matrix(x.graph, nodeList, weight=weight) if not isinstance(tn_ids, type(None)): tn_ids = set(utils._make_iterable(tn_ids)) tn_indices = tuple(i for i, node in enumerate(nodeList) if node in tn_ids) ix = [nodeList[i] for i in tn_indices] else: tn_indices = None ix = nodeList dmat = csgraph.dijkstra(m, directed=directed, indices=tn_indices) return pd.SparseDataFrame(dmat, columns=nodeList, index=ix, default_fill_value=float('inf'))
def postPredictions_KMeans(input_cluster_algorithm, input_cluster_num, rating_percent): # DB에 KMeans 결과를 저장하자 print("postKMeansResult 함수 시작") if not input_cluster_num or input_cluster_num > 20: input_cluster_num = 20 movie_queryset = Movie.objects.all() rating_limit = int(len(Rating.objects.all()) * (rating_percent / 100)) rating_queryset = Rating.objects.all()[:rating_limit] movies = to_df(movie_queryset) ratings = to_df(rating_queryset) movies = movies.rename(columns={'id': 'movieId'}) ratings = ratings.rename(columns={ 'userid': 'userId', 'movieid': 'movieId' }) print('The dataset contains: ', len(ratings), ' ratings of ', len(movies), ' movies.') ##### # Merge the two tables then pivot so we have Users X Movies dataframe ratings_title = pd.merge(ratings, movies[['movieId', 'title']], on='movieId') user_movie_ratings = pd.pivot_table(ratings_title, index='userId', columns='title', values='rating') most_rated_movies_1k = helper.get_most_rated_movies( user_movie_ratings, 1000) sparse_ratings = csr_matrix( pd.SparseDataFrame(most_rated_movies_1k).to_coo()) # sparse_ratings = csr_matrix(pd.SparseDataFrame(user_movie_ratings).to_coo()) # print(sparse_ratings) if not input_cluster_algorithm: input_cluster_algorithm = 'Kmeans' # 20 clusters if input_cluster_algorithm == 'Kmeans': predictions = KMeans(n_clusters=input_cluster_num, algorithm='full').fit_predict(sparse_ratings) if input_cluster_algorithm == 'EM': predictions = GaussianMixture( n_components=input_cluster_num).fit_predict( sparse_ratings.toarray()) if input_cluster_algorithm == 'Hierarchical': predictions = AgglomerativeClustering(n_clusters=input_cluster_num, affinity='euclidean', linkage='ward').fit_predict( sparse_ratings.toarray()) if input_cluster_algorithm == 'Kmeans_self': # KMeans self 를 위해서 데이터를 다시 list형식으로 만드는 과정 dense_ratings = sparse_ratings.todense() # print(dok_matrix(a)) matrix_ratings = dok_matrix(dense_ratings) rowcol_ratings = list(matrix_ratings.keys()) value_ratings = list(matrix_ratings.values()) km_self_ratings = [] for i in range(len(rowcol_ratings)): km_self_ratings.append( [rowcol_ratings[i][0], rowcol_ratings[i][1], value_ratings[i]]) clus = KMeans_algo(20) predictions = clus.train(km_self_ratings) if input_cluster_algorithm == 'KNN': predictions = NearestNeighbors(n_neighbors=20, algorithm='auto').fit(sparse_ratings) print("프리: ", predictions) # predictions = AgglomerativeClustering(n_clusters=20, affinity='euclidean', linkage='ward').fit_predict(sparse_ratings) print("predictions") print(predictions) print('predictions len ', len(predictions)) # 여기서 predictions를 db에 저장하면된다. return predictions
import pandas as pd from mlxtend.preprocessing import TransactionEncoder from mlxtend.frequent_patterns import apriori from mlxtend.frequent_patterns import association_rules # In[2]: fin = open("T10I4D100K.txt", "r") dataset = [[int(n) for n in line.split()] for line in fin] # In[3]: te = TransactionEncoder() te_ary = te.fit(dataset).transform(dataset, sparse=True) sparse_df = pd.SparseDataFrame(te_ary, columns=te.columns_, default_fill_value=False) sparse_df # In[4]: frequent_itemsets5 = apriori(sparse_df, min_support=0.5, use_colnames=True) frequent_itemsets5 # In[5]: frequent_itemsets1 = apriori(sparse_df, min_support=0.1, use_colnames=True) frequent_itemsets1 # In[6]:
prevmov = data[0][1] #print "====" for i in data: if i[1] == prevmov: ud[i[0]] = float(i[2]) else: md[prevmov] = dict(ud) ud.clear() prevmov = i[1] ud[i[0]] = float(i[2]) md[prevmov] = dict(ud) #print "====" df = pd.SparseDataFrame(md) centarr = pd.SparseDataFrame(df - df.mean()).fillna(0) simmat = pd.SparseDataFrame(np.dot(centarr.T, centarr)) n = np.linalg.norm(centarr, axis=0) nt = np.linalg.norm(centarr.T, axis=1) simmat = pd.DataFrame(simmat / n) simmat = pd.DataFrame(simmat.T / nt) kys = md.keys() kys.sort() simmat.columns = kys simmat.index = kys
strings = strings.lower() strings = strings.split() ps = PorterStemmer() strings = [ ps.stem(word) for word in strings if not word in set(stopwords.words('english')) ] strings = ' '.join(strings) spam_corpus.append(strings) #ham vector from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=20, analyzer='word') cv_addr = cv.fit_transform(ham_corpus) ham_vector = pd.SparseDataFrame(cv_addr, columns=cv.get_feature_names(), default_fill_value=0) tmp = [] for col in cv.get_feature_names(): tmp.append([col, sum(ham_vector[col])]) ham_gdf = pd.DataFrame(tmp, columns=['word', 'frequency' ]).sort_values(by=['frequency'], ascending=False).reset_index() del ham_gdf['index'] #spam vector from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=20, analyzer='word') cv_addr = cv.fit_transform(spam_corpus)
def head(self, n=1): """Display head of the sparsed frame.""" n = min(n, len(self._index)) return pd.SparseDataFrame(self.data[:n, :].todense(), index=self.index[:n], columns=self.columns)
def test_large_sparse_dataframe_library_size(): X = pd.SparseDataFrame(sparse.coo_matrix((10**7, 2 * 10**4)), default_fill_value=0.0) cell_sums = scprep.measure.library_size(X) assert cell_sums.shape[0] == X.shape[0]
classificacao = comentarios["sentiment"].replace(["neg", "pos"], [0, 1]) comentarios["classificacao"] = classificacao comentarios.head() """utilizando o Count Vectorizer pra criar um bag of words, ele vai separar todas as palavras encontradas em cada frase.""" from sklearn.feature_extraction.text import CountVectorizer # Exemplo textos = ["Assisti um filme ótimo", "Assisti um filme péssimo"] vetorizar = CountVectorizer(lowercase=False) bag_of_words = vetorizar.fit_transform(textos) vetorizar.get_feature_names() """criando um datatable a partir desses dados de teste pra entender o funcionamento. Ao invés de armazenar os valores 0 e ficar ocupando memória, ele armazena um valor NaN que seria um valor nulo.""" vetorizado = pd.SparseDataFrame(bag_of_words, columns=vetorizar.get_feature_names()) vetorizado """Iniciando a classificação de sentimento com base na planilha importada.""" vetorizar = CountVectorizer(lowercase=False, max_features=50) bag_of_words = vetorizar.fit_transform(comentarios["text_pt"]) print(bag_of_words.shape) """separando dados entre treino e teste com sklearn.model_selection e exibindo proporção separada""" from sklearn.model_selection import train_test_split treino, teste, classe_treino, classe_teste = train_test_split( bag_of_words, comentarios["classificacao"], random_state=42) print('Treino: {treino}, Teste: {teste}'.format(treino=treino.shape, teste=teste.shape)) """criando modelo linear com logistic regression"""
def library_size_normalize(data, rescale='median'): """Performs L1 normalization on input data Performs L1 normalization on input data such that the sum of expression values for each cell sums to 1 then returns normalized matrix to the metric space using median UMI count per cell effectively scaling all cells as if they were sampled evenly. Parameters ---------- data : array-like, shape=[n_samples, n_features] Input data rescale : {'mean', 'median'}, float or `None`, optional (default: 'median') Rescaling strategy. If 'mean' or 'median', normalized cells are scaled back up to the mean or median expression value. If a float, normalized cells are scaled up to the given value. If `None`, no rescaling is done and all cells will have normalized library size of 1. Returns ------- data_norm : array-like, shape=[n_samples, n_features] Library size normalized output data """ # pandas support columns, index = None, None if isinstance(data, pd.SparseDataFrame) or \ pd.api.types.is_sparse(data): columns, index = data.columns, data.index data = data.to_coo() elif isinstance(data, pd.DataFrame): columns, index = data.columns, data.index if rescale == 'median': rescale = np.median(np.array(measure.library_size(data))) if rescale == 0: warnings.warn( "Median library size is zero. " "Rescaling to mean instead.", UserWarning) rescale = np.mean(np.array(measure.library_size(data))) elif rescale == 'mean': rescale = np.mean(np.array(measure.library_size(data))) elif isinstance(rescale, numbers.Number): pass elif rescale is None: rescale = 1 else: raise ValueError("Expected rescale in ['median', 'mean'], a number " "or `None`. Got {}".format(rescale)) if sparse.issparse(data) and data.nnz >= 2**31: # check we can access elements by index try: data[0, 0] except TypeError: data = sparse.csr_matrix(data) # normalize in chunks - sklearn doesn't does with more # than 2**31 non-zero elements # # determine maximum chunk size split = 2**30 // (data.nnz // data.shape[0]) size_ok = False while not size_ok: for i in range(0, data.shape[0], split): if data[i:i + split, :].nnz >= 2**31: split = split // 2 break size_ok = True # normalize data_norm = [] for i in range(0, data.shape[0], split): data_norm.append(normalize(data[i:i + split, :], 'l1', axis=1)) # combine chunks data_norm = sparse.vstack(data_norm) else: data_norm = normalize(data, norm='l1', axis=1) # norm = 'l1' computes the L1 norm which computes the # axis = 1 independently normalizes each sample data_norm = data_norm * rescale if columns is not None: # pandas dataframe if sparse.issparse(data_norm): data_norm = pd.SparseDataFrame(data_norm, default_fill_value=0.0) else: data_norm = pd.DataFrame(data_norm) data_norm.columns = columns data_norm.index = index return data_norm
sR = sR[sR != 0] # Remove Zeros # Return a Series to each row of a new DataFrame return pd.concat([sR, sT], axis='index') print('> Tweet textual features') dftextpost = dfP['tweet.text'].apply(tweet_textual_features) dfP['tweet.parent_text'] = dftextpost['parent_text'] # Parent Text dftextpost.drop(['parent_text'], axis='columns', inplace=True) # TF-IDF print('> TF-IDF for tweet') tfidf = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1, 1), max_df=0.9, min_df=5, max_features=1000, binary=False) X = tfidf.fit_transform(dfP['tweet.text'].values) tfidf_feature_names = ['post_tfidf_(' + name + ')' for name in tfidf.get_feature_names()] dftfidf = pd.SparseDataFrame(X, columns=tfidf_feature_names, index=dfP.index) # TF-IDF (for parent text) print('> TF-IDF for parent text') X = tfidf.fit_transform(dfP['tweet.parent_text'].values) tfidf_feature_names = ['post_tfidf_parent_(' + name + ')' for name in tfidf.get_feature_names()] dftfidf_parent = pd.SparseDataFrame(X, columns=tfidf_feature_names).set_index(dfP.index) # Final concat dfI = pd.concat([ dfI, # Base features dfsent, # Sentiment features dftextpost, # Textual features dftfidf, # TF-IDF features dftfidf_parent # TF-IDF features on parent terms ], sort=False, axis='columns')
def logOrderBookSnapshots(self, symbol): """ Log full depth quotes (price, volume) from this order book at some pre-determined frequency. Here we are looking at the actual log for this order book (i.e. are there snapshots to export, independent of the requested frequency). """ def get_quote_range_iterator(s): """ Helper method for order book logging. Takes pandas Series and returns python range() from first to last element. """ forbidden_values = [0, 19999900] # TODO: Put constant value in more sensible place! quotes = sorted(s) for val in forbidden_values: try: quotes.remove(val) except ValueError: pass return quotes book = self.order_books[symbol] if book.book_log: print("Logging order book to file...") dfLog = book.book_log_to_df() dfLog.set_index('QuoteTime', inplace=True) dfLog = dfLog[~dfLog.index.duplicated(keep='last')] dfLog.sort_index(inplace=True) if str(self.book_freq).isdigit() and int(self.book_freq) == 0: # Save all possible information # Get the full range of quotes at the finest possible resolution. quotes = get_quote_range_iterator(dfLog.columns.unique()) # Restructure the log to have multi-level rows of all possible pairs of time and quote # with volume as the only column. if not self.wide_book: filledIndex = pd.MultiIndex.from_product([dfLog.index, quotes], names=['time', 'quote']) dfLog = dfLog.stack() dfLog = dfLog.reindex(filledIndex) filename = f'ORDERBOOK_{symbol}_FULL' else: # Sample at frequency self.book_freq # With multiple quotes in a nanosecond, use the last one, then resample to the requested freq. dfLog = dfLog.resample(self.book_freq).ffill() dfLog.sort_index(inplace=True) # Create a fully populated index at the desired frequency from market open to close. # Then project the logged data into this complete index. time_idx = pd.date_range(self.mkt_open, self.mkt_close, freq=self.book_freq, closed='right') dfLog = dfLog.reindex(time_idx, method='ffill') dfLog.sort_index(inplace=True) if not self.wide_book: dfLog = dfLog.stack() dfLog.sort_index(inplace=True) # Get the full range of quotes at the finest possible resolution. quotes = get_quote_range_iterator(dfLog.index.get_level_values(1).unique()) # Restructure the log to have multi-level rows of all possible pairs of time and quote # with volume as the only column. filledIndex = pd.MultiIndex.from_product([time_idx, quotes], names=['time', 'quote']) dfLog = dfLog.reindex(filledIndex) filename = f'ORDERBOOK_{symbol}_FREQ_{self.book_freq}' # Final cleanup if not self.wide_book: dfLog.rename('Volume') df = pd.SparseDataFrame(index=dfLog.index) df['Volume'] = dfLog else: df = dfLog df = df.reindex(sorted(df.columns), axis=1) # Archive the order book snapshots directly to a file named with the symbol, rather than # to the exchange agent log. self.writeLog(df, filename=filename) print("Order book logging complete!")
'eval_metric': ['error', 'auc'], 'seed': '2017'} self.model = xgb.train(param, dtrain, num_boost_round=param['num_boost_round'], early_stopping_rounds=50, verbose_eval=1) return self def transform(self, X: pd.DataFrame, y=None, *args, **kwargs): x_predict = self.pipeline.fit_transform(X) dpredict = xgb.DMatrix(x_predict) del x_predict predicted = pd.Series(self.model.predict(dpredict), index=X.index) del dpredict if 'after' in X.columns: return X.assign(after=X['after'].combine_first(X[predicted >= self.threshold]['before'])) else: return X.assign(after=X[predicted >= self.threshold]['before']) if __name__ == '__main__': df = pd.SparseDataFrame(['в 1905 году', '123', '123', '-', '321', '&', '0546'] + 'съешь ещё этих мягких французских булок, да выпей чаю по - фиг'.split(), columns=['before']) df['prev'] = df['before'].shift(1).fillna('').to_dense() df['next'] = df['before'].shift(-1).fillna('').to_dense() print(df) st = SelfTransformer(threshold=0.5, modelpath='models/self.model.train_9517064_0.00117_0.3_500_6') print(st.fit_transform(df))
def coo_to_sparse_DF(m, sz): return pd.SparseDataFrame([pd.SparseSeries(m[i].toarray().ravel()) for i in np.arange(sz)])
def prepare_bag_of_apps_datasets(data_dir): # Based on : https://www.kaggle.com/xiaoml/talkingdata-mobile-user-demographics/low-ram-bag-of-apps-python/ # First, check if the datasets have already been created boa_file_path_1 = os.path.join(data_dir, "bag_of_apps_train.h5") boa_file_path_2 = os.path.join(data_dir, "bag_of_apps_test.h5") if os.path.exists(boa_file_path_1) and os.path.exists(boa_file_path_2): logger.info("Reading Bag-of-Apps datasets from {} & {}".format( boa_file_path_1, boa_file_path_2)) a = pd.read_hdf(boa_file_path_1, "a") b = pd.read_hdf(boa_file_path_2, "b") return a, b # Create the datasets logger.info("Preparing Bag-of-Apps datasets") app_labels = read_gz(data_dir, "app_labels.csv.gz") app_labels = app_labels.groupby("app_id")["label_id"]\ .apply(lambda x: " ".join(str(s) for s in x)) app_events = read_gz(data_dir, "app_events.csv.gz") app_events["app_labels"] = app_events["app_id"].map(app_labels) app_events = app_events.groupby("event_id")["app_labels"]\ .apply(lambda x: " ".join(str(s) for s in x)) del app_labels events = pd.read_csv(os.path.join(data_dir, "events.csv.gz"), dtype={"device_id": np.str}) events["app_labels"] = events["event_id"].map(app_events) events = events.groupby("device_id")["app_labels"]\ .apply(lambda x: " ".join(str(s) for s in x)) del app_events pbd = pd.read_csv(os.path.join(data_dir, "phone_brand_device_model.csv.gz"), dtype={"device_id": np.str}) pbd.drop_duplicates("device_id", keep="first", inplace=True) _train = read_gz(data_dir, "gender_age_train.csv.gz") _train["app_labels"] = _train["device_id"].map(events) _train = pd.merge(_train, pbd, how="left", on="device_id", left_index=True) _test = read_gz(data_dir, "gender_age_test.csv.gz") _test["app_labels"] = _test["device_id"].map(events) _test = pd.merge(_test, pbd, how="left", on="device_id", left_index=True) del pbd del events df_all = pd.concat((_train, _test), axis=0, ignore_index=True) split_len = len(_train) vec = CountVectorizer(min_df=1, binary=1) df_all = df_all[["phone_brand", "device_model", "app_labels"]]\ .astype(np.str).apply(lambda x: " ".join(s for s in x), axis=1)\ .fillna("Missing") df_tfv = vec.fit_transform(df_all) # 186716 x 2045 sparse matrix _train = df_tfv[:split_len, :] # 74645 x 2045 sparse matrix _test = df_tfv[split_len:, :] # 112071 x 2045 sparse matrix # Converting the sparse matrix into a DataFrame a = pd.SparseDataFrame([ pd.SparseSeries(_train[i].toarray().ravel()) for i in np.arange(_train.shape[0]) ]) b = pd.SparseDataFrame([ pd.SparseSeries(_test[i].toarray().ravel()) for i in np.arange(_test.shape[0]) ]) # Rename the columns app_labels_cols = ["a" + str(x) for x in np.arange(0, a.shape[1]).tolist()] d = dict(zip(np.arange(0, a.shape[1]).tolist(), app_labels_cols)) a.rename(columns=d, inplace=True) b.rename(columns=d, inplace=True) # Write to file a.to_sparse(kind='block')\ .to_hdf(boa_file_path_1, "a", mode="w", complib="blosc", complevel=9) b.to_sparse(kind='block')\ .to_hdf(boa_file_path_2, "b", mode="w", complib="blosc", complevel=9) del _train del _test # TO USE, DO # train = pd.merge(train, a, left_index=True , right_index=True) return a, b # bag-of-apps datasets
def convert_genesetlist(gslist, to, output_fname=None, verbose=False): ''' Converts an input geneset list into another representation: gmt or gvm. Returns it. If `to == gmt` and an output file name is given, it will save the results to `output_fname`. If `output_fname` already exists, then the results saved to that file will be used. gslist : pandas.Series The geneset list to be converted. to : str Either 'gmt' or 'gvm' output_fname : str The name of the file to save the results to. verbose : bool Control the frequency of print statements used when converting to gvm. ''' if verbose: print('obtaining ' + output_fname) if to == 'gmt': #Create the gmt. gmt = [[annot] + [''] + genes for (annot, genes) in zip(gslist.index, gslist.values)] #Save it to the file if it does not exist yet. if output_fname is not None: if not file_exists(output_fname): with open(output_fname, 'w', newline='') as f: writer = csv.writer(f, delimiter='\t') for geneset in gmt: writer.writerow(geneset) return gmt elif to == 'gvm': #If the gvm file already exists, load it and return it. if file_exists(output_fname): return open_gvm(output_fname) elif file_exists(output_fname.replace('gvm.csv', 'gvm.pkl')): return open_gvm(output_fname.replace('gvm.csv', 'gvm.pkl')) #Otherwise, create it. all_genes_set = {item for sublist in gslist for item in sublist} all_genes = pd.Series(sorted(all_genes_set)) gslist = gslist.apply(set) gvm = [np.array(all_genes.isin(gs), dtype=bool) for gs in gslist] #Save the gvm file as a csv, or as a pickled pandas.SparseDataFrame if it is too large. #Transpose matrix. if len(gvm) < 10000: gvm = pd.DataFrame(gvm).transpose() else: if verbose: print('getting coo_matrix for gvm with ' + str(len(gvm)) + ' columns.') gvm = coo_matrix(gvm, dtype=bool).transpose() if verbose: print('converting coo_matrix to sparse df') gvm = pd.SparseDataFrame(gvm, dtype=bool, default_fill_value=False) if verbose: print('obtained sparse df.') #Format. gvm.index = all_genes gvm.columns = gslist.index if output_fname is not None: if gvm.shape[1] < 10000: gvm = gvm.replace(to_replace=False, value='') gvm.to_csv(output_fname, sep='\t') else: gvm.to_pickle(output_fname.replace('gvm.csv', 'gvm.pkl')) return gvm else: raise ValueError('The desired representation (`to`) is unsupported: ' + to)
def SparseDataFrame_deprecated(X, default_fill_value=0.0): return pd.SparseDataFrame(X, default_fill_value=default_fill_value)
def get_matrix_for_platform(self, exp, gene_list, mirna_list=None, symmetrize=True, identifiers=True, tolower=False): if settings.CELERY_DEBUG: import sys sys.path.append( '/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg' ) import pydevd pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True) from collections import defaultdict from wrappers.input.utils import find_refseqs log.debug(gene_list) if mirna_list: log.debug(mirna_list) regex = "^[A-Z][A-Z]_[a-zA-Z0-9.]*" if len( filter( lambda x: x is not None, map(lambda x: re.match(regex, str(x), re.IGNORECASE), gene_list))) < (len(gene_list) * 0.5): new_g = [] for gene in gene_list: rf = list(find_refseqs(gene)) if len(rf) > 0: new_g.append(rf[0]) if len(rf) == 0: new_g.append(gene) gene_list = new_g hasht = dict(zip(gene_list, range(len(gene_list)))) mirna_hasht = dict() if mirna_list is not None: new_g = [] for gene in mirna_list: rf = list(find_refseqs(gene)) if len(rf) > 0: new_g.append(rf[0]) else: new_g.append(gene) mirna_list = new_g mirna_hasht = dict(zip(mirna_list, range(len(mirna_list)))) inter_hash = defaultdict(list) interactons = self.load_pairs() cols = [] rows = [] log.debug("transforming interactions") for ix in range(len(interactons)): a, b, val = interactons.iloc[ix] if mirna_list is not None: if self.x2_unit == 'mirbase': inter_hash[b].append([a, val]) else: inter_hash[a].append([b, val]) else: inter_hash[a].append([b, val]) if exp: AllUpdated(exp.pk, comment=u"Transforming interaction matrix done", silent=False, mode=NotifyMode.INFO).send() log.debug("transformation of interactions done") count = 0 counter2 = 0 counter3 = 0 counter4 = 0 size_hash = len(inter_hash) if mirna_list is None: for key, value in inter_hash.iteritems(): count += 1 if count % 500 == 0: log.debug("translating gene %d", count) if exp: AllUpdated(exp.pk, comment=u"Translating gene %s of %s" % (count, size_hash), silent=False, mode=NotifyMode.INFO).send() refseqs = find_refseqs(key) for refseq in refseqs: counter2 += 1 if refseq not in hasht: continue if refseq in hasht: for (gene, strength) in value: # new_inters.append([(refseq, new_refseq, strength) for new_refseq in find_refseqs(gene): counter3 += 1 gi = refseq gj = new_refseq if gj not in hasht: continue counter4 += 1 val = strength if tolower: gi = gi.lower() gj = gj.lower() cols.append(hasht[gi]) rows.append(hasht[gj]) else: for key, value in inter_hash.iteritems(): count += 1 if count % 500 == 0: log.debug("translating miRNA %d", count) if exp: AllUpdated(exp.pk, comment=u"Translating miRNA %s of %s" % (count, size_hash), silent=False, mode=NotifyMode.INFO).send() refseqs = find_refseqs(key) for refseq in refseqs: counter2 += 1 if refseq not in mirna_hasht: continue if refseq in mirna_hasht: for (gene, strength) in value: for new_refseq in find_refseqs(gene): counter3 += 1 gi = refseq gj = new_refseq if gj not in hasht: continue counter4 += 1 val = strength if tolower: gi = gi.lower() gj = gj.lower() rows.append(mirna_hasht[gi]) cols.append(hasht[gj]) # size = max(max(rows), max(cols)) + 1 if exp: AllUpdated(exp.pk, comment=u"%d interactions were found." % len(cols), silent=False, mode=NotifyMode.INFO).send() inters_matr = None # TODO fix for custom value of interactions if mirna_list is None: # inters_matr = sp.coo_matrix((np.ones(len(cols)), (rows, cols)), (size, size)) inters_matr = sp.coo_matrix((np.ones(len(cols)), (rows, cols)), (len(gene_list), len(gene_list))) else: inters_matr = sp.coo_matrix((np.ones(len(cols)), (rows, cols)), (len(mirna_list), len(gene_list))) #inters_matr = sp.coo_matrix((np.ones(len(cols)), (rows, cols)), (max(rows) + 1, max(cols) + 1)) if symmetrize: inters_matr = inters_matr + inters_matr.T inters_matr.data /= inters_matr.data if identifiers: inters_matr = inters_matr.tocsr() sparse_df = pd.SparseDataFrame([ pd.SparseSeries(inters_matr[i].toarray().ravel()) for i in np.arange(inters_matr.shape[0]) ]) # sparse_df = sparse_df.to_dense() if mirna_list is None: index = gene_list[:sparse_df.shape[0]] columns = gene_list[:sparse_df.shape[1]] else: index = mirna_list[:sparse_df.shape[0]] columns = gene_list[:sparse_df.shape[1]] if settings.CELERY_DEBUG: import sys sys.path.append( '/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg' ) import pydevd pydevd.settrace('localhost', port=6901, stdoutToServer=True, stderrToServer=True) # sparse_df['new_index'] = pd.Series(index, index=sparse_df.index) sparse_df.set_index([index], inplace=True) sparse_df.columns = columns return sparse_df return inters_matr
def extract_features(self, bag_of_words=False, lemmatize=True): self.posts['has_article'] = self.posts.article_name.apply( lambda x: x != None) self.posts['text_length'] = self.posts.text.apply(len) self.posts['num_hashtags'] = self.posts.hashtags.apply(len) self.posts['has_text'] = self.posts.text_length.apply(lambda x: x > 0) self.posts['num_linked_profiles'] = self.posts.linked_profiles.apply( len) self.posts['num_links'] = self.posts.links.apply(len) #Extract nltk features------------------------------------------------------------- stop_words = set(nltk.corpus.stopwords.words( "english")) #Stop words to not consider #Tokenize self.posts['text_tokenized'] = self.posts.text.apply( nltk.tokenize.word_tokenize) self.posts['num_tokens'] = self.posts.text_tokenized.apply(len) #Tokenize - no punctuations no_punc_tokenizer = RegexpTokenizer(r'\w+') self.posts['text_tokenized_filtered'] = self.posts.text.apply( lambda words: [ word.lower() for word in no_punc_tokenizer.tokenize(words) if word not in stop_words ]) #Tokenize - lemmatize and count POS if lemmatize: def get_wordnet_pos(pos): pos = pos[0].upper() wordnet_tag_dict = { "J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV } return wordnet_tag_dict.get(pos, wordnet.NOUN) lem = nltk.stem.wordnet.WordNetLemmatizer( ) #Lemmatize words if possible self.posts[ 'text_tokenized_lemmatized'] = self.posts.text_tokenized_filtered.apply( lambda words: [ lem.lemmatize(word, get_wordnet_pos(pos)) for word, pos in nltk.pos_tag(words) ]) #Count POS pos = self.posts.text_tokenized_lemmatized.apply( lambda x: [pos for word, pos in nltk.pos_tag(x)]) counted_basic = pos.apply( lambda x: Counter([get_wordnet_pos(word) for word in x])) counted = pos.apply(lambda x: Counter(x)) for tag in ['a', 'n', 'r', 'v']: self.posts['num_pos_basic_' + tag] = counted_basic.apply( lambda x: x[tag] if x and tag in x else 0) for tag in set(counted.apply(lambda x: list(x.keys())).sum()): self.posts['num_pos_' + tag] = counted.apply( lambda x: x[tag] if x and tag in x else 0) #Collect punc info self.posts['num_words'] = self.posts.text_tokenized_filtered.apply(len) puncs = [('periods', '.'), ('exclamations', '!'), ('questionms', '?'), ('equals', '='), ('dollars', '$')] for name, punc in puncs: self.posts['num_' + name] = self.posts.text_tokenized.apply( lambda words: words.count(punc)) self.posts['percent_' + name] = self.posts['num_' + name] / self.posts.num_tokens #Percent All Caps self.posts['percent_all_caps'] = posts.text_tokenized.apply( lambda tokens: [token.isupper() for token in tokens].count( True) / len(tokens) if len(tokens) else 0) #Percent Stop Words self.posts['percent_stop_words'] = posts.text_tokenized.apply( lambda x: eval(str(x))).apply(lambda tokens: [ token.lower() in stop_words for token in tokens ].count(True) / len(tokens) if len(tokens) else 0) #Bag of words model if wanted --------------------------------------------------- if bag_of_words: count_vectorizer = CountVectorizer() tfidf_transformer = TfidfTransformer() bag_of_words_matrix = tfidf_transformer.fit_transform( count_vectorizer.fit_transform(self.posts.text)) return self.posts.to_sparse().join( pd.SparseDataFrame( bag_of_words_matrix, columns=[ 'word_' + x for x in count_vectorizer.get_feature_names() ])) #Bag of words model if wanted --------------------------------------------------- if bag_of_words: count_vectorizer = CountVectorizer() tfidf_transformer = TfidfTransformer() bag_of_words_matrix = tfidf_transformer.fit_transform( count_vectorizer.fit_transform(self.posts.text)) return self.posts.to_sparse().join( pd.SparseDataFrame( bag_of_words_matrix, columns=[ 'word_' + x for x in count_vectorizer.get_feature_names() ])) #Sentiment Analysis analyser = SentimentIntensityAnalyzer() self.posts['sentiment'] = self.posts.text.apply( analyser.polarity_scores) #Readability self.posts['readability'] = self.posts.text.apply(lambda text: [ textstat.smog_index(text), textstat.gunning_fog(text), textstat.flesch_kincaid_grade(text) ]) #TTR self.posts['ttr'] = self.posts.text_tokenized_lemmatized.apply( lambda tokens: len(set(tokens)) / len(tokens) if len(tokens) else np.nan) #Syntax Tree # def calcDepth(text): # parser = nltk.parse.corenlp.CoreNLPParser() # def calcSingleDepth(sent): # parse = next(parser.raw_parse(sentence)) # #parse.pretty_print() # return parse.height() # sentences = nltk.tokenize.sent_tokenize(text) # totalDepth = 0 # for i in range(len(sentences)): # sentence = sentences[i] # totalDepth += calcSingleDepth(sentence) # return totalDepth / len(sentences) # self.posts['depth'] = self.posts.text.apply(calcDepth) return self.posts