Python SparseDataFrameの例、pandas.SparseDataFrame Pythonの例

コード例 #1

0

ファイルを表示

ファイル: dataframe.py プロジェクト: pedrocr83/omicexperiment

def biomtable_to_dataframe(biom_table_object):
  _bt = biom_table_object
  data = _bt.matrix_data.todense()
  out = pd.SparseDataFrame(data, index=_bt.ids('observation'),
                           columns=_bt.ids('sample'))
  return out.to_dense()

コード例 #2

0

ファイルを表示

def process_chunk_week(chunk: pd.SparseDataFrame, chunk_number: int, week: int,
                       min_usage: int):
    """ Asynchronously process the input data which is a
        certain week of a certain chunk. Save the result to a pickle file. """

    print('[%d] Processing week %d' % (week, week))

    # fix the columns
    print('[%d] Removing week level from columns' % week)
    chunk.columns = chunk.columns.droplevel(0)

    # get the dummy columns for hashtags
    print('[%d] Making dummies for hashtags' % week)
    dummies_hashtags: pd.SparseDataFrame = chunk['hashtags'].apply(
        lambda v: v.lower() if type(v) == str else '').str.get_dummies(sep=',')
    dummies_hashtags_values = dummies_hashtags.values

    usage = dummies_hashtags_values.sum(0)
    high_usage = usage >= min_usage
    other = dummies_hashtags_values[:, usage < min_usage]
    dummies_hashtags = pd.SparseDataFrame(
        dummies_hashtags_values[:, high_usage], dummies_hashtags.index,
        dummies_hashtags.columns[high_usage].map(lambda c: 'hashtag_' + c))
    dummies_hashtags['other_hashtags'] = other.sum(1)

    print('[%d] There are %d hashtag columns' %
          (week, dummies_hashtags.shape[1]))

    # get the dummy columns for mentions
    print('[%d] Making dummies for mentions' % week)
    dummies_mentions: pd.SparseDataFrame = chunk['mentions'].apply(
        lambda v: v.lower() if type(v) == str else '').str.get_dummies(sep=',')
    dummies_mentions_values = dummies_mentions.values

    usage = dummies_mentions_values.sum(0)
    high_usage = usage >= min_usage
    other = dummies_mentions_values[:, usage < min_usage]
    dummies_mentions = pd.SparseDataFrame(
        dummies_mentions_values[:, high_usage], dummies_mentions.index,
        dummies_mentions.columns[high_usage].map(lambda c: 'mention_' + c))
    dummies_mentions['other_mentions'] = other.sum(1)

    print('[%d] There are %d mention columns' %
          (week, dummies_mentions.shape[1]))

    # get the dummy columns for urls
    print('[%d] Making dummies for urls' % week)
    dummies_urls: pd.SparseDataFrame = chunk['urls'].apply(
        lambda v: v.lower() if type(v) == str else '').str.get_dummies(sep=',')
    dummies_urls_values = dummies_urls.values

    usage = dummies_urls_values.sum(0)
    high_usage = usage >= min_usage
    other = dummies_urls_values[:, usage < min_usage]
    dummies_urls = pd.SparseDataFrame(
        dummies_urls_values[:, high_usage], dummies_urls.index,
        dummies_urls.columns[high_usage].map(lambda c: 'url_' + c))
    dummies_urls['other_urls'] = other.sum(1)

    print('[%d] There are %d url columns' % (week, dummies_urls.shape[1]))

    # concatenate to one big data frame
    print('[%d] Concatenating dummies and copying tweets' % week)
    dummies: pd.SparseDataFrame = pd.concat(
        [dummies_hashtags, dummies_mentions, dummies_urls], axis=1)
    dummies['tweets'] = chunk['tweets']

    # save to a pickle
    print('[%d] Saving' % week)
    dummies.to_pickle('../data/chunks/chunk_%d_week_%d.pkl' %
                      (chunk_number, week))

コード例 #3

0

ファイルを表示

ファイル: crossTab_income_final.py プロジェクト: p-r-t/DBSchemaToOntology

amgPd_training = amgPd[temp]
amgPd_training.to_csv(pathwrite+'training_randomData.csv', header = True, index = False)
amgPd_test = amgPd[~temp]
amgPd_test.to_csv(pathwrite+'test_randomData.csv', header = True, index = False)
### 1. hid Vs Category_list
user_u = list(sorted(amgPd.hid.unique()))
item_u = list(sorted((amgPd.Category_list.unique())))

row = amgPd.hid.astype('category', categories=user_u).cat.codes
col = amgPd.Category_list.astype('category', categories=item_u).cat.codes

data = np.array([1 for k in range(len(amgPd))])

sparse_matrix = csr_matrix((data, (row, col)), shape=(len(user_u), len(item_u)))

df_tmp1 = pd.SparseDataFrame([pd.SparseSeries(sparse_matrix[i].toarray().ravel(), fill_value=0) for i in np.arange(sparse_matrix.shape[0])],index=user_u, columns=item_u, default_fill_value=0)
finCols = ['hid']
len(finCols)
finCols.extend(df_tmp1.columns)
len(finCols)

dfMtrx = np.empty(shape = (df_tmp1.shape[0]+1,df_tmp1.shape[1]+1), dtype=np.ndarray)
dfMtrx[:1,:][0] = finCols
dfMtrx[1:,0] = user_u
dfMtrx[1:,1:] = df_tmp1.values
print(dfMtrx.shape)

np.savetxt(pathwrite+'Final_Data_recom1_fin.csv', dfMtrx, delimiter=",",fmt='%s')


### 1.1 hid Vs Category_list training data

コード例 #4

0

ファイルを表示

ファイル: container.py プロジェクト: rdmontgomery/exa

    def save(self, path=None, complevel=1, complib='zlib'):
        """
        Save the container as an HDF5 archive.

        Args:
            path (str): Path where to save the container
        """
        if path is None:
            path = self.hexuid + '.hdf5'
        elif os.path.isdir(path):
            path += os.sep + self.hexuid + '.hdf5'
        elif not (path.endswith('.hdf5') or path.endswith('.hdf')):
            raise ValueError(
                'File path must have a ".hdf5" or ".hdf" extension.')
        with pd.HDFStore(path, 'w', complevel=complevel,
                         complib=complib) as store:
            store['kwargs'] = pd.Series()
            store.get_storer('kwargs').attrs.metadata = self._rel()
            fc = 0  # Field counter (see special handling of fields below)
            for name, data in self._data().items():
                if hasattr(data, '_revert_categories'):
                    data._revert_categories()
                name = name[1:] if name.startswith('_') else name
                if isinstance(data, Field):  # Fields are handled separately
                    fname = 'FIELD{}_'.format(fc) + name + '/'
                    store[fname + 'data'] = pd.DataFrame(data)
                    for i, field in enumerate(data.field_values):
                        ffname = fname + 'values' + str(i)
                        if isinstance(field, pd.Series):
                            store[ffname] = pd.Series(field)
                        else:
                            store[ffname] = pd.DataFrame(field)
                    fc += 1
                elif isinstance(data, Series):
                    s = pd.Series(data)
                    if isinstance(data.dtype,
                                  pd.types.dtypes.CategoricalDtype):
                        s = s.astype('O')
                    store[name] = s
                elif isinstance(data, DataFrame):
                    store[name] = pd.DataFrame(data)
                elif isinstance(data, SparseSeries):
                    s = pd.SparseSeries(data)
                    if isinstance(data.dtype,
                                  pd.types.dtypes.CategoricalDtype):
                        s = s.astype('O')
                    store[name] = s
                elif isinstance(data, SparseDataFrame):
                    store[name] = pd.SparseDataFrame(data)
                else:
                    if hasattr(data, 'dtype') and isinstance(
                            data.dtype, pd.types.dtypes.CategoricalDtype):
                        data = data.astype('O')
                    else:
                        for col in data:
                            if isinstance(data[col].dtype,
                                          pd.types.dtypes.CategoricalDtype):
                                data[col] = data[col].astype('O')
                    store[name] = data
                if hasattr(data, '_set_categories'):
                    data._set_categories()

コード例 #5

0

ファイルを表示

def _matrix_to_data_frame(data, gene_names=None, cell_names=None, sparse=None):
    """Return the optimal data type given data, gene names and cell names.

    Parameters
    ----------

    data : array-like

    gene_names : `str`, array-like or `None` (default: None)
        Either a filename or an array containing a list of gene symbols or ids.

    cell_names : `str`, array-like or `None` (default: None)
        Either a filename or an array containing a list of cell barcodes.

    sparse : `bool` or `None` (default: None)
        If not `None`, overrides default sparsity of the data.
    """
    if gene_names is None and cell_names is None and \
            not isinstance(data, pd.DataFrame):
        # just a matrix
        if sparse is not None:
            if sparse:
                if not sp.issparse(data):
                    # return scipy.sparse.csr_matrix
                    data = sp.csr_matrix(data)
            elif sp.issparse(data) and not sparse:
                # return numpy.ndarray
                data = data.toarray()
        else:
            # return data as is
            pass
        return data
    else:
        gene_names = _parse_gene_names(gene_names, data)
        cell_names = _parse_cell_names(cell_names, data)
        # dataframe with index and/or columns
        if sparse is None:
            # let the input data decide
            sparse = isinstance(data, pd.SparseDataFrame) or sp.issparse(data)
        if sparse and gene_names is not None and \
                len(np.unique(gene_names)) < len(gene_names):
            warnings.warn(
                "Duplicate gene names detected! Forcing dense matrix",
                RuntimeWarning)
            sparse = False
        if sparse:
            # return pandas.SparseDataFrame
            if isinstance(data, pd.DataFrame):
                if gene_names is not None:
                    data.columns = gene_names
                if cell_names is not None:
                    data.index = cell_names
                if not isinstance(data, pd.SparseDataFrame):
                    data = data.to_sparse(fill_value=0.0)
            else:
                data = pd.SparseDataFrame(data, default_fill_value=0.0)
                data.index = cell_names
                data.columns = gene_names
        else:
            # return pandas.DataFrame
            if isinstance(data, pd.DataFrame):
                if gene_names is not None:
                    data.columns = gene_names
                if cell_names is not None:
                    data.index = cell_names
                if isinstance(data, pd.SparseDataFrame):
                    data = data.to_dense()
            else:
                if sp.issparse(data):
                    data = data.toarray()
                data = pd.DataFrame(data, index=cell_names, columns=gene_names)
        return data

コード例 #6

0

ファイルを表示

ファイル: functions.py プロジェクト: kozodoi/Kaggle_IEEE_Fraud_Detection

def add_text_features(data, strings, k = 5, keep = True):

    ##### PROCESSING LOOP
    for var in strings:

        ### TEXT PREPROCESSING

        # replace NaN with empty string
        data[var][pd.isnull(data[var])] = ''

        # remove common words
        freq = pd.Series(' '.join(data[var]).split()).value_counts()[:10]
        #freq = list(freq.index)
        #data[var] = data[var].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
        #data[var].head()

        # remove rare words
        freq = pd.Series(' '.join(data[var]).split()).value_counts()[-10:]
        #freq = list(freq.index)
        #data[var] = data[var].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
        #data[var].head()

        # convert to lowercase 
        data[var] = data[var].apply(lambda x: " ".join(x.lower() for x in x.split())) 

        # remove punctuation
        data[var] = data[var].str.replace('[^\w\s]','')         


        ### COMPUTE BASIC FEATURES

        # word count
        data[var + '_word_count'] = data[var].apply(lambda x: len(str(x).split(" ")))
        data[var + '_word_count'][data[var] == ''] = 0

        # character count
        data[var + '_char_count'] = data[var].str.len().fillna(0).astype('int64')


        ##### COMPUTE TF-IDF FEATURES

        # import vectorizer
        tfidf  = TfidfVectorizer(max_features = k, 
                                 lowercase    = True, 
                                 norm         = 'l2', 
                                 analyzer     = 'word', 
                                 stop_words   = 'english', 
                                 ngram_range  = (1, 1))

        # compute TF-IDF
        vals = tfidf.fit_transform(data[var])
        vals = pd.SparseDataFrame(vals)
        vals.columns = [var + '_tfidf_' + str(p) for p in vals.columns]
        data = pd.concat([data, vals], axis = 1)


        ### CORRECTIONS

        # remove raw text
        if keep == False:
            del data[var]

        # print dimensions
        #print(data.shape)
        
    # return data
    return data

コード例 #7

0

ファイルを表示

ファイル: test_dataset_functions.py プロジェクト: timandrews335/openml-python

    def test_create_dataset_pandas(self):
        data = [
            ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'],
            ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'],
            ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'],
            ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'],
            ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes']
        ]
        column_names = ['rnd_str', 'outlook', 'temperature', 'humidity',
                        'windy', 'play']
        df = pd.DataFrame(data, columns=column_names)
        # enforce the type of each column
        df['outlook'] = df['outlook'].astype('category')
        df['windy'] = df['windy'].astype('bool')
        df['play'] = df['play'].astype('category')
        # meta-information
        name = '%s-pandas_testing_dataset' % self._get_sentinel()
        description = 'Synthetic dataset created from a Pandas DataFrame'
        creator = 'OpenML tester'
        collection_date = '01-01-2018'
        language = 'English'
        licence = 'MIT'
        default_target_attribute = 'play'
        citation = 'None'
        original_data_url = 'http://openml.github.io/openml-python'
        paper_url = 'http://openml.github.io/openml-python'
        dataset = openml.datasets.functions.create_dataset(
            name=name,
            description=description,
            creator=creator,
            contributor=None,
            collection_date=collection_date,
            language=language,
            licence=licence,
            default_target_attribute=default_target_attribute,
            row_id_attribute=None,
            ignore_attribute=None,
            citation=citation,
            attributes='auto',
            data=df,
            version_label='test',
            original_data_url=original_data_url,
            paper_url=paper_url
        )
        upload_did = dataset.publish()
        self.assertEqual(
            _get_online_dataset_arff(upload_did),
            dataset._dataset,
            "Uploaded ARFF does not match original one"
        )

        # Check that SparseDataFrame are supported properly
        sparse_data = scipy.sparse.coo_matrix((
            [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
            ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
        ))
        column_names = ['input1', 'input2', 'y']
        df = pd.SparseDataFrame(sparse_data, columns=column_names)
        # meta-information
        description = 'Synthetic dataset created from a Pandas SparseDataFrame'
        dataset = openml.datasets.functions.create_dataset(
            name=name,
            description=description,
            creator=creator,
            contributor=None,
            collection_date=collection_date,
            language=language,
            licence=licence,
            default_target_attribute=default_target_attribute,
            row_id_attribute=None,
            ignore_attribute=None,
            citation=citation,
            attributes='auto',
            data=df,
            version_label='test',
            original_data_url=original_data_url,
            paper_url=paper_url
        )
        upload_did = dataset.publish()
        self.assertEqual(
            _get_online_dataset_arff(upload_did),
            dataset._dataset,
            "Uploaded ARFF does not match original one"
        )
        self.assertEqual(
            _get_online_dataset_format(upload_did),
            'sparse_arff',
            "Wrong format for dataset"
        )

        # Check that we can overwrite the attributes
        data = [['a'], ['b'], ['c'], ['d'], ['e']]
        column_names = ['rnd_str']
        df = pd.DataFrame(data, columns=column_names)
        df['rnd_str'] = df['rnd_str'].astype('category')
        attributes = {'rnd_str': ['a', 'b', 'c', 'd', 'e', 'f', 'g']}
        dataset = openml.datasets.functions.create_dataset(
            name=name,
            description=description,
            creator=creator,
            contributor=None,
            collection_date=collection_date,
            language=language,
            licence=licence,
            default_target_attribute=default_target_attribute,
            row_id_attribute=None,
            ignore_attribute=None,
            citation=citation,
            attributes=attributes,
            data=df,
            version_label='test',
            original_data_url=original_data_url,
            paper_url=paper_url
        )
        upload_did = dataset.publish()
        downloaded_data = _get_online_dataset_arff(upload_did)
        self.assertEqual(
            downloaded_data,
            dataset._dataset,
            "Uploaded ARFF does not match original one"
        )
        self.assertTrue(
            '@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}' in downloaded_data)

コード例 #8

0

ファイルを表示

ファイル: SigcseLearn.py プロジェクト: Sean-Mitchell/CS_490_Practicum

def SingleLearningThread(folderName, rawEmails_dtm, summarySentenceList,
                         cAmount, gammaAmount):

    # Remove the CleanText as we only want that when printing out the info at the end
    rawEmails = rawEmails_dtm[[
        'TopOneSentence', 'TopTwoSentence', 'TopThreeSentence',
        'TopFourSentence', 'TopFiveSentence', 'SentenceLengthBeforeStop',
        'CosineSimilarity'
    ]].astype(float)

    accuracy_Array = []
    #initialize folds
    kf = KFold(n_splits=3, shuffle=True, random_state=7)
    splitCounter = 1

    #The internet told me to split it like this
    for train_index, test_index in kf.split(rawEmails, rawEmails):
        #Create a new naive_bayes model for each test set and then put its accuracy in an array

        clf = svm.SVC(C=cAmount,
                      cache_size=5000,
                      class_weight=None,
                      coef0=0.0,
                      decision_function_shape='ovr',
                      degree=3,
                      gamma=gammaAmount,
                      kernel='rbf',
                      max_iter=-1,
                      probability=False,
                      shrinking=True,
                      tol=.001,
                      verbose=False)

        # fit and transform training into vector matrix
        clf.fit(rawEmails.iloc[train_index].values,
                summarySentenceList[train_index])

        category_prediction_test = clf.predict(
            rawEmails.iloc[test_index].values)

        accuracy_Array.append(
            metrics.f1_score(summarySentenceList[test_index],
                             category_prediction_test))
        outputCSVDataframe = pd.concat([
            pd.SparseDataFrame(
                rawEmails_dtm.iloc[test_index]).reset_index(drop=True),
            pd.DataFrame(list(summarySentenceList[test_index].astype(int)),
                         columns=['Actual']).reset_index(drop=True),
            pd.DataFrame(category_prediction_test.astype(int),
                         columns=['Predicted'])
        ],
                                       axis=1)

        outputCSVDataframe.to_csv('Output/SVM/UK_' + folderName +
                                  str(splitCounter) + '.csv',
                                  encoding='utf-8',
                                  index=False)
        splitCounter += 1

    if (metrics.f1_score(summarySentenceList[test_index],
                         category_prediction_test) > 0):
        statsLock.acquire()
        if isRunPerThread:
            global foldersName
            statsArray.append({
                'Learning_Type':
                foldersName,
                'cAmount':
                cAmount,
                'gammaAmount':
                gammaAmount,
                'F1_Score':
                metrics.f1_score(summarySentenceList[test_index],
                                 category_prediction_test),
                'Confusion_Matrix':
                metrics.confusion_matrix(summarySentenceList[test_index],
                                         category_prediction_test)
            })
        else:
            statsArray.append({
                'Learning_Type':
                'raw_SVM',
                'cAmount':
                cAmount,
                'gammaAmount':
                gammaAmount,
                'F1_Score':
                metrics.f1_score(summarySentenceList[test_index],
                                 category_prediction_test),
                'Confusion_Matrix':
                metrics.confusion_matrix(summarySentenceList[test_index],
                                         category_prediction_test)
            })
        statsLock.release()

コード例 #9

0

ファイルを表示

ファイル: visuals.py プロジェクト: vibhor0911/machine-learning

user_movie_ratings = pd.pivot_table(ratings_title,
                                    index='userId',
                                    columns='title',
                                    values='rating')
most_rated_movies_1k = helper.get_most_rated_movies(user_movie_ratings, 1000)

# To have sklearn run k-means clustering to a dataset with missing values like this, we will first cast it to the [sparse csr matrix](https://docs.scipy.org/doc/scipy-0.19.0/reference/generated/scipy.sparse.csr_matrix.html) type defined in the SciPi library.
#
# To convert from a pandas dataframe to a sparse matrix, we'll have to convert to SparseDataFrame, then use pandas' `to_coo()` method for the conversion.
#
# Note: `to_coo()` was only added in later versions of pandas. If you run into an error with the next cell, make sure pandas is up to date.

# In[51]:

sparse_ratings = csr_matrix(pd.SparseDataFrame(most_rated_movies_1k).to_coo())

# ## Let's cluster!
# With k-means, we have to specify k, the number of clusters. Let's arbitrarily try k=20 (A better way to pick k is as illustrated above with the elbow method. That would take some processing time to run, however.):

# In[52]:

# 20 clusters
predictions = KMeans(n_clusters=20,
                     algorithm='full').fit_predict(sparse_ratings)

# To visualize some of these clusters, we'll plot each cluster as a heat map:

# In[53]:

max_users = 70

コード例 #10

0

ファイルを表示

ファイル: Movie Reccomend.py プロジェクト: UmaTakenaka/pjtkaggle

print('Loading dataset...')
train = pd.read_csv("/Users/yumatakenaka/Data/ratings_sample.csv")
print('Finished')

# trainをランダムサンプリング
# train = train.sample(frac=0.01)
# カラムをカテゴリ変数化
userId_categorical = pd.api.types.CategoricalDtype(categories=sorted(train.userId.unique()), ordered=True)
movieId_categorical = pd.api.types.CategoricalDtype(categories=sorted(train.movieId.unique()), ordered=True)
# カテゴリインスタンスを利用して新しいカラムを作成
row = train.userId.astype(userId_categorical).cat.codes
col = train.movieId.astype(movieId_categorical).cat.codes
# マトリックスにRatingの数値を当てはめる
sparse_matrix = csr_matrix((train["rating"], (row, col)), shape=(userId_categorical.categories.size, movieId_categorical.categories.size))
# スパース行列をDataframe化する
train_pivot = pd.SparseDataFrame(sparse_matrix, index = userId_categorical.categories, columns = movieId_categorical.categories, default_fill_value = 0, dtype = 'int')

# %%
# n_neiborsやalgorithm、metricなど重要なアーギュメントを設定
knn = NearestNeighbors(n_neighbors=9,algorithm= 'brute', metric= 'cosine')
# 前処理したデータセットでモデルを訓練
model_knn = knn.fit(train_pivot)

# %%
def movie_prediction(movie):
    distance, indice = model_knn.kneighbors(train_pivot.iloc[train_pivot.index== movie].values.reshape(1,-1),n_neighbors=11)
    for i in range(0, len(distance.flatten())):
        if  i == 0:
            print('Recommendations if you like the movie {0}:\n'.format(train_pivot[train_pivot.index== movie].index[0]))
        else:
            print('{0}: {1} with distance: {2}'.format(i,train_pivot.index[indice.flatten() [i]],distance.flatten()[i]))

コード例 #11

0

ファイルを表示

ファイル: SigcseLearn.py プロジェクト: Sean-Mitchell/CS_490_Practicum

def LoopThroughDocuments(filePath, folderName):
    fileNames = os.listdir(filePath)
    dataframe = pd.DataFrame(columns=[
        'RawFileName', 'FileName', 'CleanText', 'CleanTextNoPunc',
        'FirstSentence', 'SecondSentence', 'ThirdSentence', 'FourthSentence',
        'FifthSentence', 'TopOneSentence', 'TopTwoSentence',
        'TopThreeSentence', 'TopFourSentence', 'TopFiveSentence',
        'SentenceLengthBeforeStop', 'CosineSimilarity'
    ])
    dataframeNoStop = pd.DataFrame(columns=[
        'RawFileName', 'FileName', 'CleanText', 'CleanTextNoPunc',
        'FirstSentence', 'SecondSentence', 'ThirdSentence', 'FourthSentence',
        'FifthSentence', 'TopOneSentence', 'TopTwoSentence',
        'TopThreeSentence', 'TopFourSentence', 'TopFiveSentence',
        'SentenceLengthBeforeStop', 'CosineSimilarity'
    ])

    # Don't worry about reading files in if there is no summary atm
    if 'summary.txt' not in fileNames:
        return dataframe, dataframeNoStop

    queryTFIDF.append(folderName)

    # used for index creation while adding into a new dataframe
    counter = 0

    # loop through all the files in the folder
    for fileName in fileNames:

        f = open(os.path.join(os.path.abspath(filePath), fileName),
                 'r',
                 encoding='ISO-8859-1')

        # Read file and make copy of the file
        rawText = f.read().lower()
        f.close()
        RawTextNoStopWords = rawText + ' '  # this makes it a deep copy

        # Remove Stop Words
        for stopword in stop_words.ENGLISH_STOP_WORDS:
            RawTextNoStopWords = re.sub(r'\b' + stopword.lower() + r'\b', '',
                                        RawTextNoStopWords)

        #SPlit strings, remove sentences without a space in them, strip ends, remove newLine characters
        # remove new lines, split on strings that have a "." plus any white space, or split on ?!; or .* plus -(2 or more dashes) or . word whitespace
        rawText = re.split(r'\.\s+|[?!;]|\.*\-{2,}|\.\w\s|,\n+\s*', rawText)
        RawTextNoStopWords = re.split(r'\.\s+|[?!;]|\.*\-{2,}|\.\w\s|,\n+\s*',
                                      RawTextNoStopWords)
        rawText = [string.strip() for string in rawText if ' ' in string]
        RawTextNoStopWords = [
            string.strip() for string in RawTextNoStopWords if ' ' in string
        ]
        rawText = [re.sub('[\n]', r'', string) for string in rawText]
        RawTextNoStopWords = [
            re.sub('[\n]', r'', string) for string in RawTextNoStopWords
        ]

        # #########################################################################################
        #                               Get Word Count Of Sentence                                  #
        # #########################################################################################
        RawSentenceLength = []
        CleanSentenceLength = []

        for sentenceCount in range(0, len(rawText)):
            RawSentenceLength.append(len(rawText[sentenceCount].split()))

        for sentenceCount in range(0, len(RawTextNoStopWords)):
            CleanSentenceLength.append(
                len(RawTextNoStopWords[sentenceCount].split()))

        # If there are no valid sentences in email, continue loop
        if not RawSentenceLength:
            continue

        maxVal = max(RawSentenceLength)
        normalized_RawSentenceLength = [
            x / float(maxVal) for x in RawSentenceLength
        ]
        maxVal = max(CleanSentenceLength)
        normalized_CleanSentenceLength = [
            x / float(maxVal) for x in CleanSentenceLength
        ]

        # #########################################################################################
        #                       Get sentence relative position in the document                    #
        # #########################################################################################
        isFirstRaw = np.zeros(len(rawText))
        isSecondRaw = np.zeros(len(rawText))
        isThirdRaw = np.zeros(len(rawText))
        isFourthRaw = np.zeros(len(rawText))
        isFifthRaw = np.zeros(len(rawText))
        isFirstNoStop = np.zeros(len(RawTextNoStopWords))
        isSecondNoStop = np.zeros(len(RawTextNoStopWords))
        isThirdNoStop = np.zeros(len(RawTextNoStopWords))
        isFourthNoStop = np.zeros(len(RawTextNoStopWords))
        isFifthNoStop = np.zeros(len(RawTextNoStopWords))

        RawTopTwoSentence = np.zeros(len(rawText))
        RawTopThreeSentence = np.zeros(len(rawText))
        RawTopFourSentence = np.zeros(len(rawText))
        RawTopFiveSentence = np.zeros(len(rawText))
        CleanTopTwoSentence = np.zeros(len(RawTextNoStopWords))
        CleanTopThreeSentence = np.zeros(len(RawTextNoStopWords))
        CleanTopFourSentence = np.zeros(len(RawTextNoStopWords))
        CleanTopFiveSentence = np.zeros(len(RawTextNoStopWords))

        # Set up sentence locality count
        if len(rawText) < 5:
            for count in range(0, len(rawText)):
                if count == 0:
                    isFirstRaw[count] = 1
                    RawTopTwoSentence[count] = 1
                    RawTopThreeSentence[count] = 1
                    RawTopFourSentence[count] = 1
                    RawTopFiveSentence[count] = 1
                elif count == 1:
                    isSecondRaw[count] = 1
                    RawTopTwoSentence[count] = 1
                    RawTopThreeSentence[count] = 1
                    RawTopFourSentence[count] = 1
                    RawTopFiveSentence[count] = 1
                elif count == 2:
                    isThirdRaw[count] = 1
                    RawTopThreeSentence[count] = 1
                    RawTopFourSentence[count] = 1
                    RawTopFiveSentence[count] = 1
                else:
                    isFourthRaw[count] = 1
                    RawTopFourSentence[count] = 1
                    RawTopFiveSentence[count] = 1

        else:
            for count in range(0, 5):
                if count == 0:
                    isFirstRaw[count] = 1
                    RawTopTwoSentence[count] = 1
                    RawTopThreeSentence[count] = 1
                    RawTopFourSentence[count] = 1
                    RawTopFiveSentence[count] = 1
                elif count == 1:
                    isSecondRaw[count] = 1
                    RawTopTwoSentence[count] = 1
                    RawTopThreeSentence[count] = 1
                    RawTopFourSentence[count] = 1
                    RawTopFiveSentence[count] = 1
                elif count == 2:
                    isThirdRaw[count] = 1
                    RawTopThreeSentence[count] = 1
                    RawTopFourSentence[count] = 1
                    RawTopFiveSentence[count] = 1
                elif count == 3:
                    isFourthRaw[count] = 1
                    RawTopFourSentence[count] = 1
                    RawTopFiveSentence[count] = 1
                else:
                    isFifthRaw[count] = 1
                    RawTopFiveSentence[count] = 1

        if len(RawTextNoStopWords) < 5:
            for count in range(0, len(RawTextNoStopWords)):
                if count == 0:
                    isFirstNoStop[count] = 1
                    CleanTopTwoSentence[count] = 1
                    CleanTopThreeSentence[count] = 1
                    CleanTopFourSentence[count] = 1
                    CleanTopFiveSentence[count] = 1
                elif count == 1:
                    isSecondNoStop[count] = 1
                    CleanTopTwoSentence[count] = 1
                    CleanTopThreeSentence[count] = 1
                    CleanTopFourSentence[count] = 1
                    CleanTopFiveSentence[count] = 1
                elif count == 2:
                    isThirdNoStop[count] = 1
                    CleanTopThreeSentence[count] = 1
                    CleanTopFourSentence[count] = 1
                    CleanTopFiveSentence[count] = 1
                else:
                    isFourthNoStop[count] = 1
                    CleanTopFourSentence[count] = 1
                    CleanTopFiveSentence[count] = 1
        else:
            for count in range(0, 5):
                if count == 0:
                    isFirstNoStop[count] = 1
                    CleanTopTwoSentence[count] = 1
                    CleanTopThreeSentence[count] = 1
                    CleanTopFourSentence[count] = 1
                    CleanTopFiveSentence[count] = 1
                elif count == 1:
                    isSecondNoStop[count] = 1
                    CleanTopTwoSentence[count] = 1
                    CleanTopThreeSentence[count] = 1
                    CleanTopFourSentence[count] = 1
                    CleanTopFiveSentence[count] = 1
                elif count == 2:
                    isThirdNoStop[count] = 1
                    CleanTopThreeSentence[count] = 1
                    CleanTopFourSentence[count] = 1
                    CleanTopFiveSentence[count] = 1
                elif count == 3:
                    isFourthNoStop[count] = 1
                    CleanTopFourSentence[count] = 1
                    CleanTopFiveSentence[count] = 1
                else:
                    isFifthNoStop[count] = 1
                    CleanTopFiveSentence[count] = 1

        # Assign bit that states whether the sentence is the first, second, ... , fifth (We'll see if this makes a difference)

        # Assigns the summary into the dataframe
        if fileName == 'summary.txt':

            # Create dataframe and concat it to what exists (if something exists)
            # Add all sentences into dataframe
            textObject = {
                'RawFileName': folderName,
                'FileName': folderName + '__summary',
                'CleanText': rawText,
                'CleanTextNoPunc': '',
                'FirstSentence': isFirstRaw,
                'SecondSentence': isSecondRaw,
                'ThirdSentence': isThirdRaw,
                'FourthSentence': isFourthRaw,
                'FifthSentence': isFifthRaw,
                'TopOneSentence': isFirstRaw,
                'TopTwoSentence': RawTopTwoSentence,
                'TopThreeSentence': RawTopThreeSentence,
                'TopFourSentence': RawTopFourSentence,
                'TopFiveSentence': RawTopFiveSentence,
                'SentenceLengthBeforeStop': normalized_RawSentenceLength,
                'CosineSimilarity': 0
            }
            textObjectNoStopWords = {
                'RawFileName': folderName,
                'FileName': folderName + '__summary',
                'CleanText': RawTextNoStopWords,
                'CleanTextNoPunc': '',
                'FirstSentence': isFirstNoStop,
                'SecondSentence': isSecondNoStop,
                'ThirdSentence': isThirdNoStop,
                'FourthSentence': isFourthNoStop,
                'FifthSentence': isFifthNoStop,
                'TopOneSentence': isFirstNoStop,
                'TopTwoSentence': CleanTopTwoSentence,
                'TopThreeSentence': CleanTopThreeSentence,
                'TopFourSentence': CleanTopFourSentence,
                'TopFiveSentence': CleanTopFiveSentence,
                'SentenceLengthBeforeStop': normalized_CleanSentenceLength,
                'CosineSimilarity': 0
            }

        # Checks to see if the text file is a number and if it is read it into the main dataframe
        elif fileName.split('.')[0].isnumeric():

            # Create dataframe and concat it to what exists (if something exists)
            # Add all sentences into dataframe
            # if rawtext is 0 for some reason replace with empty strings
            textObject = {
                'RawFileName': folderName,
                'FileName': folderName + '__' + str(counter),
                'CleanText': rawText,
                'CleanTextNoPunc': '',
                'FirstSentence': isFirstRaw,
                'SecondSentence': isSecondRaw,
                'ThirdSentence': isThirdRaw,
                'FourthSentence': isFourthRaw,
                'FifthSentence': isFifthRaw,
                'TopOneSentence': isFirstRaw,
                'TopTwoSentence': RawTopTwoSentence,
                'TopThreeSentence': RawTopThreeSentence,
                'TopFourSentence': RawTopFourSentence,
                'TopFiveSentence': RawTopFiveSentence,
                'SentenceLengthBeforeStop': normalized_RawSentenceLength,
                'CosineSimilarity': 0
            }
            textObjectNoStopWords = {
                'RawFileName': folderName,
                'FileName': folderName + '__' + str(counter),
                'CleanText': RawTextNoStopWords,
                'CleanTextNoPunc': '',
                'FirstSentence': isFirstNoStop,
                'SecondSentence': isSecondNoStop,
                'ThirdSentence': isThirdNoStop,
                'FourthSentence': isFourthNoStop,
                'FifthSentence': isFifthNoStop,
                'TopOneSentence': isFirstNoStop,
                'TopTwoSentence': CleanTopTwoSentence,
                'TopThreeSentence': CleanTopThreeSentence,
                'TopFourSentence': CleanTopFourSentence,
                'TopFiveSentence': CleanTopFiveSentence,
                'SentenceLengthBeforeStop': normalized_CleanSentenceLength,
                'CosineSimilarity': 0
            }
            counter += 1

        if dataframeNoStop.empty:
            dataframeNoStop = pd.DataFrame.from_dict(textObjectNoStopWords)
        else:
            dataframeNoStop = pd.concat([
                dataframeNoStop,
                pd.DataFrame.from_dict(textObjectNoStopWords)
            ],
                                        ignore_index=True,
                                        sort=False)

        if dataframe.empty:
            dataframe = pd.DataFrame.from_dict(textObject)
        else:
            dataframe = pd.concat(
                [dataframe, pd.DataFrame.from_dict(textObject)],
                ignore_index=True,
                sort=False)

    #Remove punctuation from all sentences
    dataframeReset = dataframe.reset_index(drop=False)
    exclude = set(strng.punctuation)
    for index, row in dataframeReset.iterrows():
        sentence = ''.join(ch for ch in row['CleanText'] if ch not in exclude)
        dataframeReset.loc[index, 'CleanTextNoPunc'] = sentence

    dataframeNoStopReset = dataframeNoStop.reset_index(drop=False)
    for index, row in dataframeNoStopReset.iterrows():
        sentence = ''.join(ch for ch in row['CleanText'] if ch not in exclude)
        dataframeNoStopReset.loc[index, 'CleanTextNoPunc'] = sentence

    # #########################################################################################
    #            Runs NaiveBayes and SVM per thread to see what performs best                 #
    # #########################################################################################
    if isRunPerThread:
        global foldersName
        foldersName = folderName
        isSummarySentence = np.zeros(len(dataframeReset))

        goodSentences = dataframeReset[dataframeReset['FileName'].str.contains(
            'summary')]
        summarySentenceList = dataframeReset['CleanText'].isin(
            goodSentences['CleanText'])
        # #######
        #  Set up Summary Sentence Array
        # #######

        vect = TfidfVectorizer(ngram_range=(1, 2))
        vect.fit(dataframeReset['CleanText'])
        rawVector = vect.transform(
            np.array([re.sub('_', r' ', folderName.lower())]))
        tfidfDataFrame = vect.fit_transform(dataframeReset['CleanText'])

        # Raw Emails
        for index, row in dataframeReset.iterrows():

            cosineSim = metrics.pairwise.cosine_similarity(
                tfidfDataFrame[index], rawVector)[0][0]
            if cosineSim != 0:
                dataframeReset.loc[index, 'CosineSimilarity'] = cosineSim

        maxVal = max(dataframeReset['CosineSimilarity'])

        if maxVal != 0:
            for index, row in dataframeReset.iterrows():
                dataframeReset.loc[index, 'CosineSimilarity'] = row[
                    'CosineSimilarity'] / float(maxVal)

        #Create and assign the start of the return array the answer for the first accuracy score
        accuracy_Array = []
        #initialize folds
        kf = KFold(n_splits=3, shuffle=True, random_state=7)

        splitCounter = 1
        #The internet told me to split it like this
        for train_index, test_index in kf.split(dataframeReset,
                                                dataframeReset):
            #Create a new naive_bayes model for each test set and then put its accuracy in an array
            nb = MultinomialNB()
            vect = TfidfVectorizer(ngram_range=(1, 2))

            # fit and transform training into vector matrix
            emails_train_dtm = vect.fit_transform(
                dataframeReset['CleanTextNoPunc'].iloc[train_index].values)
            emails_test_dtm = vect.transform(
                dataframeReset['CleanTextNoPunc'].iloc[test_index].values)

            #Fit and then compare the predictions
            nb.fit(emails_train_dtm, summarySentenceList[train_index])
            category_prediction_test = nb.predict(emails_test_dtm)

            accuracy_Array.append(
                metrics.f1_score(summarySentenceList[test_index],
                                 category_prediction_test))

            outputCSVDataframe = pd.concat([
                pd.SparseDataFrame(emails_test_dtm).reset_index(drop=True),
                pd.DataFrame(list(summarySentenceList[test_index].astype(int)),
                             columns=['Actual']).reset_index(drop=True),
                pd.DataFrame(category_prediction_test.astype(int),
                             columns=['Predicted'])
            ],
                                           axis=1)

            outputCSVDataframe.to_csv('Output/TFIDF/UK_' + folderName +
                                      str(splitCounter) + '.csv',
                                      encoding='utf-8',
                                      index=False)
            splitCounter += 1

        if len(accuracy_Array) > 0:
            naiveStatsArray.append({
                'ThreadName':
                folderName,
                'F1_Score':
                sum(accuracy_Array) / float(len(accuracy_Array)),
                'Confusion_Matrix':
                metrics.confusion_matrix(summarySentenceList[test_index],
                                         category_prediction_test)
            })

        # ##########
        # SVM
        # ##########

        threads = []
        for cAmount in np.linspace(1, 15, 15):
            for gammaAmount in np.linspace(.01, .1, 10):
                threads.append(
                    Thread(target=SingleLearningThread,
                           args=(folderName, dataframeReset[[
                               'CleanTextNoPunc', 'TopOneSentence',
                               'TopTwoSentence', 'TopThreeSentence',
                               'TopFourSentence', 'TopFiveSentence',
                               'SentenceLengthBeforeStop', 'CosineSimilarity'
                           ]], summarySentenceList, cAmount, gammaAmount)))
                threads[-1].start()

    return dataframeReset, dataframeNoStopReset

コード例 #12

0

ファイルを表示

ファイル: one_hot.py プロジェクト: yuanmengzhixing/prince

 def transform(self, X):
     return pd.SparseDataFrame(
         data=super().transform(X),
         columns=self.column_names_,
         index=X.index if isinstance(X, pd.DataFrame) else None,
         default_fill_value=0)

コード例 #13

0

ファイルを表示

ファイル: graph_utils.py プロジェクト: KrisJensen/EMpaper

def geodesic_matrix(x, tn_ids=None, directed=False, weight='weight'):
    """ Generates geodesic ("along-the-arbor") distance matrix for treenodes
    of given neuron.

    Parameters
    ----------
    x :         CatmaidNeuron | CatmaidNeuronList
                If list, must contain a SINGLE neuron.
    tn_ids :    list | numpy.ndarray, optional
                Treenode IDs. If provided, will compute distances only FROM
                this subset to all other nodes.
    directed :  bool, optional
                If True, pairs without a child->parent path will be returned
                with ``distance = "inf"``.
    weight :    'weight' | None, optional
                If ``weight`` distances are given as physical length.
                if ``None`` distances is number of nodes.

    Returns
    -------
    pd.SparseDataFrame
                Geodesic distance matrix. Distances in nanometres.

    See Also
    --------
    :func:`~pymaid.distal_to`
        Check if a node A is distal to node B.
    :func:`~pymaid.dist_between`
        Get point-to-point geodesic distances.
    """

    if isinstance(x, core.CatmaidNeuronList):
        if len(x) == 1:
            x = x[0]
        else:
            raise ValueError('Cannot process more than a single neuron.')
    elif isinstance(x, core.CatmaidNeuron):
        pass
    else:
        raise ValueError('Unable to process data of type "{0}"'.format(
            type(x)))

    if x.igraph and config.use_igraph:
        nodeList = x.igraph.vs.get_attribute_values('node_id')

        # Matrix is ordered by vertex number
        m = _igraph_to_sparse(x.igraph, weight_attr=weight)
    else:
        nodeList = tuple(x.graph.nodes())

        m = nx.to_scipy_sparse_matrix(x.graph, nodeList, weight=weight)

    if not isinstance(tn_ids, type(None)):
        tn_ids = set(utils._make_iterable(tn_ids))
        tn_indices = tuple(i for i, node in enumerate(nodeList)
                           if node in tn_ids)
        ix = [nodeList[i] for i in tn_indices]
    else:
        tn_indices = None
        ix = nodeList

    dmat = csgraph.dijkstra(m, directed=directed, indices=tn_indices)

    return pd.SparseDataFrame(dmat,
                              columns=nodeList,
                              index=ix,
                              default_fill_value=float('inf'))

コード例 #14

0

ファイルを表示

ファイル: recommend_views.py プロジェクト: jinsoo-shin/recommend_Movie

def postPredictions_KMeans(input_cluster_algorithm, input_cluster_num,
                           rating_percent):
    # DB에 KMeans 결과를 저장하자
    print("postKMeansResult 함수 시작")

    if not input_cluster_num or input_cluster_num > 20:
        input_cluster_num = 20
    movie_queryset = Movie.objects.all()

    rating_limit = int(len(Rating.objects.all()) * (rating_percent / 100))
    rating_queryset = Rating.objects.all()[:rating_limit]

    movies = to_df(movie_queryset)
    ratings = to_df(rating_queryset)

    movies = movies.rename(columns={'id': 'movieId'})
    ratings = ratings.rename(columns={
        'userid': 'userId',
        'movieid': 'movieId'
    })

    print('The dataset contains: ', len(ratings), ' ratings of ', len(movies),
          ' movies.')

    #####

    # Merge the two tables then pivot so we have Users X Movies dataframe
    ratings_title = pd.merge(ratings,
                             movies[['movieId', 'title']],
                             on='movieId')

    user_movie_ratings = pd.pivot_table(ratings_title,
                                        index='userId',
                                        columns='title',
                                        values='rating')
    most_rated_movies_1k = helper.get_most_rated_movies(
        user_movie_ratings, 1000)

    sparse_ratings = csr_matrix(
        pd.SparseDataFrame(most_rated_movies_1k).to_coo())
    # sparse_ratings = csr_matrix(pd.SparseDataFrame(user_movie_ratings).to_coo())
    # print(sparse_ratings)

    if not input_cluster_algorithm:
        input_cluster_algorithm = 'Kmeans'
    # 20 clusters
    if input_cluster_algorithm == 'Kmeans':
        predictions = KMeans(n_clusters=input_cluster_num,
                             algorithm='full').fit_predict(sparse_ratings)
    if input_cluster_algorithm == 'EM':
        predictions = GaussianMixture(
            n_components=input_cluster_num).fit_predict(
                sparse_ratings.toarray())
    if input_cluster_algorithm == 'Hierarchical':
        predictions = AgglomerativeClustering(n_clusters=input_cluster_num,
                                              affinity='euclidean',
                                              linkage='ward').fit_predict(
                                                  sparse_ratings.toarray())
    if input_cluster_algorithm == 'Kmeans_self':
        # KMeans self 를 위해서 데이터를 다시 list형식으로 만드는 과정
        dense_ratings = sparse_ratings.todense()
        # print(dok_matrix(a))
        matrix_ratings = dok_matrix(dense_ratings)
        rowcol_ratings = list(matrix_ratings.keys())
        value_ratings = list(matrix_ratings.values())
        km_self_ratings = []
        for i in range(len(rowcol_ratings)):
            km_self_ratings.append(
                [rowcol_ratings[i][0], rowcol_ratings[i][1], value_ratings[i]])
        clus = KMeans_algo(20)
        predictions = clus.train(km_self_ratings)
    if input_cluster_algorithm == 'KNN':
        predictions = NearestNeighbors(n_neighbors=20,
                                       algorithm='auto').fit(sparse_ratings)
        print("프리: ", predictions)

    # predictions = AgglomerativeClustering(n_clusters=20, affinity='euclidean', linkage='ward').fit_predict(sparse_ratings)

    print("predictions")
    print(predictions)
    print('predictions len ', len(predictions))

    # 여기서 predictions를 db에 저장하면된다.

    return predictions

コード例 #15

0

ファイルを表示

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# In[2]:

fin = open("T10I4D100K.txt", "r")
dataset = [[int(n) for n in line.split()] for line in fin]

# In[3]:

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset, sparse=True)
sparse_df = pd.SparseDataFrame(te_ary,
                               columns=te.columns_,
                               default_fill_value=False)
sparse_df

# In[4]:

frequent_itemsets5 = apriori(sparse_df, min_support=0.5, use_colnames=True)
frequent_itemsets5

# In[5]:

frequent_itemsets1 = apriori(sparse_df, min_support=0.1, use_colnames=True)
frequent_itemsets1

# In[6]:

コード例 #16

0

ファイルを表示

prevmov = data[0][1]
#print "===="
for i in data:
    if i[1] == prevmov:
        ud[i[0]] = float(i[2])
    else:
        md[prevmov] = dict(ud)
        ud.clear()
        prevmov = i[1]
        ud[i[0]] = float(i[2])

md[prevmov] = dict(ud)

#print "===="

df = pd.SparseDataFrame(md)

centarr = pd.SparseDataFrame(df - df.mean()).fillna(0)

simmat = pd.SparseDataFrame(np.dot(centarr.T, centarr))

n = np.linalg.norm(centarr, axis=0)
nt = np.linalg.norm(centarr.T, axis=1)

simmat = pd.DataFrame(simmat / n)
simmat = pd.DataFrame(simmat.T / nt)
kys = md.keys()
kys.sort()
simmat.columns = kys
simmat.index = kys

コード例 #17

0

ファイルを表示

ファイル: plot.py プロジェクト: PongC/Ham-Spam-NLP

    strings = strings.lower()
    strings = strings.split()
    ps = PorterStemmer()
    strings = [
        ps.stem(word) for word in strings
        if not word in set(stopwords.words('english'))
    ]
    strings = ' '.join(strings)
    spam_corpus.append(strings)

#ham vector
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=20, analyzer='word')
cv_addr = cv.fit_transform(ham_corpus)
ham_vector = pd.SparseDataFrame(cv_addr,
                                columns=cv.get_feature_names(),
                                default_fill_value=0)

tmp = []
for col in cv.get_feature_names():
    tmp.append([col, sum(ham_vector[col])])
ham_gdf = pd.DataFrame(tmp,
                       columns=['word', 'frequency'
                                ]).sort_values(by=['frequency'],
                                               ascending=False).reset_index()
del ham_gdf['index']

#spam vector
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=20, analyzer='word')
cv_addr = cv.fit_transform(spam_corpus)

コード例 #18

0

ファイルを表示

ファイル: sparse_frame.py プロジェクト: kayibal/sparsity

 def head(self, n=1):
     """Display head of the sparsed frame."""
     n = min(n, len(self._index))
     return pd.SparseDataFrame(self.data[:n, :].todense(),
                               index=self.index[:n],
                               columns=self.columns)

コード例 #19

0

ファイルを表示

ファイル: test_filter.py プロジェクト: bebatut/scprep

def test_large_sparse_dataframe_library_size():
    X = pd.SparseDataFrame(sparse.coo_matrix((10**7, 2 * 10**4)),
                           default_fill_value=0.0)
    cell_sums = scprep.measure.library_size(X)
    assert cell_sums.shape[0] == X.shape[0]

コード例 #20

0

ファイルを表示

classificacao = comentarios["sentiment"].replace(["neg", "pos"], [0, 1])
comentarios["classificacao"] = classificacao
comentarios.head()
"""utilizando o Count Vectorizer pra criar um bag of words, ele vai separar todas as palavras encontradas em cada frase."""

from sklearn.feature_extraction.text import CountVectorizer

# Exemplo
textos = ["Assisti um filme ótimo", "Assisti um filme péssimo"]
vetorizar = CountVectorizer(lowercase=False)
bag_of_words = vetorizar.fit_transform(textos)
vetorizar.get_feature_names()
"""criando um datatable a partir desses dados de teste pra entender o funcionamento. Ao invés de armazenar os valores 0 e ficar ocupando memória, ele armazena um valor NaN que seria um valor nulo."""

vetorizado = pd.SparseDataFrame(bag_of_words,
                                columns=vetorizar.get_feature_names())
vetorizado
"""Iniciando a classificação de sentimento com base na planilha importada."""

vetorizar = CountVectorizer(lowercase=False, max_features=50)
bag_of_words = vetorizar.fit_transform(comentarios["text_pt"])
print(bag_of_words.shape)
"""separando dados entre treino e teste com sklearn.model_selection e exibindo proporção separada"""

from sklearn.model_selection import train_test_split

treino, teste, classe_treino, classe_teste = train_test_split(
    bag_of_words, comentarios["classificacao"], random_state=42)
print('Treino: {treino}, Teste: {teste}'.format(treino=treino.shape,
                                                teste=teste.shape))
"""criando modelo linear com logistic regression"""

コード例 #21

0

ファイルを表示

def library_size_normalize(data, rescale='median'):
    """Performs L1 normalization on input data
    Performs L1 normalization on input data such that the sum of expression
    values for each cell sums to 1
    then returns normalized matrix to the metric space using median UMI count
    per cell effectively scaling all cells as if they were sampled evenly.

    Parameters
    ----------
    data : array-like, shape=[n_samples, n_features]
        Input data
    rescale : {'mean', 'median'}, float or `None`, optional (default: 'median')
        Rescaling strategy. If 'mean' or 'median', normalized cells are scaled
        back up to the mean or median expression value. If a float,
        normalized cells are scaled up to the given value. If `None`, no
        rescaling is done and all cells will have normalized library size of 1.

    Returns
    -------
    data_norm : array-like, shape=[n_samples, n_features]
        Library size normalized output data
    """
    # pandas support
    columns, index = None, None
    if isinstance(data, pd.SparseDataFrame) or \
            pd.api.types.is_sparse(data):
        columns, index = data.columns, data.index
        data = data.to_coo()
    elif isinstance(data, pd.DataFrame):
        columns, index = data.columns, data.index

    if rescale == 'median':
        rescale = np.median(np.array(measure.library_size(data)))
        if rescale == 0:
            warnings.warn(
                "Median library size is zero. "
                "Rescaling to mean instead.", UserWarning)
            rescale = np.mean(np.array(measure.library_size(data)))
    elif rescale == 'mean':
        rescale = np.mean(np.array(measure.library_size(data)))
    elif isinstance(rescale, numbers.Number):
        pass
    elif rescale is None:
        rescale = 1
    else:
        raise ValueError("Expected rescale in ['median', 'mean'], a number "
                         "or `None`. Got {}".format(rescale))

    if sparse.issparse(data) and data.nnz >= 2**31:
        # check we can access elements by index
        try:
            data[0, 0]
        except TypeError:
            data = sparse.csr_matrix(data)
        # normalize in chunks - sklearn doesn't does with more
        # than 2**31 non-zero elements
        #
        # determine maximum chunk size
        split = 2**30 // (data.nnz // data.shape[0])
        size_ok = False
        while not size_ok:
            for i in range(0, data.shape[0], split):
                if data[i:i + split, :].nnz >= 2**31:
                    split = split // 2
                    break
            size_ok = True
        # normalize
        data_norm = []
        for i in range(0, data.shape[0], split):
            data_norm.append(normalize(data[i:i + split, :], 'l1', axis=1))
        # combine chunks
        data_norm = sparse.vstack(data_norm)
    else:
        data_norm = normalize(data, norm='l1', axis=1)

    # norm = 'l1' computes the L1 norm which computes the
    # axis = 1 independently normalizes each sample

    data_norm = data_norm * rescale
    if columns is not None:
        # pandas dataframe
        if sparse.issparse(data_norm):
            data_norm = pd.SparseDataFrame(data_norm, default_fill_value=0.0)
        else:
            data_norm = pd.DataFrame(data_norm)
        data_norm.columns = columns
        data_norm.index = index
    return data_norm

コード例 #22

0

ファイルを表示

        sR = sR[sR != 0]  # Remove Zeros

        # Return a Series to each row of a new DataFrame
        return pd.concat([sR, sT], axis='index')

    print('> Tweet textual features')
    dftextpost = dfP['tweet.text'].apply(tweet_textual_features)
    dfP['tweet.parent_text'] = dftextpost['parent_text']  # Parent Text
    dftextpost.drop(['parent_text'], axis='columns', inplace=True)

    # TF-IDF
    print('> TF-IDF for tweet')
    tfidf = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1, 1), max_df=0.9, min_df=5, max_features=1000, binary=False)
    X = tfidf.fit_transform(dfP['tweet.text'].values)
    tfidf_feature_names = ['post_tfidf_(' + name + ')' for name in tfidf.get_feature_names()]
    dftfidf = pd.SparseDataFrame(X, columns=tfidf_feature_names, index=dfP.index)

    # TF-IDF (for parent text)
    print('> TF-IDF for parent text')
    X = tfidf.fit_transform(dfP['tweet.parent_text'].values)
    tfidf_feature_names = ['post_tfidf_parent_(' + name + ')' for name in tfidf.get_feature_names()]
    dftfidf_parent = pd.SparseDataFrame(X, columns=tfidf_feature_names).set_index(dfP.index)

    # Final concat
    dfI = pd.concat([
                    dfI,  # Base features
                    dfsent,  # Sentiment features
                    dftextpost,  # Textual features
                    dftfidf,  # TF-IDF features
                    dftfidf_parent  # TF-IDF features on parent terms
                    ], sort=False, axis='columns')

コード例 #23

0

ファイルを表示

ファイル: ExchangeAgent.py プロジェクト: zy-han/abides

  def logOrderBookSnapshots(self, symbol):
    """
    Log full depth quotes (price, volume) from this order book at some pre-determined frequency. Here we are looking at
    the actual log for this order book (i.e. are there snapshots to export, independent of the requested frequency).
    """
    def get_quote_range_iterator(s):
      """ Helper method for order book logging. Takes pandas Series and returns python range() from first to last
          element.
      """
      forbidden_values = [0, 19999900] # TODO: Put constant value in more sensible place!
      quotes = sorted(s)
      for val in forbidden_values:
        try: quotes.remove(val)
        except ValueError:
          pass
      return quotes

    book = self.order_books[symbol]

    if book.book_log:

      print("Logging order book to file...")
      dfLog = book.book_log_to_df()
      dfLog.set_index('QuoteTime', inplace=True)
      dfLog = dfLog[~dfLog.index.duplicated(keep='last')]
      dfLog.sort_index(inplace=True)

      if str(self.book_freq).isdigit() and int(self.book_freq) == 0:  # Save all possible information
        # Get the full range of quotes at the finest possible resolution.
        quotes = get_quote_range_iterator(dfLog.columns.unique())

        # Restructure the log to have multi-level rows of all possible pairs of time and quote
        # with volume as the only column.
        if not self.wide_book:
          filledIndex = pd.MultiIndex.from_product([dfLog.index, quotes], names=['time', 'quote'])
          dfLog = dfLog.stack()
          dfLog = dfLog.reindex(filledIndex)

        filename = f'ORDERBOOK_{symbol}_FULL'

      else:  # Sample at frequency self.book_freq
        # With multiple quotes in a nanosecond, use the last one, then resample to the requested freq.
        dfLog = dfLog.resample(self.book_freq).ffill()
        dfLog.sort_index(inplace=True)

        # Create a fully populated index at the desired frequency from market open to close.
        # Then project the logged data into this complete index.
        time_idx = pd.date_range(self.mkt_open, self.mkt_close, freq=self.book_freq, closed='right')
        dfLog = dfLog.reindex(time_idx, method='ffill')
        dfLog.sort_index(inplace=True)

        if not self.wide_book:
          dfLog = dfLog.stack()
          dfLog.sort_index(inplace=True)

          # Get the full range of quotes at the finest possible resolution.
          quotes = get_quote_range_iterator(dfLog.index.get_level_values(1).unique())

          # Restructure the log to have multi-level rows of all possible pairs of time and quote
          # with volume as the only column.
          filledIndex = pd.MultiIndex.from_product([time_idx, quotes], names=['time', 'quote'])
          dfLog = dfLog.reindex(filledIndex)

        filename = f'ORDERBOOK_{symbol}_FREQ_{self.book_freq}'

      # Final cleanup
      if not self.wide_book:
        dfLog.rename('Volume')
        df = pd.SparseDataFrame(index=dfLog.index)
        df['Volume'] = dfLog
      else:
        df = dfLog
        df = df.reindex(sorted(df.columns), axis=1)

      # Archive the order book snapshots directly to a file named with the symbol, rather than
      # to the exchange agent log.
      self.writeLog(df, filename=filename)
      print("Order book logging complete!")

コード例 #24

0

ファイルを表示

                     'eval_metric': ['error', 'auc'],
                     'seed': '2017'}
            self.model = xgb.train(param, dtrain, num_boost_round=param['num_boost_round'],
                              early_stopping_rounds=50, verbose_eval=1)
        return self

    def transform(self, X: pd.DataFrame, y=None, *args, **kwargs):
        x_predict = self.pipeline.fit_transform(X)
        dpredict = xgb.DMatrix(x_predict)
        del x_predict
        predicted = pd.Series(self.model.predict(dpredict), index=X.index)
        del dpredict

        if 'after' in X.columns:
            return X.assign(after=X['after'].combine_first(X[predicted >= self.threshold]['before']))
        else:
            return X.assign(after=X[predicted >= self.threshold]['before'])


if __name__ == '__main__':
    df = pd.SparseDataFrame(['в 1905 году', '123', '123', '-', '321', '&', '0546']
                            + 'съешь ещё этих мягких французских булок, да выпей чаю по - фиг'.split(),
                            columns=['before'])
    df['prev'] = df['before'].shift(1).fillna('').to_dense()
    df['next'] = df['before'].shift(-1).fillna('').to_dense()
    print(df)

    st = SelfTransformer(threshold=0.5, modelpath='models/self.model.train_9517064_0.00117_0.3_500_6')

    print(st.fit_transform(df))

コード例 #25

0

ファイルを表示

ファイル: datatest.py プロジェクト: SammyVimes/storage_systems__classifier

def coo_to_sparse_DF(m, sz):
    return pd.SparseDataFrame([pd.SparseSeries(m[i].toarray().ravel()) for i in np.arange(sz)])

コード例 #26

0

ファイルを表示

def prepare_bag_of_apps_datasets(data_dir):
    # Based on : https://www.kaggle.com/xiaoml/talkingdata-mobile-user-demographics/low-ram-bag-of-apps-python/

    # First, check if the datasets have already been created
    boa_file_path_1 = os.path.join(data_dir, "bag_of_apps_train.h5")
    boa_file_path_2 = os.path.join(data_dir, "bag_of_apps_test.h5")
    if os.path.exists(boa_file_path_1) and os.path.exists(boa_file_path_2):
        logger.info("Reading Bag-of-Apps datasets from {} & {}".format(
            boa_file_path_1, boa_file_path_2))
        a = pd.read_hdf(boa_file_path_1, "a")
        b = pd.read_hdf(boa_file_path_2, "b")
        return a, b

    # Create the datasets
    logger.info("Preparing Bag-of-Apps datasets")
    app_labels = read_gz(data_dir, "app_labels.csv.gz")
    app_labels = app_labels.groupby("app_id")["label_id"]\
        .apply(lambda x: " ".join(str(s) for s in x))

    app_events = read_gz(data_dir, "app_events.csv.gz")
    app_events["app_labels"] = app_events["app_id"].map(app_labels)
    app_events = app_events.groupby("event_id")["app_labels"]\
        .apply(lambda x: " ".join(str(s) for s in x))
    del app_labels

    events = pd.read_csv(os.path.join(data_dir, "events.csv.gz"),
                         dtype={"device_id": np.str})
    events["app_labels"] = events["event_id"].map(app_events)
    events = events.groupby("device_id")["app_labels"]\
        .apply(lambda x: " ".join(str(s) for s in x))
    del app_events

    pbd = pd.read_csv(os.path.join(data_dir,
                                   "phone_brand_device_model.csv.gz"),
                      dtype={"device_id": np.str})
    pbd.drop_duplicates("device_id", keep="first", inplace=True)

    _train = read_gz(data_dir, "gender_age_train.csv.gz")
    _train["app_labels"] = _train["device_id"].map(events)
    _train = pd.merge(_train, pbd, how="left", on="device_id", left_index=True)
    _test = read_gz(data_dir, "gender_age_test.csv.gz")
    _test["app_labels"] = _test["device_id"].map(events)
    _test = pd.merge(_test, pbd, how="left", on="device_id", left_index=True)
    del pbd
    del events

    df_all = pd.concat((_train, _test), axis=0, ignore_index=True)
    split_len = len(_train)
    vec = CountVectorizer(min_df=1, binary=1)
    df_all = df_all[["phone_brand", "device_model", "app_labels"]]\
        .astype(np.str).apply(lambda x: " ".join(s for s in x), axis=1)\
        .fillna("Missing")
    df_tfv = vec.fit_transform(df_all)  # 186716 x 2045 sparse matrix
    _train = df_tfv[:split_len, :]  # 74645 x 2045 sparse matrix
    _test = df_tfv[split_len:, :]  # 112071 x 2045 sparse matrix

    # Converting the sparse matrix into a DataFrame
    a = pd.SparseDataFrame([
        pd.SparseSeries(_train[i].toarray().ravel())
        for i in np.arange(_train.shape[0])
    ])
    b = pd.SparseDataFrame([
        pd.SparseSeries(_test[i].toarray().ravel())
        for i in np.arange(_test.shape[0])
    ])
    # Rename the columns
    app_labels_cols = ["a" + str(x) for x in np.arange(0, a.shape[1]).tolist()]
    d = dict(zip(np.arange(0, a.shape[1]).tolist(), app_labels_cols))
    a.rename(columns=d, inplace=True)
    b.rename(columns=d, inplace=True)
    # Write to file
    a.to_sparse(kind='block')\
        .to_hdf(boa_file_path_1, "a", mode="w", complib="blosc", complevel=9)
    b.to_sparse(kind='block')\
        .to_hdf(boa_file_path_2, "b", mode="w", complib="blosc", complevel=9)
    del _train
    del _test

    # TO USE, DO
    # train = pd.merge(train, a, left_index=True , right_index=True)

    return a, b  # bag-of-apps datasets

コード例 #27

0

ファイルを表示

ファイル: convert_files_scripts.py プロジェクト: conceptslearningmachine-FEIN-85-1759293/Enrichment_Sandbox

def convert_genesetlist(gslist, to, output_fname=None, verbose=False):
    '''
	Converts an input geneset list into another representation: gmt or gvm. Returns it.
	If `to == gmt` and an output file name is given, it will save the results to `output_fname`.
	If `output_fname` already exists, then the results saved to that file will be used.
	gslist : pandas.Series
		The geneset list to be converted.
	to : str
		Either 'gmt' or 'gvm'
	output_fname : str
		The name of the file to save the results to. 
	verbose : bool
		Control the frequency of print statements used when converting to gvm.

	'''
    if verbose: print('obtaining ' + output_fname)
    if to == 'gmt':
        #Create the gmt.
        gmt = [[annot] + [''] + genes
               for (annot, genes) in zip(gslist.index, gslist.values)]
        #Save it to the file if it does not exist yet.
        if output_fname is not None:
            if not file_exists(output_fname):
                with open(output_fname, 'w', newline='') as f:
                    writer = csv.writer(f, delimiter='\t')
                    for geneset in gmt:
                        writer.writerow(geneset)
        return gmt

    elif to == 'gvm':
        #If the gvm file already exists, load it and return it.
        if file_exists(output_fname):
            return open_gvm(output_fname)
        elif file_exists(output_fname.replace('gvm.csv', 'gvm.pkl')):
            return open_gvm(output_fname.replace('gvm.csv', 'gvm.pkl'))

        #Otherwise, create it.
        all_genes_set = {item for sublist in gslist for item in sublist}
        all_genes = pd.Series(sorted(all_genes_set))
        gslist = gslist.apply(set)
        gvm = [np.array(all_genes.isin(gs), dtype=bool) for gs in gslist]

        #Save the gvm file as a csv, or as a pickled pandas.SparseDataFrame if it is too large.
        #Transpose matrix.
        if len(gvm) < 10000:
            gvm = pd.DataFrame(gvm).transpose()
        else:
            if verbose:
                print('getting coo_matrix for gvm with ' + str(len(gvm)) +
                      ' columns.')
            gvm = coo_matrix(gvm, dtype=bool).transpose()
            if verbose: print('converting coo_matrix to sparse df')
            gvm = pd.SparseDataFrame(gvm, dtype=bool, default_fill_value=False)
            if verbose: print('obtained sparse df.')
        #Format.
        gvm.index = all_genes
        gvm.columns = gslist.index
        if output_fname is not None:
            if gvm.shape[1] < 10000:
                gvm = gvm.replace(to_replace=False, value='')
                gvm.to_csv(output_fname, sep='\t')
            else:
                gvm.to_pickle(output_fname.replace('gvm.csv', 'gvm.pkl'))
        return gvm
    else:
        raise ValueError('The desired representation (`to`) is unsupported: ' +
                         to)

コード例 #28

0

ファイルを表示

ファイル: matrix.py プロジェクト: anmol6536/scprep

def SparseDataFrame_deprecated(X, default_fill_value=0.0):
    return pd.SparseDataFrame(X, default_fill_value=default_fill_value)

コード例 #29

0

ファイルを表示

    def get_matrix_for_platform(self,
                                exp,
                                gene_list,
                                mirna_list=None,
                                symmetrize=True,
                                identifiers=True,
                                tolower=False):
        if settings.CELERY_DEBUG:
            import sys
            sys.path.append(
                '/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg'
            )
            import pydevd
            pydevd.settrace('localhost',
                            port=6901,
                            stdoutToServer=True,
                            stderrToServer=True)

        from collections import defaultdict
        from wrappers.input.utils import find_refseqs
        log.debug(gene_list)
        if mirna_list:
            log.debug(mirna_list)
        regex = "^[A-Z][A-Z]_[a-zA-Z0-9.]*"
        if len(
                filter(
                    lambda x: x is not None,
                    map(lambda x: re.match(regex, str(x), re.IGNORECASE),
                        gene_list))) < (len(gene_list) * 0.5):
            new_g = []
            for gene in gene_list:
                rf = list(find_refseqs(gene))
                if len(rf) > 0:
                    new_g.append(rf[0])
                if len(rf) == 0:
                    new_g.append(gene)
            gene_list = new_g
        hasht = dict(zip(gene_list, range(len(gene_list))))

        mirna_hasht = dict()
        if mirna_list is not None:
            new_g = []
            for gene in mirna_list:
                rf = list(find_refseqs(gene))
                if len(rf) > 0:
                    new_g.append(rf[0])
                else:
                    new_g.append(gene)
            mirna_list = new_g
            mirna_hasht = dict(zip(mirna_list, range(len(mirna_list))))

        inter_hash = defaultdict(list)
        interactons = self.load_pairs()
        cols = []
        rows = []
        log.debug("transforming interactions")
        for ix in range(len(interactons)):
            a, b, val = interactons.iloc[ix]
            if mirna_list is not None:
                if self.x2_unit == 'mirbase':
                    inter_hash[b].append([a, val])
                else:
                    inter_hash[a].append([b, val])
            else:
                inter_hash[a].append([b, val])
        if exp:
            AllUpdated(exp.pk,
                       comment=u"Transforming interaction matrix done",
                       silent=False,
                       mode=NotifyMode.INFO).send()
        log.debug("transformation of interactions done")
        count = 0
        counter2 = 0
        counter3 = 0
        counter4 = 0
        size_hash = len(inter_hash)
        if mirna_list is None:
            for key, value in inter_hash.iteritems():
                count += 1
                if count % 500 == 0:
                    log.debug("translating gene %d", count)
                    if exp:
                        AllUpdated(exp.pk,
                                   comment=u"Translating gene %s of %s" %
                                   (count, size_hash),
                                   silent=False,
                                   mode=NotifyMode.INFO).send()
                refseqs = find_refseqs(key)
                for refseq in refseqs:
                    counter2 += 1
                    if refseq not in hasht:
                        continue
                    if refseq in hasht:
                        for (gene, strength) in value:
                            # new_inters.append([(refseq, new_refseq, strength)
                            for new_refseq in find_refseqs(gene):
                                counter3 += 1
                                gi = refseq
                                gj = new_refseq
                                if gj not in hasht:
                                    continue
                                counter4 += 1
                                val = strength
                                if tolower:
                                    gi = gi.lower()
                                    gj = gj.lower()
                                cols.append(hasht[gi])
                                rows.append(hasht[gj])
        else:
            for key, value in inter_hash.iteritems():
                count += 1
                if count % 500 == 0:
                    log.debug("translating miRNA %d", count)
                    if exp:
                        AllUpdated(exp.pk,
                                   comment=u"Translating miRNA %s of %s" %
                                   (count, size_hash),
                                   silent=False,
                                   mode=NotifyMode.INFO).send()
                refseqs = find_refseqs(key)
                for refseq in refseqs:
                    counter2 += 1
                    if refseq not in mirna_hasht:
                        continue
                    if refseq in mirna_hasht:
                        for (gene, strength) in value:
                            for new_refseq in find_refseqs(gene):
                                counter3 += 1
                                gi = refseq
                                gj = new_refseq
                                if gj not in hasht:
                                    continue
                                counter4 += 1
                                val = strength
                                if tolower:
                                    gi = gi.lower()
                                    gj = gj.lower()
                                rows.append(mirna_hasht[gi])
                                cols.append(hasht[gj])
        # size = max(max(rows), max(cols)) + 1
        if exp:
            AllUpdated(exp.pk,
                       comment=u"%d interactions were found." % len(cols),
                       silent=False,
                       mode=NotifyMode.INFO).send()
        inters_matr = None
        # TODO fix for custom value of interactions
        if mirna_list is None:
            # inters_matr = sp.coo_matrix((np.ones(len(cols)), (rows, cols)), (size, size))
            inters_matr = sp.coo_matrix((np.ones(len(cols)), (rows, cols)),
                                        (len(gene_list), len(gene_list)))
        else:
            inters_matr = sp.coo_matrix((np.ones(len(cols)), (rows, cols)),
                                        (len(mirna_list), len(gene_list)))
            #inters_matr = sp.coo_matrix((np.ones(len(cols)), (rows, cols)), (max(rows) + 1, max(cols) + 1))

        if symmetrize:
            inters_matr = inters_matr + inters_matr.T
            inters_matr.data /= inters_matr.data

        if identifiers:
            inters_matr = inters_matr.tocsr()
            sparse_df = pd.SparseDataFrame([
                pd.SparseSeries(inters_matr[i].toarray().ravel())
                for i in np.arange(inters_matr.shape[0])
            ])
            # sparse_df = sparse_df.to_dense()
            if mirna_list is None:
                index = gene_list[:sparse_df.shape[0]]
                columns = gene_list[:sparse_df.shape[1]]
            else:
                index = mirna_list[:sparse_df.shape[0]]
                columns = gene_list[:sparse_df.shape[1]]
            if settings.CELERY_DEBUG:
                import sys
                sys.path.append(
                    '/Migration/skola/phd/projects/miXGENE/mixgene_project/wrappers/pycharm-debug.egg'
                )
                import pydevd
                pydevd.settrace('localhost',
                                port=6901,
                                stdoutToServer=True,
                                stderrToServer=True)

            # sparse_df['new_index'] = pd.Series(index, index=sparse_df.index)
            sparse_df.set_index([index], inplace=True)
            sparse_df.columns = columns
            return sparse_df
        return inters_matr

コード例 #30

0

ファイルを表示

    def extract_features(self, bag_of_words=False, lemmatize=True):
        self.posts['has_article'] = self.posts.article_name.apply(
            lambda x: x != None)
        self.posts['text_length'] = self.posts.text.apply(len)

        self.posts['num_hashtags'] = self.posts.hashtags.apply(len)
        self.posts['has_text'] = self.posts.text_length.apply(lambda x: x > 0)
        self.posts['num_linked_profiles'] = self.posts.linked_profiles.apply(
            len)
        self.posts['num_links'] = self.posts.links.apply(len)

        #Extract nltk features-------------------------------------------------------------
        stop_words = set(nltk.corpus.stopwords.words(
            "english"))  #Stop words to not consider

        #Tokenize
        self.posts['text_tokenized'] = self.posts.text.apply(
            nltk.tokenize.word_tokenize)
        self.posts['num_tokens'] = self.posts.text_tokenized.apply(len)

        #Tokenize - no punctuations
        no_punc_tokenizer = RegexpTokenizer(r'\w+')
        self.posts['text_tokenized_filtered'] = self.posts.text.apply(
            lambda words: [
                word.lower() for word in no_punc_tokenizer.tokenize(words)
                if word not in stop_words
            ])

        #Tokenize - lemmatize and count POS
        if lemmatize:

            def get_wordnet_pos(pos):
                pos = pos[0].upper()
                wordnet_tag_dict = {
                    "J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV
                }
                return wordnet_tag_dict.get(pos, wordnet.NOUN)

            lem = nltk.stem.wordnet.WordNetLemmatizer(
            )  #Lemmatize words if possible
            self.posts[
                'text_tokenized_lemmatized'] = self.posts.text_tokenized_filtered.apply(
                    lambda words: [
                        lem.lemmatize(word, get_wordnet_pos(pos))
                        for word, pos in nltk.pos_tag(words)
                    ])

            #Count POS
            pos = self.posts.text_tokenized_lemmatized.apply(
                lambda x: [pos for word, pos in nltk.pos_tag(x)])
            counted_basic = pos.apply(
                lambda x: Counter([get_wordnet_pos(word) for word in x]))
            counted = pos.apply(lambda x: Counter(x))
            for tag in ['a', 'n', 'r', 'v']:
                self.posts['num_pos_basic_' + tag] = counted_basic.apply(
                    lambda x: x[tag] if x and tag in x else 0)
            for tag in set(counted.apply(lambda x: list(x.keys())).sum()):
                self.posts['num_pos_' + tag] = counted.apply(
                    lambda x: x[tag] if x and tag in x else 0)

        #Collect punc info
        self.posts['num_words'] = self.posts.text_tokenized_filtered.apply(len)
        puncs = [('periods', '.'), ('exclamations', '!'), ('questionms', '?'),
                 ('equals', '='), ('dollars', '$')]
        for name, punc in puncs:
            self.posts['num_' + name] = self.posts.text_tokenized.apply(
                lambda words: words.count(punc))
            self.posts['percent_' +
                       name] = self.posts['num_' +
                                          name] / self.posts.num_tokens

        #Percent All Caps
        self.posts['percent_all_caps'] = posts.text_tokenized.apply(
            lambda tokens: [token.isupper() for token in tokens].count(
                True) / len(tokens) if len(tokens) else 0)

        #Percent Stop Words
        self.posts['percent_stop_words'] = posts.text_tokenized.apply(
            lambda x: eval(str(x))).apply(lambda tokens: [
                token.lower() in stop_words for token in tokens
            ].count(True) / len(tokens) if len(tokens) else 0)

        #Bag of words model if wanted ---------------------------------------------------
        if bag_of_words:
            count_vectorizer = CountVectorizer()
            tfidf_transformer = TfidfTransformer()
            bag_of_words_matrix = tfidf_transformer.fit_transform(
                count_vectorizer.fit_transform(self.posts.text))
            return self.posts.to_sparse().join(
                pd.SparseDataFrame(
                    bag_of_words_matrix,
                    columns=[
                        'word_' + x
                        for x in count_vectorizer.get_feature_names()
                    ]))
        #Bag of words model if wanted ---------------------------------------------------
        if bag_of_words:
            count_vectorizer = CountVectorizer()
            tfidf_transformer = TfidfTransformer()
            bag_of_words_matrix = tfidf_transformer.fit_transform(
                count_vectorizer.fit_transform(self.posts.text))
            return self.posts.to_sparse().join(
                pd.SparseDataFrame(
                    bag_of_words_matrix,
                    columns=[
                        'word_' + x
                        for x in count_vectorizer.get_feature_names()
                    ]))

        #Sentiment Analysis
        analyser = SentimentIntensityAnalyzer()
        self.posts['sentiment'] = self.posts.text.apply(
            analyser.polarity_scores)

        #Readability
        self.posts['readability'] = self.posts.text.apply(lambda text: [
            textstat.smog_index(text),
            textstat.gunning_fog(text),
            textstat.flesch_kincaid_grade(text)
        ])

        #TTR
        self.posts['ttr'] = self.posts.text_tokenized_lemmatized.apply(
            lambda tokens: len(set(tokens)) / len(tokens)
            if len(tokens) else np.nan)

        #Syntax Tree
        #         def calcDepth(text):
        #             parser = nltk.parse.corenlp.CoreNLPParser()

        #             def calcSingleDepth(sent):
        #                 parse = next(parser.raw_parse(sentence))
        #                 #parse.pretty_print()
        #                 return parse.height()

        #             sentences = nltk.tokenize.sent_tokenize(text)
        #             totalDepth = 0

        #             for i in range(len(sentences)):
        #                 sentence = sentences[i]
        #                 totalDepth += calcSingleDepth(sentence)

        #             return totalDepth / len(sentences)
        #         self.posts['depth'] = self.posts.text.apply(calcDepth)

        return self.posts