Python CountVectorizer.sum Examples

Programming Language: Python

Namespace/Package Name: sklearn.feature_extraction.text

Class/Type: CountVectorizer

Method/Function: sum

Examples at hotexamples.com: 8

Python CountVectorizer.sum - 8 examples found. These are the top rated real world Python examples of sklearn.feature_extraction.text.CountVectorizer.sum extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

CountVectorizer(30)

_validate_vocabulary(30)

fit_transform(30)

fit(30)

build_tokenizer(30)

build_analyzer(30)

get_stop_words(30)

get_params(21)

get_feature_names_out(15)

build_preprocessor(13)

__init__(10)

get_feature_names(9)

dictionary_freeze(6)

count(4)

analyzer(4)

fixed_vocabulary(3)

astype(3)

_count_vocab(2)

copy(2)

fit_trainsform(2)

get_features_names(2)

append(2)

_word_ngrams(2)

get_feature_name(1)

getSenVec(1)

_sort_features(1)

get_features(1)

get_sentence_vector(1)

get_shape(1)

getOutputCol(1)

fit_Transform(1)

fit_trasform(1)

fit_transfrom(1)

fit_transforn(1)

__repr__(1)

fir_transform(1)

__dict__(1)

extract_ngrams(1)

delete_temporary_training_data(1)

count_features(1)

_limit_features(1)

fir(1)

Example #1

Show file

File: vectorize.py Project: romanzes637/cluster_docs

def bow(corpus):
    bow = CountVectorizer(max_features=50000,
                          ngram_range=(1, 1),
                          dtype=np.float).fit_transform(corpus).toarray()
    bow /= bow.sum(axis=1, keepdims=True)  # L1
    # bow /= np.linalg.norm(bow, axis=1, keepdims=True)  # L2
    bow[np.isnan(bow)] = 0
    return bow

Example #2

Show file

File: featureExtractors.py Project: Jnanayogi33/XChange-Predict

def getXMLTagFeatures(posts):
	tagLists = []
	for i in range(len(posts)):
	    curr = BeautifulSoup(posts[i]['Body'])
	    tagLists += [" ".join(getTags(curr))]
	X_binary = CountVectorizer(binary=True).fit_transform(tagLists)
	X_counts = CountVectorizer().fit_transform(tagLists)
	X_sums = sparseLog(X_counts.sum(axis=1))
	return X_sums, X_binary, X_counts

Example #3

Show file

File: featureExtractors.py Project: Jnanayogi33/XChange-Predict

def getTopicTagFeatures(questionPosts, answerPosts):
	tagsByQuestion = {}
	for qPost in questionPosts:
		curr = BeautifulSoup(qPost['Tags'])
		tagsByQuestion[qPost['Id']] = " ".join(getTags(curr))
	answerTags = []
	for aPost in answerPosts:
		answerTags += [tagsByQuestion[aPost['ParentId']]]
	X_binary = CountVectorizer(binary=True).fit_transform(answerTags)
	X_sums = sparseLog(X_binary.sum(axis=1))
	return X_sums, X_binary

Example #4

Show file

File: Auxiliary.py Project: henrique93/Information-Processing-and-Retrieval

def TFIDF(sentences):
    vector = CountVectorizer()
    vector = vector.fit_transform(sentences).toarray()
    #print(vector)
    maxFreq = max(vector.sum(axis=0))
    vector = vector / maxFreq
    N = len(vector)
    ni = [0] * len(vector[0])
    for l in range(0, len(vector)):
        for c in range(0, len(vector[l])):
            if (vector[l][c] != 0):
                ni[c] += 1
    for l in range(0, len(vector)):
        for c in range(0, len(vector[l])):
            vector[l][c] = vector[l][c] * math.log(N / ni[c])
    return vector

Example #5

Show file

File: Auxiliary.py Project: henrique93/Information-Processing-and-Retrieval

def TFIDFQuery(sentences):
    query = ["".join(sentences)]
    vector = CountVectorizer()
    vector = vector.fit_transform(sentences).toarray()
    queryvector = CountVectorizer()
    queryvector = queryvector.fit_transform(query).toarray()
    maxFreq = max(vector.sum(axis=0))
    queryvector = queryvector / maxFreq
    N = len(vector)
    ni = [0] * len(vector[0])
    for l in range(0, len(vector)):
        for c in range(0, len(vector[l])):
            if (vector[l][c] != 0):
                ni[c] += 1
    for l in range(0, len(queryvector)):
        for c in range(0, len(queryvector[l])):
            queryvector[l][c] = queryvector[l][c] * math.log(N / ni[c])
    return queryvector

Example #6

Show file

File: 2_Plagiarism_Feature_Engineering.ipynb.py Project: IanEdington/learn

def calculate_containment(df, n, answer_filename):
    """Calculates the containment between a given answer text and its associated source text.
       This function creates a count of ngrams (of a size, n) for each text file in our data.
       Then calculates the containment by finding the ngram count for a given answer text,
       and its associated source text, and calculating the normalized intersection of those counts.
       :param df: A dataframe with columns,
           'File', 'Task', 'Category', 'Class', 'Text', and 'Datatype'
       :param n: An integer that defines the ngram size
       :param answer_filename: A filename for an answer text in the df, ex. 'g0pB_taskd.txt'
       :return: A single containment value that represents the similarity
           between an answer text and its source text.
    """
    # your code here

    answer_row = get_dict_from_row(df.loc[df.File == answer_filename])
    source_row = get_dict_from_row(df.loc[(df.Task == answer_row['Task']) & (df.Class == -1)])

    answer_counts, source_counts = CountVectorizer(ngram_range=(n, n)).fit_transform([
        answer_row['Text'],
        source_row['Text'],
    ]).toarray()

    return np.minimum(answer_counts, source_counts).sum() / answer_counts.sum()

Example #7

Show file

File: tfidf.py Project: AlanVek/Redes-Neuronales

def tfidf(data, sublinear_tf = False, norm = 'l2', use_idf = True, smooth_idf = True, **kwargs):

    # Contamos las repeticiones
    tf = CountVectorizer(**kwargs).fit_transform(data)

    # Si estamos en sublinear tf, aplicamos logaritmo.
    if sublinear_tf: tf[tf != 0] = 1 + np.log(tf[tf != 0])

    # Dividimos por la suma en cada documento para sacar la frecuencia tf.
    tf = csr_matrix(tf / tf.sum(axis = 1))

    # D: Cantidad total de documentos; d: Cantidad de documentos en los que aparece cada palabra.
    D = tf.shape[0] + int(smooth_idf)
    d = tf.getnnz(axis = 0) + int(smooth_idf)

    # Aplicamos fórmula para idf.
    idf = 1 + np.log(D / d)

    # Multiplicamos tf * idf cuando corresponda.
    res = (tf.multiply(idf) if use_idf else tf)

    # Normalizamos a la norma que corresponda.
    return csr_matrix(res / sparse_norm(res, int(norm[1:]), axis = 1).reshape(-1, 1))

Example #8

Show file

corpus = [
    'Казнить нельзя, помиловать. Нельзя наказывать.',
    'Казнить, нельзя помиловать. Нельзя освободить.', 'Нельзя не помиловать.',
    'Обязательно освободить.'
]

#Получаем счетчики слов
TF = CountVectorizer().fit_transform(corpus)

#Строим IDF. К сожалению, в этом задании нам нужно только vectorizer.idf_
#Для стандартных случаев на этой строке все вычисления и заканчиваются.
#Обычно  TFIDF = vectorizer.fit_transform(corpus)
vectorizer = TfidfVectorizer(smooth_idf=False, use_idf=True)
vectorizer.fit_transform(corpus)

## из IDF  в DF
word_doc_freq = 1 / np.exp(vectorizer.idf_ - 1)

#TF нормируем и сглаживаем логарифмом (требование задания)
TFIDF = np.log(TF / TF.sum(axis=1) + 1) / word_doc_freq

#Масштабируем признаки
scaledTFIDF = StandardScaler().fit_transform(TFIDF)

#Домножаем на np.sqrt((4-1)/4) для перевода из DDOF(0) в DDOF(1) для 4 текстов
#(требование задания)
scaledTFIDF *= np.sqrt(3 / 4)

#Вывод в порядке возрастания DF
for l in scaledTFIDF[:, np.argsort(word_doc_freq)]:
    print(" ".join(["%.2f" % d for d in l]))