Example #1
0
def bow(corpus):
    bow = CountVectorizer(max_features=50000,
                          ngram_range=(1, 1),
                          dtype=np.float).fit_transform(corpus).toarray()
    bow /= bow.sum(axis=1, keepdims=True)  # L1
    # bow /= np.linalg.norm(bow, axis=1, keepdims=True)  # L2
    bow[np.isnan(bow)] = 0
    return bow
def getXMLTagFeatures(posts):
	tagLists = []
	for i in range(len(posts)):
	    curr = BeautifulSoup(posts[i]['Body'])
	    tagLists += [" ".join(getTags(curr))]
	X_binary = CountVectorizer(binary=True).fit_transform(tagLists)
	X_counts = CountVectorizer().fit_transform(tagLists)
	X_sums = sparseLog(X_counts.sum(axis=1))
	return X_sums, X_binary, X_counts
def getTopicTagFeatures(questionPosts, answerPosts):
	tagsByQuestion = {}
	for qPost in questionPosts:
		curr = BeautifulSoup(qPost['Tags'])
		tagsByQuestion[qPost['Id']] = " ".join(getTags(curr))
	answerTags = []
	for aPost in answerPosts:
		answerTags += [tagsByQuestion[aPost['ParentId']]]
	X_binary = CountVectorizer(binary=True).fit_transform(answerTags)
	X_sums = sparseLog(X_binary.sum(axis=1))
	return X_sums, X_binary
def TFIDF(sentences):
    vector = CountVectorizer()
    vector = vector.fit_transform(sentences).toarray()
    #print(vector)
    maxFreq = max(vector.sum(axis=0))
    vector = vector / maxFreq
    N = len(vector)
    ni = [0] * len(vector[0])
    for l in range(0, len(vector)):
        for c in range(0, len(vector[l])):
            if (vector[l][c] != 0):
                ni[c] += 1
    for l in range(0, len(vector)):
        for c in range(0, len(vector[l])):
            vector[l][c] = vector[l][c] * math.log(N / ni[c])
    return vector
def TFIDFQuery(sentences):
    query = ["".join(sentences)]
    vector = CountVectorizer()
    vector = vector.fit_transform(sentences).toarray()
    queryvector = CountVectorizer()
    queryvector = queryvector.fit_transform(query).toarray()
    maxFreq = max(vector.sum(axis=0))
    queryvector = queryvector / maxFreq
    N = len(vector)
    ni = [0] * len(vector[0])
    for l in range(0, len(vector)):
        for c in range(0, len(vector[l])):
            if (vector[l][c] != 0):
                ni[c] += 1
    for l in range(0, len(queryvector)):
        for c in range(0, len(queryvector[l])):
            queryvector[l][c] = queryvector[l][c] * math.log(N / ni[c])
    return queryvector
def calculate_containment(df, n, answer_filename):
    """Calculates the containment between a given answer text and its associated source text.
       This function creates a count of ngrams (of a size, n) for each text file in our data.
       Then calculates the containment by finding the ngram count for a given answer text,
       and its associated source text, and calculating the normalized intersection of those counts.
       :param df: A dataframe with columns,
           'File', 'Task', 'Category', 'Class', 'Text', and 'Datatype'
       :param n: An integer that defines the ngram size
       :param answer_filename: A filename for an answer text in the df, ex. 'g0pB_taskd.txt'
       :return: A single containment value that represents the similarity
           between an answer text and its source text.
    """
    # your code here

    answer_row = get_dict_from_row(df.loc[df.File == answer_filename])
    source_row = get_dict_from_row(df.loc[(df.Task == answer_row['Task']) & (df.Class == -1)])

    answer_counts, source_counts = CountVectorizer(ngram_range=(n, n)).fit_transform([
        answer_row['Text'],
        source_row['Text'],
    ]).toarray()

    return np.minimum(answer_counts, source_counts).sum() / answer_counts.sum()
Example #7
0
def tfidf(data, sublinear_tf = False, norm = 'l2', use_idf = True, smooth_idf = True, **kwargs):

    # Contamos las repeticiones
    tf = CountVectorizer(**kwargs).fit_transform(data)

    # Si estamos en sublinear tf, aplicamos logaritmo.
    if sublinear_tf: tf[tf != 0] = 1 + np.log(tf[tf != 0])

    # Dividimos por la suma en cada documento para sacar la frecuencia tf.
    tf = csr_matrix(tf / tf.sum(axis = 1))

    # D: Cantidad total de documentos; d: Cantidad de documentos en los que aparece cada palabra.
    D = tf.shape[0] + int(smooth_idf)
    d = tf.getnnz(axis = 0) + int(smooth_idf)

    # Aplicamos fórmula para idf.
    idf = 1 + np.log(D / d)

    # Multiplicamos tf * idf cuando corresponda.
    res = (tf.multiply(idf) if use_idf else tf)

    # Normalizamos a la norma que corresponda.
    return csr_matrix(res / sparse_norm(res, int(norm[1:]), axis = 1).reshape(-1, 1))
Example #8
0
corpus = [
    'Казнить нельзя, помиловать. Нельзя наказывать.',
    'Казнить, нельзя помиловать. Нельзя освободить.', 'Нельзя не помиловать.',
    'Обязательно освободить.'
]

#Получаем счетчики слов
TF = CountVectorizer().fit_transform(corpus)

#Строим IDF. К сожалению, в этом задании нам нужно только vectorizer.idf_
#Для стандартных случаев на этой строке все вычисления и заканчиваются.
#Обычно  TFIDF = vectorizer.fit_transform(corpus)
vectorizer = TfidfVectorizer(smooth_idf=False, use_idf=True)
vectorizer.fit_transform(corpus)

## из IDF  в DF
word_doc_freq = 1 / np.exp(vectorizer.idf_ - 1)

#TF нормируем и сглаживаем логарифмом (требование задания)
TFIDF = np.log(TF / TF.sum(axis=1) + 1) / word_doc_freq

#Масштабируем признаки
scaledTFIDF = StandardScaler().fit_transform(TFIDF)

#Домножаем на np.sqrt((4-1)/4) для перевода из DDOF(0) в DDOF(1) для 4 текстов
#(требование задания)
scaledTFIDF *= np.sqrt(3 / 4)

#Вывод в порядке возрастания DF
for l in scaledTFIDF[:, np.argsort(word_doc_freq)]:
    print(" ".join(["%.2f" % d for d in l]))