Example #1
0
def extract_features3(rows, corpuses, calc_target=calc_target, vectorizer = CountVectorizer(), n_components=15):
    vectorizer.fit(corpuses[:,0])
    #the count matrix for the comment corpus
    c_counts = vectorizer.transform(corpuses[:,0])
    ch2 = SelectKBest(chi2, k = n_components)
    target = calc_target(rows[:,2])
    return( ch2.fit_transform(c_counts, target).toarray(), target  )
Example #2
0
def extract_features(rows, corpuses, calc_target=calc_target, vectorizer = CountVectorizer()):
    vectorizer.fit(corpuses[:,0])
    vectorizer.fit(corpuses[:,1])
    #the count matrix for the comment corpus
    c_counts = vectorizer.transform(corpuses[:,0])
    #the count matrix for the corpus of the parent comment/submission
    p_counts = vectorizer.transform(corpuses[:,1])
    #create a row vector of similarities between rows in comment and p_counts
    similarity = np.array( [cosine_similarity(c_counts[i:i+1], p_counts[i:i+1])[0] for i in range(c_counts.shape[0])] )
    rshape = lambda x: np.reshape(x, (x.shape[0], 1))
    r0 = rshape(rows[:,0])
    #r1 = rshape(rows[:,1])
    return( preprocessing.scale(np.hstack( [r0, similarity] )), calc_target(rows[:,2]) )
Example #3
0
def extract_features2(rows, corpuses, calc_target=calc_target, vectorizer = CountVectorizer(), n_components=15):
    vectorizer.fit(corpuses[:,0])
    #the count matrix for the comment corpus
    c_counts = vectorizer.transform(corpuses[:,0])
    svd = TruncatedSVD(n_components = n_components)
    return( preprocessing.scale(svd.fit_transform(c_counts)), calc_target(rows[:,2]) )