Example #1
0
fid.close()
f2.close()
#    i = i + 1
#    print i

print len(corpus)

vectorizer = CountVectorizer()
transformer = TfidfTransformer()

tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
word = vectorizer.get_feature_names()
weight = tfidf.toarray()

vectorP = vectorizer.get_params()
tfidfP = transformer.get_params()

print 'vectorP:', vectorP
print 'tfidfP:', tfidfP

joblib.dump(vectorizer, "vectorizer" + str(sys.argv[1]) + ".m")
joblib.dump(transformer, "tfidf" + str(sys.argv[1]) + ".m")

resName = "BaiduTfidf_Result.txt"
result = codecs.open(resName, 'w', 'utf-8')
for j in range(len(word)):
    result.write(word[j] + ' ')
result.write('\r\n\r\n')

for i in range(len(weight)):
    #    print u"-------这里输出第",i,u"类文本的词语tf-idf权重------"
                     analyzer=my_analyzer, ngram_range=ngram)
tf = cv.fit_transform(discussions)

print("\nVectorizer Parameters\n", cv, "\n")


# LDA For Term Frequency x Doc Matrix
n_topics        = 9
max_iter        =  5
learning_offset = 20.
learning_method = 'online'
# LDA for TF-IDF x Doc Matrix
# First Create Term-Frequency/Inverse Doc Frequency by Review Matrix
# This requires constructing Term Freq. x Doc. matrix first
tf_idf = TfidfTransformer()
print("\nTF-IDF Parameters\n", tf_idf.get_params(),"\n")
tf_idf = tf_idf.fit_transform(tf)
# Or you can construct the TF/IDF matrix from the data
tfidf_vect = TfidfVectorizer(max_df=0.95, min_df=2, max_features=m_features,\
                             analyzer=my_analyzer, ngram_range=ngram)
tf_idf = tfidf_vect.fit_transform(discussions)
print("\nTF_IDF Vectorizer Parameters\n", tfidf_vect, "\n")

lda = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter,\
                                learning_method=learning_method, \
                                learning_offset=learning_offset, \
                                random_state=12345)
lda.fit_transform(tf_idf)
print('{:.<22s}{:>6d}'.format("Number of Reviews", tf.shape[0]))
print('{:.<22s}{:>6d}'.format("Number of Terms",     tf.shape[1]))
print("\nTopics Identified using LDA with TF_IDF")
Example #3
0
def generate_vectors(train_url,
                     test_url=None,
                     column='article',
                     trans_type=None,
                     max_n=1,
                     min_df=1,
                     max_df=1.0,
                     max_features=1,
                     sublinear_tf=True,
                     balanced=False,
                     re_weight=0,
                     verbose=False,
                     drop_words=0):
    """ generate X, y, X_test vectors with csv(with header) url use pandas and CountVectorizer

    Args:
        train_url: url to train csv
        test_url: url to test csv, set to None if not need X_test
        column: column to use as feature
        trans_type: specific transformer, {'dc','idf'}
        max_n: max_n for ngram_range
        min_df: min_df for CountVectorizer
        max_df: max_df for CountVectorizer
        max_features: max_features for CountVectorizer
        sublinear_tf: sublinear_tf for default TfdcTransformer
        balanced: balanced for default TfdcTransformer, for idf transformer, it is use_idf
        re_weight: re_weight for TfdcTransformer
        verbose: True to show more information
        drop_words: randomly delete some words from sentences

    Returns:
        X, y, X_test

    """
    verbose and print("loading '%s' level data from %s with pandas" %
                      (column, train_url))

    train_df = load_to_df(train_url)

    # vectorizer
    vec = CountVectorizer(ngram_range=(1, max_n),
                          min_df=min_df,
                          max_df=max_df,
                          max_features=max_features,
                          token_pattern='\w+')
    s_time = time()
    verbose and print("finish loading, vectorizing")
    verbose and print("vectorizer params:", vec.get_params())

    sequences = train_df[column]
    # delete some words randomly
    for i, row in enumerate(sequences):
        if drop_words <= 0:
            break
        if np.random.ranf() < drop_words:
            row = np.array(row.split())
            sequences.at[i] = ' '.join(row[np.random.ranf(row.shape) > 0.35])

    X = vec.fit_transform(sequences)
    e_time = time()
    verbose and print("finish vectorizing in %.3f seconds, transforming" %
                      (e_time - s_time))
    # transformer
    if trans_type is None or trans_type == 'idf':
        trans = TfidfTransformer(sublinear_tf=sublinear_tf, use_idf=balanced)
    else:
        trans = TfdcTransformer(sublinear_tf=sublinear_tf,
                                balanced=balanced,
                                re_weight=re_weight)

    verbose and print("transformer params:", trans.get_params())
    y = np.array((train_df["class"]).astype(int))
    X = trans.fit_transform(X, y)

    X_test = None
    if test_url:
        verbose and print("transforming test set")
        test_df = load_to_df(test_url)
        X_test = vec.transform(test_df[column])
        X_test = trans.transform(X_test)

    s_time = time()
    verbose and print("finish transforming in %.3f seconds\n" %
                      (s_time - e_time))
    return X, y, X_test
Example #4
0
def generate_vectors(train_url,
                     test_url=None,
                     column='article',
                     trans_type=None,
                     max_n=1,
                     min_df=1,
                     max_df=1.0,
                     max_features=1,
                     sublinear_tf=True,
                     balanced=False,
                     re_weight=0,
                     verbose=False,
                     drop_words=0,
                     multilabel_out=False,
                     label_col='subjects',
                     only_single=True,
                     shuffle=True,
                     apply_fun=None):
    """ generate X, y, X_test vectors with csv(with header) url use pandas and CountVectorizer

    Args:
        train_url: url to train csv
        test_url: url to test csv, set to None if not need X_test
        column: column to use as feature
        trans_type: specific transformer, {'dc','idf', 'hashing'}
        max_n: max_n for ngram_range
        min_df: min_df for CountVectorizer
        max_df: max_df for CountVectorizer
        max_features: max_features for CountVectorizer
        sublinear_tf: sublinear_tf for default TfdcTransformer
        balanced: balanced for default TfdcTransformer, for idf transformer, it is use_idf
        re_weight: re_weight for TfdcTransformer
        verbose: True to show more information
        drop_words: randomly delete some words from sentences
        multilabel_out: return y as multilabel format
        label_col: col name of label
        only_single: only keep records of single label
        shuffle: re sample train data
        apply_fun: callable to be applied on label column

    Returns:
        X, y, X_test

    """
    verbose and print("loading '%s' level data from %s with pandas" %
                      (column, train_url))

    train_df = pd.read_csv(train_url)
    if shuffle:
        train_df = train_df.sample(frac=1)
    if only_single:
        train_df = train_df[train_df['subjects'].apply(lambda x: len(x) < 2)]

    # vectorizer
    s_time = time()
    analyzer = 'word' if column == 'word_seg' else 'char'
    vec = CountVectorizer(analyzer=analyzer,
                          ngram_range=(1, max_n),
                          min_df=min_df,
                          max_df=max_df,
                          max_features=max_features,
                          token_pattern='\w+')
    verbose and print("finish loading, vectorizing")
    verbose and print("vectorizer params:", vec.get_params())
    sequences = train_df[column]
    # delete some words randomly
    for i, row in enumerate(sequences):
        if drop_words <= 0:
            break
        if np.random.ranf() < drop_words:
            row = np.array(row.split())
            sequences.at[i] = ' '.join(row[np.random.ranf(row.shape) > 0.35])
    X = sequences if trans_type == 'hashing' else vec.fit_transform(sequences)
    e_time = time()
    verbose and print("finish vectorizing in %.3f seconds, transforming" %
                      (e_time - s_time))

    # transformer
    if trans_type is None or trans_type == 'idf':
        trans = TfidfTransformer(sublinear_tf=sublinear_tf, use_idf=balanced)
    elif trans_type == 'dc':
        trans = TfdcTransformer(sublinear_tf=sublinear_tf,
                                balanced=balanced,
                                re_weight=re_weight)
    else:
        trans = HashingVectorizer(analyzer=analyzer,
                                  ngram_range=(1, max_n),
                                  n_features=max_features,
                                  token_pattern='\w+',
                                  binary=not balanced)
    verbose and print(trans_type, "transformer params:", trans.get_params())

    if multilabel_out:
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(train_df[label_col].apply(str.split))
        verbose and print("multilabel columns:\n", mlb.classes_)
    else:
        y = train_df[label_col].apply(apply_fun).values if apply_fun is not None \
            else train_df[label_col].values
    X = trans.fit_transform(X, y)

    X_test = None
    if test_url:
        verbose and print("transforming test set")
        test_df = pd.read_csv(test_url)
        X_test = test_df[column] if trans_type == 'hashing' else vec.transform(
            test_df[column])
        X_test = trans.transform(X_test)
    s_time = time()
    verbose and print("finish transforming in %.3f seconds\n" %
                      (s_time - e_time))
    return X, y, X_test