Example #1
0
def dtm_matrix(lessonpath):
    # lesson number
    lessonname = lessonpath.split("/")[-2]

    # creating corpus of txt files
    corpusText(lessonpath)

    # finding the paths of the text files
    corpuspath = "C:/Users/eabalo/Desktop/STAAR35Analyses/data/corpus"
    filepaths = glob.glob(corpuspath + "/" + lessonname + "/*.txt")

    # script names
    docindex = [w.split("-")[-1].split(".")[0] for w in filepaths]

    # building a document-term matrix
    vectorizer = CountVectorizer(input="filename")

    dtm = vectorizer.fit_transform(filepaths)

    # lexicon of words in lesson
    # vocab = vectorizer.get_feature_names()

    # converting to numpy arrays
    dtm = dtm.toarray()

    # vocab = np.array(vocab)

    return dtm, docindex, lessonname
Example #2
0
def find_common_words(all_words, num_most_frequent_words):
    vectorizer = CountVectorizer(
        stop_words=None, # 'english',
        max_features=num_most_frequent_words,
        binary=True)
    vectorizer.fit(all_words)
    return (vectorizer.vocabulary_, vectorizer.get_feature_names())
Example #3
0
def test_vectorizer_unicode():
    # tests that the count vectorizer works with cyrillic.
    document = (
        "\xd0\x9c\xd0\xb0\xd1\x88\xd0\xb8\xd0\xbd\xd0\xbd\xd0\xbe\xd0"
        "\xb5 \xd0\xbe\xd0\xb1\xd1\x83\xd1\x87\xd0\xb5\xd0\xbd\xd0\xb8\xd0"
        "\xb5 \xe2\x80\x94 \xd0\xbe\xd0\xb1\xd1\x88\xd0\xb8\xd1\x80\xd0\xbd"
        "\xd1\x8b\xd0\xb9 \xd0\xbf\xd0\xbe\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb7"
        "\xd0\xb4\xd0\xb5\xd0\xbb \xd0\xb8\xd1\x81\xd0\xba\xd1\x83\xd1\x81"
        "\xd1\x81\xd1\x82\xd0\xb2\xd0\xb5\xd0\xbd\xd0\xbd\xd0\xbe\xd0\xb3"
        "\xd0\xbe \xd0\xb8\xd0\xbd\xd1\x82\xd0\xb5\xd0\xbb\xd0\xbb\xd0"
        "\xb5\xd0\xba\xd1\x82\xd0\xb0, \xd0\xb8\xd0\xb7\xd1\x83\xd1\x87"
        "\xd0\xb0\xd1\x8e\xd1\x89\xd0\xb8\xd0\xb9 \xd0\xbc\xd0\xb5\xd1\x82"
        "\xd0\xbe\xd0\xb4\xd1\x8b \xd0\xbf\xd0\xbe\xd1\x81\xd1\x82\xd1\x80"
        "\xd0\xbe\xd0\xb5\xd0\xbd\xd0\xb8\xd1\x8f \xd0\xb0\xd0\xbb\xd0\xb3"
        "\xd0\xbe\xd1\x80\xd0\xb8\xd1\x82\xd0\xbc\xd0\xbe\xd0\xb2, \xd1\x81"
        "\xd0\xbf\xd0\xbe\xd1\x81\xd0\xbe\xd0\xb1\xd0\xbd\xd1\x8b\xd1\x85 "
        "\xd0\xbe\xd0\xb1\xd1\x83\xd1\x87\xd0\xb0\xd1\x82\xd1\x8c\xd1\x81\xd1"
        "\x8f.")

    vect = CountVectorizer()
    X_counted = vect.fit_transform([document])
    assert_equal(X_counted.shape, (1, 15))

    vect = HashingVectorizer(norm=None, non_negative=True)
    X_hashed = vect.transform([document])
    assert_equal(X_hashed.shape, (1, 2 ** 20))

    # No collisions on such a small dataset
    assert_equal(X_counted.nnz, X_hashed.nnz)

    # When norm is None and non_negative, the tokens are counted up to
    # collisions
    assert_array_equal(np.sort(X_counted.data), np.sort(X_hashed.data))
def feature_extraction(in_file_name):
    in_file = codecs.open(in_file_name, 'r','latin-1') #经过预处理的语料
    corpus = []
    while True:
      #遍历文档
      doc = in_file.readline().strip()
      if doc == '': #读到文件尾时会返回空字符串
        break
      corpus.append(doc)
    
    #生成二元,tf,tfidf三种参数的空间
    #注意:可以通过CountVectorizer的max_features参数选择保留的feature数量
    max_features = None
    #max_features = 10000
    bin_vectorizer = CountVectorizer(max_features=max_features, binary=True,min_df=3)#binary occurrence markers
    print 'calculating term occurence feature...'
    term_occurence = bin_vectorizer.fit_transform(corpus) #二元特征
    '''
    tf_vectorizer = CountVectorizer(max_features=max_features,min_df=3)
    term_counts = tf_vectorizer.fit_transform(corpus)
    #tf = normalize(term_counts, axis=1, norm='l2')
    print 'calculating tf feature...'
    tf_transformer = TfidfTransformer(norm='l1', use_idf=False)
    tf = tf_transformer.fit_transform(term_counts) #tf特征
    '''
    '''
    print 'calculating tf-idf feature...'
    tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.fit_transform(term_counts) #tf-idf特征
    '''
    return bin_vectorizer, term_occurence
Example #5
0
def BoW(texts, vectorizerType="count", min_df=3):
    """Takes a list of texts and creates a BoWs object

    Args
    ----
    texts: List of Strings
        all the texts
    vectorizerType: String
        One of "count" or "tfidf"
    min_df: int
        Minimum number of letters a word must be

    Returns
    -------
    CV: Vectorizer object
        One of CountVectorizer or 
    BoWs: Fitted Vectorizer object
    """

    if vectorizerType == "count":
        CV = CountVectorizer(min_df=min_df)
    elif vectorizerType == "tfidf":
        CV = TfidfVectorizer(min_df=min_df)

    BoWs = CV.fit_transform(texts)

    return CV, BoWs
Example #6
0
def check_webshell(clf,dir):
    all=0
    all_php=0
    webshell=0

    webshell_files_list = load_files_re(webshell_dir)
    CV = CountVectorizer(ngram_range=(3, 3), decode_error="ignore", max_features=max_features,
                         token_pattern=r'\b\w+\b', min_df=1, max_df=1.0)
    x = CV.fit_transform(webshell_files_list).toarray()

    transformer = TfidfTransformer(smooth_idf=False)
    transformer.fit_transform(x)


    g = os.walk(dir)
    for path, d, filelist in g:
        for filename in filelist:
            fulepath=os.path.join(path, filename)
            t = load_file(fulepath)
            t_list=[]
            t_list.append(t)
            x2 = CV.transform(t_list).toarray()
            x2 = transformer.transform(x2).toarray()
            y_pred = clf.predict(x2)
            all+=1
            if filename.endswith('.php'):
                all_php+=1
            if y_pred[0] == 1:
                print "%s is webshell" % fulepath
                webshell+=1

    print "Scan %d files(%d php files),%d files is webshell" %(all,all_php,webshell)
def get_data(dir):
    titles = []
    titles_label = []
    os.path.walk(dir, visit, [titles, titles_label])
    # Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
    vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)
    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of
    # strings.
    titles_vocab_mat = vectorizer.fit_transform(titles)
    # Numpy arrays are easy to work with, so convert the result to an array
    #print vectorizer.vocabulary_  # a dict, the value is the index
    train_data_features = titles_vocab_mat.toarray()
    print train_data_features.shape
    # Take a look at the words in the vocabulary
    vocab = vectorizer.get_feature_names()
    print '/'.join(vocab)
    # Sum up the counts of each vocabulary word
    dist = np.sum(train_data_features, axis=0)
    total_words = 0
    for i in train_data_features:
        #print sum(i)
        total_words += sum(i)
    print total_words
    weka(vocab, dist, train_data_features, total_words, titles_label)
def produceLDATopics():
    '''
    Takes description of each game and uses sklearn's latent dirichlet allocation and count vectorizer
    to extract topics.
    :return: pandas data frame with topic weights for each game (rows) and topic (columns)
    '''
    data_samples, gameNames = create_game_profile_df(game_path)
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(data_samples)
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                    learning_method='online', learning_offset=50.,
                                    random_state=0)
    topics = lda.fit_transform(tf)
    # for i in range(50):
    #     gameTopics = []
    #     for j in range(len(topics[0])):
    #         if topics[i,j] > 1.0/float(n_topics):
    #             gameTopics.append(j)
    #     print gameNames[i], gameTopics
    topicsByGame = pandas.DataFrame(topics)
    topicsByGame.index = gameNames
    print topicsByGame

    tf_feature_names = tf_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([tf_feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

    return topicsByGame
def bag_of_words():
    twenty_train = pickle.load("twenty_train.p")
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(twenty_train.data)
    X_train_counts.shape
    count_vect.vocabulary_.get(u'algorithm')
    occurrences_to_frequencies(X_train_counts, twenty_train, count_vect)
    def setTestInputforNN(self, collection={}, sel_words=[]):
        list_of_strings = []
        list_of_salary = []
        count = 0
        sel_words_set = set(sel_words)
        sel_words_list = list(sel_words_set)
        for document in collection:
            count += 1
            title = document.getTitle()
            description = document.getDescription()
            salary = (int)(document.getSalaryNorm())
            words = re.split(" ", title) + re.split(" ", description)
            # words = [x for x in words if x in sel_words]
            wordsUnique = set(words)
            wordsUnique = wordsUnique & sel_words_set
            words = [x for x in words if x in wordsUnique]
            documentString = " ".join(words)
            list_of_strings.append(documentString)
            list_of_salary.append(salary)

            if not (count % 15000):
                break

        vectorizer = CountVectorizer(vocabulary=sel_words, min_df=1)
        self.inp = vectorizer.fit_transform(list_of_strings)
        from sklearn.externals import joblib

        joblib.dump(self.inp.tocsr(), "test_dataset_in.joblib")

        self.inp_size = len(list_of_strings)
        output = np.array(list_of_salary)
        self.target = output.reshape(len(list_of_strings), 1)
        joblib.dump(self.target, "test_dataset_out.joblib")

        return [self.inp, self.target]
Example #11
0
def bagOfWord(X):
    vectorizer = CountVectorizer(min_df=8, token_pattern=r"(?u)\b\w+\b")
    X = vectorizer.fit_transform(X)
    with open('./model/vectorizer.pkl','wb') as fr:
        print('save text vectorizer to ./model/')
        pickle.dump(vectorizer,fr)
    return X
Example #12
0
	def feature_extraction(self):
		vectorizer = CountVectorizer(binary=True, stop_words='english')
		corpus = []
		for doc in self.person.doc_list:
			corpus.append(doc.title + ' ' + doc.snippet)
		self.train = vectorizer.fit_transform(corpus)
		return vectorizer
Example #13
0
def textExtraction(df, series):
    vectorizer = CountVectorizer(analyzer = text_process, min_df = 0.1)
    df[series] = df[series].replace(np.nan, '', regex=True)
    vectorizer.fit_transform(df[series])
    vocab = vectorizer.get_feature_names()
    
    return vocab
Example #14
0
	def __init__(self, plot_vectorizer = 'count', tokenizer = None, lda = False, use_genre_vecs = False):
		t = None
		if tokenizer is 'named_entity':
			t = NETokenizer()
		elif tokenizer is 'lemma':
			t = LemmaTokenizer()
		self.use_genre_vecs = use_genre_vecs
		self.binary = plot_vectorizer is 'binary'
		if plot_vectorizer is 'tfidf':
			self.vectorizer = TfidfVectorizer(analyzer = "word",   \
				tokenizer = t,    \
				preprocessor = None, \
				stop_words = 'english')
		elif plot_vectorizer is 'binary':
			self.vectorizer = CountVectorizer(analyzer = "word",	\
				tokenizer = t,	\
				preprocessor = None, \
				stop_words = 'english', \
				binary = True)
		else:
			self.vectorizer = CountVectorizer(analyzer = "word",   \
				tokenizer = t,    \
				preprocessor = None, \
				stop_words = 'english')
		if lda:
			self.lda = LatentDirichletAllocation(n_topics=20, max_iter=2,	\
				learning_method='online', learning_offset=10.,	\
				random_state=0)
		else:
			self.lda = None
Example #15
0
def get_feature_by_bag_tfidf():
    global white_count
    global black_count
    global max_features
    print "max_features=%d" % max_features
    x=[]
    y=[]

    webshell_files_list = load_files_re(webshell_dir)
    y1=[1]*len(webshell_files_list)
    black_count=len(webshell_files_list)

    wp_files_list =load_files_re(whitefile_dir)
    y2=[0]*len(wp_files_list)

    white_count=len(wp_files_list)


    x=webshell_files_list+wp_files_list
    y=y1+y2

    CV = CountVectorizer(ngram_range=(2, 4), decode_error="ignore",max_features=max_features,
                                       token_pattern = r'\b\w+\b',min_df=1, max_df=1.0)
    x=CV.fit_transform(x).toarray()

    transformer = TfidfTransformer(smooth_idf=False)
    x_tfidf = transformer.fit_transform(x)
    x = x_tfidf.toarray()

    return x,y
Example #16
0
def getFeature():
    with open(os.path.join('spam_filter_train.txt'), 'r') as f:
        trainData = f.readlines()
    with open(os.path.join('spam_filter_test.txt'), 'r') as f:
        testData = f.readlines()
    data = trainData + testData
    trainNum, testNum = len(trainData), len(testData)
    del trainData
    del testData

    for i in range(len(data)):
        data[i] = data[i].replace('\n', '').split('\t')[1]
    # lemmatize
    lemmatized = []
    wnl = WordNetLemmatizer()
    for line in data:
        lemmatized.append([wnl.lemmatize(word) for word in line.split(' ')])
    # remove stopwords
    stopwordRemoved = []
    sw = set(stopwords.words('english'))
    for line in lemmatized:
        stopwordRemoved.append(' '.join([x for x in line if x not in sw]))
    # tf feature
    vec = CountVectorizer()
    features = vec.fit_transform(stopwordRemoved)

    with open('trainFeatures.pkl', 'wb') as f:
        cPickle.dump(features[:trainNum], f)
    with open('testFeatures.pkl', 'wb') as f:
        cPickle.dump(features[trainNum:], f)
Example #17
0
def get_feature_by_opcode():
    global white_count
    global black_count
    global max_features
    global webshell_dir
    global whitefile_dir
    print "max_features=%d webshell_dir=%s whitefile_dir=%s" % (max_features,webshell_dir,whitefile_dir)
    x=[]
    y=[]

    webshell_files_list = load_files_opcode_re(webshell_dir)
    y1=[1]*len(webshell_files_list)
    black_count=len(webshell_files_list)

    wp_files_list =load_files_opcode_re(whitefile_dir)
    y2=[0]*len(wp_files_list)

    white_count=len(wp_files_list)


    x=webshell_files_list+wp_files_list
    #print x
    y=y1+y2

    CV = CountVectorizer(ngram_range=(2, 4), decode_error="ignore",max_features=max_features,
                                       token_pattern = r'\b\w+\b',min_df=1, max_df=1.0)

    x=CV.fit_transform(x).toarray()

    return x,y
def token_count_pac(pac_id,           \
                    limit       = 'ALL', \
                    ngram_range = (2,2), \
                    min_df      = 5):
    
    conn    = psql.connect("dbname='keyword-influence'")
    cursor  = conn.cursor()

    cursor.execute("SELECT id, speaking                       \
                    FROM words                                \
                    WHERE id IN (                             \
                        SELECT id                             \
                        FROM words                            \
                        WHERE bioguide_id IN(                 \
                            SELECT bioguide_id                \
                            FROM pac_contrib as pc            \
                            INNER JOIN congress as c          \
                            ON pc.fec_candidate_id = c.fec_id \
                            WHERE pac_id = '"+ pac_id +"'));")
    sql_result = cursor.fetchall()

    counter   = CountVectorizer(stop_words  = corpus.stopwords.words('english'), \
                                ngram_range = ngram_range,                       \
                                min_df      = min_df)
    chunks    = map(lambda x: x[1], sql_result)
    counts    = counter.fit_transform(chunks)
    vocab     = counter.get_feature_names()
    vocab     = dict(zip(range(len(vocab)),vocab))
    
    return [counts, vocab]
def getCount(artName):
    artLst = []
    #artDict = {}
    for fn in os.listdir(indir):
        if not fn.endswith('.xml'): continue
        if ':' in fn:
            fn = fn.replace(':','/')
        fn = fn.decode('utf-8')
        #fn = unicodedata.normalize("NFC",fn)
        fn_de = unidecode(fn)
        newfn = fn_de[:-4]
        #print 'artName: ',artName, 'eval: ', newfn
        newfn = newfn.lower()
        if newfn == artName:
            # print "found article begin processing"
            #print fn
            if '/' in fn:
                fn = fn.replace('/',':')
            fullname = os.path.join(indir, fn)
            tree = ET.parse(fullname)
            root = tree.getroot()
            page = root.find('{http://www.mediawiki.org/xml/export-0.7/}page')

            revisions = page.findall('{http://www.mediawiki.org/xml/export-0.7/}revision')
            for s in revisions:
                txt = s.find('{http://www.mediawiki.org/xml/export-0.7/}text')
                artLst.append(txt.text)
            artLst = filter(None,[one for one in artLst])
            # print "processing done; begin counting"
            vectorizer = CountVectorizer(min_df=1,token_pattern='([^\[\|\]\s\.\!\=\{\}\;\<\>\?\"\'\#\(\)\,\*]+)')
            X = vectorizer.fit_transform(artLst)
            artDict = dict(zip(vectorizer.get_feature_names(),np.asarray(X.sum(axis=0)).ravel()))
        
            return artDict
    return -1
def bag_of_words_to_list(lines,max_features):
    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool
    # removing stopwords
    vectorizer = CountVectorizer(
        stop_words = 'english'
        ,max_features = max_features
        )
    
    #TfidfVectorizer i need to check this

    print('>> Removing stopwords...')
    # lets remove stopwords
    lines = remove_stopwords(lines,2)

    print('>> Stemming...')
    # lets stem it
    lines =stemming(lines,3)

    print('>> Doing bag of words...')
    #lets do the bag of words
    bag_of_words = vectorizer.fit_transform(lines)



    #uncomment to visualize the words and how many times are used
    #printing_bow(bag_of_words,vectorizer)

    return(vectorizer.get_feature_names(),bag_of_words.toarray())
Example #21
0
def wordMoverDistance(d1, d2):
    ###d1 list
    ###d2 list
    # Rule out words that not in vocabulary
    d1 = " ".join([w for w in d1 if w in vocab_dict])
    d2 = " ".join([w for w in d2 if w in vocab_dict])
    #print d1
    #print d2
    vect = CountVectorizer().fit([d1,d2])
    feature_names = vect.get_feature_names()
    W_ = W[[vocab_dict[w] for w in vect.get_feature_names()]] #Word Matrix
    D_ = euclidean_distances(W_) # Distance Matrix
    D_ = D_.astype(np.double)
    #D_ /= D_.max()  # Normalize for comparison
    v_1, v_2 = vect.transform([d1, d2])
    v_1 = v_1.toarray().ravel()
    v_2 = v_2.toarray().ravel()
    ### EMD
    v_1 = v_1.astype(np.double)
    v_2 = v_2.astype(np.double)
    v_1 /= v_1.sum()
    v_2 /= v_2.sum()
    #print("d(doc_1, doc_2) = {:.2f}".format(emd(v_1, v_2, D_)))
    emd_d = emd(v_1, v_2, D_) ## WMD
    #print emd_d
    return emd_d
Example #22
0
def race_tfidf(data, can_be_noun_arg, stop_words):
    print 
    data = data.groupby('race')['last']
    data = dict(list(data))
    docs = []
    for k in data:
        docs.append(' '.join(data[k]))
    count_vectorizer = CountVectorizer(stop_words='english')
    counts = count_vectorizer.fit_transform(docs)
    #print counts.todense().shape
    tfidf = TfidfTransformer(norm="l2", sublinear_tf='True')
    tfidf.fit(counts)
    #print "IDF:", tfidf.idf_.shape
    tf_idf_matrix = tfidf.transform(counts)
    freqs = {}
    sorted_voc = sorted(count_vectorizer.vocabulary_.iteritems(), key=operator.itemgetter(1))
    terms,_ = zip(*sorted_voc)
    for i,k in enumerate(data.keys()):
        # make list
        row = np.array(tf_idf_matrix.todense()[i,:])[0].tolist()
        freq = zip(terms, row)
        freqs[k] = sorted(freq, reverse=True, key=lambda x: x[1])
        print freqs[k][:5]
    #print tf_idf_matrix.todense().shape
    return freqs
def do_vectorize(filenames, tokenizer_fn=tokenize, min_df=1,
                 max_df=1., binary=True, ngram_range=(1,1)):
    """
    Convert a list of filenames into a sparse csr_matrix, where
    each row is a file and each column represents a unique word.
    Use sklearn's CountVectorizer: http://goo.gl/eJ2PJ5
    Params:
        filenames.......list of review file names
        tokenizer_fn....the function used to tokenize each document
        min_df..........remove terms from the vocabulary that don't appear
                        in at least this many documents
        max_df..........remove terms from the vocabulary that appear in more
                        than this fraction of documents
        binary..........If true, each documents is represented by a binary
                        vector, where 1 means a term occurs at least once in 
                        the document. If false, the term frequency is used instead.
        ngram_range.....A tuple (n,m) means to use phrases of length n to m inclusive.
                        E.g., (1,2) means consider unigrams and bigrams.
    Return:
        A tuple (X, vec), where X is the csr_matrix of feature vectors,
        and vec is the CountVectorizer object.
    """
    vectorizer = CountVectorizer(tokenizer=tokenizer_fn, min_df=min_df, max_df=max_df, binary=binary, ngram_range=ngram_range, dtype=int)
    X = vectorizer.fit_transform(filenames)
    return (X, vectorizer)
def train_vectorizer(corpus, max_features=10000):
    """ Train the vectorizer """
    print "training the vectorizer..."
    vectorizer = CountVectorizer(decode_error='ignore', max_features=max_features)
    vectorizer.fit(corpus)
    print "ok"
    return vectorizer
Example #25
0
def vocab_size(texts, min_count=[1,2,3,4,5], visualise=False, save=False):
    """Plots vocab size as a function of minimum letter count

    Args
    ----
    texts: list of Strings
        List of all the texts

    Returns
    -------
    sizes: List of ints
        Size of vocabulary
    """
    sizes = []
    for i in min_count:
        CV = CountVectorizer(min_df = i)
        BoWs = CV.fit_transform(texts)
        sizes.append(BoWs.shape[1])

    if visualise:
        plt.clf()
        plt.plot(min_count, sizes, 'bo-')
        if save:
            plt.savefig("Count_vs_vocabSize.png")

    return sizes
def work_with_simple_bag_of_words():
    count = CountVectorizer()
    docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining and the weather is sweet',
    ])
    bag = count.fit_transform(docs)
    print(count.vocabulary_)
    print(bag.toarray())

    np.set_printoptions(precision=2)
    tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
    print(tfidf.fit_transform(bag).toarray())

    tf_is = 2
    n_docs = 3
    idf_is = np.log((n_docs+1) / (3+1))
    tfidf_is = tf_is * (idf_is + 1)
    print("tf-idf of term 'is' = %.2f" % tfidf_is)

    tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True)
    raw_tfidf = tfidf.fit_transform(bag).toarray()[-1]
    print(raw_tfidf)

    l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2))
    print(l2_tfidf)
Example #27
0
def handle_doc(word_set,rs_path):
    doc_dir = os.listdir(rs_path)
    doc_matrix = []
    doc_cat = []
    for docs in doc_dir:
        files = os.listdir(rs_path+docs)
        print "start to handle the -->  "+docs
        for file_d in files:
            d_path = rs_path+docs+'/'+file_d
            #get the single file path
            with open(d_path,'rb') as text_file:
                str_tmp = ''
                file_lines = text_file.readlines()
                for line in file_lines:
                    pattern = r'''[a-zA-Z]+'''
                    tokens = nltk.regexp_tokenize(line,pattern)
                    for t in tokens:
                        if t.lower() in word_set:
                            str_tmp += t.lower()
                            str_tmp += ' '
                doc_matrix.append(str_tmp)
                doc_cat.append(cat_dic[docs])
            text_file.close()
    str_tmp = ''
    for sw in word_set:
        str_tmp += sw
        str_tmp += ' '
    doc_matrix.append(str_tmp)
    doc_cat.append('NAN')
    vectorizer = CountVectorizer()
    doc_num = vectorizer.fit_transform(doc_matrix)
    tfidf = TfidfTransformer()
    doc_tfidf = tfidf.fit_transform(doc_num)
    return doc_tfidf[:-1,:],doc_cat[:-1]
def bayes_tfidf(prefix, sufix, dic_fn):
    """
    prefix example: ./data/single_label_sen/sen_spanish_protest
    sufix example: pop_cat
    """

    train_file = prefix + "_train.txt.tok"
    test_file = prefix + "_test.txt.tok"

    train_y_file = prefix + "_train." + sufix
    test_y_file = prefix + "_test." + sufix
    
    dic_cn = {k.strip(): i for i, k in enumerate(open(dic_fn))}


    word_train_set = [l.strip().lower() for l in open(train_file)]
    word_test_set = [l.strip().lower() for l in open(test_file)]

    train_y = [dic_cn[l.strip()] for l in open(train_y_file)]
    test_y = [dic_cn[l.strip()] for l in open(test_y_file)]

    # construct the word count matrix
    count_vect = CountVectorizer()
    train_set_count = count_vect.fit_transform(word_train_set)
    test_set_count = count_vect.transform(word_test_set)

    # construct tfidf matrix
    tfidf_transformer = TfidfTransformer()
    train_set_x = tfidf_transformer.fit_transform(train_set_count)
    test_set_x = tfidf_transformer.transform(test_set_count)

    print "start the model"
    test_score = bayes_experiment([train_set_x, train_y], [test_set_x, test_y])
    return test_score
    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words = self.custom_stop_words)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        working_batch_size = len(tweets)
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), working_batch_size)
            # Only deal with tweets that are longer than 3 words.
            neighbors = tree2.radius_neighbors(x, radius = self.sensitivity)[1]
            if x.getnnz() > 2:
                n_neighbors.append(len(neighbors[0]))
                neighbors_indices.append(neighbors)
            else:
                n_neighbors.append(1)
                neighbors_indices.append(np.array([np.array([0])]))

        neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]

        return neighbors_indices
Example #30
0
def vectorize_test(texts, vocab=[]):
  vectorizer = CountVectorizer(min_df=0, stop_words="english") 
  if len(vocab) > 0:
    vectorizer = CountVectorizer(min_df=0, stop_words="english", 
      vocabulary=vocab)
  features = vectorizer.transform(texts)
  return features