def vectorize_test(texts, vocab=[]): vectorizer = CountVectorizer(min_df=0, stop_words="english") if len(vocab) > 0: vectorizer = CountVectorizer(min_df=0, stop_words="english", vocabulary=vocab) features = vectorizer.transform(texts) return features
def single_batch(self, tweets): """Performs an approximate nearest neighbors search on tweets in the database passed to it. The database must be a list of tweets (text of the tweets only). Returns the indices of tweets with nearby neighbors (i.e. spam tweets). These indices correspond to indices within the batch of tweets fed to this function.""" # Vectorize and fit tree: vect2 = CountVectorizer(stop_words = self.custom_stop_words) X2 = vect2.fit_transform(tweets) tree2 = LSHForest() tree2.fit(X2) # Build tree: n_neighbors = [] neighbors_indices = [] working_batch_size = len(tweets) for x in vect2.transform(tweets): if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), working_batch_size) # Only deal with tweets that are longer than 3 words. neighbors = tree2.radius_neighbors(x, radius = self.sensitivity)[1] if x.getnnz() > 2: n_neighbors.append(len(neighbors[0])) neighbors_indices.append(neighbors) else: n_neighbors.append(1) neighbors_indices.append(np.array([np.array([0])])) neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2] return neighbors_indices
def handle_doc(word_set,rs_path): doc_dir = os.listdir(rs_path) doc_matrix = [] doc_cat = [] for docs in doc_dir: files = os.listdir(rs_path+docs) print "start to handle the --> "+docs for file_d in files: d_path = rs_path+docs+'/'+file_d #get the single file path with open(d_path,'rb') as text_file: str_tmp = '' file_lines = text_file.readlines() for line in file_lines: pattern = r'''[a-zA-Z]+''' tokens = nltk.regexp_tokenize(line,pattern) for t in tokens: if t.lower() in word_set: str_tmp += t.lower() str_tmp += ' ' doc_matrix.append(str_tmp) doc_cat.append(cat_dic[docs]) text_file.close() str_tmp = '' for sw in word_set: str_tmp += sw str_tmp += ' ' doc_matrix.append(str_tmp) doc_cat.append('NAN') vectorizer = CountVectorizer() doc_num = vectorizer.fit_transform(doc_matrix) tfidf = TfidfTransformer() doc_tfidf = tfidf.fit_transform(doc_num) return doc_tfidf[:-1,:],doc_cat[:-1]
def dtm_matrix(lessonpath): # lesson number lessonname = lessonpath.split("/")[-2] # creating corpus of txt files corpusText(lessonpath) # finding the paths of the text files corpuspath = "C:/Users/eabalo/Desktop/STAAR35Analyses/data/corpus" filepaths = glob.glob(corpuspath + "/" + lessonname + "/*.txt") # script names docindex = [w.split("-")[-1].split(".")[0] for w in filepaths] # building a document-term matrix vectorizer = CountVectorizer(input="filename") dtm = vectorizer.fit_transform(filepaths) # lexicon of words in lesson # vocab = vectorizer.get_feature_names() # converting to numpy arrays dtm = dtm.toarray() # vocab = np.array(vocab) return dtm, docindex, lessonname
def test_vectorizer_unicode(): # tests that the count vectorizer works with cyrillic. document = ( "\xd0\x9c\xd0\xb0\xd1\x88\xd0\xb8\xd0\xbd\xd0\xbd\xd0\xbe\xd0" "\xb5 \xd0\xbe\xd0\xb1\xd1\x83\xd1\x87\xd0\xb5\xd0\xbd\xd0\xb8\xd0" "\xb5 \xe2\x80\x94 \xd0\xbe\xd0\xb1\xd1\x88\xd0\xb8\xd1\x80\xd0\xbd" "\xd1\x8b\xd0\xb9 \xd0\xbf\xd0\xbe\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb7" "\xd0\xb4\xd0\xb5\xd0\xbb \xd0\xb8\xd1\x81\xd0\xba\xd1\x83\xd1\x81" "\xd1\x81\xd1\x82\xd0\xb2\xd0\xb5\xd0\xbd\xd0\xbd\xd0\xbe\xd0\xb3" "\xd0\xbe \xd0\xb8\xd0\xbd\xd1\x82\xd0\xb5\xd0\xbb\xd0\xbb\xd0" "\xb5\xd0\xba\xd1\x82\xd0\xb0, \xd0\xb8\xd0\xb7\xd1\x83\xd1\x87" "\xd0\xb0\xd1\x8e\xd1\x89\xd0\xb8\xd0\xb9 \xd0\xbc\xd0\xb5\xd1\x82" "\xd0\xbe\xd0\xb4\xd1\x8b \xd0\xbf\xd0\xbe\xd1\x81\xd1\x82\xd1\x80" "\xd0\xbe\xd0\xb5\xd0\xbd\xd0\xb8\xd1\x8f \xd0\xb0\xd0\xbb\xd0\xb3" "\xd0\xbe\xd1\x80\xd0\xb8\xd1\x82\xd0\xbc\xd0\xbe\xd0\xb2, \xd1\x81" "\xd0\xbf\xd0\xbe\xd1\x81\xd0\xbe\xd0\xb1\xd0\xbd\xd1\x8b\xd1\x85 " "\xd0\xbe\xd0\xb1\xd1\x83\xd1\x87\xd0\xb0\xd1\x82\xd1\x8c\xd1\x81\xd1" "\x8f.") vect = CountVectorizer() X_counted = vect.fit_transform([document]) assert_equal(X_counted.shape, (1, 15)) vect = HashingVectorizer(norm=None, non_negative=True) X_hashed = vect.transform([document]) assert_equal(X_hashed.shape, (1, 2 ** 20)) # No collisions on such a small dataset assert_equal(X_counted.nnz, X_hashed.nnz) # When norm is None and non_negative, the tokens are counted up to # collisions assert_array_equal(np.sort(X_counted.data), np.sort(X_hashed.data))
def vocab_size(texts, min_count=[1,2,3,4,5], visualise=False, save=False): """Plots vocab size as a function of minimum letter count Args ---- texts: list of Strings List of all the texts Returns ------- sizes: List of ints Size of vocabulary """ sizes = [] for i in min_count: CV = CountVectorizer(min_df = i) BoWs = CV.fit_transform(texts) sizes.append(BoWs.shape[1]) if visualise: plt.clf() plt.plot(min_count, sizes, 'bo-') if save: plt.savefig("Count_vs_vocabSize.png") return sizes
def find_common_words(all_words, num_most_frequent_words): vectorizer = CountVectorizer( stop_words=None, # 'english', max_features=num_most_frequent_words, binary=True) vectorizer.fit(all_words) return (vectorizer.vocabulary_, vectorizer.get_feature_names())
def do_vectorize(filenames, tokenizer_fn=tokenize, min_df=1, max_df=1., binary=True, ngram_range=(1,1)): """ Convert a list of filenames into a sparse csr_matrix, where each row is a file and each column represents a unique word. Use sklearn's CountVectorizer: http://goo.gl/eJ2PJ5 Params: filenames.......list of review file names tokenizer_fn....the function used to tokenize each document min_df..........remove terms from the vocabulary that don't appear in at least this many documents max_df..........remove terms from the vocabulary that appear in more than this fraction of documents binary..........If true, each documents is represented by a binary vector, where 1 means a term occurs at least once in the document. If false, the term frequency is used instead. ngram_range.....A tuple (n,m) means to use phrases of length n to m inclusive. E.g., (1,2) means consider unigrams and bigrams. Return: A tuple (X, vec), where X is the csr_matrix of feature vectors, and vec is the CountVectorizer object. """ vectorizer = CountVectorizer(tokenizer=tokenizer_fn, min_df=min_df, max_df=max_df, binary=binary, ngram_range=ngram_range, dtype=int) X = vectorizer.fit_transform(filenames) return (X, vectorizer)
def BoW(texts, vectorizerType="count", min_df=3): """Takes a list of texts and creates a BoWs object Args ---- texts: List of Strings all the texts vectorizerType: String One of "count" or "tfidf" min_df: int Minimum number of letters a word must be Returns ------- CV: Vectorizer object One of CountVectorizer or BoWs: Fitted Vectorizer object """ if vectorizerType == "count": CV = CountVectorizer(min_df=min_df) elif vectorizerType == "tfidf": CV = TfidfVectorizer(min_df=min_df) BoWs = CV.fit_transform(texts) return CV, BoWs
def wordMoverDistance(d1, d2): ###d1 list ###d2 list # Rule out words that not in vocabulary d1 = " ".join([w for w in d1 if w in vocab_dict]) d2 = " ".join([w for w in d2 if w in vocab_dict]) #print d1 #print d2 vect = CountVectorizer().fit([d1,d2]) feature_names = vect.get_feature_names() W_ = W[[vocab_dict[w] for w in vect.get_feature_names()]] #Word Matrix D_ = euclidean_distances(W_) # Distance Matrix D_ = D_.astype(np.double) #D_ /= D_.max() # Normalize for comparison v_1, v_2 = vect.transform([d1, d2]) v_1 = v_1.toarray().ravel() v_2 = v_2.toarray().ravel() ### EMD v_1 = v_1.astype(np.double) v_2 = v_2.astype(np.double) v_1 /= v_1.sum() v_2 /= v_2.sum() #print("d(doc_1, doc_2) = {:.2f}".format(emd(v_1, v_2, D_))) emd_d = emd(v_1, v_2, D_) ## WMD #print emd_d return emd_d
def feature_extraction(in_file_name): in_file = codecs.open(in_file_name, 'r','latin-1') #经过预处理的语料 corpus = [] while True: #遍历文档 doc = in_file.readline().strip() if doc == '': #读到文件尾时会返回空字符串 break corpus.append(doc) #生成二元,tf,tfidf三种参数的空间 #注意:可以通过CountVectorizer的max_features参数选择保留的feature数量 max_features = None #max_features = 10000 bin_vectorizer = CountVectorizer(max_features=max_features, binary=True,min_df=3)#binary occurrence markers print 'calculating term occurence feature...' term_occurence = bin_vectorizer.fit_transform(corpus) #二元特征 ''' tf_vectorizer = CountVectorizer(max_features=max_features,min_df=3) term_counts = tf_vectorizer.fit_transform(corpus) #tf = normalize(term_counts, axis=1, norm='l2') print 'calculating tf feature...' tf_transformer = TfidfTransformer(norm='l1', use_idf=False) tf = tf_transformer.fit_transform(term_counts) #tf特征 ''' ''' print 'calculating tf-idf feature...' tfidf_transformer = TfidfTransformer() tfidf = tfidf_transformer.fit_transform(term_counts) #tf-idf特征 ''' return bin_vectorizer, term_occurence
def getCount(artName): artLst = [] #artDict = {} for fn in os.listdir(indir): if not fn.endswith('.xml'): continue if ':' in fn: fn = fn.replace(':','/') fn = fn.decode('utf-8') #fn = unicodedata.normalize("NFC",fn) fn_de = unidecode(fn) newfn = fn_de[:-4] #print 'artName: ',artName, 'eval: ', newfn newfn = newfn.lower() if newfn == artName: # print "found article begin processing" #print fn if '/' in fn: fn = fn.replace('/',':') fullname = os.path.join(indir, fn) tree = ET.parse(fullname) root = tree.getroot() page = root.find('{http://www.mediawiki.org/xml/export-0.7/}page') revisions = page.findall('{http://www.mediawiki.org/xml/export-0.7/}revision') for s in revisions: txt = s.find('{http://www.mediawiki.org/xml/export-0.7/}text') artLst.append(txt.text) artLst = filter(None,[one for one in artLst]) # print "processing done; begin counting" vectorizer = CountVectorizer(min_df=1,token_pattern='([^\[\|\]\s\.\!\=\{\}\;\<\>\?\"\'\#\(\)\,\*]+)') X = vectorizer.fit_transform(artLst) artDict = dict(zip(vectorizer.get_feature_names(),np.asarray(X.sum(axis=0)).ravel())) return artDict return -1
def get_data(dir): titles = [] titles_label = [] os.path.walk(dir, visit, [titles, titles_label]) # Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool. vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 5000) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings. titles_vocab_mat = vectorizer.fit_transform(titles) # Numpy arrays are easy to work with, so convert the result to an array #print vectorizer.vocabulary_ # a dict, the value is the index train_data_features = titles_vocab_mat.toarray() print train_data_features.shape # Take a look at the words in the vocabulary vocab = vectorizer.get_feature_names() print '/'.join(vocab) # Sum up the counts of each vocabulary word dist = np.sum(train_data_features, axis=0) total_words = 0 for i in train_data_features: #print sum(i) total_words += sum(i) print total_words weka(vocab, dist, train_data_features, total_words, titles_label)
def get_feature_by_opcode(): global white_count global black_count global max_features global webshell_dir global whitefile_dir print "max_features=%d webshell_dir=%s whitefile_dir=%s" % (max_features,webshell_dir,whitefile_dir) x=[] y=[] webshell_files_list = load_files_opcode_re(webshell_dir) y1=[1]*len(webshell_files_list) black_count=len(webshell_files_list) wp_files_list =load_files_opcode_re(whitefile_dir) y2=[0]*len(wp_files_list) white_count=len(wp_files_list) x=webshell_files_list+wp_files_list #print x y=y1+y2 CV = CountVectorizer(ngram_range=(2, 4), decode_error="ignore",max_features=max_features, token_pattern = r'\b\w+\b',min_df=1, max_df=1.0) x=CV.fit_transform(x).toarray() return x,y
def check_webshell(clf,dir): all=0 all_php=0 webshell=0 webshell_files_list = load_files_re(webshell_dir) CV = CountVectorizer(ngram_range=(3, 3), decode_error="ignore", max_features=max_features, token_pattern=r'\b\w+\b', min_df=1, max_df=1.0) x = CV.fit_transform(webshell_files_list).toarray() transformer = TfidfTransformer(smooth_idf=False) transformer.fit_transform(x) g = os.walk(dir) for path, d, filelist in g: for filename in filelist: fulepath=os.path.join(path, filename) t = load_file(fulepath) t_list=[] t_list.append(t) x2 = CV.transform(t_list).toarray() x2 = transformer.transform(x2).toarray() y_pred = clf.predict(x2) all+=1 if filename.endswith('.php'): all_php+=1 if y_pred[0] == 1: print "%s is webshell" % fulepath webshell+=1 print "Scan %d files(%d php files),%d files is webshell" %(all,all_php,webshell)
def bayes_tfidf(prefix, sufix, dic_fn): """ prefix example: ./data/single_label_sen/sen_spanish_protest sufix example: pop_cat """ train_file = prefix + "_train.txt.tok" test_file = prefix + "_test.txt.tok" train_y_file = prefix + "_train." + sufix test_y_file = prefix + "_test." + sufix dic_cn = {k.strip(): i for i, k in enumerate(open(dic_fn))} word_train_set = [l.strip().lower() for l in open(train_file)] word_test_set = [l.strip().lower() for l in open(test_file)] train_y = [dic_cn[l.strip()] for l in open(train_y_file)] test_y = [dic_cn[l.strip()] for l in open(test_y_file)] # construct the word count matrix count_vect = CountVectorizer() train_set_count = count_vect.fit_transform(word_train_set) test_set_count = count_vect.transform(word_test_set) # construct tfidf matrix tfidf_transformer = TfidfTransformer() train_set_x = tfidf_transformer.fit_transform(train_set_count) test_set_x = tfidf_transformer.transform(test_set_count) print "start the model" test_score = bayes_experiment([train_set_x, train_y], [test_set_x, test_y]) return test_score
def get_feature_by_bag_tfidf(): global white_count global black_count global max_features print "max_features=%d" % max_features x=[] y=[] webshell_files_list = load_files_re(webshell_dir) y1=[1]*len(webshell_files_list) black_count=len(webshell_files_list) wp_files_list =load_files_re(whitefile_dir) y2=[0]*len(wp_files_list) white_count=len(wp_files_list) x=webshell_files_list+wp_files_list y=y1+y2 CV = CountVectorizer(ngram_range=(2, 4), decode_error="ignore",max_features=max_features, token_pattern = r'\b\w+\b',min_df=1, max_df=1.0) x=CV.fit_transform(x).toarray() transformer = TfidfTransformer(smooth_idf=False) x_tfidf = transformer.fit_transform(x) x = x_tfidf.toarray() return x,y
def train_vectorizer(corpus, max_features=10000): """ Train the vectorizer """ print "training the vectorizer..." vectorizer = CountVectorizer(decode_error='ignore', max_features=max_features) vectorizer.fit(corpus) print "ok" return vectorizer
def work_with_simple_bag_of_words(): count = CountVectorizer() docs = np.array([ 'The sun is shining', 'The weather is sweet', 'The sun is shining and the weather is sweet', ]) bag = count.fit_transform(docs) print(count.vocabulary_) print(bag.toarray()) np.set_printoptions(precision=2) tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True) print(tfidf.fit_transform(bag).toarray()) tf_is = 2 n_docs = 3 idf_is = np.log((n_docs+1) / (3+1)) tfidf_is = tf_is * (idf_is + 1) print("tf-idf of term 'is' = %.2f" % tfidf_is) tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True) raw_tfidf = tfidf.fit_transform(bag).toarray()[-1] print(raw_tfidf) l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2)) print(l2_tfidf)
def bag_of_words_to_list(lines,max_features): # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool # removing stopwords vectorizer = CountVectorizer( stop_words = 'english' ,max_features = max_features ) #TfidfVectorizer i need to check this print('>> Removing stopwords...') # lets remove stopwords lines = remove_stopwords(lines,2) print('>> Stemming...') # lets stem it lines =stemming(lines,3) print('>> Doing bag of words...') #lets do the bag of words bag_of_words = vectorizer.fit_transform(lines) #uncomment to visualize the words and how many times are used #printing_bow(bag_of_words,vectorizer) return(vectorizer.get_feature_names(),bag_of_words.toarray())
def race_tfidf(data, can_be_noun_arg, stop_words): print data = data.groupby('race')['last'] data = dict(list(data)) docs = [] for k in data: docs.append(' '.join(data[k])) count_vectorizer = CountVectorizer(stop_words='english') counts = count_vectorizer.fit_transform(docs) #print counts.todense().shape tfidf = TfidfTransformer(norm="l2", sublinear_tf='True') tfidf.fit(counts) #print "IDF:", tfidf.idf_.shape tf_idf_matrix = tfidf.transform(counts) freqs = {} sorted_voc = sorted(count_vectorizer.vocabulary_.iteritems(), key=operator.itemgetter(1)) terms,_ = zip(*sorted_voc) for i,k in enumerate(data.keys()): # make list row = np.array(tf_idf_matrix.todense()[i,:])[0].tolist() freq = zip(terms, row) freqs[k] = sorted(freq, reverse=True, key=lambda x: x[1]) print freqs[k][:5] #print tf_idf_matrix.todense().shape return freqs
def getFeature(): with open(os.path.join('spam_filter_train.txt'), 'r') as f: trainData = f.readlines() with open(os.path.join('spam_filter_test.txt'), 'r') as f: testData = f.readlines() data = trainData + testData trainNum, testNum = len(trainData), len(testData) del trainData del testData for i in range(len(data)): data[i] = data[i].replace('\n', '').split('\t')[1] # lemmatize lemmatized = [] wnl = WordNetLemmatizer() for line in data: lemmatized.append([wnl.lemmatize(word) for word in line.split(' ')]) # remove stopwords stopwordRemoved = [] sw = set(stopwords.words('english')) for line in lemmatized: stopwordRemoved.append(' '.join([x for x in line if x not in sw])) # tf feature vec = CountVectorizer() features = vec.fit_transform(stopwordRemoved) with open('trainFeatures.pkl', 'wb') as f: cPickle.dump(features[:trainNum], f) with open('testFeatures.pkl', 'wb') as f: cPickle.dump(features[trainNum:], f)
def produceLDATopics(): ''' Takes description of each game and uses sklearn's latent dirichlet allocation and count vectorizer to extract topics. :return: pandas data frame with topic weights for each game (rows) and topic (columns) ''' data_samples, gameNames = create_game_profile_df(game_path) tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') tf = tf_vectorizer.fit_transform(data_samples) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) topics = lda.fit_transform(tf) # for i in range(50): # gameTopics = [] # for j in range(len(topics[0])): # if topics[i,j] > 1.0/float(n_topics): # gameTopics.append(j) # print gameNames[i], gameTopics topicsByGame = pandas.DataFrame(topics) topicsByGame.index = gameNames print topicsByGame tf_feature_names = tf_vectorizer.get_feature_names() for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) return topicsByGame
def setTestInputforNN(self, collection={}, sel_words=[]): list_of_strings = [] list_of_salary = [] count = 0 sel_words_set = set(sel_words) sel_words_list = list(sel_words_set) for document in collection: count += 1 title = document.getTitle() description = document.getDescription() salary = (int)(document.getSalaryNorm()) words = re.split(" ", title) + re.split(" ", description) # words = [x for x in words if x in sel_words] wordsUnique = set(words) wordsUnique = wordsUnique & sel_words_set words = [x for x in words if x in wordsUnique] documentString = " ".join(words) list_of_strings.append(documentString) list_of_salary.append(salary) if not (count % 15000): break vectorizer = CountVectorizer(vocabulary=sel_words, min_df=1) self.inp = vectorizer.fit_transform(list_of_strings) from sklearn.externals import joblib joblib.dump(self.inp.tocsr(), "test_dataset_in.joblib") self.inp_size = len(list_of_strings) output = np.array(list_of_salary) self.target = output.reshape(len(list_of_strings), 1) joblib.dump(self.target, "test_dataset_out.joblib") return [self.inp, self.target]
def bagOfWord(X): vectorizer = CountVectorizer(min_df=8, token_pattern=r"(?u)\b\w+\b") X = vectorizer.fit_transform(X) with open('./model/vectorizer.pkl','wb') as fr: print('save text vectorizer to ./model/') pickle.dump(vectorizer,fr) return X
def textExtraction(df, series): vectorizer = CountVectorizer(analyzer = text_process, min_df = 0.1) df[series] = df[series].replace(np.nan, '', regex=True) vectorizer.fit_transform(df[series]) vocab = vectorizer.get_feature_names() return vocab
def feature_extraction(self): vectorizer = CountVectorizer(binary=True, stop_words='english') corpus = [] for doc in self.person.doc_list: corpus.append(doc.title + ' ' + doc.snippet) self.train = vectorizer.fit_transform(corpus) return vectorizer
def bag_of_words(): twenty_train = pickle.load("twenty_train.p") count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(twenty_train.data) X_train_counts.shape count_vect.vocabulary_.get(u'algorithm') occurrences_to_frequencies(X_train_counts, twenty_train, count_vect)
def __init__(self, plot_vectorizer = 'count', tokenizer = None, lda = False, use_genre_vecs = False): t = None if tokenizer is 'named_entity': t = NETokenizer() elif tokenizer is 'lemma': t = LemmaTokenizer() self.use_genre_vecs = use_genre_vecs self.binary = plot_vectorizer is 'binary' if plot_vectorizer is 'tfidf': self.vectorizer = TfidfVectorizer(analyzer = "word", \ tokenizer = t, \ preprocessor = None, \ stop_words = 'english') elif plot_vectorizer is 'binary': self.vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = t, \ preprocessor = None, \ stop_words = 'english', \ binary = True) else: self.vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = t, \ preprocessor = None, \ stop_words = 'english') if lda: self.lda = LatentDirichletAllocation(n_topics=20, max_iter=2, \ learning_method='online', learning_offset=10., \ random_state=0) else: self.lda = None
def token_count_pac(pac_id, \ limit = 'ALL', \ ngram_range = (2,2), \ min_df = 5): conn = psql.connect("dbname='keyword-influence'") cursor = conn.cursor() cursor.execute("SELECT id, speaking \ FROM words \ WHERE id IN ( \ SELECT id \ FROM words \ WHERE bioguide_id IN( \ SELECT bioguide_id \ FROM pac_contrib as pc \ INNER JOIN congress as c \ ON pc.fec_candidate_id = c.fec_id \ WHERE pac_id = '"+ pac_id +"'));") sql_result = cursor.fetchall() counter = CountVectorizer(stop_words = corpus.stopwords.words('english'), \ ngram_range = ngram_range, \ min_df = min_df) chunks = map(lambda x: x[1], sql_result) counts = counter.fit_transform(chunks) vocab = counter.get_feature_names() vocab = dict(zip(range(len(vocab)),vocab)) return [counts, vocab]
train = pd.read_csv("../datasets/dataset_1/train.csv", header='infer', index_col=None) x_train, x_test, y_train, y_test = train_test_split(train["SentimentText"], train["Sentiment"], random_state=1000, test_size=0.3) # nb_classes= np.max(y_train) + 1 # from keras.utils import np_utils # Y_train = np_utils.to_categorical(y_train, nb_classes) # Y_test = np_utils.to_categorical(y_test, nb_classes) # >>> COUNT VECTORIZER >>> count_vect = CountVectorizer() X = count_vect.fit_transform(x_train) encoder = LabelEncoder() y_train = encoder.fit_transform(y_train) num_classes = np.max(y_train) + 1 y_train = utils.to_categorical(y_train, num_classes) y_test = utils.to_categorical(y_test, num_classes) # >>> TUNING BATCH SIZE AND EPOCHS >>> def create_model(): # Build the model model = Sequential() input_dim = X.shape[1] model.add(Dense(512, input_dim=input_dim)) model.add(Dense(num_classes))
def kfold_cross(data): kf = KFold(n_splits=30) num_test = 1 #for avg accuracy_tot = 0 precision_tot = 0 recall_tot = 0 f_p_tot = 0 f_measure_tot = 0 for train_index, test_index in kf.split(data): #create train and test list train_msg = list() train_classe = list() test_msg = list() test_classe = list() print("\nStarting test N°", str(num_test)) for i in train_index: train_msg.append(data[i][0]) train_classe.append(data[i][1]) for j in test_index: test_msg.append(data[j][0]) test_classe.append(data[j][1]) #call count vectorizer that will create a matrix of token counts v = CountVectorizer() train_matrix = v.fit_transform(train_msg) #call multinomial naive bayes clf = MultinomialNB() clf.fit(train_matrix, train_classe) #create a matrix for the test list test_matrix = v.transform(test_msg) #predict with multinomial the test set predicted = clf.predict(test_matrix) #create a matrix of dim [2,2] conf_matrix = confusion_matrix(test_classe, predicted) tn, fp, fn, tp = conf_matrix.ravel() accuracy = (accuracy_score(test_classe, predicted)) * 100 precision = precision_score(test_classe, predicted, pos_label='malware') recall = recall_score(test_classe, predicted, pos_label='malware') f_p_rate = fp / (fp + tn) f_measure = 2 * (precision * recall) / (precision + recall) accuracy_tot += accuracy precision_tot += precision recall_tot += recall f_p_tot += f_p_rate f_measure_tot += f_measure #print all the results print('Confusion matrix:') print(confusion_matrix(test_classe, predicted)) print('Accuracy is: ' + str(accuracy)[:5] + '%') print('Precision: ' + str(precision)[:5]) print('Recall: ' + str(recall)[:5]) print('False-positive rating: ' + str(f_p_rate)[:5]) print('F-measure: ' + str(f_measure)[:5]) num_test += 1 print('\nPerformance Evaluation, AVG Values:') print('Accuracy: ' + str(accuracy_tot / 30)[:5] + '%') print('Precision: ' + str(precision_tot / 30)[:5]) print('Recall: ' + str(recall_tot / 30)[:5]) print('False-positive rating: ' + str(f_p_tot / 30)[:5]) print('F-measure: ' + str(f_measure_tot / 30)[:5])
email_data = email_data.loc[email_data.text != " ",:] ##There are no empty spaces ##Creating a matrix of token counts for the entire text document def split_if_words(i): return [word for word in i.split(" ")] predictors = email_data.iloc[:,1] target = email_data.iloc[:,0] #Splitting the data from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test = train_test_split(predictors, target, test_size = 0.3, stratify = target) ##Convert email text into word count matric i.e bag of words email_bow = CountVectorizer(analyzer = split_if_words).fit(email_data["text"]) ##For all the emails doing the transformation all_emails_matrix = email_bow.transform(email_data["text"]) all_emails_matrix.shape #(5559, 6661) ##For training data train_emails_matrix = email_bow.transform(x_train) train_emails_matrix.shape #(3891, 6661) ##For test data test_emails_matrix = email_bow.transform(x_test) test_emails_matrix.shape
all_the_text = all_the_text.decode('utf-8', 'ignore') essay_list = re.split('#@\d+', all_the_text) file_object = open('NGSL_lemmatized') try: all_the_text = file_object.read( ) finally: file_object.close( ) Dict = all_the_text.split() punct = [',','.','?','!',':','\'','"'] #Dict.append(punct) vectorizer = CountVectorizer(stop_words='english') X = vectorizer.fit_transform(Dict) charac_num = [] #0. number of characters tokens_num = [] #1. number of tokens difftokens_num = [] #2. number of different tokens diff_to_tokens = [] #3. 2/1 sentence_num = [] #4.number of sentences aver_charac = [] #5. 0/1 aver_tokens = [] #6. 1/5 words_in_NGSL = [] #7. ratio of words in NGSL E1_percent = [] #8.ratio of E1 error bag_of_words = [] #9.bag_of_wods feature_vector = []
import jieba import jieba.posseg as pseg import os import sys from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer import json if __name__ == "__main__": # corpus=["我 來到 北京 清華大學", # 第一類文本斷詞後的结果,詞之間以空格隔開 # "他 來到 了 網易 杭研 大廈", # 第二類文本的斷詞結果 # "小明 碩士 畢業 於 中國 科學院", # 第三類文本的斷詞結果 # "我 愛 北京 天安門"] # 第四類文本的斷詞結果 corpus = [] with open("E:/AB104/Expedia/Hotels-City-Suites-Kaohsiung-Chenai_comments.json", "r") as a: Com_list = json.load(a) for i in Com_list: for j in i["comment_collection"]: corpus.append(j["comment"]) vectorizer = CountVectorizer(ngram_range=(2,2)) # 該類會將文本中的詞語轉換為詞頻矩陣,矩陣元素a[i][j] 表示j詞在i類文本下的詞頻 transformer = TfidfTransformer() # 該類會統計每個詞語的 tf-idf 權重 tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) # 第一個 fit_transform是計算 tf-idf,第二個 fit_transform是將文本轉為詞頻矩陣 word = vectorizer.get_feature_names() # 獲取詞代模型中的所有詞語 weight = tfidf.toarray() # 將tf-idf矩陣抽取出来,元素a[i][j]表示j詞在i類文本中的tf-idf權重 for i in range(len(weight)): # 打印每類文本的tf-idf詞語權重,第一個 for便利所有文本,第二個 for便利某一類文本下的詞語權重 print u"-------這裡輸出第", i + 1, u"類文本的詞語tf-idf權重------" for j in range(len(word)): print word[j], weight[i][j]
# In[35]: # using countvectorizer to convert a collection of text documents to a matrix of token counts # In[36]: from sklearn.feature_extraction.text import CountVectorizer # In[37]: bow_transformer=CountVectorizer(analyzer=text_process).fit(messages['message']) # In[38]: print(len(bow_transformer.vocabulary_)) # In[39]: #example text message and get its bag-of-words counts as a vector # In[40]:
data = pd.read_csv("train.csv") def show_topic(model, feature_names, top): for index, distribution in enumerate(model.components_): sorted_word_indices = distribution.argsort()[::-1][:top] print(f"Topic {index}:") print(" ".join([feature_names[i] for i in sorted_word_indices])) # CountVectorizer tf_vectorizer = CountVectorizer( # set up your CountVectorizer tokenizer=lambda text: [ token.text for token in nlp(text) if not token.is_stop and not token.is_punct ], max_df=.8, # 0.8(float) * 25,000 min_df=25, # 25(int) out of 25,000 max_features=1000, #top 1000 tokens ) tf = tf_vectorizer.fit_transform(data["review"]) lda = LatentDirichletAllocation( # set up your LatentDirichletAllocation n_components=20, # 20 out of 25,000 can be any other number learning_method="online", learning_offset=50, random_state=2020, ) lda.fit(tf)
from sklearn.datasets import fetch_20newsgroups from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer #Exploring the data emails = fetch_20newsgroups(categories = ['rec.sport.baseball', 'rec.sport.hockey']) print(emails.target_names) print(emails.data[5]) print(emails.target[5]) #1 print(emails.target_names) #rec.sport.hockey #Making the Training and Test Sets train_emails = fetch_20newsgroups(categories = ['comp.sys.ibm.pc.hardware','rec.sport.hockey'], subset = 'train', shuffle = True, random_state = 108) test_emails = fetch_20newsgroups(categories = ['comp.sys.ibm.pc.hardware','rec.sport.hockey'], subset = 'test', shuffle = True, random_state = 108) #Counting words counter = CountVectorizer() counter.fit(test_emails.data + train_emails.data) train_counts = counter.transform(train_emails.data) test_counts = counter.transform(test_emails.data) #Making a Naive Bayes Classifier classifier = MultinomialNB() classifier.fit(train_counts, train_emails.target) print(classifier.score(test_counts, test_emails.target)) #0.9974 #Test other datasets
def linear_SVM(x_train,y_train,x_test,y_test): sgd = Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None))]) sgd.fit(x_train,y_train) y_pred = sgd.predict(x_test) print("Linear SVM: "+str(accuracy_score(y_pred,y_test))) print(classification_report(y_test, y_pred,target_names=flairs))
print(twt) sentiment = model.predict(twt,batch_size=1,verbose = 7)[0] if(np.argmax(sentiment) == 0): print("negative") elif (np.argmax(sentiment) == 1): print("positive") train_clean_tweet=[] for tweet in train['TweetText']: train_clean_tweet.append(tweet) test_clean_tweet=[] for tweet in test['TweetText']: test_clean_tweet.append(tweet) from sklearn.feature_extraction.text import CountVectorizer v = CountVectorizer(analyzer = "word") train_features= v.fit_transform(train_clean_tweet) test_features=v.transform(test_clean_tweet) from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC, LinearSVC, NuSVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.metrics import accuracy_score Classifiers = [ LogisticRegression(C=0.000000001,solver='liblinear',max_iter=200), KNeighborsClassifier(3),
from flask import Flask, render_template, request import numpy as np import pandas as pd from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import CountVectorizer import pickle df = pd.read_csv('movie.csv') similarity_matrix = pickle.load(open('similarity_matrix.pkl', 'rb')) cv = CountVectorizer(stop_words='english') app = Flask(__name__) @app.route('/') def hello(): return render_template('index.html') @app.route('/recommend', methods=['POST']) def recommend(): movie_name = request.form.get('movie_name') movie_list = recommender(movie_name) return render_template('index.html', movie_list=movie_list) def recommender(movie_name): # find the index of this movie index_pos = df[df['title'] == movie_name].index[0] # calculate similarity recommended_movie_index = sorted(list( enumerate(similarity_matrix[index_pos])), reverse=True,
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.neural_network import MLPClassifier from sklearn.feature_extraction.text import CountVectorizer import joblib df = pd.read_csv(r'C:\Users\meysam-sadat\Downloads\Compressed\emails.csv') my_vectorize = CountVectorizer() x = my_vectorize.fit_transform(df['text']) y = df.spam.values x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) truncated = TruncatedSVD(n_components=1000) x_train = truncated.fit_transform(x_train) x_test = truncated.transform(x_test) clf = MLPClassifier(hidden_layer_sizes=(200, 50)) clf.fit(x_train, y_train) print(clf.score(x_test, y_test)) joblib.dump(clf, 'spam_pam_detector.joblib') model = joblib.load('spam_pam_detector.joblib') prediction = model.predict(x_test)
# ``` # # # Aplique `CountVectorizer` ao _data set_ `newsgroups` e descubra o número de vezes que a palavra _phone_ aparece no corpus. Responda como um único escalar. # In[42]: categories = ['sci.electronics', 'comp.graphics', 'rec.motorcycles'] newsgroup = fetch_20newsgroups(subset="train", categories=categories, shuffle=True, random_state=42) # In[43]: vectorizer = CountVectorizer() X = vectorizer.fit_transform(newsgroup.data) # In[44]: df_words = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names()) df_words['phone'].sum() # In[45]: def q6(): df_words = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
def naiveBayes(x_train,y_train,x_test,y_test): nb = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]) nb.fit(x_train,y_train) y_pred = nb.predict(x_test) print("Naive Bayes: "+str(accuracy_score(y_pred,y_test))) print(classification_report(y_test, y_pred,target_names=flairs))
for j in range(col): if j < i: prod[index] = np.inner(X[:,i], X[:,j]) index += 1 return prod # Define the original dimension n = int() m = int(5000) categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'] # categories = ['sci.med'] textdata = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) count_vect = CountVectorizer() word_counts = count_vect.fit_transform(textdata.data) tf_transformer = TfidfTransformer(use_idf=False).fit(word_counts) # the results is a scipy.sparse.csr.csr_matrix X = tf_transformer.transform(word_counts) # Clip the dataset X = X[:,:m] # normalize and transpose the dataset X = normalize(X, 'l2') X = X.asfptype() X = X.transpose() print("The text data fequency matrix size is:", X.shape) prod_init = get_inner_product(scipy.eye(m)*X) [dummy, n] = X.shape
documents = ["Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey"] stop_words = ['a', 'of', 'for'] # ngram_range控制n-gram的n,例如ngram_range=(1,2)即表示抽取uni-gram和bi-gram特征 # max_features控制词表的大小,若不为空,则按照词频进行排序后在作截取 count_vec = CountVectorizer( tokenizer=word_tokenize, stop_words=stop_words, max_features=100, ngram_range=(1, 3)) count_vec.fit(documents) # 或者使用fit_transform # sparse_matrix = count_vec.fit_transform(documents) # count_vec对应的字典 count_vec.vocabulary_ print(len(count_vec.vocabulary_)) print(count_vec.vocabulary_['trees']) # 'trees'对应的id # 稀疏矩阵 sparse_matrix = count_vec.transform(documents) # 转为稠密矩阵形式 matrix = sparse_matrix.toarray()
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3) corpus = [] for i in range(0, 1000): review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) review = review.lower() review = review.split() ps = PorterStemmer() all_stopwords = stopwords.words('english') all_stopwords.remove('not') review = [ps.stem(word) for word in review if not word in set(all_stopwords)] review = ' '.join(review) corpus.append(review) from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features = 1500) X = cv.fit_transform(corpus).toarray() y = dataset.iloc[:, -1].values from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0) from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() classifier.fit(X_train, y_train) root = Tk() root.title('Review Analyser') root.geometry("500x500")
LogisticRegression(random_state=0, solver="lbfgs", multi_class="auto", max_iter=5000, n_jobs=-1), MultinomialNB(), DecisionTreeClassifier(random_state=0), SVC(kernel="rbf", gamma="scale"), KNeighborsClassifier(n_neighbors=3, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1) ] # Vectorizers vectorizers = [ CountVectorizer(input="content", encoding="utf-8", tokenizer=lambda x: x.split(",")), TfidfVectorizer(analyzer="word", token_pattern=r"([^,]+)"), CountVectorizer(input="content", encoding="utf-8", tokenizer=lambda x: x.split(","), ngram_range=(3, 3)), TfidfVectorizer(analyzer="word", token_pattern=r"([^,]+)", ngram_range=(3, 3)), CountVectorizer(input="content", encoding="utf-8", tokenizer=lambda x: x.split(","), ngram_range=(4, 4)), TfidfVectorizer(analyzer="word", token_pattern=r"([^,]+)",
# ****** Create a bag of words from the training set # print "Creating the bag of words...\n" # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 5000) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings. train_data_features = vectorizer.fit_transform(d_train) # Numpy arrays are easy to work with, so convert the result to an # array train_data_features = train_data_features.toarray() # ******* Train a random forest using the bag of words #
def bag_of_words(self): self.bag_of_words_matrix = CountVectorizer().fit_transform(self.sentences) if(debug): return self.bag_of_words_matrix
# 获取权重 trainFilePath = "bydata-train_03" sample_weights_array = weights(trainFilePath) print(sample_weights_array) # 读取文件 train = datasets.load_files("bydata-train_03")# 训练文件 test = datasets.load_files("bydata-test")# 测试文件 # print( "train target:", train.target_names[:])# target_names:类别名称 # print ("index:", train.target[:])# target:target_names:类别名称的索引 # 对文档分词,形成词汇表,然后对新文档就可以使用这个词汇表进行编码, # 最终将会返回一个长度等于词汇表长度的向量,每个数字表示单词在文档中出现的次数 count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(train.data) # print ('X_train_counts:', X_train_counts.toarray()) ## one step tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) clf = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None).\ fit(X_train_tfidf, train.target, sample_weight=np.array(sample_weights_array))# 分类器 X_new_counts = count_vect.transform(test.data) X_new_tfidf = tfidf_transformer.transform(X_new_counts) predicted = clf.predict(X_new_tfidf)# 预测类型的索引 print('predicted:', predicted)
topic summary """ import pickle import random # for consistent testing random.seed(1532525625823) raw_data = pickle.load(open("pickles/list-of-reviews.p", "rb")) from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from nltk.corpus import stopwords count_vect = CountVectorizer(stop_words=set(stopwords.words('english'))) train_counts = count_vect.fit_transform(random.sample(raw_data, 30000)) raw_data = None btr = pickle.load(open("pickles/dict-of-business-to-reviews.p", "rb")) test_counts = count_vect.transform(btr["Appliance Service Center"]) tfidf_transformer = TfidfTransformer() train_tfidf = tfidf_transformer.fit_transform(train_counts) test_tfidf = tfidf_transformer.transform(test_counts) dtm = train_tfidf dtm_test = test_tfidf vocab = count_vect.get_feature_names()
def get_vectorizer(conf): if conf["pond"] == None: V = CountVectorizer(ngram_range=conf["N"], analyzer=conf["Tok"]) elif conf["pond"] == "tf-idf": V = TfidfVectorizer(ngram_range=conf["N"]) return V
from sklearn import svm import numpy as np from sklearn.model_selection import GridSearchCV test_file = 'test_data.txt' strategy_instance = helper.strategy() class_0 = strategy_instance.class0 class_1 = strategy_instance.class1 class_all = class_0 + class_1 class_all = [' '.join(i) for i in class_all] vectorizer = CountVectorizer(stop_words=None) vectorizer.fit(class_all) ##print(vectorizer.vocabulary_) tmp_dic = vectorizer.vocabulary_ word_list = sorted(tmp_dic, key=lambda x: tmp_dic[x]) train_data = vectorizer.transform(class_all).toarray() train_label = np.array([0] * 360 + [1] * 180) test = [] with open(test_file) as f: for line in f: test.append(line)
def I_BOW(data_time=['2014-07-01', '2014-12-31'], vec_time=['2014-07-01', '2014-12-31'], max_features=100, fit=False): global weibo_train_data global weibo_predict_data global features_log print "loading data..." if data_time[0] > '2014-12-31': data = weibo_predict_data.copy() data['context_clean'] = pd.Series.from_csv( '../data/predict_context_clean.csv') else: data = weibo_train_data.copy() data['context_clean'] = pd.Series.from_csv( '../data/train_context_clean.csv') data.context_clean = data.context_clean.apply(lambda x: json.loads(x)) data.context_clean = data.context_clean.apply(lambda x: ' '.join(x)) if fit == True: print 'fitting and transforming...' data_time = vec_time vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features=max_features) features = vectorizer.fit_transform( \ data[(data['time']<=data_time[1]) \ & (data['time']>=data_time[0])].context_clean) print 'saving models...' joblib.dump( vectorizer, '../others/' + '_'.join(vec_time) + '_' + str(max_features) + '.vectorizer') else: print 'transforming...' vectorizer = joblib.load('../others/' + '_'.join(vec_time) + '_' + str(max_features) + '.vectorizer') features = vectorizer.transform( \ data[(data['time']<=data_time[1]) \ & (data['time']>=data_time[0])].context_clean) columns = ['I_BOW_' + str(i + 1) for i in range(max_features)] features = pd.DataFrame(features.toarray(), columns=columns) # write log print 'saving features...' feature_name = 'I_BOW_' + '_'.join(data_time) + '_' + '_'.join( vec_time) + '_' + str(max_features) feature_address = '../features/' + feature_name + '.feature' features.to_csv(feature_address) usage = "train" if fit == True else "test" description = "Bag of Words in word count from "+str(data_time[0])+" to "+ \ data_time[1]+" using top "+str(max_features)+" words" print "writing logs..." log = [ feature_name, 'I_BOW', data_time, { 'max_features': max_features, 'vec_time': vec_time }, 'I', feature_address, usage, description, list(features.values.shape) ] writeLog(log, "features_log") return features
list1 = [] for i in range(0, 5536): mail = df.Message[i] #print(mail) mail = re.sub('[^a-zA-Z]', ' ', mail) mail = mail.lower() mailwords = mail.split() mailwords = [ ps.stem(word) for word in mailwords if word not in stopwords.words('english') ] mail = ' '.join(mailwords) list1.append(mail) from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer() X = cv.fit_transform(list1).toarray() y = df.Status.values from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) from sklearn.linear_model import LogisticRegression lr = LogisticRegression() lr.fit(X_train, y_train) y_pred = lr.predict(X_test)
def classify(): # inp1 = raw_input("Enter No. of Positive Examples (Total Positive Sample is 69): ") # inp2 = raw_input("Enter No. of Negative Examples (Total Positive Sample is 69): ") inp = raw_input( "Enter 'a' for inclusion statements or 'b' for interventions or 'c' for exclusion statements: " ) inp1 = 67 inp2 = 67 trn_data = [] trn_cat = [] p1 = 0 p2 = 0 p3 = 0 if inp == 'a': stng = 'training_corpora/inclusion_statements/jaccard/' elif inp == 'b': stng = 'training_corpora/interventions/jaccard/' elif inp == 'c': stng = 'training_corpora/exclusion_statements/jaccard/' else: print("Wrong input") exit # Preparing Positive Training Samples for i in range(1, int(inp1) + 1): trn_pos = stng + 'positive/pv' + str( i) + '.txt' # File location (training using Jaccard) text = codecs.open(trn_pos, encoding='utf-8', mode='r').readlines() text = ''.join(text) sentences = tokenize.sent_tokenize(text) for s in sentences: s = re.sub('[^a-zA-Z0-9.?:!$\n]', ' ', s) # Remove special characters trn_data.append(s) trn_cat.append(0) p1 = p1 + 1 # Preparing Negative Training Samples for i in range(1, int(inp2) + 1): trn_neg = stng + 'negative/ng' + str( i) + '.txt' # File location (training using Jaccard) text = codecs.open(trn_neg, encoding='utf-8', mode='r').readlines() text = ''.join(text) sentences = tokenize.sent_tokenize(text) for s in sentences: s = re.sub('[^a-zA-Z0-9.?:!$\n]', ' ', s) # Remove special characters trn_data.append(s) trn_cat.append(1) p2 = p2 + 1 # A pipeline of different parameters of the classifier pipeline = Pipeline([ ('vect', CountVectorizer(token_pattern=r'\b\w+\b')), ('tfidf', TfidfTransformer()), ('svr', svm.SVC(kernel='linear', class_weight='balanced')), ]) # Fix the values of the parameters using Grid Search and cross validation on the training samples parameters = { 'vect__min_df': (2, 3), 'vect__ngram_range': ((1, 2), (1, 3)), # unigrams or bigrams 'tfidf__use_idf': (True, False), 'svr__C': (0.1, 10, 100, 1000), } grid = grid_search.GridSearchCV(pipeline, parameters, cv=10) grid.fit(trn_data, trn_cat) # print("The best classifier is: ", grid.best_estimator_) clf = grid.best_estimator_ # Classification of the test samples using the fixed pipeline tst_map = codecs.open('test_samples/test_file_map.txt', encoding='utf-8', mode='r').readlines() for b in range(0, len(tst_map), 2): # File name of test file to check fl = tst_map[b].strip('\n\r') print fl data = [] tst_data = [] p3 = 0 # Preparing Test Samples text = codecs.open('test_samples/' + fl + '.txt', encoding='utf-8', mode='r').readlines() text = ''.join(text) sentences = tokenize.sent_tokenize(text) for s in sentences: # Extracting sentences s = re.sub('[^a-zA-Z0-9.?:!$\n]', ' ', s) # Remove special characters tst_data.append(s) p3 = p3 + 1 data.extend(trn_data) data.extend(tst_data) if inp == 'a': out = codecs.open('output/inclusion_statements/svm/' + fl + '_svm.txt', encoding='utf-8', mode='w') # Output file out.write('\n Using SVM Classifier: \n\n') out.write('Total No. of Sentences in the Reference: ' + str(p3) + '\n\n') out.write('The Inclusion Statements are as Follow: \n\n') elif inp == 'b': out = codecs.open('output/interventions/svm/' + fl + '_svm.txt', encoding='utf-8', mode='w') # Output file out.write('\n Using SVM Classifier: \n\n') out.write('Total No. of Sentences in the Reference: ' + str(p3) + '\n\n') out.write('The Interventions are as Follow: \n\n') elif inp == 'c': out = codecs.open('output/exclusion_statements/svm/' + fl + '_svm.txt', encoding='utf-8', mode='w') # Output file out.write('\n Using SVM Classifier: \n\n') out.write('Total No. of Sentences in the Reference: ' + str(p3) + '\n\n') out.write('The Exclusion Statements are as Follow: \n\n') # Results nps = 0 clf.fit(trn_data, trn_cat) predicted = clf.predict(tst_data) for i in range(0, len(predicted)): if predicted[i] == 0: nps = nps + 1 # print 'Relevant Sentence '+str(nps) # print '\n'+data[p1+p2+i]+'\n' out.write('\n' + str(nps) + ") " + tst_data[i] + '\n') print("Total No. of Positive Sentences: %d" % nps)
df.drop(df.index[630:680], inplace=True) df.drop(df.index[680:750], inplace=True) df.drop(df.index[750:800], inplace=True) df.drop(df.index[800:850], inplace=True) df.drop(df.index[850:880], inplace=True) df.drop(df.index[880:900], inplace=True) df.drop(df.index[900:911], inplace=True) X_train = df['URL'] y_train = df['Category'] X_test = dt['URL'] y_test = dt['Category'] from sklearn.pipeline import Pipeline text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]) text_clf = text_clf.fit(X_train, y_train) from sklearn.model_selection import RandomizedSearchCV n_iter_search = 5 parameters = { 'vect_ngram_range': [(1, 1), (1, 2)], 'tfidfuse_idf': (True, False), 'clf_alpha': (1e-2, 1e-3) } gs_clf = RandomizedSearchCV(text_clf, parameters, n_iter=n_iter_search) gs_clf = gs_clf.fit(X_train, y_train) #read url file cf = pd.read_csv('./urls.csv')
from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer() corpus = [] with open("sirgawain.txt") as f: for line in f: corpus.append(line.rstrip()) X = vectorizer.fit_transform(corpus) print X.toarray() print vectorizer.get_feature_names()
tweets = dataset['Text'] classes = dataset['Sentimento'] #Gerando o Data Frame tweets_Dataframe = pd.DataFrame({'Text': tweets, 'Classificacao': classes}) #Analisar quantos tweets a base possui print(len(tweets_Dataframe)) #Analisar quantos dados de cada classificação existem print(tweets_Dataframe.Classificacao.value_counts()) #Exibindo em grafico de barras fig = plt.figure(figsize=(8, 6)) tweets_Dataframe.groupby('Classificacao').count().plot.bar(ylim=0) plt.show() #tecnicas de normalização. Oversampling e undersampling vectorizer = CountVectorizer(ngram_range=(1, 2)) bow = vectorizer.fit_transform(tweets) #Exibindo o Bag of Words. Lembrar de pegar o vetor com menos dados [:50] #bow_data_frame = pd.DataFrame(bow.A,columns = vectorizer.get_feature_names()) #normalização de ocorrências. frequência das palavras tfidf_transformer = TfidfTransformer() bow = tfidf_transformer.fit_transform(bow) #Exibindo o Bag of Words após o TDIDF. Lembrar de pegar o vetor com menos dados [:50] #bow_data_frame = pd.DataFrame(bow.A,columns = vectorizer.get_feature_names()) modelo = MultinomialNB() modelo.fit(bow, classes)