def non_supervised(args): csv_file = args[1] n_pos = int(args[2]) k = int(args[3]) print('número de tópicos') print(k) local_itr = float(args[4]) global_itr = float(args[5]) alpha = float(args[6]) beta = float(args[7]) loader = pbg.util.Loader() global X, y X, y = loader.load_csv(csv_file, text_column="Text", class_column="Class") vect = TfidfVectorizer() X = vect.fit_transform(X) model = TPBG( k, alpha=alpha, beta=beta, local_max_itr=local_itr, global_max_itr=global_itr, local_threshold=1e-6, global_threshold=1e-6, save_interval=-1, feature_names=vect.get_feature_names_out(), silence=False, ) # treinar o modelo model.unsupervised_fit(X) semi_supervised(X, y, model, n_pos)
def vectorize_text_features(self): # create tfidf vectors """ Transform the text from the book features (name, authors, description, publication year) into vectors that will be used to compute book feature similarity. """ vectorizer = TfidfVectorizer() self.tfidf_vectors = vectorizer.fit_transform(self.book_df['target'].replace(np.nan, "n/a")) self.tfidf_features = vectorizer.get_feature_names_out()
class TfIdfSearchEngine(object): def __init__(self, search_base: List[Tuple[str, str]], tokenizer: Callable = None): self.document_ids = [] self.documents = [] for doc_id, document in search_base: self.document_ids.append(doc_id) self.documents.append(document) self.vectorizer = TfidfVectorizer(tokenizer=tokenizer) self.X = self.vectorizer.fit_transform(self.documents) self.vocabulary = self.vectorizer.get_feature_names_out() def search(self, query: str): q = self.vectorizer.transform([query]) s = cosine_similarity(q, self.X) ranking = [(i, self.document_ids[i], score) for i, score in sorted(enumerate(s[0]), key=lambda x: -x[1]) ] return ranking @staticmethod def feedback(ranking: List[Tuple[int, str, str]], ground_truth: Set, top_k: int): tp, fp, fn, tn = list(), list(), list(), list() retrieved = ranking[:top_k] non_retrieved = ranking[top_k:] for j, (i, doc_id, score) in enumerate(retrieved): if doc_id in ground_truth: tp.append(doc_id) else: fp.append(doc_id) for j, (i, doc_id, score) in enumerate(non_retrieved): if doc_id in ground_truth: fn.append(doc_id) else: tn.append(doc_id) return tp, fp, fn, tn
print(datos.info()) # pre-procesamiento de datos archivos = datos['Text'].values.astype("U") #print(archivos) vectorizacion = TfidfVectorizer(max_df=0.8, stop_words='english') #Frecuencia de palabras caracteristicas = vectorizacion.fit_transform( archivos ) #transformando todas las caracteristicas usando la media y la varianza some = caracteristicas.toarray() terms = vectorizacion.get_feature_names_out( ) #obtencion de la salida de los nombres por caracteristicas para la transformacion dist = cosine_similarity(caracteristicas[0:62], caracteristicas) print("Distancia Coseno: ", dist, sep='\n') #print(cosine_similarity(caracteristicas[0:62],caracteristicas)) matriz_enlace = ward( dist ) #definimos la matriz de enlace utilizando la distancia euclidiana como metrica print("Mattriz de enlace: ", matriz_enlace, sep='\n') #visualizacion del dendograma fig, ax = plt.subplots(figsize=(15, 20)) # tamaño del set ax = dendrogram(matriz_enlace, orientation="top") #Definimos las propiedades plt.tick_params( axis='x', # aplicamos los cambios al eje x
name="MiniBatchKMeans\nwith LSA on tf-idf vectors", ) # %% # Top terms per cluster # --------------------- # # Since :class:`~sklearn.feature_extraction.text.TfidfVectorizer` can be # inverted we can identify the cluster centers, which provide an intuition of # the most influential words **for each cluster**. See the example script # :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py` # for a comparison with the most predictive words **for each target class**. original_space_centroids = lsa[0].inverse_transform(kmeans.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] terms = vectorizer.get_feature_names_out() for i in range(true_k): print(f"Cluster {i}: ", end="") for ind in order_centroids[i, :10]: print(f"{terms[ind]} ", end="") print() # %% # HashingVectorizer # ----------------- # An alternative vectorization can be done using a # :class:`~sklearn.feature_extraction.text.HashingVectorizer` instance, which # does not provide IDF weighting as this is a stateless model (the fit method # does nothing). When IDF weighting is needed it can be added by pipelining the # :class:`~sklearn.feature_extraction.text.HashingVectorizer` output to a
from sklearn.feature_extraction.text import TfidfVectorizer tfidf_vector = TfidfVectorizer(stop_words='english', max_features=None, max_df=0.5, min_df=2) data = tfidf_vector.fit_transform(data_cleaned) kmeans.fit(data) clusters = kmeans.labels_ print(Counter(clusters)) import numpy as np cluster_label = {i: labels[np.where(clusters == i)] for i in range(k)} terms = tfidf_vector.get_feature_names_out() centroids = kmeans.cluster_centers_ for cluster, index_list in cluster_label.items(): counter = Counter(cluster_label[cluster]) print('cluster_{}: {} samples'.format(cluster, len(index_list))) for label_index, count in sorted(counter.items(), key=lambda x: x[1], reverse=True): print('{}: {} samples'.format(label_names[label_index], count)) print('Top 10 terms:') for ind in centroids[cluster].argsort()[-10:]: print(' %s' % terms[ind], end="") print()
max_features=n_features, stop_words='english') t0 = time() tf = tf_vectorizer.fit_transform(data_samples) print("done in %0.3fs." % (time() - t0)) print() # Fit the NMF model print("Fitting the NMF model (Frobenius norm) with tf-idf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t0)) tfidf_feature_names = tfidf_vectorizer.get_feature_names_out() plot_top_words(nmf, tfidf_feature_names, n_top_words, 'Topics in NMF model (Frobenius norm)') # Fit the NMF model print( '\n' * 2, "Fitting the NMF model (generalized Kullback-Leibler " "divergence) with tf-idf features, n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
class TextTransformer(object): def __init__(self): self._new_columns = [] self._old_column = None self._max_features = 100 self._vectorizer = None def fit(self, X, column): self._old_column = column self._vectorizer = TfidfVectorizer( analyzer="word", stop_words="english", lowercase=True, max_features=self._max_features, ) x = X[column][~pd.isnull(X[column])] self._vectorizer.fit(x) for f in list(self._vectorizer.get_feature_names_out()): new_col = self._old_column + "_" + f self._new_columns += [new_col] def transform(self, X): with warnings.catch_warnings(): warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning) ii = ~pd.isnull(X[self._old_column]) x = X[self._old_column][ii] vect = self._vectorizer.transform(x) for f in self._new_columns: X[f] = 0.0 X.loc[ii, self._new_columns] = vect.toarray() X.drop(self._old_column, axis=1, inplace=True) return X def to_json(self): for k in self._vectorizer.vocabulary_.keys(): self._vectorizer.vocabulary_[k] = int( self._vectorizer.vocabulary_[k]) data_json = { "new_columns": list(self._new_columns), "old_column": self._old_column, "vocabulary": self._vectorizer.vocabulary_, "fixed_vocabulary": self._vectorizer.fixed_vocabulary_, "idf": list(self._vectorizer.idf_), } return data_json def from_json(self, data_json): self._new_columns = data_json.get("new_columns", None) self._old_column = data_json.get("old_column", None) vocabulary = data_json.get("vocabulary") fixed_vocabulary = data_json.get("fixed_vocabulary") idf = data_json.get("idf") if vocabulary is not None and fixed_vocabulary is not None and idf is not None: self._vectorizer = TfidfVectorizer( analyzer="word", stop_words="english", lowercase=True, max_features=self._max_features, ) self._vectorizer.vocabulary_ = vocabulary self._vectorizer.fixed_vocabulary_ = fixed_vocabulary self._vectorizer.idf_ = idf
def load_dataset(verbose=False, remove=()): """Load and vectorize the 20 newsgroups dataset.""" data_train = fetch_20newsgroups( subset="train", categories=categories, shuffle=True, random_state=42, remove=remove, ) data_test = fetch_20newsgroups( subset="test", categories=categories, shuffle=True, random_state=42, remove=remove, ) # order of labels in `target_names` can be different from `categories` target_names = data_train.target_names # split target in a training set and a test set y_train, y_test = data_train.target, data_test.target # Extracting features from the training data using a sparse vectorizer t0 = time() vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=5, stop_words="english") X_train = vectorizer.fit_transform(data_train.data) duration_train = time() - t0 # Extracting features from the test data using the same vectorizer t0 = time() X_test = vectorizer.transform(data_test.data) duration_test = time() - t0 feature_names = vectorizer.get_feature_names_out() if verbose: # compute size of loaded data data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) print(f"{len(data_train.data)} documents - " f"{data_train_size_mb:.2f}MB (training set)") print( f"{len(data_test.data)} documents - {data_test_size_mb:.2f}MB (test set)" ) print(f"{len(target_names)} categories") print(f"vectorize training done in {duration_train:.3f}s " f"at {data_train_size_mb / duration_train:.3f}MB/s") print(f"n_samples: {X_train.shape[0]}, n_features: {X_train.shape[1]}") print(f"vectorize testing done in {duration_test:.3f}s " f"at {data_test_size_mb / duration_test:.3f}MB/s") print(f"n_samples: {X_test.shape[0]}, n_features: {X_test.shape[1]}") return X_train, X_test, y_train, y_test, feature_names, target_names
# calculando a similaridade por coseno entre sentenças do array cosine_similarity_matrix = cosine_similarity(array) results = create_dataframe(cosine_similarity_matrix, corpus) results """**a)** Representação vetorial TF-IDF com similaridade do cosseno --- """ # criando um vetorizador TF IDF cv_tf = TfidfVectorizer(stop_words='english') array_tf = cv_tf.fit_transform(corpus).toarray() pd.DataFrame(array_tf, columns=cv_tf.get_feature_names_out()) # calculando a similaridade por coseno entre sentenças do array cosine_similarity_matrix = cosine_similarity(array_tf) results = create_dataframe(cosine_similarity_matrix, corpus) results """**Questão 2**: Elabore um problema de classificação binária de textos coerente com sua base. **a)** Determine o rótulo dos documentos (separando os documentos em classes bem definidas) --- *1. Com base na classe mais frequente, determinar se o tweet é sobre esse desastre ou não.* """
def generate_counts(text_df, text_column="abstract", tfidf=True, min_df=50, max_df=0.5): """Generate tf-idf weights for unigrams/bigrams derived from textual data. Parameters ---------- text_df : (D x 2) :obj:`pandas.DataFrame` A DataFrame with two columns ('id' and 'text'). D = document. Returns ------- weights_df : (D x T) :obj:`pandas.DataFrame` A DataFrame where the index is 'id' and the columns are the unigrams/bigrams derived from the data. D = document. T = term. """ if text_column not in text_df.columns: raise ValueError(f"Column '{text_column}' not found in DataFrame") # Remove rows with empty text cells orig_ids = text_df["id"].tolist() text_df = text_df.fillna("") keep_ids = text_df.loc[text_df[text_column] != "", "id"] text_df = text_df.loc[text_df["id"].isin(keep_ids)] if len(keep_ids) != len(orig_ids): LGR.info(f"Retaining {len(keep_ids)}/{len(orig_ids)} studies") ids = text_df["id"].tolist() text = text_df[text_column].tolist() stoplist = op.join(get_resource_path(), "neurosynth_stoplist.txt") with open(stoplist, "r") as fo: stop_words = fo.read().splitlines() if tfidf: vectorizer = TfidfVectorizer( min_df=min_df, max_df=max_df, ngram_range=(1, 2), vocabulary=None, stop_words=stop_words, ) else: vectorizer = CountVectorizer( min_df=min_df, max_df=max_df, ngram_range=(1, 2), vocabulary=None, stop_words=stop_words, ) weights = vectorizer.fit_transform(text).toarray() if hasattr(vectorizer, "get_feature_names_out"): # scikit-learn >= 1.0.0 names = vectorizer.get_feature_names_out() else: # scikit-learn < 1.0.0 # To remove when we drop support for 3.6 and increase minimum sklearn version to 1.0.0. names = vectorizer.get_feature_names() names = [str(name) for name in names] weights_df = pd.DataFrame(weights, columns=names, index=ids) weights_df.index.name = "id" return weights_df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True) model = TPBG( k, alpha=0.05, beta=0.0001, local_max_itr=10, global_max_itr=10, local_threshold=1e-6, global_threshold=1e-6, save_interval=-1, feature_names=vect.get_feature_names_out(), silence=False, ) # selecionar aleatoriamente uma classe e n_pos exemplo rotulado choosed_cls = list(set(y_train))[randint(0, n_class - 1)] selected_idx = np.random.choice(np.where(y_train == choosed_cls)[0], size=n_pos, replace=False) # marcar com -1 todo o restante y_train[[i for i in range(len(y_train)) if i not in selected_idx]] = -1 # treinar o modelo model.fit(X_train, y_train)
def main(): args = sys.argv # param = Params(args[0]) # csv_file = "/home/thiagodepaulo/exp/text-collections/Sequence_of_words_CSV/CSTR.csv" # n_pos = 5 # k = 4 # local_itr = 10 # global_itr = 10 # alpha = 0.05 # beta = 0.0001 csv_file = args[1] n_pos = int(args[2]) k = int(args[3]) local_itr = float(args[4]) global_itr = float(args[5]) alpha = float(args[6]) beta = float(args[7]) loader = pbg.util.Loader() X, y = loader.load_csv(csv_file, text_column="Text", class_column="Class") target_name = list(set(y)) n_class = len(target_name) vect = TfidfVectorizer() X = vect.fit_transform(X) model = TPBG( k, alpha=alpha, beta=beta, local_max_itr=local_itr, global_max_itr=global_itr, local_threshold=1e-6, global_threshold=1e-6, save_interval=-1, feature_names=vect.get_feature_names_out(), silence=False, ) # selecionar aleatoriamente uma classe e n_pos exemplo rotulado choosed_cls = target_name[randint(0, n_class - 1)] selected_idx = np.random.choice(np.where(y == choosed_cls)[0], size=n_pos, replace=False) # marcar com -1 todo o restante y_train = np.copy(y) y_train[[i for i in range(len(y)) if i not in selected_idx]] = -1 X_test, y_test = remove_rows(X, y, selected_idx) def eval_func(model): y_predict = model.predict(X_test) y_predict = [1 if c == choosed_cls else 0 for c in y_predict] y_test2 = [1 if c == choosed_cls else 0 for c in y_test] # calcular a métrica labels = [0, 1] names = ["others", choosed_cls] report = classification_report(y_test2, y_predict, labels=labels, target_names=names) print('\n' + report + '\n') # insere função de avaliação model.eval_func = eval_func # treinar o modelo model.fit(X, y_train)
def KMeansClassifier(text): ## Cyber attacks are divided in eight categories ## Marketplaces is a point of interest ## Data collected per category from files malware = open('src\\main\\resources\\static\\cyber_threats\\malware.txt', 'r').read() phishing = open( 'src\\main\\resources\\static\\cyber_threats\\phishing.txt', 'r').read() MITM = open('src\\main\\resources\\static\\cyber_threats\\MITM.txt', 'r').read() DoS = open('src\\main\\resources\\static\\cyber_threats\\DoS.txt', 'r').read() SQL_injection = open( 'src\\main\\resources\\static\\cyber_threats\\SQL_injection.txt', 'r').read() zero_day = open( 'src\\main\\resources\\static\\cyber_threats\\zero_day.txt', 'r').read() XSS = open('src\\main\\resources\\static\\cyber_threats\\XSS.txt', 'r').read() credential_reuse = open( 'src\\main\\resources\\static\\cyber_threats\\credential_reuse.txt', 'r').read() carding = open('src\\main\\resources\\static\\cyber_threats\\carding.txt', 'r').read() marketplace = open( 'src\\main\\resources\\static\\cyber_threats\\/marketplace.txt', 'r').read() ## Unify all texts document = [ malware, phishing, MITM, DoS, SQL_injection, zero_day, XSS, credential_reuse, carding, marketplace ] ## TF-IDF on texts vectorizer = TfidfVectorizer(stop_words='english') X = vectorizer.fit_transform(document) ## Apply the K-Means algorithm on data true_k = 10 model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, random_state=3425) model.fit(X) order_centroids = model.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names_out() # Create a list of the 15 most common words for each category marketplace = list() for ind in order_centroids[0, :15]: marketplace.append(terms[ind]) MITM = list() for ind in order_centroids[1, :15]: MITM.append(terms[ind]) malware = list() for ind in order_centroids[2, :15]: malware.append(terms[ind]) SQL_injection = list() for ind in order_centroids[3, :15]: SQL_injection.append(terms[ind]) carding = list() for ind in order_centroids[4, :15]: carding.append(terms[ind]) credential_reuse = list() for ind in order_centroids[5, :15]: credential_reuse.append(terms[ind]) zero_day = list() for ind in order_centroids[6, :15]: zero_day.append(terms[ind]) XSS = list() for ind in order_centroids[7, :15]: XSS.append(terms[ind]) phishing = list() for ind in order_centroids[8, :15]: phishing.append(terms[ind]) DoS = list() for ind in order_centroids[9, :15]: DoS.append(terms[ind]) text = text.lower() if len(keywords_in_text(malware, text)) > 2: return "Malware" elif len(keywords_in_text(phishing, text)) > 2: return "Phishing" elif len(keywords_in_text(MITM, text)) > 2: return "MITM" elif len(keywords_in_text(DoS, text)) > 2: return "DoS" elif len(keywords_in_text(SQL_injection, text)) > 2: return "SQL Injection" elif len(keywords_in_text(zero_day, text)) > 2: return "Zero Day" elif len(keywords_in_text(XSS, text)) > 2: return "XSS" elif len(keywords_in_text(credential_reuse, text)) > 2: return "Credential Reuse" elif len(keywords_in_text(carding, text)) > 2: return "Carding" elif len(keywords_in_text(marketplace, text)) > 2: return "Marketplace" else: return "Undefined"