Esempio n. 1
0
def non_supervised(args):

    csv_file = args[1]
    n_pos = int(args[2])
    k = int(args[3])
    print('número de tópicos')
    print(k)
    local_itr = float(args[4])
    global_itr = float(args[5])
    alpha = float(args[6])
    beta = float(args[7])

    loader = pbg.util.Loader()
    global X, y
    X, y = loader.load_csv(csv_file, text_column="Text", class_column="Class")
    vect = TfidfVectorizer()
    X = vect.fit_transform(X)

    model = TPBG(
        k,
        alpha=alpha,
        beta=beta,
        local_max_itr=local_itr,
        global_max_itr=global_itr,
        local_threshold=1e-6,
        global_threshold=1e-6,
        save_interval=-1,
        feature_names=vect.get_feature_names_out(),
        silence=False,
    )

    # treinar o modelo
    model.unsupervised_fit(X)
    semi_supervised(X, y, model, n_pos)
Esempio n. 2
0
 def vectorize_text_features(self):
     # create tfidf vectors
     """
     Transform the text from the book features (name, authors, description, publication year) into
     vectors that will be used to compute book feature similarity.
     """
     vectorizer = TfidfVectorizer()
     self.tfidf_vectors = vectorizer.fit_transform(self.book_df['target'].replace(np.nan, "n/a"))
     self.tfidf_features = vectorizer.get_feature_names_out()   
Esempio n. 3
0
class TfIdfSearchEngine(object):
    def __init__(self,
                 search_base: List[Tuple[str, str]],
                 tokenizer: Callable = None):
        self.document_ids = []
        self.documents = []
        for doc_id, document in search_base:
            self.document_ids.append(doc_id)
            self.documents.append(document)
        self.vectorizer = TfidfVectorizer(tokenizer=tokenizer)
        self.X = self.vectorizer.fit_transform(self.documents)
        self.vocabulary = self.vectorizer.get_feature_names_out()

    def search(self, query: str):
        q = self.vectorizer.transform([query])
        s = cosine_similarity(q, self.X)
        ranking = [(i, self.document_ids[i], score)
                   for i, score in sorted(enumerate(s[0]), key=lambda x: -x[1])
                   ]
        return ranking

    @staticmethod
    def feedback(ranking: List[Tuple[int, str, str]], ground_truth: Set,
                 top_k: int):
        tp, fp, fn, tn = list(), list(), list(), list()
        retrieved = ranking[:top_k]
        non_retrieved = ranking[top_k:]
        for j, (i, doc_id, score) in enumerate(retrieved):
            if doc_id in ground_truth:
                tp.append(doc_id)
            else:
                fp.append(doc_id)
        for j, (i, doc_id, score) in enumerate(non_retrieved):
            if doc_id in ground_truth:
                fn.append(doc_id)
            else:
                tn.append(doc_id)
        return tp, fp, fn, tn
Esempio n. 4
0
print(datos.info())

# pre-procesamiento de datos

archivos = datos['Text'].values.astype("U")

#print(archivos)
vectorizacion = TfidfVectorizer(max_df=0.8,
                                stop_words='english')  #Frecuencia de palabras

caracteristicas = vectorizacion.fit_transform(
    archivos
)  #transformando todas las caracteristicas usando la media y la varianza

some = caracteristicas.toarray()
terms = vectorizacion.get_feature_names_out(
)  #obtencion de la salida de los nombres por caracteristicas para la transformacion

dist = cosine_similarity(caracteristicas[0:62], caracteristicas)

print("Distancia Coseno: ", dist, sep='\n')
#print(cosine_similarity(caracteristicas[0:62],caracteristicas))
matriz_enlace = ward(
    dist
)  #definimos la matriz de enlace utilizando la distancia euclidiana como metrica
print("Mattriz de enlace: ", matriz_enlace, sep='\n')
#visualizacion del dendograma
fig, ax = plt.subplots(figsize=(15, 20))  # tamaño del set
ax = dendrogram(matriz_enlace, orientation="top")
#Definimos las propiedades
plt.tick_params(
    axis='x',  # aplicamos los cambios al eje x
Esempio n. 5
0
    name="MiniBatchKMeans\nwith LSA on tf-idf vectors",
)

# %%
# Top terms per cluster
# ---------------------
#
# Since :class:`~sklearn.feature_extraction.text.TfidfVectorizer` can be
# inverted we can identify the cluster centers, which provide an intuition of
# the most influential words **for each cluster**. See the example script
# :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py`
# for a comparison with the most predictive words **for each target class**.

original_space_centroids = lsa[0].inverse_transform(kmeans.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

for i in range(true_k):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i, :10]:
        print(f"{terms[ind]} ", end="")
    print()

# %%
# HashingVectorizer
# -----------------
# An alternative vectorization can be done using a
# :class:`~sklearn.feature_extraction.text.HashingVectorizer` instance, which
# does not provide IDF weighting as this is a stateless model (the fit method
# does nothing). When IDF weighting is needed it can be added by pipelining the
# :class:`~sklearn.feature_extraction.text.HashingVectorizer` output to a
Esempio n. 6
0
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vector = TfidfVectorizer(stop_words='english',
                               max_features=None,
                               max_df=0.5,
                               min_df=2)

data = tfidf_vector.fit_transform(data_cleaned)
kmeans.fit(data)

clusters = kmeans.labels_
print(Counter(clusters))

import numpy as np

cluster_label = {i: labels[np.where(clusters == i)] for i in range(k)}

terms = tfidf_vector.get_feature_names_out()
centroids = kmeans.cluster_centers_
for cluster, index_list in cluster_label.items():
    counter = Counter(cluster_label[cluster])
    print('cluster_{}: {} samples'.format(cluster, len(index_list)))
    for label_index, count in sorted(counter.items(),
                                     key=lambda x: x[1],
                                     reverse=True):
        print('{}: {} samples'.format(label_names[label_index], count))
    print('Top 10 terms:')
    for ind in centroids[cluster].argsort()[-10:]:
        print(' %s' % terms[ind], end="")
    print()
Esempio n. 7
0
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..." % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
plot_top_words(nmf, tfidf_feature_names, n_top_words,
               'Topics in NMF model (Frobenius norm)')

# Fit the NMF model
print(
    '\n' * 2, "Fitting the NMF model (generalized Kullback-Leibler "
    "divergence) with tf-idf features, n_samples=%d and n_features=%d..." %
    (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components,
          random_state=1,
          beta_loss='kullback-leibler',
          solver='mu',
          max_iter=1000,
          alpha=.1,
class TextTransformer(object):
    def __init__(self):
        self._new_columns = []
        self._old_column = None
        self._max_features = 100
        self._vectorizer = None

    def fit(self, X, column):
        self._old_column = column
        self._vectorizer = TfidfVectorizer(
            analyzer="word",
            stop_words="english",
            lowercase=True,
            max_features=self._max_features,
        )

        x = X[column][~pd.isnull(X[column])]
        self._vectorizer.fit(x)
        for f in list(self._vectorizer.get_feature_names_out()):
            new_col = self._old_column + "_" + f
            self._new_columns += [new_col]

    def transform(self, X):
        with warnings.catch_warnings():
            warnings.simplefilter(action="ignore",
                                  category=pd.errors.PerformanceWarning)
            ii = ~pd.isnull(X[self._old_column])
            x = X[self._old_column][ii]
            vect = self._vectorizer.transform(x)

            for f in self._new_columns:
                X[f] = 0.0

            X.loc[ii, self._new_columns] = vect.toarray()
            X.drop(self._old_column, axis=1, inplace=True)
        return X

    def to_json(self):
        for k in self._vectorizer.vocabulary_.keys():
            self._vectorizer.vocabulary_[k] = int(
                self._vectorizer.vocabulary_[k])

        data_json = {
            "new_columns": list(self._new_columns),
            "old_column": self._old_column,
            "vocabulary": self._vectorizer.vocabulary_,
            "fixed_vocabulary": self._vectorizer.fixed_vocabulary_,
            "idf": list(self._vectorizer.idf_),
        }
        return data_json

    def from_json(self, data_json):
        self._new_columns = data_json.get("new_columns", None)
        self._old_column = data_json.get("old_column", None)
        vocabulary = data_json.get("vocabulary")
        fixed_vocabulary = data_json.get("fixed_vocabulary")
        idf = data_json.get("idf")
        if vocabulary is not None and fixed_vocabulary is not None and idf is not None:
            self._vectorizer = TfidfVectorizer(
                analyzer="word",
                stop_words="english",
                lowercase=True,
                max_features=self._max_features,
            )
            self._vectorizer.vocabulary_ = vocabulary
            self._vectorizer.fixed_vocabulary_ = fixed_vocabulary
            self._vectorizer.idf_ = idf
Esempio n. 9
0
def load_dataset(verbose=False, remove=()):
    """Load and vectorize the 20 newsgroups dataset."""

    data_train = fetch_20newsgroups(
        subset="train",
        categories=categories,
        shuffle=True,
        random_state=42,
        remove=remove,
    )

    data_test = fetch_20newsgroups(
        subset="test",
        categories=categories,
        shuffle=True,
        random_state=42,
        remove=remove,
    )

    # order of labels in `target_names` can be different from `categories`
    target_names = data_train.target_names

    # split target in a training set and a test set
    y_train, y_test = data_train.target, data_test.target

    # Extracting features from the training data using a sparse vectorizer
    t0 = time()
    vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 max_df=0.5,
                                 min_df=5,
                                 stop_words="english")
    X_train = vectorizer.fit_transform(data_train.data)
    duration_train = time() - t0

    # Extracting features from the test data using the same vectorizer
    t0 = time()
    X_test = vectorizer.transform(data_test.data)
    duration_test = time() - t0

    feature_names = vectorizer.get_feature_names_out()

    if verbose:

        # compute size of loaded data
        data_train_size_mb = size_mb(data_train.data)
        data_test_size_mb = size_mb(data_test.data)

        print(f"{len(data_train.data)} documents - "
              f"{data_train_size_mb:.2f}MB (training set)")
        print(
            f"{len(data_test.data)} documents - {data_test_size_mb:.2f}MB (test set)"
        )
        print(f"{len(target_names)} categories")
        print(f"vectorize training done in {duration_train:.3f}s "
              f"at {data_train_size_mb / duration_train:.3f}MB/s")
        print(f"n_samples: {X_train.shape[0]}, n_features: {X_train.shape[1]}")
        print(f"vectorize testing done in {duration_test:.3f}s "
              f"at {data_test_size_mb / duration_test:.3f}MB/s")
        print(f"n_samples: {X_test.shape[0]}, n_features: {X_test.shape[1]}")

    return X_train, X_test, y_train, y_test, feature_names, target_names
Esempio n. 10
0
# calculando a similaridade por coseno entre sentenças do array
cosine_similarity_matrix = cosine_similarity(array)
results = create_dataframe(cosine_similarity_matrix, corpus)
results

"""**a)** Representação vetorial TF-IDF com similaridade do cosseno

---


"""

# criando um vetorizador TF IDF
cv_tf = TfidfVectorizer(stop_words='english')
array_tf = cv_tf.fit_transform(corpus).toarray()
pd.DataFrame(array_tf, columns=cv_tf.get_feature_names_out())

# calculando a similaridade por coseno entre sentenças do array
cosine_similarity_matrix = cosine_similarity(array_tf)
results = create_dataframe(cosine_similarity_matrix, corpus)
results

"""**Questão 2**: Elabore um problema de classificação binária de textos coerente com sua base.

**a)** Determine o rótulo dos documentos (separando os documentos em classes bem definidas)

---

*1.   Com base na classe mais frequente, determinar se o tweet é sobre esse desastre ou não.*
"""
Esempio n. 11
0
def generate_counts(text_df,
                    text_column="abstract",
                    tfidf=True,
                    min_df=50,
                    max_df=0.5):
    """Generate tf-idf weights for unigrams/bigrams derived from textual data.

    Parameters
    ----------
    text_df : (D x 2) :obj:`pandas.DataFrame`
        A DataFrame with two columns ('id' and 'text'). D = document.

    Returns
    -------
    weights_df : (D x T) :obj:`pandas.DataFrame`
        A DataFrame where the index is 'id' and the columns are the
        unigrams/bigrams derived from the data. D = document. T = term.
    """
    if text_column not in text_df.columns:
        raise ValueError(f"Column '{text_column}' not found in DataFrame")

    # Remove rows with empty text cells
    orig_ids = text_df["id"].tolist()
    text_df = text_df.fillna("")
    keep_ids = text_df.loc[text_df[text_column] != "", "id"]
    text_df = text_df.loc[text_df["id"].isin(keep_ids)]

    if len(keep_ids) != len(orig_ids):
        LGR.info(f"Retaining {len(keep_ids)}/{len(orig_ids)} studies")

    ids = text_df["id"].tolist()
    text = text_df[text_column].tolist()
    stoplist = op.join(get_resource_path(), "neurosynth_stoplist.txt")
    with open(stoplist, "r") as fo:
        stop_words = fo.read().splitlines()

    if tfidf:
        vectorizer = TfidfVectorizer(
            min_df=min_df,
            max_df=max_df,
            ngram_range=(1, 2),
            vocabulary=None,
            stop_words=stop_words,
        )
    else:
        vectorizer = CountVectorizer(
            min_df=min_df,
            max_df=max_df,
            ngram_range=(1, 2),
            vocabulary=None,
            stop_words=stop_words,
        )
    weights = vectorizer.fit_transform(text).toarray()

    if hasattr(vectorizer, "get_feature_names_out"):
        # scikit-learn >= 1.0.0
        names = vectorizer.get_feature_names_out()
    else:
        # scikit-learn < 1.0.0
        # To remove when we drop support for 3.6 and increase minimum sklearn version to 1.0.0.
        names = vectorizer.get_feature_names()

    names = [str(name) for name in names]
    weights_df = pd.DataFrame(weights, columns=names, index=ids)
    weights_df.index.name = "id"
    return weights_df
Esempio n. 12
0
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True)

model = TPBG(
    k,
    alpha=0.05,
    beta=0.0001,
    local_max_itr=10,
    global_max_itr=10,
    local_threshold=1e-6,
    global_threshold=1e-6,
    save_interval=-1,
    feature_names=vect.get_feature_names_out(),
    silence=False,
)

# selecionar aleatoriamente uma classe e n_pos exemplo rotulado
choosed_cls = list(set(y_train))[randint(0, n_class - 1)]
selected_idx = np.random.choice(np.where(y_train == choosed_cls)[0],
                                size=n_pos,
                                replace=False)

# marcar com -1 todo o restante
y_train[[i for i in range(len(y_train)) if i not in selected_idx]] = -1

# treinar o modelo
model.fit(X_train, y_train)
Esempio n. 13
0
def main():
    args = sys.argv
    # param = Params(args[0])
    # csv_file = "/home/thiagodepaulo/exp/text-collections/Sequence_of_words_CSV/CSTR.csv"
    # n_pos = 5
    # k = 4
    # local_itr = 10
    # global_itr = 10
    # alpha = 0.05
    # beta = 0.0001
    csv_file = args[1]
    n_pos = int(args[2])
    k = int(args[3])
    local_itr = float(args[4])
    global_itr = float(args[5])
    alpha = float(args[6])
    beta = float(args[7])

    loader = pbg.util.Loader()
    X, y = loader.load_csv(csv_file, text_column="Text", class_column="Class")
    target_name = list(set(y))
    n_class = len(target_name)
    vect = TfidfVectorizer()
    X = vect.fit_transform(X)

    model = TPBG(
        k,
        alpha=alpha,
        beta=beta,
        local_max_itr=local_itr,
        global_max_itr=global_itr,
        local_threshold=1e-6,
        global_threshold=1e-6,
        save_interval=-1,
        feature_names=vect.get_feature_names_out(),
        silence=False,
    )

    # selecionar aleatoriamente uma classe e n_pos exemplo rotulado
    choosed_cls = target_name[randint(0, n_class - 1)]
    selected_idx = np.random.choice(np.where(y == choosed_cls)[0],
                                    size=n_pos,
                                    replace=False)

    # marcar com -1 todo o restante
    y_train = np.copy(y)
    y_train[[i for i in range(len(y)) if i not in selected_idx]] = -1

    X_test, y_test = remove_rows(X, y, selected_idx)

    def eval_func(model):
        y_predict = model.predict(X_test)
        y_predict = [1 if c == choosed_cls else 0 for c in y_predict]
        y_test2 = [1 if c == choosed_cls else 0 for c in y_test]

        # calcular a métrica
        labels = [0, 1]
        names = ["others", choosed_cls]
        report = classification_report(y_test2,
                                       y_predict,
                                       labels=labels,
                                       target_names=names)
        print('\n' + report + '\n')

    # insere função de avaliação
    model.eval_func = eval_func

    # treinar o modelo
    model.fit(X, y_train)
def KMeansClassifier(text):
    ## Cyber attacks are divided in eight categories
    ## Marketplaces is a point of interest

    ## Data collected per category from files
    malware = open('src\\main\\resources\\static\\cyber_threats\\malware.txt',
                   'r').read()
    phishing = open(
        'src\\main\\resources\\static\\cyber_threats\\phishing.txt',
        'r').read()
    MITM = open('src\\main\\resources\\static\\cyber_threats\\MITM.txt',
                'r').read()
    DoS = open('src\\main\\resources\\static\\cyber_threats\\DoS.txt',
               'r').read()
    SQL_injection = open(
        'src\\main\\resources\\static\\cyber_threats\\SQL_injection.txt',
        'r').read()
    zero_day = open(
        'src\\main\\resources\\static\\cyber_threats\\zero_day.txt',
        'r').read()
    XSS = open('src\\main\\resources\\static\\cyber_threats\\XSS.txt',
               'r').read()
    credential_reuse = open(
        'src\\main\\resources\\static\\cyber_threats\\credential_reuse.txt',
        'r').read()
    carding = open('src\\main\\resources\\static\\cyber_threats\\carding.txt',
                   'r').read()
    marketplace = open(
        'src\\main\\resources\\static\\cyber_threats\\/marketplace.txt',
        'r').read()

    ## Unify all texts
    document = [
        malware, phishing, MITM, DoS, SQL_injection, zero_day, XSS,
        credential_reuse, carding, marketplace
    ]

    ## TF-IDF on texts
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(document)

    ## Apply the K-Means algorithm on data
    true_k = 10
    model = KMeans(n_clusters=true_k,
                   init='k-means++',
                   max_iter=100,
                   n_init=1,
                   random_state=3425)
    model.fit(X)

    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names_out()

    # Create a list of the 15 most common words for each category
    marketplace = list()
    for ind in order_centroids[0, :15]:
        marketplace.append(terms[ind])

    MITM = list()
    for ind in order_centroids[1, :15]:
        MITM.append(terms[ind])

    malware = list()
    for ind in order_centroids[2, :15]:
        malware.append(terms[ind])

    SQL_injection = list()
    for ind in order_centroids[3, :15]:
        SQL_injection.append(terms[ind])

    carding = list()
    for ind in order_centroids[4, :15]:
        carding.append(terms[ind])

    credential_reuse = list()
    for ind in order_centroids[5, :15]:
        credential_reuse.append(terms[ind])

    zero_day = list()
    for ind in order_centroids[6, :15]:
        zero_day.append(terms[ind])

    XSS = list()
    for ind in order_centroids[7, :15]:
        XSS.append(terms[ind])

    phishing = list()
    for ind in order_centroids[8, :15]:
        phishing.append(terms[ind])

    DoS = list()
    for ind in order_centroids[9, :15]:
        DoS.append(terms[ind])

    text = text.lower()

    if len(keywords_in_text(malware, text)) > 2:
        return "Malware"
    elif len(keywords_in_text(phishing, text)) > 2:
        return "Phishing"
    elif len(keywords_in_text(MITM, text)) > 2:
        return "MITM"
    elif len(keywords_in_text(DoS, text)) > 2:
        return "DoS"
    elif len(keywords_in_text(SQL_injection, text)) > 2:
        return "SQL Injection"
    elif len(keywords_in_text(zero_day, text)) > 2:
        return "Zero Day"
    elif len(keywords_in_text(XSS, text)) > 2:
        return "XSS"
    elif len(keywords_in_text(credential_reuse, text)) > 2:
        return "Credential Reuse"
    elif len(keywords_in_text(carding, text)) > 2:
        return "Carding"
    elif len(keywords_in_text(marketplace, text)) > 2:
        return "Marketplace"
    else:
        return "Undefined"