Beispiel #1
0
from Tools.getStackOverflow import getStackOverflow
from random import sample

# SOvrflow = getStackOverflow("/Volumes/Files/Work/Research/Information Retrieval/1) Data/StackOverflow-Dataset/")
SOvrflow = getStackOverflow("/home/sounak/Datasets/StackOverflow-Dataset/")
data = SOvrflow.getData()
labels = SOvrflow.getTarget()

## Binarize Labels ##
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
labels = lb.fit_transform(labels)
print("Label dimention : ", labels.shape)

from Tools.Feature_Extraction import chisqure
selected_terms = chisqure(data, labels, feature_count = 1500)

## Process Dataset ##
data_vectors, embeddings, maxSize, embedding_vocab = get_Embeddings(data, selected_terms)


#-------------------------------------------Classification-------------------------------------------

totrec = 0.0
totprec = 0.0
totF1 = 0.0

from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
from Tools.Classifier import CNN_Classifier, RNN_Classifier, Nested_CNN_Classifier
Beispiel #2
0
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

# Transform multilabel labels ##
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(
    [reuters.categories(doc_id) for doc_id in train_docs_id])
print("Label dimention : ", train_labels.shape)
test_labels = mlb.transform(
    [reuters.categories(doc_id) for doc_id in test_docs_id])

## Process Dataset ##
from Tools import Utils
train_docs, test_docs = Utils.preprocess(train_docs, test_docs)
from Tools.Feature_Extraction import chisqure
selected_terms = chisqure(train_docs, train_labels, feature_count=500)
# print(len(train_docs), " ; ", len(test_docs))
train_doc_vectors, test_doc_vectors, embeddings, maxSize, embedding_vocab = get_Embeddings(
    dataset, train_docs, test_docs, selected_terms)
# print("Doc Vector : ", train_doc_vectors[10])

#-------------------------------------------Classification-------------------------------------------

from Tools.Classifier import HNN_RR_Classifier, HNN_CR_Classifier, HNN_RC_Classifier, KerasBlog_CNN_Classifier

# classifier = HNN_RC_Classifier(RNN_output_size=256, filter_sizes=[3,7], filter_counts=[150,300], pool_windows=[6,21], learning_rate=0.001, batch_size=7, num_epochs=100)
# new = classifier.predict(np.array(train_doc_vectors), train_labels, np.array(test_doc_vectors), test_labels, embeddings, maxSize[0], maxSize[1], train_labels.shape[1])

classifier = KerasBlog_CNN_Classifier(filter_sizes=[5, 5],
                                      filter_counts=[300, 300],
                                      pool_windows=[2, 2],