from Tools.getStackOverflow import getStackOverflow from random import sample # SOvrflow = getStackOverflow("/Volumes/Files/Work/Research/Information Retrieval/1) Data/StackOverflow-Dataset/") SOvrflow = getStackOverflow("/home/sounak/Datasets/StackOverflow-Dataset/") data = SOvrflow.getData() labels = SOvrflow.getTarget() ## Binarize Labels ## from sklearn.preprocessing import LabelBinarizer lb = LabelBinarizer() labels = lb.fit_transform(labels) print("Label dimention : ", labels.shape) from Tools.Feature_Extraction import chisqure selected_terms = chisqure(data, labels, feature_count = 1500) ## Process Dataset ## data_vectors, embeddings, maxSize, embedding_vocab = get_Embeddings(data, selected_terms) #-------------------------------------------Classification------------------------------------------- totrec = 0.0 totprec = 0.0 totF1 = 0.0 from sklearn.model_selection import KFold kf = KFold(n_splits=5) from Tools.Classifier import CNN_Classifier, RNN_Classifier, Nested_CNN_Classifier
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id] # Transform multilabel labels ## from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer() train_labels = mlb.fit_transform( [reuters.categories(doc_id) for doc_id in train_docs_id]) print("Label dimention : ", train_labels.shape) test_labels = mlb.transform( [reuters.categories(doc_id) for doc_id in test_docs_id]) ## Process Dataset ## from Tools import Utils train_docs, test_docs = Utils.preprocess(train_docs, test_docs) from Tools.Feature_Extraction import chisqure selected_terms = chisqure(train_docs, train_labels, feature_count=500) # print(len(train_docs), " ; ", len(test_docs)) train_doc_vectors, test_doc_vectors, embeddings, maxSize, embedding_vocab = get_Embeddings( dataset, train_docs, test_docs, selected_terms) # print("Doc Vector : ", train_doc_vectors[10]) #-------------------------------------------Classification------------------------------------------- from Tools.Classifier import HNN_RR_Classifier, HNN_CR_Classifier, HNN_RC_Classifier, KerasBlog_CNN_Classifier # classifier = HNN_RC_Classifier(RNN_output_size=256, filter_sizes=[3,7], filter_counts=[150,300], pool_windows=[6,21], learning_rate=0.001, batch_size=7, num_epochs=100) # new = classifier.predict(np.array(train_doc_vectors), train_labels, np.array(test_doc_vectors), test_labels, embeddings, maxSize[0], maxSize[1], train_labels.shape[1]) classifier = KerasBlog_CNN_Classifier(filter_sizes=[5, 5], filter_counts=[300, 300], pool_windows=[2, 2],