Ejemplo n.º 1
0
def training():

    # get data && model
    [x_train, y_train], s = getData(input_dim, window_size, predict_days,
                                    data_frequency, "train", False)
    model = get_model(x_train)
    loss = []

    # Fit && save model/history
    path = f"./model/epoch_{total_epochs},dim_{input_dim},win_{window_size},freq_{data_frequency}/"
    if not os.path.exists(path):
        os.mkdir(path, 755)

    #TODO: if there's existing model, pick up and keep training

    # Visualize the model
    print(model.summary())

    # train
    for i in range(total_epochs):
        print(f'epoch: {i + 1}/{total_epochs}')
        history = model.fit(x_train, y_train, epochs=1, batch_size=batchSize)
        loss += history.history['loss']
        if (i + 1) % 10 == 0:
            model.save(f'{path}epoch_{i}.h5')

    # save
    with open(f'{path}loss', 'wb') as fp:
        #         print(loss)
        pickle.dump(loss, fp)
Ejemplo n.º 2
0
def training():

    # get data && model
    [x_train, y_train], s = getData(input_dim, window_size, predict_days,
                                    "train", True)
    model = get_model(x_train)
    loss = []

    # Fit && save model/history
    path = f"./model/class/epoch_{total_epochs},dim_{input_dim},win_{window_size}/"
    if not os.path.exists(path):
        os.mkdir(path, 755)

    # Visualize the model
#     print(model.summary())

# train
    for i in range(total_epochs):
        print(f'epoch: {i + 1}/{total_epochs}')
        history = model.fit(x_train, y_train, epochs=1, batch_size=batchSize)
        model.save(f'{path}epoch_{i}.h5')
        loss += history.history['loss']

    # save
    with open(f'{path}loss', 'wb') as fp:
        pickle.dump(loss, fp)
Ejemplo n.º 3
0
def predict(epoch=300, dim=4, win=49, pred=20, freq=5):
    #     epoch = 300
    #     dim = 4
    #     win = 60
    #     pred = 20
    #     freq = 5
    # Get data
    real_stock_price = getGT(pred, freq)
    [x_test, y_test], scaler_list = getData(dim, win, pred, freq, "test")

    # Model predict
    K.clear_session()
    model_name = f"./model/draw/epoch_{epoch},dim_{dim},win_{win},freq_{freq}.h5"
    if not os.path.isfile(model_name):
        return "no such model!"

    model = load_model(model_name)
    model_output = model.predict(x_test)

    # get all the close price
    close_price = []
    for j in range(len(model_output)):
        close_price.append(model_output[j][0])

    # re-scale back
    close_price = np.reshape(close_price, (1, -1))
    predicted_stock_price = scaler_list[0].inverse_transform(close_price)
    return predicted_stock_price[0]
Ejemplo n.º 4
0
def run(num_feature, isCF, isDF, isCustom):
    lst_train_text, lst_train_stars = iohelper.readTrain()
    lst_dev_text = iohelper.readDev()
    lst_test_text = iohelper.readTest()

    stop_words = iohelper.readStopWords()
    lst_train_text = [
        preprocess.preprocess(text, stop_words) for text in lst_train_text
    ]
    lst_train_BOW = [preprocess.toBOW(text) for text in lst_train_text]
    lst_dev_text = [
        preprocess.preprocess(text, stop_words) for text in lst_dev_text
    ]
    lst_dev_BOW = [preprocess.toBOW(text) for text in lst_dev_text]
    lst_test_text = [
        preprocess.preprocess(text, stop_words) for text in lst_test_text
    ]
    lst_test_BOW = [preprocess.toBOW(text) for text in lst_test_text]
    print "PREPROCESS FINISHED!"

    if isCustom:
        train_data, dev_data, test_data = preprocess.getData_custom(
            lst_train_BOW, lst_dev_BOW, lst_test_BOW, num_feature, isCF, isDF)
    else:
        train_data, dev_data, test_data = preprocess.getData(
            lst_train_BOW, lst_dev_BOW, lst_test_BOW, num_feature, isCF, isDF)
    print "DATA MATRIX GENERATED!"

    W = multiLR.train(train_data, lst_train_stars)

    print "START PREDICT ON DEVELOPMENT DATA!"
    lst_dev_hard, lst_dev_soft = multiLR.pred(dev_data, W)
    iohelper.writeDevPred(lst_dev_hard, lst_dev_soft)

    print "START PREDICT ON TEST DATA!"
    lst_test_hard, lst_test_soft = multiLR.pred(test_data, W)
    iohelper.writeTestPred(lst_test_hard, lst_test_soft)
Ejemplo n.º 5
0
def genData():
    lst_train_text, lst_train_stars = iohelper.readTrain()
    lst_dev_text = iohelper.readDev()

    stop_words = iohelper.readStopWords()
    lst_train_text = [
        preprocess.preprocess(text, stop_words) for text in lst_train_text
    ]
    lst_train_BOW = [preprocess.toBOW(text) for text in lst_train_text]
    lst_dev_text = [
        preprocess.preprocess(text, stop_words) for text in lst_dev_text
    ]
    lst_dev_BOW = [preprocess.toBOW(text) for text in lst_dev_text]
    print "PREPROCESS FINISHED!"

    train_data, dev_data, _ = preprocess.getData(lst_train_BOW,
                                                 lst_dev_BOW, [],
                                                 2000,
                                                 isCF=False,
                                                 isDF=True)
    print "DATA MATRIX GENERATED!"

    writeTrain(train_data, lst_train_stars, "../data/svm_train")
    writeTest(dev_data, "../data/svm_dev")
Ejemplo n.º 6
0
import pandas as pd
import matplotlib.pyplot as plt

from keras import backend as K
from keras.models import load_model
from preprocess import getData

## Variable
MSE = []
total_epochs = 5
input_dim = 4
window_size = 60
predict_days = 20

# Get data
[x_test, y_test], scaler_list = getData(input_dim, window_size, predict_days,
                                        "test", True)

## Model predict
path = f"./model/class/epoch_{total_epochs},dim_{input_dim},win_{window_size}/"
for i in range(total_epochs):
    start = time.time()
    K.clear_session()

    # load model
    model = load_model(f'{path}epoch_{i}.h5')
    print(f'read model: epoch_{i}.h5')
    output = model.predict(x_test)

    acc = 0
    total_guess = predict_days
    for j in range(len(output)):
Ejemplo n.º 7
0
def main():
    out_dir = os.getcwd()
    parser = argparse.ArgumentParser(
        description='Get Training path and Glove path')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=0,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--data', '-d', default=out_dir, help='data path')
    parser.add_argument('--glove',
                        '-w',
                        default=out_dir,
                        help='File to read glove vectors')
    parser.add_argument('-t',
                        action='store_true',
                        default=False,
                        dest='testing')
    parser.add_argument('--setvocab',
                        '-v',
                        default=None,
                        help="Providing already setup vocab")
    parser.add_argument('--setvectors', '-c', default=None)
    parser.add_argument('--model', '-m', default='NTIFullTreeMatching')
    args = parser.parse_args()
    gpu = args.gpu
    train_path = os.path.join(args.data, 'sts2016_train.stasis.csv')
    glove_path = os.path.join(args.glove, 'glove.840B.300d.txt')
    vocab_path = args.setvocab
    vectors_path = args.setvectors
    model_name = args.model
    logging_file = "train.log"
    FORMAT = "%(asctime)-15s %(message)s"
    if (logging_file is None):
        logging.basicConfig(format=FORMAT, level=logging.DEBUG)
    else:
        logging.basicConfig(filename=logging_file,
                            format=FORMAT,
                            level=logging.DEBUG)
    logging.info("The output directory is %s", out_dir)
    logging.info("The glove path is %s", glove_path)
    logging.info("The data path is %s", train_path)
    if (gpu < 0):
        logging.info("The program is running for CPU")
    else:
        logging.info("The program is running for GPU")

    n_epoch = 40  # number of epochs
    n_units = 300  # number of units per layer
    batch_size = 32  # minibatch size
    eval_batch = 64
    max_dev = 0
    max_tr = 0
    max_test = 0
    max_epch = 0

    EMPTY = np.random.uniform(-0.1, 0.1, (1, 300)).astype(np.float32)
    #preprocessing
    vocab = None
    vectors = None
    data = preprocess.getData(train_path)
    if (vocab_path is not None):
        vocab = pickle.load(open(vocab_path, "r"))
        logging.info("Vocabulary loaded")
    else:
        vocab, frequency_dictionary = preprocess.create_vocab(data)
        pickle.dump(vocab, open(os.path.join(out_dir, "train_vocab.pkl"), "w"))
        logging.info("Dumping the training vocabulary in %s",
                     out_dir + "/train_vocab.pkl")
        logging.info("Vocabulary created")
    if (vectors_path is None):
        vectors = preprocess.get_vectors(vocab, glove_path, out_dir)
        logging.info("Vectors formed")
    else:
        vectors = pickle.load(open(vectors_path, "r"))
        logging.info("Vectors loaded")
    train_data, validate_data = train_test_split(data, test_size=0.1)
    train_data = train_data.as_matrix()
    validate_data = validate_data.as_matrix()
    train_dataset, train_labels = preprocess.prepare_sentence_data(
        train_data, vectors)
    dataset, labels = preprocess.prepare_sentence_data(validate_data, vectors)
    validation_dataset, validation_labels = dataset, labels
    #test_dataset,test_labels = dataset,labels
    if (args.testing):
        logging.info("Just Testing!")
        train_dataset, train_labels = train_dataset[0:100], train_labels[0:100]
        validation_dataset, validation_labels = dataset[0:100], labels[0:100]
        #test_dataset,test_labels = dataset[-100:],labels[-100:]

    logging.info("The training size is %d", len(train_labels))
    logging.info("The validation size is %d", len(validation_labels))
    #logging.info("The test size is %d",len(test_labels))
    model = None
    if model_name == "BILSTM":
        model = BILSTM(n_units, gpu)
    else:
        model = NTIFullTreeMatching(n_units, gpu)
    model.init_optimizer()
    n_train = len(train_labels)
    n_dev = len(validation_labels)
    #n_test = len(test_labels)
    logging.debug("Training Begins")

    #training code
    for i in xrange(0, n_epoch):
        logging.debug("epoch={}".format(i))
        #Shuffle the data
        shuffle = np.random.permutation(n_train)
        preds = []
        preds_true = []
        aLoss = 0
        ss = 0
        begin_time = time.time()
        for j in six.moves.range(0, n_train, batch_size):
            c_b = shuffle[j:min(j + batch_size, n_train)]
            ys = preprocess.batch(train_labels, c_b)
            preds_true.extend(ys)
            y_data = np.array(ys, dtype=np.int32)
            sent_batch = preprocess.batch(train_dataset, c_b)
            sent_batch = preprocess.stack_pairs(sent_batch)
            y_s, loss = model.train(sent_batch, y_data)
            aLoss = aLoss + loss.data
            preds.extend(y_s)
            ss = ss + 1
        logging.debug("loss:%f", aLoss / ss)
        logging.debug('secs per train epoch={}'.format(time.time() -
                                                       begin_time))
        f1_tr = accuracy_score(preds_true, preds)
        logging.debug('train accuracy_score={}'.format(f1_tr))
        logging.debug(confusion_matrix(preds_true, preds))
        preds = []
        preds_true = []
        for j in six.moves.range(0, n_dev, eval_batch):
            ys = validation_labels[j:j + eval_batch]
            preds_true.extend(ys)
            y_data = np.array(ys, dtype=np.int32)
            sent_batch = validation_dataset[j:j + eval_batch]
            sent_batch = preprocess.stack_pairs(sent_batch)
            y_s = model.predict(sent_batch)
            preds.extend(y_s)
        f1_dev = accuracy_score(preds_true, preds)
        logging.debug('dev accuracy_score={}'.format(f1_dev))
        logging.debug(confusion_matrix(preds_true, preds))
        if f1_dev > max_dev:
            max_dev = f1_dev
            max_tr = f1_tr
            max_epch = i
            logging.info('saving model')
            model.save(out_dir + '/' + model_name + '.' + str(i))
        logging.info(
            "best results so far (dev): epoch=%d  dev f1-score=%d  test f1-score=%d",
            max_epch, max_dev, max_test)
Ejemplo n.º 8
0
# TODO: Have more sophisticated argument parser
conf_file = sys.argv[1]

# Create conf object. Can now access params, including db connection 
params = Conf(conf_file)

# If topics weren't specified use all the topics 
# Each document should be one line (submission + comments)
if not params.topics:
	topics = params.db.subreddit_list()
else:
	topics = params.topics

# Get the documents in Reddit format 
print "Retrieving documents..."
reddit_documents = preprocess.getData(topics, params.comment_level, params.num_docs, params.db)
print reddit_documents
# Preprocess these 
print "Preprocessing documents..."
preprocess.preprocess(reddit_documents, params.max_word_length, params.min_word_length, params.stopwords, params.stem)
print reddit_documents
''' TODO: removal_threshold and removal_perc '''

# Split into train and test 
print "Splitting into train and test sets..."
train,test = utils.partition(reddit_documents, .9)

# Now save metadata to db to remember parameter configuration
print "Saving metadata to mongodb..."
metadata = utils.createMetaData(params)
result = params.db.add_metadata(metadata)
Ejemplo n.º 9
0
def testClassification():
    threashold = 0.001
    corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm')
    #convert the corpus to a numpy matrix, take the transpose and convert it to a list
    corpusList = [
        list(x) for x in zip(*gensim.matutils.corpus2dense(
            corpus, corpus.num_terms, dtype=np.float64))
    ]
    # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]]
    reports = preprocess.getReports()

    numFolds = 5  # number of folds for cross validation
    # Create the output directory
    directory = "label_tests/" + datetime.datetime.now().strftime(
        '%m_%d_%H_%M') + "/"
    os.makedirs(directory)
    with open(directory + "labelClassification.csv", 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerow(["score", "output label", "expected label", "report"])

        for j in range(len(REPORT_FILES_LABELLED)):
            writer.writerow("")
            writer.writerow("")
            writer.writerow([DIAGNOSES[j]])

            # fetch corpus and labels
            labelledCorpus = []
            unlabelledCorpus = []
            # The labeled data is at the start of the data set
            # Get the ids in the corpus of these first labeled examples for each class
            for i in range(
                    preprocess.getNumReports(REPORT_FILES[:j]),
                    preprocess.getNumReports(REPORT_FILES[:j]) +
                    preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
                labelledCorpus.append(corpusList[i])
            for i in range(
                    preprocess.getNumReports(REPORT_FILES[:j]) +
                    preprocess.getNumReports([REPORT_FILES_LABELLED[j]]),
                    preprocess.getNumReports(REPORT_FILES[:j]) +
                    preprocess.getNumReports([REPORT_FILES[j]])):
                unlabelledCorpus.append(corpusList[i])
            labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]
                                                    ]))[:, 2]
            ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
            count = 0
            deletes = []
            for x in range(len(labels)):
                if (labels[x] == "negative"):
                    count = count + 1
                    deletes.append(x)
                if (count == (len(labels) -
                              (list(labels).count("positive")) * 2)):
                    break
            labelledCorpus = np.delete(labelledCorpus, deletes, axis=0)
            labels = np.delete(labels, deletes)
            ##################

            numData = len(labels)  # size of the labelled data set

            # build classifier
            classifier = svm.SVC(kernel='linear').fit(labelledCorpus, labels)

            # compute output label and corresponding score
            output_test = classifier.predict(unlabelledCorpus)
            output_scores_test = classifier.decision_function(unlabelledCorpus)

            # sort scores and labels in order
            sortList = list(
                zip(output_scores_test, output_test, unlabelledCorpus))
            sortList.sort()
            output_scores_test, output_test, unlabelledCorpus = zip(*sortList)

            # save result to file
            for r in range(len(unlabelledCorpus)):
                if (abs(output_scores_test[r]) < threashold):
                    reportIdx = corpusList.index(list(unlabelledCorpus[r]))
                    writer.writerow("")
                    writer.writerow(
                        [reportIdx, output_scores_test[r], output_test[r]])
                    writer.writerow([reports[reportIdx]])
    writeFile.close()
Ejemplo n.º 10
0
def main():

    total_data = preprocess.getData(img_size,num_total_data)

    # first 80% rows
    train_data = total_data[:int(num_total_data * 0.8)]
    train_feature = np.array([row[0] for row in train_data]).astype(np.float32)
    train_age_label = np.array([row[2] for row in train_data]).astype(np.int).flatten() # age
    train_gender_label = np.array([row[3] for row in train_data]).astype(np.int).flatten() # gender

    # the last 20% rows
    eval_data = total_data[int(num_total_data * 0.8):]
    eval_feature = np.array([row[0] for row in eval_data]).astype(np.float32)
    eval_age_label = np.array([row[2] for row in eval_data]).astype(np.int).flatten() # age
    eval_gender_label = np.array([row[3] for row in eval_data]).astype(np.int).flatten() # age

    # build the classifier
    age_classifier = tf.estimator.Estimator(neural_network_age_model_fn, model_dir=age_model_dir)
    gender_classifier = tf.estimator.Estimator(neural_network_gender_model_fn, model_dir=gender_model_dir)

    # train the model
    age_train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_feature},
        y=train_age_label,
        batch_size=125,
        num_epochs=None,
        shuffle=True)
    age_classifier.train(input_fn=age_train_input_fn,
        steps=num_steps)

    gender_train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_feature},
        y=train_gender_label,
        batch_size=125,
        num_epochs=None,
        shuffle=True)
    gender_classifier.train(input_fn=gender_train_input_fn,
        steps=num_steps)

    # evaluate the model
    age_eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_feature},
        y=eval_age_label,
        num_epochs=1,
        shuffle=False)
    age_eval_results = age_classifier.evaluate(input_fn=age_eval_input_fn)
    print("age evaluation results:" + str(age_eval_results))

    gender_eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_feature},
        y=eval_gender_label,
        num_epochs=1,
        shuffle=False)
    gender_eval_results = gender_classifier.evaluate(input_fn=gender_eval_input_fn)
    print("gender evaluation results:" + str(gender_eval_results))


    # print confusion matrix for age
    age_predict_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_feature},
        num_epochs=1,
        shuffle=False)
    age_predictions = list(age_classifier.predict(age_predict_input_fn))
    predicted_ages = [p["classes"] for p in age_predictions]
    age_confusion_matrix = tf.confusion_matrix(labels=eval_age_label, predictions=predicted_ages, num_classes=age_num_classes)
    with tf.Session():
        print('Age Confusion Matrix: \n\n', tf.Tensor.eval(age_confusion_matrix,feed_dict=None, session=None))

    # print confusion matrix for gender
    gender_predict_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_feature},
        num_epochs=1,
        shuffle=False)
    gender_predictions = list(gender_classifier.predict(gender_predict_input_fn))
    predicted_genders = [p["classes"] for p in gender_predictions]
    gender_confusion_matrix = tf.confusion_matrix(labels=eval_gender_label, predictions=predicted_genders, num_classes=gender_num_classes)
    with tf.Session():
        print('Gender Confusion Matrix: \n\n', tf.Tensor.eval(gender_confusion_matrix,feed_dict=None, session=None))
Ejemplo n.º 11
0
from preprocess import getData
from preprocess import getGT
from keras import backend as K
from keras.models import load_model

## Variable
MSE = []
total_epochs = 300
input_dim = 4
window_size = 60
predict_days = 20
data_frequency = 1

## Get data
real_stock_price = getGT(predict_days, data_frequency)
[x_test, y_test], scaler_list = getData(input_dim, window_size, predict_days,
                                        data_frequency, "test")

## Model predict
# load model
# path = f"./model/epoch_{total_epochs},dim_{input_dim},win_{window_size}/"
path = f"./model/draw/"
lstm = load_model(f'{path}000.h5')
label1 = 'test1'
# rnn = load_model(f'{path}50unit.h5')
# label2 = 'test2'
lstm_output = lstm.predict(x_test)
# rnn_output = rnn.predict(x_test)

# get all the close price
lstm_close_price = []
for j in range(len(lstm_output)):
Ejemplo n.º 12
0
def testClassification(threshold,fileType):

	REPORT_FILES = [('Cleaned' + fileType + 'Full.csv')]
	REPORT_FILES_LABELLED = [('Cleaned' + fileType + 'Labelled.csv')]
	DIAGNOSES = [fileType]

	corpus = gensim.corpora.MmCorpus('../model_files/reports_lsi.mm')
	#convert the corpus to a numpy matrix, take the transpose and convert it to a list
	corpusList = [list(x) for x in zip(*gensim.matutils.corpus2dense(corpus,corpus.num_terms,dtype=np.float64))]
	# corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]]
	reports = preprocess.getReports(fileType)

	numFolds = 5 # number of folds for cross validation

	with open("labelClassification.csv",'w') as writeFile:
		writer = csv.writer(writeFile)
		writer.writerow(["score","output label","expected label","report"])

		for j in range(len(REPORT_FILES_LABELLED)):
			#writer.writerow("")
			#writer.writerow("")
			writer.writerow([DIAGNOSES[j],"",""]) # Added "" for csv parsing

			# fetch corpus and labels
			labelledCorpus = []
			unlabelledCorpus = []
			# The labeled data is at the start of the data set
			# Get the ids in the corpus of these first labeled examples for each class
			for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
				labelledCorpus.append(corpusList[i])
			for i in range(preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES[j]])):
				unlabelledCorpus.append(corpusList[i])
			labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2]
			############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
			count = 0
			deletes = []
			for x in range(len(labels)):
				if (labels[x] == "negative"):
					count = count + 1
					deletes.append(x)
				if (count == (len(labels)-(list(labels).count("positive"))*2)):
					break
			labelledCorpus = np.delete(labelledCorpus,deletes,axis=0)
			labels = np.delete(labels,deletes)
			##################

			numData = len(labels) # size of the labelled data set

			# build classifier
			classifier = svm.SVC(kernel='linear').fit(labelledCorpus,labels)
			
			print ""
                        print "Model parameters:"
			print classifier.coef_
			print ""
                        print "L2 norm of current model: " + str(np.linalg.norm(classifier.coef_))
			print ""

			for i in range(len(classifier.coef_)):
				parameters.append(classifier.coef_[i])
                        

			# compute output label and corresponding score
			output_test = classifier.predict(unlabelledCorpus)
			output_scores_test = classifier.decision_function(unlabelledCorpus)

			# sort scores and labels in order
			sortList = list(zip(output_scores_test,output_test,unlabelledCorpus))
			sortList.sort()
			output_scores_test,output_test,unlabelledCorpus = zip(*sortList)

			# save result to file
			for r in range(len(unlabelledCorpus)):
				if (abs(output_scores_test[r]) < threshold):
					reportIdx = corpusList.index(list(unlabelledCorpus[r]))
					# writer.writerow("") # Removing newline to help with future parsing
					writer.writerow([reportIdx,output_scores_test[r],output_test[r]])
					writer.writerow([reports[reportIdx],"",""]) # Added extra "" to make csv parsing work
	writeFile.close()

	# Write model parameters to file
	with open("coef.csv",'w') as fout:
		writer = csv.writer(fout)
		for i in range(len(parameters)):
			writer.writerow(parameters[i])

	print "Model parameters saved to file."
Ejemplo n.º 13
0
def labelClassificationD2V():

    model = gensim.models.Doc2Vec.load("./model_files/reports.doc2vec_model")

    reports = preprocess.getReports()
    processedReports = preprocess.getProcessedReports()

    numFolds = 5  # number of folds for cross validation
    directory = "label_classification/" + datetime.datetime.now().strftime(
        '%m_%d_%H_%M') + "/"
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(directory + "labelClassification.csv", 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerow(["score", "output label", "expected label", "report"])

        for j in range(len(REPORT_FILES_LABELLED)):
            writer.writerow("")
            writer.writerow("")
            writer.writerow([DIAGNOSES[j]])

            # initialise figure and plot
            name = DIAGNOSES[j] + " ROC"
            plt.figure(name)
            plt.xlabel("False Positive")
            plt.ylabel("True Positive")
            plt.title(DIAGNOSES[j] + " ROC")

            # fetch corpus and labels
            labelledReports = []
            labelledCorpus = list()
            # The labeled data is at the start of the data set
            # Get the ids in the corpus of these first labeled examples for each class
            for i in range(
                    preprocess.getNumReports(REPORT_FILES[:j]),
                    preprocess.getNumReports(REPORT_FILES[:j]) +
                    preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
                labelledReports.append(reports[i])
                labelledCorpus.append(model.infer_vector(processedReports[i]))
            labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]
                                                    ]))[:, 2]
            corpusList = [list(x) for x in labelledCorpus]
            ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
            count = 0
            deletes = []
            for x in range(len(labels)):
                if (labels[x] == "negative"):
                    count = count + 1
                    deletes.append(x)
                if (count == (len(labels) -
                              (list(labels).count("positive")) * 2)):
                    break
            labelledCorpus = np.delete(labelledCorpus, deletes, axis=0)
            labels = np.delete(labels, deletes)
            ##################

            numData = len(labels)  # size of the labelled data set
            dataPerFold = int(math.ceil(numData / numFolds))

            for n in range(0, numFolds):
                # split training and test data
                train_labelledCorpus, test_labelledCorpus, train_labels, test_labels = train_test_split(
                    labelledCorpus, labels, test_size=0.13)

                # build classifier
                classifier = svm.SVC(kernel='linear').fit(
                    train_labelledCorpus, train_labels)

                # compute output label and corresponding score
                output_test = classifier.predict(test_labelledCorpus)
                output_train = classifier.predict(train_labelledCorpus)
                output_scores_test = classifier.decision_function(
                    test_labelledCorpus)
                output_scores_train = classifier.decision_function(
                    train_labelledCorpus)

                # sort scores and labels in order
                sortList = list(
                    zip(output_scores_test, output_test, test_labels,
                        test_labelledCorpus))
                sortList.sort()
                output_scores_test, output_test, test_labels, test_labelledCorpus = zip(
                    *sortList)

                # build roc curve and plot
                fp_test, tp_test, _ = roc_curve(test_labels,
                                                output_scores_test,
                                                pos_label="positive")
                fp_train, tp_train, _ = roc_curve(train_labels,
                                                  output_scores_train,
                                                  pos_label="positive")

                plt.plot(fp_test,
                         tp_test,
                         'r',
                         label="train" if n == 0 else "")
                plt.plot(fp_train,
                         tp_train,
                         'b',
                         label="test" if n == 0 else "")
                plt.legend(loc='lower right')
                plt.savefig(directory + name)

                # save result to file
                for r in range(len(test_labels)):
                    reportIdx = corpusList.index(list(test_labelledCorpus[r]))
                    writer.writerow("")
                    writer.writerow([
                        output_scores_test[r], output_test[r], test_labels[r]
                    ])
                    writer.writerow([labelledReports[reportIdx]])
        # plt.show()
    writeFile.close()
Ejemplo n.º 14
0
def labelClassificationRNN(learn=True):
	if learn:
		c_vals = [[0.001,0.001,0.001,0.001]]
		c_vals = [[0.005,0.005,0.005,0.005]]
		c_vals.append([0.01,0.01,0.01,0.01])
		c_vals.append([0.05,0.05,0.05,0.05])
		c_vals.append([0.1,0.1,0.1,0.1])
		c_vals.append([0.5,0.5,0.5,0.5])
		c_vals.append([1,1,1,1])
		optimal_c = [[0,0,0,0]]
	else:
		file = open('./model_files/svm_c_values.pkl', 'r')
		c_vals = pickle.load(file)
		optimal_c = c_vals
		file.close()
	reports = preprocess.getReports()
	reportVectors = rnn.loadReportVecs()

	numFolds = 5 # number of folds for cross validation
	directory = "label_classification/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/"
	if not os.path.exists(directory):
		os.makedirs(directory)
	with open(directory+"labelClassification.csv",'w') as writeFile:
		writer = csv.writer(writeFile)
		writer.writerow(["score","output label","expected label","report"])

		for j in range(len(REPORT_FILES_LABELLED)):
			writer.writerow("")
			writer.writerow("")
			writer.writerow([DIAGNOSES[j]])
			# fetch corpus and labels
			labelledReports = []
			labelledCorpus = list()
			# The labeled data is at the start of the data set
			# Get the ids in the corpus of these first labeled examples for each class
			for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
				labelledReports.append(reports[i])
				labelledCorpus.append(reportVectors[i][:])
			labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2]
			corpusList = [list(x) for x in labelledCorpus]
			############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
			# count = 0
			# deletes = []
			# for x in range(len(labels)):
			# 	if (labels[x] == "negative"):
			# 		count = count + 1
			# 		deletes.append(x)
			# 	if (count == (len(labels)-(list(labels).count("positive"))*2)):
			# 		break
			# labelledCorpus = np.delete(labelledCorpus,deletes,axis=0)
			# labels = np.delete(labels,deletes)
			##################
			best_area_cv = -1
			for c_value in c_vals:
				for n in range(numFolds):
					# split training and test data
					train_labelledCorpus,test_labelledCorpus,train_labels,test_labels = train_test_split(labelledCorpus,labels,test_size=0.15)
					# Split of the last 20% of training set for cross validation
					cv_labelledCorpus = train_labelledCorpus[int(0.8*len(train_labelledCorpus)):]
					train_labelledCorpus = train_labelledCorpus[:int(0.8*len(train_labelledCorpus))]
					cv_labels = train_labels[int(0.8*len(train_labels)):]
					train_labels = train_labels[:int(0.8*len(train_labels))]
					# build classifier
					classifier = svm.SVC(C=c_value[j],kernel='linear').fit(train_labelledCorpus,train_labels)
					# compute output label and corresponding score
					output_test = classifier.predict(test_labelledCorpus)
					output_cv = classifier.predict(cv_labelledCorpus)
					output_train = classifier.predict(train_labelledCorpus)
					output_scores_test = classifier.decision_function(test_labelledCorpus)
					output_scores_train = classifier.decision_function(train_labelledCorpus)
					output_scores_cv = classifier.decision_function(cv_labelledCorpus)

					if n ==0:
						all_test_labels = tuple(test_labels)
						all_output_scores_test = tuple(output_scores_test)
						all_cv_labels = tuple(cv_labels)
						all_output_scores_cv = tuple(output_scores_cv)
						all_train_labels = tuple(train_labels)
						all_output_scores_train = tuple(output_scores_train)
					else:
						all_test_labels = all_test_labels + tuple(test_labels)
						all_output_scores_test = all_output_scores_test + tuple(output_scores_test)
						all_cv_labels = all_cv_labels + tuple(cv_labels)
						all_output_scores_cv = all_output_scores_cv + tuple(output_scores_cv)
						all_train_labels = all_train_labels + tuple(train_labels)
						all_output_scores_train = all_output_scores_train+ tuple(output_scores_train)
					# save result for fold to file
					for r in range(len(test_labels)):
						reportIdx = corpusList.index(list(test_labelledCorpus[r]))
						writer.writerow("With c value: "+str(c_value[j]))
						writer.writerow([output_scores_test[r],output_test[r],test_labels[r]])
						writer.writerow([labelledReports[reportIdx]])
				# generate the roc curve
				fp_test,tp_test,_ = roc_curve(all_test_labels,all_output_scores_test,pos_label="positive")
				fp_cv,tp_cv,_ = roc_curve(all_cv_labels,all_output_scores_cv,pos_label="positive")
				fp_train,tp_train,_ = roc_curve(all_train_labels,all_output_scores_train,pos_label="positive")

				# Calculate the area under the curves
				area_test = auc(fp_test, tp_test)
				area_cv = auc(fp_cv, tp_cv)
				area_train = auc(fp_train, tp_train)
				# Store c value,tps, fps and aucs if cv auc is new best
				if area_cv > best_area_cv:
					optimal_c[0][j] = c_value[j]
					best_fp_test=fp_test
					best_tp_test=tp_test
					best_fp_cv=fp_cv
					best_tp_cv=tp_cv
					best_fp_train=fp_train
					best_tp_train=tp_train
					best_area_test=area_test
					best_area_cv=area_cv
					best_area_train=area_train
			# initialise and plot the average ROC curves for optimal c value
			name = DIAGNOSES[j] + " ROC"
			plt.figure(name)
			plt.xlabel("False Positive")
			plt.ylabel("True Positive")
			plt.title(DIAGNOSES[j] + " ROC: c value of "+str(optimal_c[0][j]))
			plt.plot(best_fp_test,best_tp_test,'b',label='test(area = %0.2f)' % best_area_test)
			plt.plot(best_fp_cv,best_tp_cv,'g',label='cv(area = %0.2f)' % best_area_cv)
			plt.plot(best_fp_train,best_tp_train,'r',label='train(area = %0.2f)' % best_area_train)
			plt.legend(loc='lower right')
			plt.savefig(directory+name)
	writeFile.close()
	if learn:
		file = open('./model_files/svm_c_values.pkl', 'w')
		pickle.dump(optimal_c,file)
		file.close()
Ejemplo n.º 15
0
# reduction = 'lsa'
# method = 'l2'		# or frobenius
# dimensions = 500

# # DISTANCE CLASSIFICATION FUNCTION 
# distance = 'cosine'

# # NUMBER OF NEIGHBORS 
# neighbors = 1

'''
Data
'''
# Get a list of RedditDocument objects
rdb = db.RedditDB("blacksun.cs.mcgill.ca", 31050, 'ejacques', 'shellcentershell', "reddit_topics")
documents = P.getData(topics, comment_level, num_docs, rdb)
print "done getting documents:", len(documents)

'''
Preprocessing
'''
# Apply some basic preprocessing functions to it. 
# Default is not to stem. 
P.preprocess(documents, max_word_length=max_word_length, min_word_length=min_word_length, stopwords=stopwords)

# Add websites to documents 
P.addWebsites(documents)

print "done preprocessing"

# Divide into training and test set 
Ejemplo n.º 16
0
def labelClassificationD2V():

	model = gensim.models.Doc2Vec.load("./model_files/reports.doc2vec_model")

	reports = preprocess.getReports()
	processedReports = preprocess.getProcessedReports()

	numFolds = 5 # number of folds for cross validation
	directory = "label_classification/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/"
	if not os.path.exists(directory):
		os.makedirs(directory)
	with open(directory+"labelClassification.csv",'w') as writeFile:
		writer = csv.writer(writeFile)
		writer.writerow(["score","output label","expected label","report"])

		for j in range(len(REPORT_FILES_LABELLED)):
			writer.writerow("")
			writer.writerow("")
			writer.writerow([DIAGNOSES[j]])

			# initialise figure and plot
			name = DIAGNOSES[j] + " ROC"
			plt.figure(name)
			plt.xlabel("False Positive")
			plt.ylabel("True Positive")
			plt.title(DIAGNOSES[j] + " ROC")

			# fetch corpus and labels
			labelledReports = []
			labelledCorpus = list()
			# The labeled data is at the start of the data set
			# Get the ids in the corpus of these first labeled examples for each class
			for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
				labelledReports.append(reports[i])
				labelledCorpus.append(model.infer_vector(processedReports[i]))
			labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2]
			corpusList = [list(x) for x in labelledCorpus]
			############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
			count = 0
			deletes = []
			for x in range(len(labels)):
				if (labels[x] == "negative"):
					count = count + 1
					deletes.append(x)
				if (count == (len(labels)-(list(labels).count("positive"))*2)):
					break
			labelledCorpus = np.delete(labelledCorpus,deletes,axis=0)
			labels = np.delete(labels,deletes)
			##################

			numData = len(labels) # size of the labelled data set
			dataPerFold = int(math.ceil(numData/numFolds))


			for n in range(0,numFolds):
				# split training and test data
				train_labelledCorpus,test_labelledCorpus,train_labels,test_labels = train_test_split(labelledCorpus,labels,test_size=0.13)

				# build classifier
				classifier = svm.SVC(kernel='linear').fit(train_labelledCorpus,train_labels)

				# compute output label and corresponding score
				output_test = classifier.predict(test_labelledCorpus)
				output_train = classifier.predict(train_labelledCorpus)
				output_scores_test = classifier.decision_function(test_labelledCorpus)
				output_scores_train = classifier.decision_function(train_labelledCorpus)

				# sort scores and labels in order
				sortList = list(zip(output_scores_test,output_test,test_labels,test_labelledCorpus))
				sortList.sort()
				output_scores_test,output_test,test_labels,test_labelledCorpus = zip(*sortList)

				# build roc curve and plot
				fp_test,tp_test,_ = roc_curve(test_labels,output_scores_test,pos_label="positive")
				fp_train,tp_train,_ = roc_curve(train_labels,output_scores_train,pos_label="positive")

				plt.plot(fp_test,tp_test,'r',label="train" if n == 0 else "")
				plt.plot(fp_train,tp_train,'b',label="test" if n == 0 else "")
				plt.legend(loc='lower right')
				plt.savefig(directory+name)

				# save result to file
				for r in range(len(test_labels)):
					reportIdx = corpusList.index(list(test_labelledCorpus[r]))
					writer.writerow("")
					writer.writerow([output_scores_test[r],output_test[r],test_labels[r]])
					writer.writerow([labelledReports[reportIdx]])
		# plt.show()
	writeFile.close()
Ejemplo n.º 17
0
def labelClassification():
	corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm')
	#convert the corpus to a numpy matrix, take the transpose and convert it to a list
	corpusList = [list(x) for x in zip(*gensim.matutils.corpus2dense(corpus,corpus.num_terms,dtype=np.float64))]
	# corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]]
	reports = preprocess.getReports()

	numFolds = 5 # number of folds for cross validation
	# Create the output directory
	directory = "label_classification/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/"
	if not os.path.exists(directory):
		os.makedirs(directory)
	with open(directory+"labelClassification.csv",'w') as writeFile:
		writer = csv.writer(writeFile)
		writer.writerow(["score","output label","expected label","report"])

		for j in range(len(REPORT_FILES_LABELLED)):
			writer.writerow("")
			writer.writerow("")
			writer.writerow([DIAGNOSES[j]])

			# initialise figure and plot
			name = DIAGNOSES[j] + " ROC"
			plt.figure(name)
			plt.xlabel("False Positive")
			plt.ylabel("True Positive")
			plt.title(DIAGNOSES[j] + " ROC")

			# fetch corpus and labels
			labelledCorpus = []
			# print(range(getNumReports(REPORT_FILES[:j]),getNumReports(REPORT_FILES[:j])+getNumReports([REPORT_FILES_LABELLED[j]])))
			# The labeled data is at the start of the data set
			# Get the ids in the corpus of these first labeled examples for each class
			for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
				labelledCorpus.append((corpusList[i]))
			labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2]
			############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
			count = 0
			deletes = []
			for x in range(len(labels)):
				if (labels[x] == "negative"):
					count = count + 1
					deletes.append(x)
				if (count == (len(labels)-(list(labels).count("positive"))*2)):
					break
			labelledCorpus = np.delete(labelledCorpus,deletes,axis=0)
			labels = np.delete(labels,deletes)
			##################

			numData = len(labels) # size of the labelled data set
			dataPerFold = int(math.ceil(numData/numFolds))


			for n in range(0,numFolds):
				# split training and test data
				train_labelledCorpus,test_labelledCorpus,train_labels,test_labels = train_test_split(labelledCorpus,labels,test_size=0.13)

				# build classifier
				classifier = svm.SVC(kernel='linear').fit(train_labelledCorpus,train_labels)
				# classifier = svm.LinearSVC(C=1.0).fit(train_labelledCorpus,train_labels)
				# classifier = neighbors.KNeighborsClassifier(n_neighbors=3).fit(train_labelledCorpus,train_labels)

				# compute output label and corresponding score
				output_test = classifier.predict(test_labelledCorpus)
				output_train = classifier.predict(train_labelledCorpus)
				output_scores_test = classifier.decision_function(test_labelledCorpus)
				output_scores_train = classifier.decision_function(train_labelledCorpus)

				# sort scores and labels in order
				sortList = list(zip(output_scores_test,output_test,test_labels,test_labelledCorpus))
				sortList.sort()
				output_scores_test,output_test,test_labels,test_labelledCorpus = zip(*sortList)

				if n ==0:
					all_test_labels = test_labels
					all_output_scores_test = output_scores_test
					all_train_labels = tuple(train_labels)
					all_output_scores_train = tuple(output_scores_train)
				else:
					all_test_labels = all_test_labels + test_labels
					all_output_scores_test = all_output_scores_test + output_scores_test
					all_train_labels = all_train_labels + tuple(train_labels)
					all_output_scores_train = all_output_scores_train+ tuple(output_scores_train)
				# save result to file
				for r in range(len(test_labels)):
					reportIdx = corpusList.index(list(test_labelledCorpus[r]))
					writer.writerow("")
					writer.writerow([output_scores_test[r],output_test[r],test_labels[r]])
					writer.writerow([reports[reportIdx]])
			# generate the roc curve
			fp_test,tp_test,_ = roc_curve(all_test_labels,all_output_scores_test,pos_label="positive")
			fp_train,tp_train,_ = roc_curve(all_train_labels,all_output_scores_train,pos_label="positive")

			# Calculate the area under the curves
			area_test = auc(fp_test, tp_test)
			area_train = auc(fp_train, tp_train)
			# Plot the average ROC curves
			plt.plot(fp_test,tp_test,'b',label='test(area = %0.2f)' % area_test)
			plt.plot(fp_train,tp_train,'r',label='train(area = %0.2f)' % area_train)
			plt.legend(loc='lower right')
			plt.savefig(directory+name)
	writeFile.close()
Ejemplo n.º 18
0
from preprocess import getData
import pickle

#intervals = [15,30,60]
intervals = [60]
for i in intervals:
    data, data_unnormalized = getData(i)
    # pickle.dump(data,open(str(i)+".p","wb"))
    pickle.dump(data_unnormalized,open(str(i)+"_unnormalized.p","wb"))
Ejemplo n.º 19
0
	topics = params.topics

# Now save metadata to db to remember parameter configuration
print "Saving metadata to mongodb..."
metadata = utils.createMetaData(params)
result = params.db.add_metadata(metadata)
if result:
	print "Save successful."
else:
	print "Save not successful."

# Get the documents in Reddit format 
print "Retrieving documents..."
trainpath = "../data/scala/llda_train_"+str(result)+".csv"
testpath = "../data/scala/llda_test_"+str(result)+".csv"
preprocess.getData(topics, params.comment_level, params.num_docs, params.db, trainpath, testpath, params.max_word_length, params.min_word_length, params.stopwords, params.stem)
# Preprocess these 
#print "Preprocessing documents..."
#preprocess.preprocess(reddit_documents, params.max_word_length, params.min_word_length, params.stopwords, params.stem)
#''' TODO: removal_threshold and removal_perc '''

# Split into train and test 
#print "Splitting into train and test sets..."
#train,test = utils.partition(reddit_documents, .9)

# Print each document to file
# Add metadata's db id to filename to be able to match up to metadata in db
#timestamp = '_'.join(str(datetime.today()).split())
#ftrain = open(trainpath, "wt")
#ftest = open(testpath, "wt")
Ejemplo n.º 20
0
# Will keep track of total counts 
word_frequencies = {}
# Each subreddit has associated list of documents 
documents = {}
# Each subreddit has submission- and comment-level date ranges
dates = {}

start = time.clock()
# Get documents from mongodb while preprocessing and 
# counting overall word frequencies
for topic in topics:
	st = time.clock()
	print topic 
	s = time.clock()
	docs = P.getData([topic], comment_level, num_docs)

	# Find date range for this topic (at submission level)
	subsd, subed = utils.submission_date_range(docs)
	# Find date range for this topic (at comment level)
	commsd, commed = utils.comment_date_range(docs)
	dates[topic] = {"SSD":subsd, "SED": subed, "CSD": commsd, "CED":commed}

	print "\t",len(docs), "documents:", time.clock()-s
	s = time.clock()
	P.preprocess(docs, max_word_length=max_word_length, min_word_length=min_word_length, stopwords='long', stem=False)
	print "\tdone preprocessing:", time.clock()-s
	documents[topic] = docs
	print "\tcalculating total frequencies..."
	s = time.clock()
	for doc in docs:
Ejemplo n.º 21
0
def explore_data():
    X, Y = getData()
    print(X.shape)
Ejemplo n.º 22
0
def testClassification():
	threashold = 0.001
	corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm')
	#convert the corpus to a numpy matrix, take the transpose and convert it to a list
	corpusList = [list(x) for x in zip(*gensim.matutils.corpus2dense(corpus,corpus.num_terms,dtype=np.float64))]
	# corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]]
	reports = preprocess.getReports()

	numFolds = 5 # number of folds for cross validation
	# Create the output directory
	directory = "label_tests/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/"
	os.makedirs(directory)
	with open(directory+"labelClassification.csv",'w') as writeFile:
		writer = csv.writer(writeFile)
		writer.writerow(["score","output label","expected label","report"])

		for j in range(len(REPORT_FILES_LABELLED)):
			writer.writerow("")
			writer.writerow("")
			writer.writerow([DIAGNOSES[j]])

			# fetch corpus and labels
			labelledCorpus = []
			unlabelledCorpus = []
			# The labeled data is at the start of the data set
			# Get the ids in the corpus of these first labeled examples for each class
			for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
				labelledCorpus.append(corpusList[i])
			for i in range(preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES[j]])):
				unlabelledCorpus.append(corpusList[i])
			labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2]
			############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
			count = 0
			deletes = []
			for x in range(len(labels)):
				if (labels[x] == "negative"):
					count = count + 1
					deletes.append(x)
				if (count == (len(labels)-(list(labels).count("positive"))*2)):
					break
			labelledCorpus = np.delete(labelledCorpus,deletes,axis=0)
			labels = np.delete(labels,deletes)
			##################

			numData = len(labels) # size of the labelled data set

			# build classifier
			classifier = svm.SVC(kernel='linear').fit(labelledCorpus,labels)

			# compute output label and corresponding score
			output_test = classifier.predict(unlabelledCorpus)
			output_scores_test = classifier.decision_function(unlabelledCorpus)

			# sort scores and labels in order
			sortList = list(zip(output_scores_test,output_test,unlabelledCorpus))
			sortList.sort()
			output_scores_test,output_test,unlabelledCorpus = zip(*sortList)

			# save result to file
			for r in range(len(unlabelledCorpus)):
				if (abs(output_scores_test[r]) < threashold):
					reportIdx = corpusList.index(list(unlabelledCorpus[r]))
					writer.writerow("")
					writer.writerow([reportIdx,output_scores_test[r],output_test[r]])
					writer.writerow([reports[reportIdx]])
	writeFile.close()
Ejemplo n.º 23
0
def labelClassification():
    corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm')
    #convert the corpus to a numpy matrix, take the transpose and convert it to a list
    corpusList = [
        list(x) for x in zip(*gensim.matutils.corpus2dense(
            corpus, corpus.num_terms, dtype=np.float64))
    ]
    # corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]]
    reports = preprocess.getReports()

    numFolds = 5  # number of folds for cross validation
    # Create the output directory
    directory = "label_classification/" + datetime.datetime.now().strftime(
        '%m_%d_%H_%M') + "/"
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(directory + "labelClassification.csv", 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerow(["score", "output label", "expected label", "report"])

        for j in range(len(REPORT_FILES_LABELLED)):
            writer.writerow("")
            writer.writerow("")
            writer.writerow([DIAGNOSES[j]])

            # initialise figure and plot
            name = DIAGNOSES[j] + " ROC"
            plt.figure(name)
            plt.xlabel("False Positive")
            plt.ylabel("True Positive")
            plt.title(DIAGNOSES[j] + " ROC")

            # fetch corpus and labels
            labelledCorpus = []
            # print(range(getNumReports(REPORT_FILES[:j]),getNumReports(REPORT_FILES[:j])+getNumReports([REPORT_FILES_LABELLED[j]])))
            # The labeled data is at the start of the data set
            # Get the ids in the corpus of these first labeled examples for each class
            for i in range(
                    preprocess.getNumReports(REPORT_FILES[:j]),
                    preprocess.getNumReports(REPORT_FILES[:j]) +
                    preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
                labelledCorpus.append((corpusList[i]))
            labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]
                                                    ]))[:, 2]
            ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
            count = 0
            deletes = []
            for x in range(len(labels)):
                if (labels[x] == "negative"):
                    count = count + 1
                    deletes.append(x)
                if (count == (len(labels) -
                              (list(labels).count("positive")) * 2)):
                    break
            labelledCorpus = np.delete(labelledCorpus, deletes, axis=0)
            labels = np.delete(labels, deletes)
            ##################

            numData = len(labels)  # size of the labelled data set
            dataPerFold = int(math.ceil(numData / numFolds))

            for n in range(0, numFolds):
                # split training and test data
                train_labelledCorpus, test_labelledCorpus, train_labels, test_labels = train_test_split(
                    labelledCorpus, labels, test_size=0.13)

                # build classifier
                classifier = svm.SVC(kernel='linear').fit(
                    train_labelledCorpus, train_labels)
                # classifier = svm.LinearSVC(C=1.0).fit(train_labelledCorpus,train_labels)
                # classifier = neighbors.KNeighborsClassifier(n_neighbors=3).fit(train_labelledCorpus,train_labels)

                # compute output label and corresponding score
                output_test = classifier.predict(test_labelledCorpus)
                output_train = classifier.predict(train_labelledCorpus)
                output_scores_test = classifier.decision_function(
                    test_labelledCorpus)
                output_scores_train = classifier.decision_function(
                    train_labelledCorpus)

                # sort scores and labels in order
                sortList = list(
                    zip(output_scores_test, output_test, test_labels,
                        test_labelledCorpus))
                sortList.sort()
                output_scores_test, output_test, test_labels, test_labelledCorpus = zip(
                    *sortList)

                if n == 0:
                    all_test_labels = test_labels
                    all_output_scores_test = output_scores_test
                    all_train_labels = tuple(train_labels)
                    all_output_scores_train = tuple(output_scores_train)
                else:
                    all_test_labels = all_test_labels + test_labels
                    all_output_scores_test = all_output_scores_test + output_scores_test
                    all_train_labels = all_train_labels + tuple(train_labels)
                    all_output_scores_train = all_output_scores_train + tuple(
                        output_scores_train)
                # save result to file
                for r in range(len(test_labels)):
                    reportIdx = corpusList.index(list(test_labelledCorpus[r]))
                    writer.writerow("")
                    writer.writerow([
                        output_scores_test[r], output_test[r], test_labels[r]
                    ])
                    writer.writerow([reports[reportIdx]])
            # generate the roc curve
            fp_test, tp_test, _ = roc_curve(all_test_labels,
                                            all_output_scores_test,
                                            pos_label="positive")
            fp_train, tp_train, _ = roc_curve(all_train_labels,
                                              all_output_scores_train,
                                              pos_label="positive")

            # Calculate the area under the curves
            area_test = auc(fp_test, tp_test)
            area_train = auc(fp_train, tp_train)
            # Plot the average ROC curves
            plt.plot(fp_test,
                     tp_test,
                     'b',
                     label='test(area = %0.2f)' % area_test)
            plt.plot(fp_train,
                     tp_train,
                     'r',
                     label='train(area = %0.2f)' % area_train)
            plt.legend(loc='lower right')
            plt.savefig(directory + name)
    writeFile.close()
Ejemplo n.º 24
0
from sklearn.utils import shuffle
from preprocess import getData


#### The following function gets an indicator matrix from the targets
def y2indicator(y, K):
    N = len(y)
    ind = np.zeros((N, K))

    for i in range(N):
        ind[i, y[i]] = 1
    return ind


#### Getting our data, shuffling it and defining the test and train sets
X, Y = getData()
X, Y = shuffle(X, Y)
Y = Y.astype(np.int32)
D = X.shape[1]
K = len(set(Y))

X_train = X[:-100]
Y_train = Y[:-100]
Y_train_ind = y2indicator(Y_train, K)

X_test = X[-100:]
Y_test = Y[-100:]
Y_test_ind = y2indicator(Y_test, K)

#### Initializing the weights
W = np.random.randn(D, K)
Ejemplo n.º 25
0
def labelClassificationRNN(learn=True):
    if learn:
        c_vals = [[0.001, 0.001, 0.001, 0.001]]
        c_vals = [[0.005, 0.005, 0.005, 0.005]]
        c_vals.append([0.01, 0.01, 0.01, 0.01])
        c_vals.append([0.05, 0.05, 0.05, 0.05])
        c_vals.append([0.1, 0.1, 0.1, 0.1])
        c_vals.append([0.5, 0.5, 0.5, 0.5])
        c_vals.append([1, 1, 1, 1])
        optimal_c = [[0, 0, 0, 0]]
    else:
        file = open('./model_files/svm_c_values.pkl', 'r')
        c_vals = pickle.load(file)
        optimal_c = c_vals
        file.close()
    reports = preprocess.getReports()
    reportVectors = rnn.loadReportVecs()

    numFolds = 5  # number of folds for cross validation
    directory = "label_classification/" + datetime.datetime.now().strftime(
        '%m_%d_%H_%M') + "/"
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(directory + "labelClassification.csv", 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerow(["score", "output label", "expected label", "report"])

        for j in range(len(REPORT_FILES_LABELLED)):
            writer.writerow("")
            writer.writerow("")
            writer.writerow([DIAGNOSES[j]])
            # fetch corpus and labels
            labelledReports = []
            labelledCorpus = list()
            # The labeled data is at the start of the data set
            # Get the ids in the corpus of these first labeled examples for each class
            for i in range(
                    preprocess.getNumReports(REPORT_FILES[:j]),
                    preprocess.getNumReports(REPORT_FILES[:j]) +
                    preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
                labelledReports.append(reports[i])
                labelledCorpus.append(reportVectors[i][:])
            labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]
                                                    ]))[:, 2]
            corpusList = [list(x) for x in labelledCorpus]
            ############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
            # count = 0
            # deletes = []
            # for x in range(len(labels)):
            # 	if (labels[x] == "negative"):
            # 		count = count + 1
            # 		deletes.append(x)
            # 	if (count == (len(labels)-(list(labels).count("positive"))*2)):
            # 		break
            # labelledCorpus = np.delete(labelledCorpus,deletes,axis=0)
            # labels = np.delete(labels,deletes)
            ##################
            best_area_cv = -1
            for c_value in c_vals:
                for n in range(numFolds):
                    # split training and test data
                    train_labelledCorpus, test_labelledCorpus, train_labels, test_labels = train_test_split(
                        labelledCorpus, labels, test_size=0.15)
                    # Split of the last 20% of training set for cross validation
                    cv_labelledCorpus = train_labelledCorpus[
                        int(0.8 * len(train_labelledCorpus)):]
                    train_labelledCorpus = train_labelledCorpus[:int(
                        0.8 * len(train_labelledCorpus))]
                    cv_labels = train_labels[int(0.8 * len(train_labels)):]
                    train_labels = train_labels[:int(0.8 * len(train_labels))]
                    # build classifier
                    classifier = svm.SVC(C=c_value[j], kernel='linear').fit(
                        train_labelledCorpus, train_labels)
                    # compute output label and corresponding score
                    output_test = classifier.predict(test_labelledCorpus)
                    output_cv = classifier.predict(cv_labelledCorpus)
                    output_train = classifier.predict(train_labelledCorpus)
                    output_scores_test = classifier.decision_function(
                        test_labelledCorpus)
                    output_scores_train = classifier.decision_function(
                        train_labelledCorpus)
                    output_scores_cv = classifier.decision_function(
                        cv_labelledCorpus)

                    if n == 0:
                        all_test_labels = tuple(test_labels)
                        all_output_scores_test = tuple(output_scores_test)
                        all_cv_labels = tuple(cv_labels)
                        all_output_scores_cv = tuple(output_scores_cv)
                        all_train_labels = tuple(train_labels)
                        all_output_scores_train = tuple(output_scores_train)
                    else:
                        all_test_labels = all_test_labels + tuple(test_labels)
                        all_output_scores_test = all_output_scores_test + tuple(
                            output_scores_test)
                        all_cv_labels = all_cv_labels + tuple(cv_labels)
                        all_output_scores_cv = all_output_scores_cv + tuple(
                            output_scores_cv)
                        all_train_labels = all_train_labels + tuple(
                            train_labels)
                        all_output_scores_train = all_output_scores_train + tuple(
                            output_scores_train)
                    # save result for fold to file
                    for r in range(len(test_labels)):
                        reportIdx = corpusList.index(
                            list(test_labelledCorpus[r]))
                        writer.writerow("With c value: " + str(c_value[j]))
                        writer.writerow([
                            output_scores_test[r], output_test[r],
                            test_labels[r]
                        ])
                        writer.writerow([labelledReports[reportIdx]])
                # generate the roc curve
                fp_test, tp_test, _ = roc_curve(all_test_labels,
                                                all_output_scores_test,
                                                pos_label="positive")
                fp_cv, tp_cv, _ = roc_curve(all_cv_labels,
                                            all_output_scores_cv,
                                            pos_label="positive")
                fp_train, tp_train, _ = roc_curve(all_train_labels,
                                                  all_output_scores_train,
                                                  pos_label="positive")

                # Calculate the area under the curves
                area_test = auc(fp_test, tp_test)
                area_cv = auc(fp_cv, tp_cv)
                area_train = auc(fp_train, tp_train)
                # Store c value,tps, fps and aucs if cv auc is new best
                if area_cv > best_area_cv:
                    optimal_c[0][j] = c_value[j]
                    best_fp_test = fp_test
                    best_tp_test = tp_test
                    best_fp_cv = fp_cv
                    best_tp_cv = tp_cv
                    best_fp_train = fp_train
                    best_tp_train = tp_train
                    best_area_test = area_test
                    best_area_cv = area_cv
                    best_area_train = area_train
            # initialise and plot the average ROC curves for optimal c value
            name = DIAGNOSES[j] + " ROC"
            plt.figure(name)
            plt.xlabel("False Positive")
            plt.ylabel("True Positive")
            plt.title(DIAGNOSES[j] + " ROC: c value of " +
                      str(optimal_c[0][j]))
            plt.plot(best_fp_test,
                     best_tp_test,
                     'b',
                     label='test(area = %0.2f)' % best_area_test)
            plt.plot(best_fp_cv,
                     best_tp_cv,
                     'g',
                     label='cv(area = %0.2f)' % best_area_cv)
            plt.plot(best_fp_train,
                     best_tp_train,
                     'r',
                     label='train(area = %0.2f)' % best_area_train)
            plt.legend(loc='lower right')
            plt.savefig(directory + name)
    writeFile.close()
    if learn:
        file = open('./model_files/svm_c_values.pkl', 'w')
        pickle.dump(optimal_c, file)
        file.close()
Ejemplo n.º 26
0
import sys
import random
import preprocess
import search
# process the search term
if (len(sys.argv) < 2):
    print("ERROR: Please specify an input file")
    sys.exit()
fileName = str(sys.argv[1])
fileText = [row.rstrip('\n') for row in open(fileName)]

if (fileText[1] == "notonlyreturndates"):
    print("Req No.<,>Report Date<,>Report")
elif (fileText[1] == "onlyreturndates"):
    print("Req No.<,>Report Date")
else:
    print("ERROR: input file layout error")
    sys.exit()

data = preprocess.getData()

similarReports = search.search("lsi",50,fileText[0])
for reportIdx in similarReports:
    year = random.randint(2000,int(fileText[2][0:4])-1)
    month = random.randint(1,12)
    date = random.randint(1,28)
    if (fileText[1] == "notonlyreturndates"):
        print(data[reportIdx[0]][0] + "<,>" + str(year) + str(month).zfill(2) + str(date).zfill(2) + "<,>" + data[reportIdx[0]][1])
    elif (fileText[1] == "onlyreturndates"):
        print(data[reportIdx[0]][0] + "<,>" + str(year) + str(month).zfill(2) + str(date).zfill(2))