Esempio n. 1
0
def generate_sentences(data_files):
    """
    Generates SentenceRecord objects using the given datafiles. Tokenises reports into sentences and then preprocesses
    each sentence. Also saves the report_id and report class into the object.
    Note: Currently report class is just the index of the datafile extracted from, a global mapping would be better.
    :param data_files: List of strings corresponding the raw csv datafiles in the format 'report_id, report'
    :return: List of SentenceRecord objects extracted from the files.
    """
    list_sentence_record = []
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    for i, file_name in enumerate(data_files):
        with open(file_name, 'rb') as file:
            file.readline()  # skip header line
            reader = csv.reader(file)
            for report in reader:
                for sentence in tokenizer.tokenize(report[1]):
                    tmp = SentenceRecord(sentence)
                    tmp.processed_sentence = " ".join(
                        preprocess.textPreprocess(sentence, removeNegationsFromSentences=False))
                    tmp.report_id = report[0]
                    tmp.report_class = i
                    list_sentence_record.append(tmp)

    return list_sentence_record
Esempio n. 2
0
def search(model, numResults, searchTerm):
	dictionary = gensim.corpora.Dictionary.load('./model_files/reports.dict')
	origSearchTerm = searchTerm
	searchTerm = preprocess.textPreprocess(searchTerm)
	# searchTerm = preprocess.getDerivations(searchTerm)
	if (searchTerm == []):
		return []
	if model == "bow":
		index = gensim.similarities.SparseMatrixSimilarity.load('./model_files/reports.index')
		index.num_best = numResults

		searchTerm_bow = dictionary.doc2bow(searchTerm)

		similarReports = index[searchTerm_bow]
	elif model == "tfidf":
		tfidf_model = gensim.models.TfidfModel.load('./model_files/reports.tfidf_model')

		tfidf_index = gensim.similarities.SparseMatrixSimilarity.load('./model_files/reports_tfidf.index')
		tfidf_index.num_best = numResults

		searchTerm_bow = dictionary.doc2bow(searchTerm)
		searchTerm_tfidf = tfidf_model[searchTerm_bow]

		similarReports = tfidf_index[searchTerm_tfidf]
	elif model == "lsi":
		tfidf_model = gensim.models.TfidfModel.load('./model_files/reports.tfidf_model')
		lsi_model = gensim.models.LsiModel.load('./model_files/reports.lsi_model')

		lsi_index = gensim.similarities.MatrixSimilarity.load('./model_files/reports_lsi.index')
		lsi_index.num_best = numResults

		searchTerm_bow = dictionary.doc2bow(searchTerm)
		searchTerm_tfidf = tfidf_model[searchTerm_bow]
		searchTerm_lsi = lsi_model[searchTerm_tfidf]

		similarReports = lsi_index[searchTerm_lsi]
	elif model == "lda":
		lda_model = gensim.models.LdaModel.load('./model_files/reports.lda_model')

		lda_index = gensim.similarities.MatrixSimilarity.load('./model_files/reports_lda.index')
		lda_index.num_best = numResults

		searchTerm_bow = dictionary.doc2bow(searchTerm)
		searchTerm_lda = lda_model[searchTerm_bow]

		similarReports = lda_index[searchTerm_lda]
	elif model == "doc2vec":
		model = gensim.models.Doc2Vec.load("./model_files/reports.doc2vec_model")

		searchTerm_docvec = model.infer_vector(searchTerm)

		similarReports = model.docvecs.most_similar([searchTerm_docvec],topn=numResults)
	elif model == "rnn":
		searchTerm_rnn = rnn.getReportSearchTerm(origSearchTerm)

		similarReports = rnn.most_similar_reports(searchTerm_rnn,topn=numResults)
	else:
		return 0

	return similarReports
Esempio n. 3
0
def input_game():
    """
    This game accepts a 'report sentence' from a user and attempts to classify it.
    :return: None
    """
    print "Write a sentence to see if the system thinks it's diagnostic:"
    sys.stdout.write("> ")
    ans = sys.stdin.readline()
    print ans
    if ans and ans.rstrip() != "":
        ans = ans.rstrip()
        processed_sentence = " ".join(
            preprocess.textPreprocess(ans, removeNegationsFromSentences=False))
        if processed_sentence == "":
            return
        print processed_sentence
        probs = pipe.predict_proba([processed_sentence])[0]
        print probs
        if probs[0] > probs[1]:
            print "[Negative] That output is not diagnostic with a confidence of " + str(probs[0])
        else:
            print "[Positive] That output is diagnostic with a confidence of " + str(probs[1])