def main(): print("Running on BIO-NLP data\n\n") home_dir = "../dl4nlp" # The hyper-parameters of the word embedding trained model window_size = 5 embed_vector_size = 50 min_count = 1000 # Define the data files data_folder = os.path.join("..\\", "sample_data", "drugs_and_diseases") train_file_path = os.path.join(data_folder, "Drug_and_Disease_train.txt") test_file_path = os.path.join(data_folder, "Drug_and_Disease_test.txt") data_file_path = os.path.join(data_folder, "unlabeled_test_sample.txt") resources_pickle_file = os.path.join(home_dir, "models", "resources.pkl") embedding_pickle_file = os.path.join(home_dir, "models", "w2vmodel_pubmed_vs_{}_ws_{}_mc_{}.pkl" \ .format(embed_vector_size, window_size, min_count)) print("embedding_pickle_file= {}".format(embedding_pickle_file)) # The hyperparameters of the LSTM trained model #network_type= 'unidirectional' network_type = 'bidirectional' num_layers = 2 num_hidden_units = 150 num_epochs = 10 batch_size = 50 dropout = 0.2 reg_alpha = 0.0 model_file_path = os.path.join(home_dir,'models','lstm_{}_model_units_{}_lyrs_{}_epchs_{}_vs_{}_ws_{}_mc_{}.h5'.\ format(network_type, num_hidden_units, num_layers, num_epochs, embed_vector_size, window_size, min_count)) print("Training the model... num_epochs = {}, num_layers = {}, num_hidden_units = {}".\ format(num_epochs, num_layers,num_hidden_units)) reader = DataReader() entityExtractor = EntityExtractor(reader, embedding_pickle_file) entityExtractor.train (train_file_path, \ output_resources_pickle_file = resources_pickle_file, \ network_type = network_type, \ num_epochs = num_epochs, \ batch_size = batch_size, \ dropout = dropout, \ reg_alpha = reg_alpha, \ num_hidden_units = num_hidden_units, \ num_layers = num_layers) #Save the model entityExtractor.save(model_file_path) print("Done.")
def init(): """ Initialise SD model """ from keras.models import Sequential from keras.models import load_model import h5py global entityExtractor start = t.default_timer() home_dir = os.getcwd() print("home_dir = {}".format(home_dir)) resources_pickle_file = os.path.join(home_dir, "resources.pkl") if not os.path.exists(resources_pickle_file): print("The model companion resources pickle file ({}) doesn't exist.". format(resources_pickle_file)) model_file_path = os.path.join(home_dir, 'model.h5') if not os.path.exists(model_file_path): print("The neural model file ({}) doesn't exist.".format( model_file_path)) print("Starting the model prediction ...") reader = DataReader(input_resources_pickle_file=resources_pickle_file) entityExtractor = EntityExtractor(reader) try: #load the model print("Loading the entity extraction model {}".format(model_file_path)) entityExtractor.load(model_file_path) entityExtractor.print_summary() except: print("can't load the entity extraction model") pass end = t.default_timer() loadTimeMsg = "Model loading time: {0} ms".format( round((end - start) * 1000, 2)) logger.info(loadTimeMsg)
''' Created on Oct 28, 2013 @author: lasitha ''' from FilteredTweetWriter import FilteredTweetWriter from MongoReader import MongoReader from EntityExtractor import EntityExtractor from pymongo import Connection from sentimentanalysis.classifier.classifier import Classifier from sentimentanalysis.classifier.config import POSITIVE, NEGATIVE, HARD_TO_CLASSIFY from time import sleep reader = MongoReader() entityExtractor = EntityExtractor() while True: data = reader.getData() if data != None: tweetText = data['text'] else: print 'Programs is sleeping' sleep(20) continue enti = entityExtractor.tagEntity(tweetText)
#mode = 'train' mode = 'evaluate' #mode = 'score' K.clear_session() with K.get_session() as sess: K.set_session(sess) graphr = K.get_session().graph with graphr.as_default(): if mode == 'train': print("Training the model... num_epochs = {}, num_layers = {}". format(num_epochs, num_layers)) reader = DataReader(num_classes, vector_size=embed_vector_size) entityExtractor = EntityExtractor(reader, embedding_pickle_file) entityExtractor.train(local_train_file_path, network_type=network_type, num_epochs=num_epochs, num_hidden_units=num_hidden_units, num_layers=num_layers) entityExtractor.save_tag_map(tag_to_idx_map_file) #Save the model entityExtractor.save(model_file_path) elif mode == 'evaluate': # Evaluate the model print("Evaluating the model...") reader = DataReader(num_classes, max_seq_length=max_seq_length,
# Combine predicted type and status into status_classification_info status_classification_info = StatusClassifier.finalize_classification_info_object(status_classification_info) status_result_file = "status_results.txt" StatusClassifier.evaluate_status_classification(status_classification_info, status_result_file, TEST_FOLD) ################################## #### EXTRACTION PIPELINE ######### ################################## # NOTE: MUST CHANGE PARAMETER stanford_ner_path to your 'stanford-ner.jar' path STAN_NER_DIR = "/home/wlane/stanford-ner-2015-04-20/stanford-ner.jar" # Train EntityExtractor.train(training_doc_objs, stanford_ner_path=STAN_NER_DIR) # Test EntityExtractor.test(status_classification_info, stanford_ner_path=STAN_NER_DIR) # DEBUG -- place breakpoint here, take a look at the status_classification object and make sure it has everything we need test = 0 ################################################################## #### PUTTING EXTRACTION AND STATUS PREDICTIONS TOGETHER ######### ################################################################ # DATA FOR COMBINING ATTRIBUTES TO EVENTS: # Where do I find the CRF classification output? # - status_classification.one_of_the_sentences_in_sent_objs.tok_sent_with_crf_classification # - This contains a dictionary of {attrib_type:crf output} # - attrib_type is in the domain {Temporal, Method, Type, Amount, History}
def main(): print("Running on BIO-NLP data\n\n") from sys import platform if platform == "win32": home_dir = "C:\\dl4nlp" else: home_dir = os.path.join(os.path.expanduser('~'), "dl4nlp") print("home_dir = {}".format(home_dir)) # The hyper-parameters of the word embedding trained model window_size = 5 embed_vector_size = 50 min_count = 400 data_folder = os.path.join("sample_data", "drugs_and_diseases") test_file_path = os.path.join(data_folder, "Drug_and_Disease_test.txt") resources_pickle_file = os.path.join(home_dir, "models", "resources.pkl") # The hyper-parameters of the LSTM trained model #network_type= 'unidirectional' network_type = 'bidirectional' #embed_vector_size = 50 num_classes = 7 + 1 max_seq_length = 613 num_layers = 2 num_hidden_units = 150 num_epochs = 10 batch_size = 50 dropout = 0.2 reg_alpha = 0.0 print("Initializing data...") model_file_path = os.path.join(home_dir,'models','lstm_{}_model_units_{}_lyrs_{}_epchs_{}_vs_{}_ws_{}_mc_{}.h5'.\ format(network_type, num_hidden_units, num_layers, num_epochs, embed_vector_size, window_size, min_count)) K.clear_session() with K.get_session() as sess: K.set_session(sess) graphr = K.get_session().graph with graphr.as_default(): # Evaluate the model print("Evaluating the model...") reader = DataReader( input_resources_pickle_file=resources_pickle_file) entityExtractor = EntityExtractor(reader) #load the model print("Loading the model from file {} ...".format(model_file_path)) entityExtractor.load(model_file_path) entityExtractor.print_summary() if not os.path.exists(os.path.join(home_dir, "output")): os.makedirs(os.path.join(home_dir, "output")) # make sure that the input test data file is in IOB format output_prediction_file = os.path.join(home_dir, "output", "prediction_output.tsv") evaluation_report, confusion_matrix = entityExtractor.evaluate_model( test_file_path, output_prediction_file) print(evaluation_report) print(confusion_matrix) ######################################################### # from the commmand line interface, # (1) change directory to \code\02_modeling\03_model_evaluation # (2) run the following perl evaluation script # "C:\Program Files\Git\usr\bin\perl.exe" Drug_and_Disease_eval.pl ..\..\..\sample_data\drugs_and_diseases\Drug_and_Disease_test.txt C:\dl4nlp\output\prediction_output.tsv ######################################################### K.clear_session() K.set_session(None) print("Done.")
def upload_file_(): try: print("Here in uploader") file = request.files['file'] pname = request.form['pname'] print(file) print(pname) if request.method == 'POST': file = request.files['file'] pname = request.form['pname'] lang = request.form['lang'] if file.filename == '': print("file name is empty") return redirect(url_for('addfiles.html', message='No selected file')) if file and allowed_file(file.filename): filename = secure_filename(file.filename) file.filename.replace(" ","_") print("file",file.filename) file.save(os.path.join(app.config['UPLOAD_PATH_PDF'], filename)) datetime_now = datetime.datetime.now(); formatted_date = datetime_now.strftime('%Y-%m-%d') db = pymysql.connect(app.config["DATABASEIP"], app.config["DB_USER"], app.config["DB_PASSWORD"], app.config["DATABASE"]) cur = db.cursor() sql = 'INSERT INTO Project_Files (FileName,ProjectName,ProjectUserID,UploadDate,UploadPath,Nodes,Edges,FileEntities,URL) VALUES (%s,%s, %s ,%s,%s,%s,%s,%s,%s)' entityExtractor_ = None document_url = None if (".docx" in file.filename): with open(app.config['UPLOAD_PATH_PDF'] + file.filename, "rb") as docx_file: result = mammoth.convert_to_html(docx_file) html = result.value # The generated HTML temp = file.filename.replace(".docx", "") Html_file = open(app.config['UPLOAD_PATH_PDF'] + temp + ".html", "w") Html_file.write(html) Html_file.close() document_url = "http://george.runmy.tech:5000/static/web/"+ temp + ".html" entityExtractor_ = EntityExtractor(lang,app.config['UPLOAD_PATH_PDF'] + file.filename,pname.strip("'"),document_url,app.config['GOOGLE_API_KEY'],app.config['NLP_API_KEY'],app.config["DATABASEIP"], app.config["DB_USER"], app.config["DB_PASSWORD"], app.config["DATABASE"],file.filename) entityExtractor_.getEntityDocxJson() elif(".txt" in file.filename): temp = file.filename.replace(".txt", "") data = "" html = "<html></html>" soup = BeautifulSoup(html) htmltag = soup.find('html') body = soup.new_tag("body") with open(app.config['UPLOAD_PATH_PDF'] + file.filename, "r") as myfile: data = myfile.read() paras = data.split("\n\n") for para in paras: html = "<p></p>" souppara = BeautifulSoup(html) ptag = souppara.find('p') ptag.insert(0, NavigableString(para)) body.append(ptag) htmltag.append(body) html_page = soup.prettify("utf-8") with open(app.config['UPLOAD_PATH_PDF'] + temp + ".html", "wb+") as filewriter: filewriter.write(html_page) document_url = "http://george.runmy.tech:5000/static/web/" + temp + ".html" entityExtractor_ = EntityExtractor(lang,app.config['UPLOAD_PATH_PDF'] + filename,pname.strip("'"), document_url,app.config['GOOGLE_API_KEY'],app.config['NLP_API_KEY'],app.config["DATABASEIP"], app.config["DB_USER"], app.config["DB_PASSWORD"], app.config["DATABASE"],file.filename) entityExtractor_.getEntityTxtJson() elif(".msg" in file.filename): #pythoncom.CoInitialize() #outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI") #temp = file.filename.replace(".msg", "") #msg = outlook.OpenSharedItem(app.config['UPLOAD_PATH_PDF']+ file.filename) #data = msg.Body #os.system('cd /home/sanam/Test ; msgconvert chunmun.msg') print('cd '+app.config['UPLOAD_PATH_PDF']+'; '+'msgconvert '+file.filename) os.system('cd '+app.config['UPLOAD_PATH_PDF']+'; '+'msgconvert '+file.filename ) with open(app.config['UPLOAD_PATH_PDF']+file.filename+'.eml', 'rb') as fp: # select a specific email file from the list msg = BytesParser(policy=policy.default).parse(fp) data = msg.get_body(preferencelist=('plain')).get_content() #print(text) # print the email content print(data) temp = file.filename.replace(".msg", "") html = "<html></html>" soup = BeautifulSoup(html) htmltag = soup.find('html') body = soup.new_tag("body") paras = data.split("\n\n") for para in paras: html = "<p></p>" souppara = BeautifulSoup(html,features="lxml") ptag = souppara.find('p') ptag.insert(0, NavigableString(para)) body.append(ptag) htmltag.append(body) html_page = soup.prettify("utf-8") with open(app.config['UPLOAD_PATH_PDF'] + temp + ".html", "wb+") as filewriter: filewriter.write(html_page) with open(app.config['UPLOAD_PATH_PDF'] + temp + ".txt", "w+") as filewriter: print("Converting .msg into Text") filewriter.write(data) print("Converted .msg into Text") document_url = "http://george.runmy.tech:5000/static/web/" + temp + ".html" entityExtractor_ = EntityExtractor(lang,app.config['UPLOAD_PATH_PDF'] + temp+".txt", pname.strip("'"),document_url,app.config['GOOGLE_API_KEY'],app.config['NLP_API_KEY'],app.config["DATABASEIP"], app.config["DB_USER"], app.config["DB_PASSWORD"], app.config["DATABASE"],file.filename) print("Stucked") entityExtractor_.getEntityTxtJson() else: temp = file.filename.replace(".pdf", "") document_url = "http://george.runmy.tech:5000/static/web/viewer.html?file=" + file.filename entityExtractor_ = EntityExtractor(lang,app.config['UPLOAD_PATH_PDF'] + file.filename,pname.strip("'"),document_url,app.config['GOOGLE_API_KEY'],app.config['NLP_API_KEY'],app.config["DATABASEIP"], app.config["DB_USER"], app.config["DB_PASSWORD"], app.config["DATABASE"],file.filename) searchable = entityExtractor_.isSearchablePDF(); if(searchable): entityExtractor_.getEntityPDFJson() else: # OCR print("Have to do OCR") document_url = "http://george.runmy.tech:5000/static/web/viewer.html?file=" + file.filename OCR.pdf_splitter(app.config['UPLOAD_PATH_PDF'] + filename, app.config['UPLOAD_PATH_PDF'] + temp+".txt", app.config['OCR_API_KEY']) entityExtractor_ = EntityExtractor(lang,app.config['UPLOAD_PATH_PDF'] + temp + ".txt",pname.strip("'"), document_url,app.config['GOOGLE_API_KEY'],app.config['NLP_API_KEY'],app.config["DATABASEIP"], app.config["DB_USER"], app.config["DB_PASSWORD"], app.config["DATABASE"],file.filename) entityExtractor_.getEntityTxtJson() print(entityExtractor_.getEntities()) print("some stuff") args = (file.filename, pname.strip("'"), session['user'].strip("'"), formatted_date.strip("'"), app.config['UPLOAD_PATH_PDF'],entityExtractor_.getNodesList(),entityExtractor_.getEdgeList(),entityExtractor_.getEntities(),document_url) if(entityExtractor_ !=None): del entityExtractor_ # Execute the SQL command cur.execute(sql, args) # Commit your changes in the database db.commit() # return redirect(url_for('success', n=str(email))) # session['user']=email db.close() return render_template('addfiles.html', email=session['user'],projectList=session['projectList'],message="File is successfully uploaded and processed") else: return render_template('addfiles.html',email=session['user'],projectList=session['projectList'], message='File Extension not allowed') except Exception as e: print("Error is here soooooooooo" + str(e)) print(''.join(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))) return render_template('addfiles.html', projectList=session['projectList'],email=session['user'],message='Exception in file processing')
def main(): print("Running on advisor conversation") b_download_embedding_files = False b_train = True b_evaluate = True b_score = False #Specify the path where to store the downloaded files from sys import platform if platform == "win32": home_dir = "C:\\dl4nlp" else: home_dir = os.path.join(os.path.expanduser('~'), "dl4nlp") print("home_dir = {}".format(home_dir)) # The hyper-parameters of the word embedding trained model window_size = 5 embed_vector_size = 50 min_count = 400 # Define the data files data_folder = os.path.join("sample_data", "advisorConversations") train_file_path = os.path.join(data_folder, "advisorConversations_train.txt") test_file_path = os.path.join(data_folder, "advisorConversations_test.txt") data_file_path = os.path.join(data_folder, "unlabeled_advisorConversations.txt") resources_pickle_file = os.path.join(home_dir, "models", "resources.pkl") embedding_pickle_file = os.path.join(home_dir, "models", "w2vmodel_advisorConversations_vs_{}_ws_{}_mc_{}.pkl" \ .format(embed_vector_size, window_size, min_count)) print("embedding_pickle_file= {}".format(embedding_pickle_file)) if b_download_embedding_files == True: #Specify the string to look for in blob names from your container embedding_folder_name = "word2vec_advisorConversations_model_vs_{}_ws_{}_mc_{}_parquet_files".\ format(embed_vector_size, window_size, min_count) print("embedding_folder_name= {}".format(embedding_folder_name)) embedding_full_path = os.path.join(home_dir, "models", embedding_folder_name) print("embedding_full_path= {}".format(embedding_full_path)) #download the parquet files from Blob storage download_embedding_parquet_files_from_storage(embedding_full_path, embedding_folder_name, num_parquet_files=1000) save_embeddings_to_pickle_file(embedding_full_path, embedding_pickle_file, embed_vector_size) print("Done") # The hyperparameters of the LSTM trained model #network_type= 'unidirectional' network_type = 'bidirectional' #embed_vector_size = 50 num_layers = 2 num_hidden_units = 150 num_epochs = 10 batch_size = 50 dropout = 0.2 reg_alpha = 0.0 model_file_path = os.path.join(home_dir,'models','lstm_{}_model_units_{}_lyrs_{}_epchs_{}_vs_{}_ws_{}_mc_{}.h5'.\ format(network_type, num_hidden_units, num_layers, num_epochs, embed_vector_size, window_size, min_count)) K.clear_session() with K.get_session() as sess: K.set_session(sess) graphr = K.get_session().graph with graphr.as_default(): if b_train == True: print("Training the model... num_epochs = {}, num_layers = {}, num_hidden_units = {}".\ format(num_epochs, num_layers,num_hidden_units)) reader = DataReader() entityExtractor = EntityExtractor(reader, embedding_pickle_file) entityExtractor.train (train_file_path, \ output_resources_pickle_file = resources_pickle_file, \ network_type = network_type, \ num_epochs = num_epochs, \ batch_size = batch_size, \ dropout = dropout, \ reg_alpha = reg_alpha, \ num_hidden_units = num_hidden_units, \ num_layers = num_layers) #Save the model entityExtractor.save(model_file_path) if b_evaluate == True: # Evaluate the model print("Evaluating the model...") reader = DataReader( input_resources_pickle_file=resources_pickle_file) entityExtractor = EntityExtractor(reader) #load the model print("Loading the model from file {} ...".format( model_file_path)) entityExtractor.load(model_file_path) entityExtractor.print_summary() if not os.path.exists(os.path.join(home_dir, "output")): os.makedirs(os.path.join(home_dir, "output")) # make sure that the input test data file is in IOB format output_prediction_file = os.path.join(home_dir, "output", "prediction_output.tsv") evaluation_report, confusion_matrix = entityExtractor.evaluate_model( test_file_path, output_prediction_file) print(evaluation_report) print(confusion_matrix) if b_score == True: print("Starting the model prediction ...") reader = DataReader( input_resources_pickle_file=resources_pickle_file) entityExtractor = EntityExtractor(reader) #load the model print("Loading the model from file {} ...".format( model_file_path)) entityExtractor.load(model_file_path) entityExtractor.print_summary() predicted_tags = entityExtractor.predict_2(data_file_path) if not os.path.exists(os.path.join(home_dir, "output")): os.makedirs(os.path.join(home_dir, "output")) output_prediction_file = os.path.join(home_dir, "output", "prediction_output.tsv") with open(output_prediction_file, 'w') as f: for ind, line in enumerate(predicted_tags): f.write("{}\t{}\n".format(ind, line)) K.clear_session() K.set_session(None) print("Done.")
def init(): """ Initialise SD model """ global entityExtractor start = t.default_timer() home_dir = os.getcwd() print("home_dir = {}".format(home_dir)) # define the word2vec embedding model hyperparameters window_size = 5 vector_size = 50 min_count = 400 #download_embedding_parquet_files_from_storage() #embedding_pickle_file = save_embeddings_to_pickle_file() embedding_pickle_file = os.path.join(home_dir, "w2vmodel_pubmed_vs_{}_ws_{}_mc_{}.pkl" \ .format(vector_size, window_size, min_count)) if not os.path.exists(embedding_pickle_file): print("The word embeddings pickle file ({}) doesn't exist.".format( embedding_pickle_file)) tag_to_idx_map_file = os.path.join(home_dir, "tag_map.tsv") if not os.path.exists(tag_to_idx_map_file): print("The entity types index mapping file ({}) doesn't exist.".format( tag_to_idx_map_file)) # define the LSTM model hyperparameters network_type = 'unidirectional' # network_type= 'bidirectional' num_classes = 7 + 1 max_seq_length = 613 num_layers = 2 num_hidden_units = 300 num_epochs = 10 model_file_path = os.path.join(home_dir,'Models','lstm_{}_model_units_{}_lyrs_{}_epchs_{}_vs_{}_ws_{}_mc_{}.h5'.\ format(network_type, num_hidden_units, num_layers, num_epochs, embed_vector_size, window_size, min_count)) if not os.path.exists(model_file_path): print("The neural model file ({}) doesn't exist.".format( model_file_path)) print("Starting the model prediction ...") reader = DataReader(num_classes, max_seq_length=max_seq_length, tag_to_idx_map_file=tag_to_idx_map_file, vector_size=vector_size) entityExtractor = EntityExtractor(reader, embedding_pickle_file) # Load model and load the model from brainscript (3rd index) try: #load the model print("Loading the entity extraction model {}".format(model_file_path)) entityExtractor.load(model_file_path) entityExtractor.print_summary() except: print("can't load the entity extraction model") pass end = t.default_timer() loadTimeMsg = "Model loading time: {0} ms".format( round((end - start) * 1000, 2)) logger.info(loadTimeMsg)