def run_step3(topicConfig=[], test_ratio=0.005, saveModel=True, coherence_types=['u_mass', 'c_v', 'c_uci', 'c_npmi']): """ Step 3: LDA topic modeling """ aml_logger = get_azureml_logger() # logger writes to AMLWorkbench runtime view aml_logger.log('amlrealworld.document-collection-analysis.step3', 'true') logger = logging.getLogger(__name__) logger.info("========= Run Step 3: LDA topic modeling") run_logger = get_azureml_logger() df = loadProcessedTextData(numPhrases=MAX_NUM_PHRASE) if df is None or len(df) == 0: raise ValueError("Failed to load the processed text data") docs = list(df['ProcessedText']) if test_ratio >= 1.0 or test_ratio < 0.0: test_ratio = 0.005 topicmodeler = TopicModeler(docs, stopWordFile=FUNCTION_WORDS_FILE, minWordCount=MIN_WORD_COUNT, minDocCount=MIN_DOC_COUNT, maxDocFreq=MAX_DOC_FREQ, workers=cpu_count()-1, numTopics=NUM_TOPICS, numIterations=NUM_ITERATIONS, passes=NUM_PASSES, chunksize=CHUNK_SIZE, random_state=RANDOM_STATE, test_ratio=test_ratio) if topicConfig is None or len(topicConfig) == 0: logger.info("Only need to learn %d topics" % NUM_TOPICS) lda = topicmodeler.TrainLDA(saveModel=saveModel) coherence = topicmodeler.EvaluateCoherence(lda, coherence_types) perplex = topicmodeler.EvaluatePerplexity(lda) run_logger.log("Perplexity", perplex['perplexity']) run_logger.log("Per Word Bound", perplex['per_word_bound']) for type in coherence: run_logger.log(type + " Coherence", coherence[type]) run_logger.log("Topic Number", NUM_TOPICS) return lda else: for i in topicConfig: logger.info("Learning %d topics, from list of topic configuration: %s" % (i, str(topicConfig))) # IMPORTANT: update the number of topics need to learn topicmodeler.numTopics = i # train an LDA model lda = topicmodeler.TrainLDA(saveModel=saveModel) topicmodeler.EvaluateCoherence(lda, coherence_types) topicmodeler.EvaluatePerplexity(lda) topicmodeler.CollectRunLog() topicmodeler.PlotRunLog()
def main(): global mnist, scores, sess, x, graph, btch_sz mnist = learn.datasets.mnist.read_data_sets('MNIST_data', one_hot=True) btch_sz = 5 init() predict_all_test_data() y = graph.get_tensor_by_name("y:0") eval_op = evaluateModel(scores, y) test_feed_dict = { x: mnist.test.images, y: mnist.test.labels, } print(scores.shape) _netacc = sess.run(eval_op, feed_dict=test_feed_dict) print("Net Accuracy:", _netacc) print(scores[0:5,:], " predicted value = ", np.argmax(scores[0:5,:], axis=1), " actual value", np.argmax(mnist.test.labels[0:5,:], axis=1)) run_logger = get_azureml_logger() run_logger.log("Accuracy",_netacc) print("Calling prepare schema") inputs = {"nparr": SampleDefinition(DataTypes.NUMPY, mnist.test.images[0:btch_sz])} outputs = {"probs_and_class_category_df": SampleDefinition(DataTypes.PANDAS, retdf)} amlo16n.generate_schema(inputs=inputs, outputs=outputs, filepath="outputs/mnistschema.json", run_func=run ) amlo16n.generate_main(user_file="mnistscore.py", schema_file="outputs/schema.json", main_file_name="outputs/main.py") print("End of prepare schema")
def main(pretrained_model_type, retraining_type, config_filename, output_model_name, num_epochs): ''' Coordinate all activities for Batch AI training ''' # Log the parameters used for this run run_logger = get_azureml_logger() run_logger.log('amlrealworld.aerial_image_classification.run_batch_ai', 'true') run_logger.log('pretrained_model_type', pretrained_model_type) run_logger.log('config_filename', config_filename) run_logger.log('retraining_type', retraining_type) run_logger.log('output_model_name', output_model_name) # Load the configuration file and save relevant info config = ConfigFile(config_filename) write_model_summary_to_blob(config, output_model_name, pretrained_model_type, retraining_type) # Create a cluster (if necessary) and wait till it's ready get_cluster(config) check_for_steady_cluster_status(config) # Submit the job and wait until it completes job_name = submit_job(config, pretrained_model_type, retraining_type, output_model_name, num_epochs) print('Job submitted: checking for job completion') check_for_job_completion(config, job_name) print('Job complete: retrieving output files') # Download the output files and store metrics to Vienna retrieve_outputs(config, job_name, output_model_name) print('Parsing output logs') parse_stdout(run_logger) return
def getAmlLogger(): try: from azureml.logging import get_azureml_logger run_logger = get_azureml_logger() except: print("Azure ML logger not found.") run_logger = [] return run_logger
def run_step3(topicConfig=[], test_ratio=0.005, saveModel=True, coherence_types=['u_mass', 'c_v', 'c_uci', 'c_npmi']): """ Step 3: LDA topic modeling """ logger = logging.getLogger(__name__) logger.info("========= Run Step 3: LDA topic modeling") run_logger = get_azureml_logger() df = loadProcessedTextData(numPhrases=MAX_NUM_PHRASE) if df is None or len(df) == 0: raise ValueError("Failed to load the processed text data") docs = list(df['ProcessedText']) if test_ratio >= 1.0 or test_ratio < 0.0: test_ratio = 0.005 topicmodeler = TopicModeler(docs, stopWordFile=FUNCTION_WORDS_FILE, minWordCount=MIN_WORD_COUNT, minDocCount=MIN_DOC_COUNT, maxDocFreq=MAX_DOC_FREQ, workers=cpu_count()-1, numTopics=NUM_TOPICS, numIterations=NUM_ITERATIONS, passes=NUM_PASSES, chunksize=CHUNK_SIZE, random_state=RANDOM_STATE, test_ratio=test_ratio) if topicConfig is None or len(topicConfig) == 0: logger.info("Only need to learn %d topics" % NUM_TOPICS) lda = topicmodeler.TrainLDA(saveModel=saveModel) coherence = topicmodeler.EvaluateCoherence(lda, coherence_types) perplex = topicmodeler.EvaluatePerplexity(lda) run_logger.log("Perplexity", perplex['perplexity']) run_logger.log("Per Word Bound", perplex['per_word_bound']) for type in coherence: run_logger.log(type + " Coherence", coherence[type]) run_logger.log("Topic Number", NUM_TOPICS) return lda else: for i in topicConfig: logger.info("Learning %d topics, from list of topic configuration: %s" % (i, str(topicConfig))) # IMPORTANT: update the number of topics need to learn topicmodeler.numTopics = i # train an LDA model lda = topicmodeler.TrainLDA(saveModel=saveModel) topicmodeler.EvaluateCoherence(lda, coherence_types) topicmodeler.EvaluatePerplexity(lda) topicmodeler.CollectRunLog() topicmodeler.PlotRunLog()
def run_step2(cleanedDataFrame, config=(0, 0, 0), numPhrase=MAX_NUM_PHRASE, maxPhrasePerIter=MAX_PHRASE_PER_ITER, maxPhraseLength=MAX_PHRASE_LENGTH, minInstanceCount=MIN_INSTANCE_COUNT): """ Step 2: phrase learning """ aml_logger = get_azureml_logger() # logger writes to AMLWorkbench runtime view aml_logger.log('amlrealworld.document-collection-analysis.step2', 'true') logger = logging.getLogger(__name__) logger.info("========= Run Step 2: learn phrases from data") minPhrase, maxPhrase, step = config if minPhrase == 0 and maxPhrase == 0 and step == 0: logger.info("Only need to learn %d phrases" % numPhrase) # Instantiate a PhraseLearner and run a configuration # We need to put this code under '__main__' to run multiprocessing phraseLearner = PhraseLearner(cleanedDataFrame, "CleanedText", numPhrase, maxPhrasePerIter, maxPhraseLength, minInstanceCount) textData = list(phraseLearner.textFrame['LowercaseText']) phraseLearner.RunConfiguration(textData, phraseLearner.learnedPhrases, addSpace=True, writeFile=True, num_workers=cpu_count()-1) phraseLearner.textFrame['TextWithPhrases'] = textData phraseLearner.MapVocabToSurfaceForms('CleanedText', 'TextWithPhrases', True) newDocsFrame = phraseLearner.ReconstituteDocsFromChunks('DocID', 'TextWithPhrases', True) else: # make sure the inputs are valid and make sense minPhrase = max(10, minPhrase) maxPhrase = max(10, maxPhrase) step = max(1, step) # Instance a phrase learner with minPhrase set phraseLearner = PhraseLearner(cleanedDataFrame, "CleanedText", minPhrase, maxPhrasePerIter, maxPhraseLength, minInstanceCount) # Get the lower case text data textData = list(phraseLearner.textFrame['LowercaseText']) # Incrementally learn phrases for i in range(minPhrase, maxPhrase + 1, step): logger.info("Learning %d phrases, based on previous leaned %d phrases" % (i, len(phraseLearner.learnedPhrases))) # need to update this number phraseLearner.maxNumPhrases = i phraseLearner.RunConfiguration(textData, phraseLearner.learnedPhrases, addSpace=True, writeFile=True, num_workers=cpu_count()-1) phraseLearner.textFrame['TextWithPhrases'] = textData phraseLearner.MapVocabToSurfaceForms('CleanedText', 'TextWithPhrases', True) newDocsFrame = phraseLearner.ReconstituteDocsFromChunks('DocID', 'TextWithPhrases', True)
def addmodelcomparison(X_test, y_test, model): # initialize the logger from azureml.logging import get_azureml_logger run_logger = get_azureml_logger() # log accuracy which is a single numerical value accuracy = model.score(X_test, y_test) run_logger.log("Accuracy", accuracy)
def __init__(self, textFrame=None, textCol="", maxNumPhrases=25000, maxPhrasesPerIter=500, maxPhraseLength=7, minInstanceCount=5): logger = logging.getLogger(__name__) # initialize the run logger self.run_logger = get_azureml_logger() self.run_logger.log( 'amlrealworld.document-collection-analysis.phraseLearning', 'true') self.textFrame = textFrame self.textCol = textCol # Load the black list of words # This is a precreated hash table containing the list # of black list words to be ignored during phrase learning self.black_list = get_shared_file_path(BLACK_LIST_FILE) self.blacklistHash = LoadListAsHash(self.black_list) # Load the function words # This is a precreated hash table containing the list # of function words used during phrase learning self.function_words = get_shared_file_path(FUNCTION_WORDS_FILE) self.functionwordHash = LoadListAsHash(self.function_words) # Maximum number of phrases to learn # If you want to test the code out quickly then set this to a small # value (e.g. 100) and set verbose to true when running the quick test self.maxNumPhrases = maxNumPhrases # Maximum number of phrases to learn per iteration # Increasing this number may speed up processing but will affect the ordering of the phrases # learned and good phrases could be by-passed if the maxNumPhrases is set to a small number self.maxPhrasesPerIter = maxPhrasesPerIter # Maximum number of words allowed in the learned phrases self.maxPhraseLength = maxPhraseLength # Minimum number of times a phrase must occur in the data to # be considered during the phrase learning process self.minInstanceCount = minInstanceCount # The learned phrases self.learnedPhrases = [] # Lower case the raw text column and save in a new column if self.textFrame is not None and self.textCol != '': self.LowerText(self.textFrame, self.textCol) else: logger.error( "Create an instance with Null text DataFrame, please call self.LowerText() to convert text to lowercase." )
def __init__(self, textData=None, stopWordFile='', minWordCount=5, minDocCount=2, maxDocFreq=0.25, workers=1, numTopics=50, numIterations=100, passes=1, chunksize=2000, random_state=None, test_ratio=0.005): logger = logging.getLogger(__name__) # initialize the run logger self.run_logger = get_azureml_logger() self.run_logger.log('amlrealworld.document-collection-analysis.topicModeling', 'true') if not textData or not isinstance(textData, list): raise ValueError("Text data should be non-empty and in the format of list.") # The minimum word count in all documents self.minWordCount = minWordCount # The minimum count of documents that contain a specific word self.minDocCount = minDocCount # The maximum document frequency that contain a specific word self.maxDocFreq = maxDocFreq if workers > cpu_count() or workers <= 0: logger.warning("Worker number %d is greater than number of cores: %d, reduced it to the number of cores" % (workers, cpu_count())) self.workers = cpu_count() else: self.workers = workers self.numTopics = numTopics self.numIterations = numIterations self.passes = passes self.chunksize = chunksize self.random_state = random_state self.test_ratio = test_ratio if not stopWordFile: raise ValueError("Need to provide the file name of the stop word list") stopWordPath = get_shared_file_path(stopWordFile) if not os.path.exists(stopWordPath): download_file_from_blob(stopWordFile) self.stopWordHash = LoadListAsHash(stopWordPath) self.vocabHash = self.CreateVocabForTopicModeling(textData, self.stopWordHash) self.tokenizedDocs = self.TokenizeText(textData) self.id2token = None self.token2id = None self.BuildDictionary(self.tokenizedDocs) self.corpus = self.BuildCorpus(self.tokenizedDocs) # global variable for run log self.topics_list = [] self.u_mass_list = [] self.c_v_list = [] self.c_uci_list = [] self.c_npmi_list = [] self.perplexity_list = [] self.word_bound_list = []
def __init__(self, textFrame=None, textCol="", maxNumPhrases=25000, maxPhrasesPerIter=500, maxPhraseLength=7, minInstanceCount=5): logger = logging.getLogger(__name__) # initialize the run logger self.run_logger = get_azureml_logger() self.run_logger.log('amlrealworld.document-collection-analysis.phraseLearning', 'true') self.textFrame = textFrame self.textCol = textCol # Load the black list of words # This is a precreated hash table containing the list # of black list words to be ignored during phrase learning self.black_list = get_shared_file_path(BLACK_LIST_FILE) self.blacklistHash = LoadListAsHash(self.black_list) # Load the function words # This is a precreated hash table containing the list # of function words used during phrase learning self.function_words = get_shared_file_path(FUNCTION_WORDS_FILE) self.functionwordHash = LoadListAsHash(self.function_words) # Maximum number of phrases to learn # If you want to test the code out quickly then set this to a small # value (e.g. 100) and set verbose to true when running the quick test self.maxNumPhrases = maxNumPhrases # Maximum number of phrases to learn per iteration # Increasing this number may speed up processing but will affect the ordering of the phrases # learned and good phrases could be by-passed if the maxNumPhrases is set to a small number self.maxPhrasesPerIter = maxPhrasesPerIter # Maximum number of words allowed in the learned phrases self.maxPhraseLength = maxPhraseLength # Minimum number of times a phrase must occur in the data to # be considered during the phrase learning process self.minInstanceCount = minInstanceCount # The learned phrases self.learnedPhrases = [] # Lower case the raw text column and save in a new column if self.textFrame is not None and self.textCol != '': self.LowerText(self.textFrame, self.textCol) else: logger.error("Create an instance with Null text DataFrame, please call self.LowerText() to convert text to lowercase.")
def main(pretrained_model_type, mmlspark_model_type, config_filename, output_model_name, sample_frac): # Load the configuration file config = ConfigFile(config_filename, pretrained_model_type, mmlspark_model_type, output_model_name) write_model_summary_to_blob(config, mmlspark_model_type) # Log the parameters of the run run_logger = get_azureml_logger() run_logger.log('amlrealworld.aerial_image_classification.run_mmlspark','true') run_logger.log('pretrained_model_type', pretrained_model_type) run_logger.log('mmlspark_model_type', mmlspark_model_type) run_logger.log('config_filename', config_filename) run_logger.log('output_model_name', output_model_name) run_logger.log('sample_frac', sample_frac) # Train and save the MMLSpark model train_df = load_data(config.train_uri, config, sample_frac) mmlspark_model = mmlspark.TrainClassifier( model=config.mmlspark_model_type, labelCol='label').fit(train_df) mmlspark_model.write().overwrite().save(config.output_uri) # Apply the MMLSpark model to the test set and save the accuracy metric test_df = load_data(config.test_uri, config, sample_frac) predictions = mmlspark_model.transform(test_df) metrics = mmlspark.ComputeModelStatistics(evaluationMetric='accuracy') \ .transform(predictions) metrics.show() run_logger.log('accuracy_on_test_set', metrics.first()['accuracy']) # Save the predictions tf = mmlspark.IndexToValue().setInputCol('scored_labels') \ .setOutputCol('pred_label') predictions = tf.transform(predictions).select( 'filepath', 'label', 'pred_label') output_str = predictions.toPandas().to_csv(index=False) blob_service = BlockBlobService(config.storage_account_name, config.storage_account_key) blob_service.create_container(config.container_prediction_results) blob_service.create_blob_from_text( config.container_prediction_results, config.predictions_filename, output_str) return
def record_results(eval_path): print("Starting logging results, using eval dir {0}".format(eval_path)) ea = event_accumulator.EventAccumulator( eval_path, size_guidance={ # see below regarding this argument event_accumulator.COMPRESSED_HISTOGRAMS: 500, event_accumulator.IMAGES: 30, event_accumulator.AUDIO: 4, event_accumulator.SCALARS: 0, event_accumulator.HISTOGRAMS: 1 }) ea.Reload() # Plot mAP vs Interations df = pd.DataFrame(ea.Scalars('Precision/[email protected]')) max_vals = df.loc[df["value"].idxmax()] # max value of mAP fig = plt.figure(figsize=(6, 5), dpi=75) plt.plot(df["step"], df["value"]) plt.plot(max_vals["step"], max_vals["value"], "g+", mew=2, ms=10) plt.title("Precision") plt.ylabel("mAP") plt.xlabel("interations") fig.savefig("./outputs/mAP.png", bbox_inches='tight') # Plot detection resuts for model with max mAP IMAGE_ID = "image-1" # Use blurry image as a baseline across runs IMG_OUTPUT = "./outputs/kittiwake.png" match = [x for x in ea.Images(IMAGE_ID) if x.step == max_vals["step"]] if (len(match) > 0): img_encoded = match[0].encoded_image_string img_file = BytesIO(img_encoded) img = Image.open(img_file) print(img.size) img.save(IMG_OUTPUT, "PNG") else: message = "Did not find images summary for step {0} with max mAP {1}. Need increase event_accumulator.IMAGES?" print(message.format(max_vals["step"], max_vals["value"])) run_logger = get_azureml_logger() run_logger.log("max_mAP", max_vals["value"]) run_logger.log("max_mAP_interation#", max_vals["step"]) print("Done logging resuts")
def run_step1(saveFile=True): """ Step 1: data preprocessing """ aml_logger = get_azureml_logger( ) # logger writes to AMLWorkbench runtime view aml_logger.log('amlrealworld.document-collection-analysis.step1', 'true') logger = logging.getLogger(__name__) fpath = get_shared_file_path(CLEANED_DATA_FILE_NAME) logger.info("========= Run Step 1: preprocessing text data") # Read raw data into a Pandas DataFrame textDF = getData() # Write frame with preprocessed text out to TSV file cleanedDataFrame = CleanAndSplitText(textDF, idColumnName='ID', textColumnName='Text', saveDF=saveFile) return cleanedDataFrame
import createfeautures as cf import numpy as np import pandas as pd import pyspark import os import urllib import sys from pyspark.sql.functions import * from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator from azureml.logging import get_azureml_logger # initialize logger run_logger = get_azureml_logger() from azureml.dataprep import datasource # start Spark session spark = pyspark.sql.SparkSession.builder.appName( 'classification').getOrCreate() # print runtime versions print('****************') print('Python version: {}'.format(sys.version)) print('Spark version: {}'.format(spark.version)) print('****************') print('***Prepare Input Data to get required attributes***') inputdata = datasource.load_datasource('POLines.dsource') data = inputdata.dropna(subset=['Category']) print('***Filtering Training + Testing + Validation records***')
def __init__(self, textData=None, stopWordFile='', minWordCount=5, minDocCount=2, maxDocFreq=0.25, workers=1, numTopics=50, numIterations=100, passes=1, chunksize=2000, random_state=None, test_ratio=0.005): logger = logging.getLogger(__name__) # initialize the run logger self.run_logger = get_azureml_logger() if not textData or not isinstance(textData, list): raise ValueError( "Text data should be non-empty and in the format of list.") # The minimum word count in all documents self.minWordCount = minWordCount # The minimum count of documents that contain a specific word self.minDocCount = minDocCount # The maximum document frequency that contain a specific word self.maxDocFreq = maxDocFreq if workers > cpu_count() or workers <= 0: logger.warning( "Worker number %d is greater than number of cores: %d, reduced it to the number of cores" % (workers, cpu_count())) self.workers = cpu_count() else: self.workers = workers self.numTopics = numTopics self.numIterations = numIterations self.passes = passes self.chunksize = chunksize self.random_state = random_state self.test_ratio = test_ratio if not stopWordFile: raise ValueError( "Need to provide the file name of the stop word list") stopWordPath = get_shared_file_path(stopWordFile) if not os.path.exists(stopWordPath): download_file_from_blob(stopWordFile) self.stopWordHash = LoadListAsHash(stopWordPath) self.vocabHash = self.CreateVocabForTopicModeling( textData, self.stopWordHash) self.tokenizedDocs = self.TokenizeText(textData) self.id2token = None self.token2id = None self.BuildDictionary(self.tokenizedDocs) self.corpus = self.BuildCorpus(self.tokenizedDocs) # global variable for run log self.topics_list = [] self.u_mass_list = [] self.c_v_list = [] self.c_uci_list = [] self.c_npmi_list = [] self.perplexity_list = [] self.word_bound_list = []
def CleanAndSplitText(textDataFrame, idColumnName='ID', textColumnName='Text', saveDF=False): aml_logger = get_azureml_logger() # logger writes to AMLWorkbench runtime view aml_logger.log('amlrealworld.document-collection-analysis.preprocessText', 'true') logger = logging.getLogger(__name__) # Need to download the 'punkt' model for breaking text # strings into individual sentences try: nltk.data.find('tokenizers/punkt') except LookupError: logger.debug("Need to download the 'punkt' model from NLTK") nltk.download('punkt') logger.debug("Downloading 'punkt' model done.") logger.info("Clean and split the raw text into sentences") textDataOut = [] # This regular expression is for section headers in the bill summaries that we wish to ignore reHeaders = re.compile(r" *TABLE OF CONTENTS:? *" "| *Title [IVXLC]+:? *" "| *Subtitle [A-Z]+:? *" "| *\(Sec\. \d+\) *") # This regular expression is for punctuation that we wish to clean out # We also will split sentences into smaller phrase like units using this expression rePhraseBreaks = re.compile("[\"\!\?\)\]\}\,\:\;\*\-]*\s+\([0-9]+\)\s+[\(\[\{\"\*\-]*" "|[\"\!\?\)\]\}\,\:\;\*\-]+\s+[\(\[\{\"\*\-]*" "|\.\.+" "|\s*\-\-+\s*" "|\s+\-\s+" "|\:\:+" "|\s+[\/\(\[\{\"\-\*]+\s*" "|[\,!\?\"\)\(\]\[\}\{\:\;\*](?=[a-zA-Z])" "|[\"\!\?\)\]\}\,\:\;]+[\.]*$" ) # Regex for underbars regexUnderbar = re.compile('_') # Regex for space regexSpace = re.compile(' +') # Regex for sentence final period regexPeriod = re.compile("\.$") # Iterate through each document and do: # (1) Split documents into sections based on section headers and remove section headers # (2) Split the sections into sentences using NLTK sentence tokenizer # (3) Further split sentences into phrasal units based on punctuation and remove punctuation # (4) Remove sentence final periods when not part of a abbreviation for i in range(len(textDataFrame)): # Extract one document from frame docID = textDataFrame[idColumnName][i] docText = textDataFrame[textColumnName][i] # Set counter for output line count for this document lineIndex=0; # Split document into sections by finding sections headers and splitting on them sections = reHeaders.split(str(docText)) for section in sections: # Split section into sentence using NLTK tokenizer sentences = tokenize.sent_tokenize(section) for sentence in sentences: # Split each sentence into phrase level chunks based on punctuation textSegs = rePhraseBreaks.split(sentence) numSegs = len(textSegs) for j in range(0, numSegs): if len(textSegs[j]) > 0: # Convert underbars to spaces # Underbars are reserved for building the compound word phrases textSegs[j] = regexUnderbar.sub(" ", textSegs[j]) # Split out the words so we can specially handle the last word words = regexSpace.split(textSegs[j]) phraseOut = "" # If the last word ends in a period then remove the period words[-1] = regexPeriod.sub("", words[-1]) # If the last word is an abbreviation like "U.S." # then add the word final perios back on if "\." in words[-1]: words[-1] += "." phraseOut = " ".join(words) textDataOut.append([docID, lineIndex, phraseOut]) lineIndex += 1 # Convert to Pandas DataFrame frameOut = pd.DataFrame(textDataOut, columns=['DocID', 'DocLine', 'CleanedText']) logger.debug("Returned clean DataFrame shape: %d, %d" % (frameOut.shape[0], frameOut.shape[1])) if saveDF: logger.info("Saving the cleaned DataFrame in file: %s" % CLEANED_DATA_FILE_NAME) cleanedDataFile = get_shared_file_path(CLEANED_DATA_FILE_NAME) frameOut.to_csv(cleanedDataFile, sep='\t', index=False) else: logger.info("The cleaned and sentenced text data is not being saved.") return frameOut
def main(): ######################################### # Accept One Argument as Input ######################################### try: topN = int(sys.argv[1]) except IndexError: print( "This script takes one argument. Please enter a valid non-negative integer number.\n" ) raise ######################################### # Access trainQ and testQ from Part 2 ######################################### workfolder = os.environ.get('AZUREML_NATIVE_SHARE_DIRECTORY') # paths to trainQ and testQ. trainQ_path = os.path.join(workfolder, 'trainQ_part2') testQ_path = os.path.join(workfolder, 'testQ_part2') # load the training and test data. trainQ = pd.read_csv(trainQ_path, sep='\t', index_col='Id', encoding='latin1') testQ = pd.read_csv(testQ_path, sep='\t', index_col='Id', encoding='latin1') ######################################### # Extract Features ######################################### token2IdHashInit = tokensToIds(trainQ['Tokens'], featureHash=None) # get unique answerId in ascending order uniqueAnswerId = list(np.unique(trainQ['AnswerId'])) # calculate the count matrix of all training questions. N_wAInit = countMatrix(trainQ, token2IdHashInit, 'AnswerId', uniqueAnswerId) P_A = priorProbabilityAnswer(trainQ['AnswerId'], uniqueAnswerId) P_Aw = posterioriProb(N_wAInit, P_A, uniqueAnswerId) # select top N important tokens per answer class. featureHash = feature_selection(P_Aw, token2IdHashInit, topN=topN) token2IdHash = tokensToIds(trainQ['Tokens'], featureHash=featureHash) N_wA = countMatrix(trainQ, token2IdHash, 'AnswerId', uniqueAnswerId) alpha = 0.0001 P_w = featureWeights(N_wA, alpha) beta = 0.0001 P_wA = wordProbabilityInAnswer(N_wA, P_w, beta) P_wNotA = wordProbabilityNotinAnswer(N_wA, P_w, beta) ######################################### # Train Naive Bayes Classifier ######################################### NBWeights = np.log(P_wA / P_wNotA) ######################################### # Predict Probabilities on Test ######################################### beta_A = 0 x_wTest = normalizeTF(testQ, token2IdHash) Y_test_prob = softmax(-beta_A + np.dot(x_wTest.T, NBWeights)) ######################################### # Evaluate Model Performance ######################################### # We use two evaluation matrices (Average Rank and Top 3 Percentage) to test our model performance. # The Average Rank can be interpreted as in average at which position we can find the correct answer among all available answers for a given question. # The Top 3 Percentage can be interpreted as how many percentage of the new questions that we can find their correct answers in the first 3 choices. # sort the similarity scores in descending order and map them to the corresponding AnswerId in Answer set testQ = rank(testQ, Y_test_prob, uniqueAnswerId) AR = np.floor(testQ['Rank'].mean()) top3 = round(len(testQ.query('Rank <= 3')) / len(testQ), 3) print('Top %d important tokens selected per Class.' % topN) print('Average of rank: ' + str(AR)) print('Percentage of questions find answers in the first 3 choices: ' + str(top3)) ######################################### # Log Parameters and Performance ######################################### # initialize the logger run_logger = get_azureml_logger() # log performance. run_logger.log("Top N Tokens Selected", topN) run_logger.log("Average Rank", AR) run_logger.log("Top 3 Percentage", top3)
fp = cm[0][1] fn = cm[1][0] accuracy = (tp + tn) / (tp + tn + fp + fn) print("Accuracy => {}".format(accuracy)) precision = tp / (tp + fp) # measuring exactness recall = tp / (tp + fn) # measuring completeness f1 = 2 * precision * recall / (precision + recall) # compromise between precision and recall print("F1 Score => {}".format(f1)) #========================= LOGGING MODEL EVALUATION ========================= # initialize the logger from azureml.logging import get_azureml_logger run_logger = get_azureml_logger() accuracy = classifier.score(X_test, y_test) run_logger.log("Accuracy", accuracy) #print ("Accuracy is {}".format(accuracy)) #========================= SAVE MODEL ========================= import pickle import sys, os # create the outputs folder os.makedirs('./outputs', exist_ok=True) # serialize the model print ("Saved the model => socialads.pkl")
def sessionrun(num_epochs): global mnist, serialized_tf_example, prediction_classes, values global tensor_info_x, tensor_info_y, sessinfo global train_x, train_y downloaddata() batch_size = 100 x = tf.placeholder(tf.float32, [None, 784], name='x') y = tf.placeholder(tf.float32, [None, 10], name='y') phase_train = tf.placeholder(tf.bool, name='phase_train') keep_prob = tf.placeholder(tf.float32, name='keep_prob') batch_norm = tf.placeholder(tf.bool, name='batch_norm') pred_op = inference(x, keep_prob, batch_norm=True, phase_train=phase_train) loss_op = sploss(pred_op, y) ts_op = train(loss_op) eval_op = evaluateModel(pred_op, y) values, indices = tf.nn.top_k(pred_op, 10) loss_list = [] acc_list = [] merged = tf.summary.merge_all() with tf.Session() as sess: train_writer = tf.summary.FileWriter('outputs/tflogs/train', sess.graph) test_writer = tf.summary.FileWriter('outputs/tflogs/test') sess.run(tf.global_variables_initializer()) saver0 = tf.train.Saver() for epoch in range(num_epochs): avgloss = 0. avgacc = 0. total_batch = int(mnist.train.num_examples / batch_size) for i in range(total_batch): mx, my = mnist.train.next_batch(batch_size) #nx = 1-mx - this is for training images on whitebackground feed_dict = { x: mx, y: my, phase_train: True, batch_norm: True, keep_prob: 0.4 } _trsumm, _totloss, _trainstep, _predseriescc = sess.run( [merged, loss_op, ts_op, pred_op], feed_dict=feed_dict) avgloss += _totloss / total_batch #this is for training images on whitebackground #feed_dict = {x: nx, y: my, phase_train: True, batch_norm: True, keep_prob: 0.4} #_totloss, _trainstep, _predseriescc = sess.run( # [loss_op, ts_op, pred_op], # feed_dict=feed_dict) #avgloss += _totloss / total_batch loss_list.append(avgloss) if (i % 10 == 0): train_writer.add_summary(_trsumm, i) val_feed_dict = { x: mnist.validation.images, y: mnist.validation.labels, phase_train: False, batch_norm: True, keep_prob: 1 } _valsumm, _acc = sess.run([merged, eval_op], feed_dict=val_feed_dict) avgacc = _acc acc_list.append(_acc) print("In Epoch ", epoch, " with loss ", avgloss, " and with accuracy ", avgacc) train_writer.add_summary(_trsumm, epoch * batch_size) test_writer.add_summary(_valsumm, epoch) test_feed_dict = { x: mnist.test.images, y: mnist.test.labels, phase_train: False, batch_norm: True, keep_prob: 1 } _tstsumm, _netacc = sess.run([merged, eval_op], feed_dict=test_feed_dict) print("Net accuracy: ", _netacc) tensor_info_x = tf.saved_model.utils.build_tensor_info(x) tensor_info_y = tf.saved_model.utils.build_tensor_info(pred_op) run_logger = get_azureml_logger() run_logger.log("Accuracy", _netacc) run_logger.log("Number of Epochs", num_epochs) run_logger.log("Data Size", mnist.train.num_examples) # export model export_path_base = 'outputs/mnist' print('export_path_base:', export_path_base) if os.path.exists(export_path_base): print( "model path already exist, removing model path files and directory" ) shutil.rmtree(export_path_base) os.mkdir(export_path_base) saver0.save(sess, 'outputs/mnist/mnistmodel') print('Done exporting!')
def main(): run_logger = get_azureml_logger() parser = argparse.ArgumentParser(description='Chainer example: MNIST') parser.add_argument('--batchsize', '-b', type=int, default=100, help='Number of images in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=20, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', '-o', default='outputs', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--unit', '-u', type=int, default=1000, help='Number of units') args = parser.parse_args() print('GPU: {}'.format(args.gpu)) print('# unit: {}'.format(args.unit)) print('# Minibatch-size: {}'.format(args.batchsize)) print('# epoch: {}'.format(args.epoch)) print('') # Set up a neural network to train model = L.Classifier(train_mnist.MLP(args.unit, 10)) if args.gpu >= 0: # Make a speciied GPU current chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Copy the model to the GPU # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(model) # Load the MNIST dataset train, test = chainer.datasets.get_mnist() train_count = len(train) test_count = len(test) train_iter = chainer.iterators.SerialIterator(train, args.batchsize) test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False) sum_accuracy = 0 sum_loss = 0 # Keep training until reach max iterations metrics = [] losses = [] while train_iter.epoch < args.epoch: batch = train_iter.next() x_array, t_array = convert.concat_examples(batch, args.gpu) x = chainer.Variable(x_array) t = chainer.Variable(t_array) optimizer.update(model, x, t) sum_loss += float(model.loss.data) * len(t.data) sum_accuracy += float(model.accuracy.data) * len(t.data) if train_iter.is_new_epoch: print('epoch: ', train_iter.epoch) print('train mean loss: {}, accuracy: {}'.format( sum_loss / train_count, sum_accuracy / train_count)) # evaluation sum_accuracy = 0 sum_loss = 0 for batch in test_iter: x_array, t_array = convert.concat_examples(batch, args.gpu) x = chainer.Variable(x_array) t = chainer.Variable(t_array) loss = model(x, t) sum_loss += float(loss.data) * len(t.data) sum_accuracy += float(model.accuracy.data) * len(t.data) test_iter.reset() print('test mean loss: {}, accuracy: {}'.format( sum_loss / test_count, sum_accuracy / test_count)) metrics.append(float(sum_accuracy / test_count)) losses.append(float(sum_loss)) sum_accuracy = 0 sum_loss = 0 run_logger.log("Accuracy", metrics) run_logger.log("Loss", losses) # Save the model and the optimizer print('save the model') serializers.save_npz('mlp.model', model) print('save the optimizer') serializers.save_npz('mlp.state', optimizer)
def CleanAndSplitText(textDataFrame, idColumnName='ID', textColumnName='Text', saveDF=False): aml_logger = get_azureml_logger( ) # logger writes to AMLWorkbench runtime view aml_logger.log('amlrealworld.document-collection-analysis.preprocessText', 'true') logger = logging.getLogger(__name__) # Need to download the 'punkt' model for breaking text # strings into individual sentences try: nltk.data.find('tokenizers/punkt') except LookupError: logger.debug("Need to download the 'punkt' model from NLTK") nltk.download('punkt') logger.debug("Downloading 'punkt' model done.") logger.info("Clean and split the raw text into sentences") textDataOut = [] # This regular expression is for section headers in the bill summaries that we wish to ignore reHeaders = re.compile(r" *TABLE OF CONTENTS:? *" "| *Title [IVXLC]+:? *" "| *Subtitle [A-Z]+:? *" "| *\(Sec\. \d+\) *") # This regular expression is for punctuation that we wish to clean out # We also will split sentences into smaller phrase like units using this expression rePhraseBreaks = re.compile( "[\"\!\?\)\]\}\,\:\;\*\-]*\s+\([0-9]+\)\s+[\(\[\{\"\*\-]*" "|[\"\!\?\)\]\}\,\:\;\*\-]+\s+[\(\[\{\"\*\-]*" "|\.\.+" "|\s*\-\-+\s*" "|\s+\-\s+" "|\:\:+" "|\s+[\/\(\[\{\"\-\*]+\s*" "|[\,!\?\"\)\(\]\[\}\{\:\;\*](?=[a-zA-Z])" "|[\"\!\?\)\]\}\,\:\;]+[\.]*$") # Regex for underbars regexUnderbar = re.compile('_') # Regex for space regexSpace = re.compile(' +') # Regex for sentence final period regexPeriod = re.compile("\.$") # Iterate through each document and do: # (1) Split documents into sections based on section headers and remove section headers # (2) Split the sections into sentences using NLTK sentence tokenizer # (3) Further split sentences into phrasal units based on punctuation and remove punctuation # (4) Remove sentence final periods when not part of a abbreviation for i in range(len(textDataFrame)): # Extract one document from frame docID = textDataFrame[idColumnName][i] docText = textDataFrame[textColumnName][i] # Set counter for output line count for this document lineIndex = 0 # Split document into sections by finding sections headers and splitting on them sections = reHeaders.split(str(docText)) for section in sections: # Split section into sentence using NLTK tokenizer sentences = tokenize.sent_tokenize(section) for sentence in sentences: # Split each sentence into phrase level chunks based on punctuation textSegs = rePhraseBreaks.split(sentence) numSegs = len(textSegs) for j in range(0, numSegs): if len(textSegs[j]) > 0: # Convert underbars to spaces # Underbars are reserved for building the compound word phrases textSegs[j] = regexUnderbar.sub(" ", textSegs[j]) # Split out the words so we can specially handle the last word words = regexSpace.split(textSegs[j]) phraseOut = "" # If the last word ends in a period then remove the period words[-1] = regexPeriod.sub("", words[-1]) # If the last word is an abbreviation like "U.S." # then add the word final perios back on if "\." in words[-1]: words[-1] += "." phraseOut = " ".join(words) textDataOut.append([docID, lineIndex, phraseOut]) lineIndex += 1 # Convert to Pandas DataFrame frameOut = pd.DataFrame(textDataOut, columns=['DocID', 'DocLine', 'CleanedText']) logger.debug("Returned clean DataFrame shape: %d, %d" % (frameOut.shape[0], frameOut.shape[1])) if saveDF: logger.info("Saving the cleaned DataFrame in file: %s" % CLEANED_DATA_FILE_NAME) cleanedDataFile = get_shared_file_path(CLEANED_DATA_FILE_NAME) frameOut.to_csv(cleanedDataFile, sep='\t', index=False) else: logger.info("The cleaned and sentenced text data is not being saved.") return frameOut
from documentAnalysis import * import logging import pandas as pd import os from multiprocessing import cpu_count from step1 import run_step1 from step2 import run_step2 from step3 import run_step3, copyFigures, visualizeTopic, saveModel from azureml.logging import get_azureml_logger if __name__ == '__main__': aml_logger = get_azureml_logger() # logger writes to AMLWorkbench runtime view aml_logger.log('amlrealworld.document-collection-analysis.runme', 'true') logging.basicConfig(format='%(asctime)s : %(name)s : %(levelname)s : %(message)s', level=logging.INFO) """ By default, this script will use the entire Congressional dataset. If you just need to try it on a smaller dataset, choose the right 'DATASET_FILE' setting in documentAnalysis/configs.py file. """ # Step 1: Data preprocessing cleanedDataFrame = run_step1(saveFile=False) # Step 2: Phrase learning run_step2(cleanedDataFrame=cleanedDataFrame, numPhrase=MAX_NUM_PHRASE, maxPhrasePerIter=MAX_PHRASE_PER_ITER, maxPhraseLength=MAX_PHRASE_LENGTH, minInstanceCount=MIN_INSTANCE_COUNT)
from documentAnalysis import * import logging import pandas as pd import os from multiprocessing import cpu_count from step1 import run_step1 from step2 import run_step2 from step3 import run_step3, copyFigures, visualizeTopic, saveModel from azureml.logging import get_azureml_logger if __name__ == '__main__': aml_logger = get_azureml_logger( ) # logger writes to AMLWorkbench runtime view aml_logger.log('amlrealworld.document-collection-analysis.runme', 'true') logging.basicConfig( format='%(asctime)s : %(name)s : %(levelname)s : %(message)s', level=logging.INFO) """ By default, this script will use the entire Congressional dataset. If you just need to try it on a smaller dataset, choose the right 'DATASET_FILE' setting in documentAnalysis/configs.py file. """ # Step 1: Data preprocessing cleanedDataFrame = run_step1(saveFile=False) # Step 2: Phrase learning run_step2(cleanedDataFrame=cleanedDataFrame, numPhrase=MAX_NUM_PHRASE, maxPhrasePerIter=MAX_PHRASE_PER_ITER,