def run_step3(topicConfig=[], test_ratio=0.005, saveModel=True, coherence_types=['u_mass', 'c_v', 'c_uci', 'c_npmi']):
    """
    Step 3: LDA topic modeling
    """
    aml_logger = get_azureml_logger()   # logger writes to AMLWorkbench runtime view
    aml_logger.log('amlrealworld.document-collection-analysis.step3', 'true')

    logger = logging.getLogger(__name__)
    logger.info("=========  Run Step 3: LDA topic modeling")

    run_logger = get_azureml_logger()

    df = loadProcessedTextData(numPhrases=MAX_NUM_PHRASE)
    if df is None or len(df) == 0:
        raise ValueError("Failed to load the processed text data")

    docs = list(df['ProcessedText'])
    if test_ratio >= 1.0 or test_ratio < 0.0:
        test_ratio = 0.005

    topicmodeler = TopicModeler(docs, 
            stopWordFile=FUNCTION_WORDS_FILE,
            minWordCount=MIN_WORD_COUNT, 
            minDocCount=MIN_DOC_COUNT, 
            maxDocFreq=MAX_DOC_FREQ, 
            workers=cpu_count()-1, 
            numTopics=NUM_TOPICS, 
            numIterations=NUM_ITERATIONS, 
            passes=NUM_PASSES, 
            chunksize=CHUNK_SIZE, 
            random_state=RANDOM_STATE,
            test_ratio=test_ratio)

    if topicConfig is None or len(topicConfig) == 0:
        logger.info("Only need to learn %d topics" % NUM_TOPICS)
        
        lda = topicmodeler.TrainLDA(saveModel=saveModel)
        coherence = topicmodeler.EvaluateCoherence(lda, coherence_types)
        perplex = topicmodeler.EvaluatePerplexity(lda)
        run_logger.log("Perplexity", perplex['perplexity'])
        run_logger.log("Per Word Bound", perplex['per_word_bound'])
        for type in coherence:
            run_logger.log(type + " Coherence", coherence[type])
        run_logger.log("Topic Number", NUM_TOPICS)

        return lda
    else:
        for i in topicConfig:
            logger.info("Learning %d topics, from list of topic configuration: %s" % (i, str(topicConfig)))

            # IMPORTANT: update the number of topics need to learn
            topicmodeler.numTopics = i
            
            # train an LDA model
            lda = topicmodeler.TrainLDA(saveModel=saveModel)

            topicmodeler.EvaluateCoherence(lda, coherence_types)
            topicmodeler.EvaluatePerplexity(lda)
        topicmodeler.CollectRunLog()
        topicmodeler.PlotRunLog()
def main():
    global mnist, scores, sess, x, graph, btch_sz
    mnist = learn.datasets.mnist.read_data_sets('MNIST_data', one_hot=True)
    btch_sz = 5
    init()
    predict_all_test_data()
    y = graph.get_tensor_by_name("y:0")
    eval_op = evaluateModel(scores, y)
    test_feed_dict = {
        x: mnist.test.images,
        y: mnist.test.labels,


    }
    print(scores.shape)
    _netacc = sess.run(eval_op, feed_dict=test_feed_dict)
    print("Net Accuracy:", _netacc)
    print(scores[0:5,:], " predicted value = ", np.argmax(scores[0:5,:], axis=1), 
    " actual value", np.argmax(mnist.test.labels[0:5,:], axis=1))
    run_logger = get_azureml_logger() 
    run_logger.log("Accuracy",_netacc)
    
    print("Calling prepare schema")
    inputs = {"nparr": SampleDefinition(DataTypes.NUMPY, mnist.test.images[0:btch_sz])}
    outputs = {"probs_and_class_category_df": SampleDefinition(DataTypes.PANDAS, retdf)}
    amlo16n.generate_schema(inputs=inputs,
                            outputs=outputs,
                            filepath="outputs/mnistschema.json",
                            run_func=run
                            )

    amlo16n.generate_main(user_file="mnistscore.py", schema_file="outputs/schema.json",
                          main_file_name="outputs/main.py")
    print("End of prepare schema")
Beispiel #3
0
def main(pretrained_model_type, retraining_type, config_filename,
         output_model_name, num_epochs):
    ''' Coordinate all activities for Batch AI training '''

    # Log the parameters used for this run
    run_logger = get_azureml_logger()
    run_logger.log('amlrealworld.aerial_image_classification.run_batch_ai',
                   'true')
    run_logger.log('pretrained_model_type', pretrained_model_type)
    run_logger.log('config_filename', config_filename)
    run_logger.log('retraining_type', retraining_type)
    run_logger.log('output_model_name', output_model_name)

    # Load the configuration file and save relevant info
    config = ConfigFile(config_filename)
    write_model_summary_to_blob(config, output_model_name,
                                pretrained_model_type, retraining_type)

    # Create a cluster (if necessary) and wait till it's ready
    get_cluster(config)
    check_for_steady_cluster_status(config)

    # Submit the job and wait until it completes
    job_name = submit_job(config, pretrained_model_type, retraining_type,
                          output_model_name, num_epochs)
    print('Job submitted: checking for job completion')
    check_for_job_completion(config, job_name)
    print('Job complete: retrieving output files')

    # Download the output files and store metrics to Vienna
    retrieve_outputs(config, job_name, output_model_name)
    print('Parsing output logs')
    parse_stdout(run_logger)

    return
Beispiel #4
0
def getAmlLogger():
    try:
        from azureml.logging import get_azureml_logger
        run_logger = get_azureml_logger()
    except:
        print("Azure ML logger not found.")
        run_logger = []
    return run_logger
Beispiel #5
0
def run_step3(topicConfig=[], test_ratio=0.005, saveModel=True, coherence_types=['u_mass', 'c_v', 'c_uci', 'c_npmi']):
    """
    Step 3: LDA topic modeling
    """
    logger = logging.getLogger(__name__)
    logger.info("=========  Run Step 3: LDA topic modeling")

    run_logger = get_azureml_logger()

    df = loadProcessedTextData(numPhrases=MAX_NUM_PHRASE)
    if df is None or len(df) == 0:
        raise ValueError("Failed to load the processed text data")

    docs = list(df['ProcessedText'])
    if test_ratio >= 1.0 or test_ratio < 0.0:
        test_ratio = 0.005

    topicmodeler = TopicModeler(docs, 
            stopWordFile=FUNCTION_WORDS_FILE,
            minWordCount=MIN_WORD_COUNT, 
            minDocCount=MIN_DOC_COUNT, 
            maxDocFreq=MAX_DOC_FREQ, 
            workers=cpu_count()-1, 
            numTopics=NUM_TOPICS, 
            numIterations=NUM_ITERATIONS, 
            passes=NUM_PASSES, 
            chunksize=CHUNK_SIZE, 
            random_state=RANDOM_STATE,
            test_ratio=test_ratio)

    if topicConfig is None or len(topicConfig) == 0:
        logger.info("Only need to learn %d topics" % NUM_TOPICS)
        
        lda = topicmodeler.TrainLDA(saveModel=saveModel)
        coherence = topicmodeler.EvaluateCoherence(lda, coherence_types)
        perplex = topicmodeler.EvaluatePerplexity(lda)
        run_logger.log("Perplexity", perplex['perplexity'])
        run_logger.log("Per Word Bound", perplex['per_word_bound'])
        for type in coherence:
            run_logger.log(type + " Coherence", coherence[type])
        run_logger.log("Topic Number", NUM_TOPICS)

        return lda
    else:
        for i in topicConfig:
            logger.info("Learning %d topics, from list of topic configuration: %s" % (i, str(topicConfig)))

            # IMPORTANT: update the number of topics need to learn
            topicmodeler.numTopics = i
            
            # train an LDA model
            lda = topicmodeler.TrainLDA(saveModel=saveModel)

            topicmodeler.EvaluateCoherence(lda, coherence_types)
            topicmodeler.EvaluatePerplexity(lda)
        topicmodeler.CollectRunLog()
        topicmodeler.PlotRunLog()
Beispiel #6
0
def run_step2(cleanedDataFrame, config=(0, 0, 0), numPhrase=MAX_NUM_PHRASE, maxPhrasePerIter=MAX_PHRASE_PER_ITER,
                maxPhraseLength=MAX_PHRASE_LENGTH, minInstanceCount=MIN_INSTANCE_COUNT):
    """
    Step 2: phrase learning
    """
    aml_logger = get_azureml_logger()   # logger writes to AMLWorkbench runtime view
    aml_logger.log('amlrealworld.document-collection-analysis.step2', 'true')
    
    logger = logging.getLogger(__name__)
    logger.info("=========  Run Step 2: learn phrases from data")

    minPhrase, maxPhrase, step = config

    if minPhrase == 0 and maxPhrase == 0 and step == 0:
        logger.info("Only need to learn %d phrases" % numPhrase)
        # Instantiate a PhraseLearner and run a configuration
        # We need to put this code under '__main__' to run multiprocessing
        phraseLearner = PhraseLearner(cleanedDataFrame, "CleanedText", numPhrase,
                                maxPhrasePerIter, maxPhraseLength, minInstanceCount)

        textData = list(phraseLearner.textFrame['LowercaseText'])
        phraseLearner.RunConfiguration(textData,
                    phraseLearner.learnedPhrases,
                    addSpace=True,
                    writeFile=True,
                    num_workers=cpu_count()-1)

        phraseLearner.textFrame['TextWithPhrases'] = textData
        phraseLearner.MapVocabToSurfaceForms('CleanedText', 'TextWithPhrases', True)
        newDocsFrame = phraseLearner.ReconstituteDocsFromChunks('DocID', 'TextWithPhrases', True)
    else:
        # make sure the inputs are valid and make sense
        minPhrase = max(10, minPhrase)
        maxPhrase = max(10, maxPhrase)
        step = max(1, step)

        # Instance a phrase learner with minPhrase set
        phraseLearner = PhraseLearner(cleanedDataFrame, "CleanedText", minPhrase,
                                maxPhrasePerIter, maxPhraseLength, minInstanceCount)
        # Get the lower case text data
        textData = list(phraseLearner.textFrame['LowercaseText'])

        # Incrementally learn phrases
        for i in range(minPhrase, maxPhrase + 1, step):
            logger.info("Learning %d phrases, based on previous leaned %d phrases" % (i, len(phraseLearner.learnedPhrases)))

            # need to update this number
            phraseLearner.maxNumPhrases = i
            phraseLearner.RunConfiguration(textData,
                        phraseLearner.learnedPhrases,
                        addSpace=True,
                        writeFile=True,
                        num_workers=cpu_count()-1)

            phraseLearner.textFrame['TextWithPhrases'] = textData
            phraseLearner.MapVocabToSurfaceForms('CleanedText', 'TextWithPhrases', True)
            newDocsFrame = phraseLearner.ReconstituteDocsFromChunks('DocID', 'TextWithPhrases', True)
Beispiel #7
0
def addmodelcomparison(X_test, y_test, model):

    # initialize the logger
    from azureml.logging import get_azureml_logger
    run_logger = get_azureml_logger() 

    # log accuracy which is a single numerical value
    accuracy = model.score(X_test, y_test)
    run_logger.log("Accuracy", accuracy)
    def __init__(self,
                 textFrame=None,
                 textCol="",
                 maxNumPhrases=25000,
                 maxPhrasesPerIter=500,
                 maxPhraseLength=7,
                 minInstanceCount=5):
        logger = logging.getLogger(__name__)

        # initialize the run logger
        self.run_logger = get_azureml_logger()
        self.run_logger.log(
            'amlrealworld.document-collection-analysis.phraseLearning', 'true')

        self.textFrame = textFrame
        self.textCol = textCol

        # Load the black list of words
        # This is a precreated hash table containing the list
        # of black list words to be ignored during phrase learning
        self.black_list = get_shared_file_path(BLACK_LIST_FILE)
        self.blacklistHash = LoadListAsHash(self.black_list)

        # Load the function words
        # This is a precreated hash table containing the list
        # of function words used during phrase learning
        self.function_words = get_shared_file_path(FUNCTION_WORDS_FILE)
        self.functionwordHash = LoadListAsHash(self.function_words)

        # Maximum number of phrases to learn
        # If you want to test the code out quickly then set this to a small
        # value (e.g. 100) and set verbose to true when running the quick test
        self.maxNumPhrases = maxNumPhrases

        # Maximum number of phrases to learn per iteration
        # Increasing this number may speed up processing but will affect the ordering of the phrases
        # learned and good phrases could be by-passed if the maxNumPhrases is set to a small number
        self.maxPhrasesPerIter = maxPhrasesPerIter

        # Maximum number of words allowed in the learned phrases
        self.maxPhraseLength = maxPhraseLength

        # Minimum number of times a phrase must occur in the data to
        # be considered during the phrase learning process
        self.minInstanceCount = minInstanceCount

        # The learned phrases
        self.learnedPhrases = []

        # Lower case the raw text column and save in a new column
        if self.textFrame is not None and self.textCol != '':
            self.LowerText(self.textFrame, self.textCol)
        else:
            logger.error(
                "Create an instance with Null text DataFrame, please call self.LowerText() to convert text to lowercase."
            )
    def __init__(self, textData=None, stopWordFile='', minWordCount=5, minDocCount=2, 
                    maxDocFreq=0.25, workers=1, numTopics=50, numIterations=100, passes=1, 
                    chunksize=2000, random_state=None, test_ratio=0.005):
        logger = logging.getLogger(__name__)

        # initialize the run logger
        self.run_logger = get_azureml_logger()
        self.run_logger.log('amlrealworld.document-collection-analysis.topicModeling', 'true')

        if not textData or not isinstance(textData, list):
            raise ValueError("Text data should be non-empty and in the format of list.")

        # The minimum word count in all documents
        self.minWordCount = minWordCount
        # The minimum count of documents that contain a specific word
        self.minDocCount = minDocCount
        # The maximum document frequency that contain a specific word
        self.maxDocFreq = maxDocFreq

        if workers > cpu_count() or workers <= 0:
            logger.warning("Worker number %d is greater than number of cores: %d, reduced it to the number of cores" % (workers, cpu_count()))
            self.workers = cpu_count()
        else:
            self.workers = workers

        self.numTopics = numTopics
        self.numIterations = numIterations
        self.passes = passes
        self.chunksize = chunksize
        self.random_state = random_state
        self.test_ratio = test_ratio

        if not stopWordFile:
            raise ValueError("Need to provide the file name of the stop word list")
        
        stopWordPath = get_shared_file_path(stopWordFile)
        if not os.path.exists(stopWordPath):
            download_file_from_blob(stopWordFile)
        
        self.stopWordHash = LoadListAsHash(stopWordPath)
        self.vocabHash = self.CreateVocabForTopicModeling(textData, self.stopWordHash)
        self.tokenizedDocs = self.TokenizeText(textData)
        self.id2token = None
        self.token2id = None
        self.BuildDictionary(self.tokenizedDocs)
        self.corpus = self.BuildCorpus(self.tokenizedDocs)

        # global variable for run log
        self.topics_list = []
        self.u_mass_list = []
        self.c_v_list = []
        self.c_uci_list = []
        self.c_npmi_list = []
        self.perplexity_list = []
        self.word_bound_list = []
    def __init__(self, textFrame=None, textCol="", maxNumPhrases=25000, 
                    maxPhrasesPerIter=500, maxPhraseLength=7, minInstanceCount=5):
        logger = logging.getLogger(__name__)

        # initialize the run logger
        self.run_logger = get_azureml_logger()
        self.run_logger.log('amlrealworld.document-collection-analysis.phraseLearning', 'true')

        self.textFrame = textFrame
        self.textCol = textCol

        # Load the black list of words
        # This is a precreated hash table containing the list 
        # of black list words to be ignored during phrase learning
        self.black_list = get_shared_file_path(BLACK_LIST_FILE)
        self.blacklistHash = LoadListAsHash(self.black_list)

        # Load the function words
        # This is a precreated hash table containing the list 
        # of function words used during phrase learning
        self.function_words = get_shared_file_path(FUNCTION_WORDS_FILE)
        self.functionwordHash = LoadListAsHash(self.function_words)

        # Maximum number of phrases to learn
        # If you want to test the code out quickly then set this to a small
        # value (e.g. 100) and set verbose to true when running the quick test
        self.maxNumPhrases = maxNumPhrases

        # Maximum number of phrases to learn per iteration 
        # Increasing this number may speed up processing but will affect the ordering of the phrases 
        # learned and good phrases could be by-passed if the maxNumPhrases is set to a small number
        self.maxPhrasesPerIter = maxPhrasesPerIter

        # Maximum number of words allowed in the learned phrases 
        self.maxPhraseLength = maxPhraseLength

        # Minimum number of times a phrase must occur in the data to 
        # be considered during the phrase learning process
        self.minInstanceCount = minInstanceCount

        # The learned phrases
        self.learnedPhrases = []

        # Lower case the raw text column and save in a new column
        if self.textFrame is not None and self.textCol != '':
            self.LowerText(self.textFrame, self.textCol)
        else:
            logger.error("Create an instance with Null text DataFrame, please call self.LowerText() to convert text to lowercase.")
def main(pretrained_model_type, mmlspark_model_type, config_filename,
		 output_model_name, sample_frac):
	# Load the configuration file
	config = ConfigFile(config_filename, pretrained_model_type,
		mmlspark_model_type, output_model_name)
	write_model_summary_to_blob(config, mmlspark_model_type)

	# Log the parameters of the run
	run_logger = get_azureml_logger()
	run_logger.log('amlrealworld.aerial_image_classification.run_mmlspark','true')
	run_logger.log('pretrained_model_type', pretrained_model_type)
	run_logger.log('mmlspark_model_type', mmlspark_model_type)
	run_logger.log('config_filename', config_filename)
	run_logger.log('output_model_name', output_model_name)
	run_logger.log('sample_frac', sample_frac)

	# Train and save the MMLSpark model
	train_df = load_data(config.train_uri, config, sample_frac)
	mmlspark_model = mmlspark.TrainClassifier(
		model=config.mmlspark_model_type, labelCol='label').fit(train_df)
	mmlspark_model.write().overwrite().save(config.output_uri)

	# Apply the MMLSpark model to the test set and save the accuracy metric
	test_df = load_data(config.test_uri, config, sample_frac)
	predictions = mmlspark_model.transform(test_df)
	metrics = mmlspark.ComputeModelStatistics(evaluationMetric='accuracy') \
		.transform(predictions)
	metrics.show()
	run_logger.log('accuracy_on_test_set', metrics.first()['accuracy'])
	
	# Save the predictions
	tf = mmlspark.IndexToValue().setInputCol('scored_labels') \
		.setOutputCol('pred_label')
	predictions = tf.transform(predictions).select(
		'filepath', 'label', 'pred_label')
	output_str = predictions.toPandas().to_csv(index=False)
	blob_service = BlockBlobService(config.storage_account_name,
									config.storage_account_key)
	blob_service.create_container(config.container_prediction_results)
	blob_service.create_blob_from_text(
			config.container_prediction_results,
			config.predictions_filename,
			output_str)

	return
def record_results(eval_path):
    print("Starting logging results, using eval dir {0}".format(eval_path))
    ea = event_accumulator.EventAccumulator(
        eval_path,
        size_guidance={  # see below regarding this argument
            event_accumulator.COMPRESSED_HISTOGRAMS: 500,
            event_accumulator.IMAGES: 30,
            event_accumulator.AUDIO: 4,
            event_accumulator.SCALARS: 0,
            event_accumulator.HISTOGRAMS: 1
        })
    ea.Reload()

    # Plot mAP vs Interations
    df = pd.DataFrame(ea.Scalars('Precision/[email protected]'))
    max_vals = df.loc[df["value"].idxmax()]  # max value of mAP

    fig = plt.figure(figsize=(6, 5), dpi=75)
    plt.plot(df["step"], df["value"])
    plt.plot(max_vals["step"], max_vals["value"], "g+", mew=2, ms=10)
    plt.title("Precision")
    plt.ylabel("mAP")
    plt.xlabel("interations")
    fig.savefig("./outputs/mAP.png", bbox_inches='tight')

    # Plot detection resuts for model with max mAP
    IMAGE_ID = "image-1"  # Use blurry image as a baseline across runs
    IMG_OUTPUT = "./outputs/kittiwake.png"
    match = [x for x in ea.Images(IMAGE_ID) if x.step == max_vals["step"]]
    if (len(match) > 0):
        img_encoded = match[0].encoded_image_string
        img_file = BytesIO(img_encoded)
        img = Image.open(img_file)
        print(img.size)
        img.save(IMG_OUTPUT, "PNG")
    else:
        message = "Did not find images summary for step {0} with max mAP {1}. Need increase event_accumulator.IMAGES?"
        print(message.format(max_vals["step"], max_vals["value"]))

    run_logger = get_azureml_logger()
    run_logger.log("max_mAP", max_vals["value"])
    run_logger.log("max_mAP_interation#", max_vals["step"])
    print("Done logging resuts")
Beispiel #13
0
def run_step1(saveFile=True):
    """
    Step 1: data preprocessing
    """
    aml_logger = get_azureml_logger(
    )  # logger writes to AMLWorkbench runtime view
    aml_logger.log('amlrealworld.document-collection-analysis.step1', 'true')

    logger = logging.getLogger(__name__)

    fpath = get_shared_file_path(CLEANED_DATA_FILE_NAME)
    logger.info("=========  Run Step 1: preprocessing text data")

    # Read raw data into a Pandas DataFrame
    textDF = getData()

    # Write frame with preprocessed text out to TSV file
    cleanedDataFrame = CleanAndSplitText(textDF,
                                         idColumnName='ID',
                                         textColumnName='Text',
                                         saveDF=saveFile)

    return cleanedDataFrame
Beispiel #14
0
import createfeautures as cf
import numpy as np
import pandas as pd
import pyspark
import os
import urllib
import sys
from pyspark.sql.functions import *
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from azureml.logging import get_azureml_logger

# initialize logger
run_logger = get_azureml_logger()

from azureml.dataprep import datasource

# start Spark session
spark = pyspark.sql.SparkSession.builder.appName(
    'classification').getOrCreate()
# print runtime versions
print('****************')
print('Python version: {}'.format(sys.version))
print('Spark version: {}'.format(spark.version))
print('****************')
print('***Prepare Input Data to get required attributes***')
inputdata = datasource.load_datasource('POLines.dsource')
data = inputdata.dropna(subset=['Category'])

print('***Filtering Training + Testing + Validation records***')
    def __init__(self,
                 textData=None,
                 stopWordFile='',
                 minWordCount=5,
                 minDocCount=2,
                 maxDocFreq=0.25,
                 workers=1,
                 numTopics=50,
                 numIterations=100,
                 passes=1,
                 chunksize=2000,
                 random_state=None,
                 test_ratio=0.005):
        logger = logging.getLogger(__name__)

        # initialize the run logger
        self.run_logger = get_azureml_logger()

        if not textData or not isinstance(textData, list):
            raise ValueError(
                "Text data should be non-empty and in the format of list.")

        # The minimum word count in all documents
        self.minWordCount = minWordCount
        # The minimum count of documents that contain a specific word
        self.minDocCount = minDocCount
        # The maximum document frequency that contain a specific word
        self.maxDocFreq = maxDocFreq

        if workers > cpu_count() or workers <= 0:
            logger.warning(
                "Worker number %d is greater than number of cores: %d, reduced it to the number of cores"
                % (workers, cpu_count()))
            self.workers = cpu_count()
        else:
            self.workers = workers

        self.numTopics = numTopics
        self.numIterations = numIterations
        self.passes = passes
        self.chunksize = chunksize
        self.random_state = random_state
        self.test_ratio = test_ratio

        if not stopWordFile:
            raise ValueError(
                "Need to provide the file name of the stop word list")

        stopWordPath = get_shared_file_path(stopWordFile)
        if not os.path.exists(stopWordPath):
            download_file_from_blob(stopWordFile)

        self.stopWordHash = LoadListAsHash(stopWordPath)
        self.vocabHash = self.CreateVocabForTopicModeling(
            textData, self.stopWordHash)
        self.tokenizedDocs = self.TokenizeText(textData)
        self.id2token = None
        self.token2id = None
        self.BuildDictionary(self.tokenizedDocs)
        self.corpus = self.BuildCorpus(self.tokenizedDocs)

        # global variable for run log
        self.topics_list = []
        self.u_mass_list = []
        self.c_v_list = []
        self.c_uci_list = []
        self.c_npmi_list = []
        self.perplexity_list = []
        self.word_bound_list = []
def CleanAndSplitText(textDataFrame, idColumnName='ID', textColumnName='Text', saveDF=False):
    aml_logger = get_azureml_logger()   # logger writes to AMLWorkbench runtime view
    aml_logger.log('amlrealworld.document-collection-analysis.preprocessText', 'true')

    logger = logging.getLogger(__name__)

    # Need to download the 'punkt' model for breaking text
    # strings into individual sentences
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        logger.debug("Need to download the 'punkt' model from NLTK")
        nltk.download('punkt')
        logger.debug("Downloading 'punkt' model done.")
    
    logger.info("Clean and split the raw text into sentences")

    textDataOut = [] 
    # This regular expression is for section headers in the bill summaries that we wish to ignore
    reHeaders = re.compile(r" *TABLE OF CONTENTS:? *"
                           "| *Title [IVXLC]+:? *"
                           "| *Subtitle [A-Z]+:? *"
                           "| *\(Sec\. \d+\) *")

    # This regular expression is for punctuation that we wish to clean out
    # We also will split sentences into smaller phrase like units using this expression
    rePhraseBreaks = re.compile("[\"\!\?\)\]\}\,\:\;\*\-]*\s+\([0-9]+\)\s+[\(\[\{\"\*\-]*"                             
                                "|[\"\!\?\)\]\}\,\:\;\*\-]+\s+[\(\[\{\"\*\-]*"
                                "|\.\.+"
                                "|\s*\-\-+\s*"
                                "|\s+\-\s+"
                                "|\:\:+"
                                "|\s+[\/\(\[\{\"\-\*]+\s*"
                                "|[\,!\?\"\)\(\]\[\}\{\:\;\*](?=[a-zA-Z])"
                                "|[\"\!\?\)\]\}\,\:\;]+[\.]*$"
                             )
    
    # Regex for underbars
    regexUnderbar = re.compile('_')
    
    # Regex for space
    regexSpace = re.compile(' +')
 
    # Regex for sentence final period
    regexPeriod = re.compile("\.$")

    # Iterate through each document and do:
    #    (1) Split documents into sections based on section headers and remove section headers
    #    (2) Split the sections into sentences using NLTK sentence tokenizer
    #    (3) Further split sentences into phrasal units based on punctuation and remove punctuation
    #    (4) Remove sentence final periods when not part of a abbreviation 

    for i in range(len(textDataFrame)):
        # Extract one document from frame
        docID = textDataFrame[idColumnName][i]
        docText = textDataFrame[textColumnName][i] 

        # Set counter for output line count for this document
        lineIndex=0;

        # Split document into sections by finding sections headers and splitting on them 
        sections = reHeaders.split(str(docText))
        
        for section in sections:
            # Split section into sentence using NLTK tokenizer 
            sentences = tokenize.sent_tokenize(section)
            
            for sentence in sentences:
                # Split each sentence into phrase level chunks based on punctuation
                textSegs = rePhraseBreaks.split(sentence)
                numSegs = len(textSegs)
                
                for j in range(0, numSegs):
                    if len(textSegs[j]) > 0:
                        # Convert underbars to spaces 
                        # Underbars are reserved for building the compound word phrases                   
                        textSegs[j] = regexUnderbar.sub(" ", textSegs[j])
                    
                        # Split out the words so we can specially handle the last word
                        words = regexSpace.split(textSegs[j])
                        phraseOut = ""
                        # If the last word ends in a period then remove the period
                        words[-1] = regexPeriod.sub("", words[-1])
                        # If the last word is an abbreviation like "U.S."
                        # then add the word final perios back on
                        if "\." in words[-1]:
                            words[-1] += "."
                        phraseOut = " ".join(words)  

                        textDataOut.append([docID, lineIndex, phraseOut])
                        lineIndex += 1
    # Convert to Pandas DataFrame 
    frameOut = pd.DataFrame(textDataOut, columns=['DocID', 'DocLine', 'CleanedText'])
    logger.debug("Returned clean DataFrame shape: %d, %d" % (frameOut.shape[0], frameOut.shape[1]))

    if saveDF:
        logger.info("Saving the cleaned DataFrame in file: %s" % CLEANED_DATA_FILE_NAME)
        cleanedDataFile = get_shared_file_path(CLEANED_DATA_FILE_NAME)
        frameOut.to_csv(cleanedDataFile, sep='\t', index=False)
    else:
        logger.info("The cleaned and sentenced text data is not being saved.")

    return frameOut
Beispiel #17
0
def main():

    #########################################
    # Accept One Argument as Input
    #########################################

    try:
        topN = int(sys.argv[1])
    except IndexError:
        print(
            "This script takes one argument. Please enter a valid non-negative integer number.\n"
        )
        raise

    #########################################
    # Access trainQ and testQ from Part 2
    #########################################

    workfolder = os.environ.get('AZUREML_NATIVE_SHARE_DIRECTORY')

    # paths to trainQ and testQ.
    trainQ_path = os.path.join(workfolder, 'trainQ_part2')
    testQ_path = os.path.join(workfolder, 'testQ_part2')

    # load the training and test data.
    trainQ = pd.read_csv(trainQ_path,
                         sep='\t',
                         index_col='Id',
                         encoding='latin1')
    testQ = pd.read_csv(testQ_path,
                        sep='\t',
                        index_col='Id',
                        encoding='latin1')

    #########################################
    # Extract Features
    #########################################

    token2IdHashInit = tokensToIds(trainQ['Tokens'], featureHash=None)

    # get unique answerId in ascending order
    uniqueAnswerId = list(np.unique(trainQ['AnswerId']))

    # calculate the count matrix of all training questions.
    N_wAInit = countMatrix(trainQ, token2IdHashInit, 'AnswerId',
                           uniqueAnswerId)

    P_A = priorProbabilityAnswer(trainQ['AnswerId'], uniqueAnswerId)
    P_Aw = posterioriProb(N_wAInit, P_A, uniqueAnswerId)

    # select top N important tokens per answer class.
    featureHash = feature_selection(P_Aw, token2IdHashInit, topN=topN)
    token2IdHash = tokensToIds(trainQ['Tokens'], featureHash=featureHash)

    N_wA = countMatrix(trainQ, token2IdHash, 'AnswerId', uniqueAnswerId)

    alpha = 0.0001
    P_w = featureWeights(N_wA, alpha)

    beta = 0.0001
    P_wA = wordProbabilityInAnswer(N_wA, P_w, beta)
    P_wNotA = wordProbabilityNotinAnswer(N_wA, P_w, beta)

    #########################################
    # Train Naive Bayes Classifier
    #########################################

    NBWeights = np.log(P_wA / P_wNotA)

    #########################################
    # Predict Probabilities on Test
    #########################################

    beta_A = 0
    x_wTest = normalizeTF(testQ, token2IdHash)
    Y_test_prob = softmax(-beta_A + np.dot(x_wTest.T, NBWeights))

    #########################################
    # Evaluate Model Performance
    #########################################
    # We use two evaluation matrices (Average Rank and Top 3 Percentage) to test our model performance.
    # The Average Rank can be interpreted as in average at which position we can find the correct answer among all available answers for a given question.
    # The Top 3 Percentage can be interpreted as how many percentage of the new questions that we can find their correct answers in the first 3 choices.
    # sort the similarity scores in descending order and map them to the corresponding AnswerId in Answer set

    testQ = rank(testQ, Y_test_prob, uniqueAnswerId)

    AR = np.floor(testQ['Rank'].mean())
    top3 = round(len(testQ.query('Rank <= 3')) / len(testQ), 3)

    print('Top %d important tokens selected per Class.' % topN)
    print('Average of rank: ' + str(AR))
    print('Percentage of questions find answers in the first 3 choices: ' +
          str(top3))

    #########################################
    # Log Parameters and Performance
    #########################################

    # initialize the logger
    run_logger = get_azureml_logger()

    # log performance.
    run_logger.log("Top N Tokens Selected", topN)
    run_logger.log("Average Rank", AR)
    run_logger.log("Top 3 Percentage", top3)
Beispiel #18
0
fp = cm[0][1]
fn = cm[1][0]

accuracy = (tp + tn) / (tp + tn + fp + fn)
print("Accuracy => {}".format(accuracy))
precision = tp / (tp + fp)    # measuring exactness
recall  = tp / (tp + fn)    # measuring completeness
f1 = 2 * precision * recall / (precision + recall)   # compromise between precision and recall
print("F1 Score => {}".format(f1))

#========================= LOGGING MODEL EVALUATION =========================

# initialize the logger
from azureml.logging import get_azureml_logger
run_logger = get_azureml_logger() 

accuracy = classifier.score(X_test, y_test)
run_logger.log("Accuracy", accuracy)
#print ("Accuracy is {}".format(accuracy))


#========================= SAVE MODEL =========================
import pickle
import sys, os

 # create the outputs folder
os.makedirs('./outputs', exist_ok=True)

# serialize the model
print ("Saved the model => socialads.pkl")
def sessionrun(num_epochs):
    global mnist, serialized_tf_example, prediction_classes, values
    global tensor_info_x, tensor_info_y, sessinfo
    global train_x, train_y
    downloaddata()

    batch_size = 100

    x = tf.placeholder(tf.float32, [None, 784], name='x')
    y = tf.placeholder(tf.float32, [None, 10], name='y')
    phase_train = tf.placeholder(tf.bool, name='phase_train')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    batch_norm = tf.placeholder(tf.bool, name='batch_norm')

    pred_op = inference(x, keep_prob, batch_norm=True, phase_train=phase_train)
    loss_op = sploss(pred_op, y)
    ts_op = train(loss_op)
    eval_op = evaluateModel(pred_op, y)
    values, indices = tf.nn.top_k(pred_op, 10)

    loss_list = []
    acc_list = []
    merged = tf.summary.merge_all()

    with tf.Session() as sess:
        train_writer = tf.summary.FileWriter('outputs/tflogs/train',
                                             sess.graph)
        test_writer = tf.summary.FileWriter('outputs/tflogs/test')

        sess.run(tf.global_variables_initializer())
        saver0 = tf.train.Saver()
        for epoch in range(num_epochs):
            avgloss = 0.
            avgacc = 0.

            total_batch = int(mnist.train.num_examples / batch_size)
            for i in range(total_batch):
                mx, my = mnist.train.next_batch(batch_size)
                #nx = 1-mx - this is for training images on whitebackground

                feed_dict = {
                    x: mx,
                    y: my,
                    phase_train: True,
                    batch_norm: True,
                    keep_prob: 0.4
                }
                _trsumm, _totloss, _trainstep, _predseriescc = sess.run(
                    [merged, loss_op, ts_op, pred_op], feed_dict=feed_dict)
                avgloss += _totloss / total_batch
                #this is for training images on whitebackground
                #feed_dict = {x: nx, y: my, phase_train: True, batch_norm: True, keep_prob: 0.4}
                #_totloss, _trainstep, _predseriescc = sess.run(
                #    [loss_op, ts_op, pred_op],
                #    feed_dict=feed_dict)
                #avgloss += _totloss / total_batch
                loss_list.append(avgloss)
                if (i % 10 == 0):
                    train_writer.add_summary(_trsumm, i)
            val_feed_dict = {
                x: mnist.validation.images,
                y: mnist.validation.labels,
                phase_train: False,
                batch_norm: True,
                keep_prob: 1
            }
            _valsumm, _acc = sess.run([merged, eval_op],
                                      feed_dict=val_feed_dict)
            avgacc = _acc
            acc_list.append(_acc)
            print("In Epoch ", epoch, " with loss ", avgloss,
                  " and with accuracy ", avgacc)
            train_writer.add_summary(_trsumm, epoch * batch_size)
            test_writer.add_summary(_valsumm, epoch)

        test_feed_dict = {
            x: mnist.test.images,
            y: mnist.test.labels,
            phase_train: False,
            batch_norm: True,
            keep_prob: 1
        }

        _tstsumm, _netacc = sess.run([merged, eval_op],
                                     feed_dict=test_feed_dict)
        print("Net accuracy: ", _netacc)
        tensor_info_x = tf.saved_model.utils.build_tensor_info(x)
        tensor_info_y = tf.saved_model.utils.build_tensor_info(pred_op)
        run_logger = get_azureml_logger()
        run_logger.log("Accuracy", _netacc)
        run_logger.log("Number of Epochs", num_epochs)
        run_logger.log("Data Size", mnist.train.num_examples)

        # export model
        export_path_base = 'outputs/mnist'
        print('export_path_base:', export_path_base)
        if os.path.exists(export_path_base):
            print(
                "model path already exist, removing model path files and directory"
            )
            shutil.rmtree(export_path_base)
        os.mkdir(export_path_base)
        saver0.save(sess, 'outputs/mnist/mnistmodel')
        print('Done exporting!')
def main():
    run_logger = get_azureml_logger()

    parser = argparse.ArgumentParser(description='Chainer example: MNIST')
    parser.add_argument('--batchsize',
                        '-b',
                        type=int,
                        default=100,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=20,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--out',
                        '-o',
                        default='outputs',
                        help='Directory to output the result')
    parser.add_argument('--resume',
                        '-r',
                        default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--unit',
                        '-u',
                        type=int,
                        default=1000,
                        help='Number of units')
    args = parser.parse_args()

    print('GPU: {}'.format(args.gpu))
    print('# unit: {}'.format(args.unit))
    print('# Minibatch-size: {}'.format(args.batchsize))
    print('# epoch: {}'.format(args.epoch))
    print('')

    # Set up a neural network to train
    model = L.Classifier(train_mnist.MLP(args.unit, 10))
    if args.gpu >= 0:
        # Make a speciied GPU current
        chainer.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()  # Copy the model to the GPU

    # Setup an optimizer
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    # Load the MNIST dataset
    train, test = chainer.datasets.get_mnist()

    train_count = len(train)
    test_count = len(test)

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    sum_accuracy = 0
    sum_loss = 0
    # Keep training until reach max iterations
    metrics = []
    losses = []

    while train_iter.epoch < args.epoch:
        batch = train_iter.next()
        x_array, t_array = convert.concat_examples(batch, args.gpu)
        x = chainer.Variable(x_array)
        t = chainer.Variable(t_array)
        optimizer.update(model, x, t)
        sum_loss += float(model.loss.data) * len(t.data)
        sum_accuracy += float(model.accuracy.data) * len(t.data)

        if train_iter.is_new_epoch:
            print('epoch: ', train_iter.epoch)
            print('train mean loss: {}, accuracy: {}'.format(
                sum_loss / train_count, sum_accuracy / train_count))
            # evaluation
            sum_accuracy = 0
            sum_loss = 0
            for batch in test_iter:
                x_array, t_array = convert.concat_examples(batch, args.gpu)
                x = chainer.Variable(x_array)
                t = chainer.Variable(t_array)
                loss = model(x, t)
                sum_loss += float(loss.data) * len(t.data)
                sum_accuracy += float(model.accuracy.data) * len(t.data)

            test_iter.reset()
            print('test mean  loss: {}, accuracy: {}'.format(
                sum_loss / test_count, sum_accuracy / test_count))

            metrics.append(float(sum_accuracy / test_count))
            losses.append(float(sum_loss))
            sum_accuracy = 0
            sum_loss = 0

    run_logger.log("Accuracy", metrics)
    run_logger.log("Loss", losses)

    # Save the model and the optimizer
    print('save the model')
    serializers.save_npz('mlp.model', model)
    print('save the optimizer')
    serializers.save_npz('mlp.state', optimizer)
def CleanAndSplitText(textDataFrame,
                      idColumnName='ID',
                      textColumnName='Text',
                      saveDF=False):
    aml_logger = get_azureml_logger(
    )  # logger writes to AMLWorkbench runtime view
    aml_logger.log('amlrealworld.document-collection-analysis.preprocessText',
                   'true')

    logger = logging.getLogger(__name__)

    # Need to download the 'punkt' model for breaking text
    # strings into individual sentences
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        logger.debug("Need to download the 'punkt' model from NLTK")
        nltk.download('punkt')
        logger.debug("Downloading 'punkt' model done.")

    logger.info("Clean and split the raw text into sentences")

    textDataOut = []
    # This regular expression is for section headers in the bill summaries that we wish to ignore
    reHeaders = re.compile(r" *TABLE OF CONTENTS:? *"
                           "| *Title [IVXLC]+:? *"
                           "| *Subtitle [A-Z]+:? *"
                           "| *\(Sec\. \d+\) *")

    # This regular expression is for punctuation that we wish to clean out
    # We also will split sentences into smaller phrase like units using this expression
    rePhraseBreaks = re.compile(
        "[\"\!\?\)\]\}\,\:\;\*\-]*\s+\([0-9]+\)\s+[\(\[\{\"\*\-]*"
        "|[\"\!\?\)\]\}\,\:\;\*\-]+\s+[\(\[\{\"\*\-]*"
        "|\.\.+"
        "|\s*\-\-+\s*"
        "|\s+\-\s+"
        "|\:\:+"
        "|\s+[\/\(\[\{\"\-\*]+\s*"
        "|[\,!\?\"\)\(\]\[\}\{\:\;\*](?=[a-zA-Z])"
        "|[\"\!\?\)\]\}\,\:\;]+[\.]*$")

    # Regex for underbars
    regexUnderbar = re.compile('_')

    # Regex for space
    regexSpace = re.compile(' +')

    # Regex for sentence final period
    regexPeriod = re.compile("\.$")

    # Iterate through each document and do:
    #    (1) Split documents into sections based on section headers and remove section headers
    #    (2) Split the sections into sentences using NLTK sentence tokenizer
    #    (3) Further split sentences into phrasal units based on punctuation and remove punctuation
    #    (4) Remove sentence final periods when not part of a abbreviation

    for i in range(len(textDataFrame)):
        # Extract one document from frame
        docID = textDataFrame[idColumnName][i]
        docText = textDataFrame[textColumnName][i]

        # Set counter for output line count for this document
        lineIndex = 0

        # Split document into sections by finding sections headers and splitting on them
        sections = reHeaders.split(str(docText))

        for section in sections:
            # Split section into sentence using NLTK tokenizer
            sentences = tokenize.sent_tokenize(section)

            for sentence in sentences:
                # Split each sentence into phrase level chunks based on punctuation
                textSegs = rePhraseBreaks.split(sentence)
                numSegs = len(textSegs)

                for j in range(0, numSegs):
                    if len(textSegs[j]) > 0:
                        # Convert underbars to spaces
                        # Underbars are reserved for building the compound word phrases
                        textSegs[j] = regexUnderbar.sub(" ", textSegs[j])

                        # Split out the words so we can specially handle the last word
                        words = regexSpace.split(textSegs[j])
                        phraseOut = ""
                        # If the last word ends in a period then remove the period
                        words[-1] = regexPeriod.sub("", words[-1])
                        # If the last word is an abbreviation like "U.S."
                        # then add the word final perios back on
                        if "\." in words[-1]:
                            words[-1] += "."
                        phraseOut = " ".join(words)

                        textDataOut.append([docID, lineIndex, phraseOut])
                        lineIndex += 1
    # Convert to Pandas DataFrame
    frameOut = pd.DataFrame(textDataOut,
                            columns=['DocID', 'DocLine', 'CleanedText'])
    logger.debug("Returned clean DataFrame shape: %d, %d" %
                 (frameOut.shape[0], frameOut.shape[1]))

    if saveDF:
        logger.info("Saving the cleaned DataFrame in file: %s" %
                    CLEANED_DATA_FILE_NAME)
        cleanedDataFile = get_shared_file_path(CLEANED_DATA_FILE_NAME)
        frameOut.to_csv(cleanedDataFile, sep='\t', index=False)
    else:
        logger.info("The cleaned and sentenced text data is not being saved.")

    return frameOut
from documentAnalysis import *
import logging
import pandas as pd
import os

from multiprocessing import cpu_count
from step1 import run_step1
from step2 import run_step2
from step3 import run_step3, copyFigures, visualizeTopic, saveModel
from azureml.logging import get_azureml_logger



if __name__ == '__main__':
    aml_logger = get_azureml_logger()   # logger writes to AMLWorkbench runtime view
    aml_logger.log('amlrealworld.document-collection-analysis.runme', 'true')

    logging.basicConfig(format='%(asctime)s : %(name)s : %(levelname)s : %(message)s', level=logging.INFO)

    """
    By default, this script will use the entire Congressional dataset.
    If you just need to try it on a smaller dataset, choose the right 'DATASET_FILE'
    setting in documentAnalysis/configs.py file.
    """
    # Step 1: Data preprocessing
    cleanedDataFrame = run_step1(saveFile=False)

    # Step 2: Phrase learning
    run_step2(cleanedDataFrame=cleanedDataFrame, numPhrase=MAX_NUM_PHRASE, maxPhrasePerIter=MAX_PHRASE_PER_ITER,
               maxPhraseLength=MAX_PHRASE_LENGTH, minInstanceCount=MIN_INSTANCE_COUNT)
Beispiel #23
0
from documentAnalysis import *
import logging
import pandas as pd
import os

from multiprocessing import cpu_count
from step1 import run_step1
from step2 import run_step2
from step3 import run_step3, copyFigures, visualizeTopic, saveModel
from azureml.logging import get_azureml_logger

if __name__ == '__main__':
    aml_logger = get_azureml_logger(
    )  # logger writes to AMLWorkbench runtime view
    aml_logger.log('amlrealworld.document-collection-analysis.runme', 'true')

    logging.basicConfig(
        format='%(asctime)s : %(name)s : %(levelname)s : %(message)s',
        level=logging.INFO)
    """
    By default, this script will use the entire Congressional dataset.
    If you just need to try it on a smaller dataset, choose the right 'DATASET_FILE'
    setting in documentAnalysis/configs.py file.
    """
    # Step 1: Data preprocessing
    cleanedDataFrame = run_step1(saveFile=False)

    # Step 2: Phrase learning
    run_step2(cleanedDataFrame=cleanedDataFrame,
              numPhrase=MAX_NUM_PHRASE,
              maxPhrasePerIter=MAX_PHRASE_PER_ITER,