Python Preprocessor Examples, preprocessing.Preprocessor.Preprocessor Python Examples

Example #1

0

Show file

File: TraceabilityRunner.py Project: FChen12/finegrained-traceability

class TraceabilityRunner:

    CAMEL = CamelCaseSplitter()
    LOWER = LowerCaseTransformer()
    LETTER = NonLetterFilter()
    URL = UrlRemover()
    SEP = Separator()
    JAVASTOP = JavaCodeStopWordRemover()
    JAVASTOP_IT = JavaCodeStopWordRemover(True)
    STOP_IT = StopWordRemover(True)
    STOP = StopWordRemover()
    LEMMA_IT = Lemmatizer(Lemmatizer.LemmatizerType.italian_spacy)
    LEMMA = Lemmatizer(Lemmatizer.LemmatizerType.english_spacy)
    W_LENGTH = WordLengthFilter(2)

    REQ_PREPROCESSOR = Preprocessor(
        [URL, SEP, LETTER, CAMEL, LOWER, LEMMA, STOP, W_LENGTH])
    CODE_PREPROCESSOR = Preprocessor(
        [URL, SEP, LETTER, CAMEL, JAVASTOP, LOWER, LEMMA, STOP, W_LENGTH])
    REQ_PREPROCESSOR_IT = Preprocessor(
        [URL, SEP, LETTER, CAMEL, LOWER, LEMMA_IT, STOP_IT, W_LENGTH])
    CODE_PREPROCESSOR_IT = Preprocessor([
        URL, SEP, LETTER, CAMEL, JAVASTOP_IT, LOWER, LEMMA_IT, STOP_IT,
        W_LENGTH
    ])

    def __init__(self, dataset):
        self.dataset = dataset
        self.req_preprocessor = self.REQ_PREPROCESSOR if dataset.is_english(
        ) else self.REQ_PREPROCESSOR_IT
        self.code_preprocessor = self.CODE_PREPROCESSOR if dataset.is_english(
        ) else self.CODE_PREPROCESSOR_IT
        self.code_tokenizer = JavaCodeASTTokenizer(
            dataset,
            JavaDocDescriptionOnlyTokenizer(dataset, not dataset.is_english()))

Example #2

0

Show file

 def preprocessing(self,
                   overwrite_cropped=False,
                   analyse=False,
                   run_preprocessing=False):
     self.crop(overwrite_cropped)
     self.dataset_analyse(analyse)
     if run_preprocessing or len(os.listdir(self.preprocessed_dir)) == 0:
         pp = Preprocessor(self.raw_data_dir, self.preprocessed_dir)
         pp.run()

Example #3

0

Show file

 def preprocess(self, inputList):
     """
     This function parses date time values into
     correct format.
     :param inputList: list of input data
     :return: parsed list of date time values
     """
     self.preprocessor = Preprocessor(inputList)
     return self.preprocessor.parseDateTime()

Example #4

0

Show file

class BorderAnalytics:
    def __init__(self, inputFile, outputFilePath):
        self.input = inputFile
        self.outputFilePath = outputFilePath
        self.inputHandler = None
        self.outputHandler = None
        self.preprocessor = None
        self.borderCrossingCompute = None

    def inputFile(self, name):
        self.inputHandler = InputHandler(name)
        return self.inputHandler.parse()

    def preprocess(self, inputList):
        self.preprocessor = Preprocessor(inputList)
        return self.preprocessor.parseDateTime()

    def computeTotalCrossing(self):
        store = self.inputFile(self.input)
        print("Store size:" + str(len(store)))

        processedList = self.preprocess(store)
        print("List processed:" + str(len(store)))

        self.borderCrossingCompute = BorderCrossingComputation(processedList)

        print("Starting computation for total crossing")
        self.borderCrossingCompute.computeTotalCross()
        print("Finished")

        return self.borderCrossingCompute.get_total_cross()

    def computeAverageCrossing(self):
        print("Starting avg cross computation")
        self.borderCrossingCompute.calculate_running_avg()
        print("Finished")

        return self.borderCrossingCompute.get_monthly_avg()

    def startAnalysis(self):
        totalCrossing = self.computeTotalCrossing()
        avgCrossing = self.computeAverageCrossing()

        print("Merging results")
        output = self.borderCrossingCompute.getTotalCrossAndAverage()
        print("Finished")

        print("Saving output")
        self.saveOutput(output)

    def saveOutput(self, output):
        self.outputHandler = OutputHandler(self.outputFilePath)
        header = ["Border", "Date", "Measure", "Value", "Average"]
        status = self.outputHandler.save_to_csv(header, output)
        print(status)

Example #5

0

Show file

from preprocessing.Preprocessor import Preprocessor

import helpers.constants as constantsPATREC

# dirProject = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/';
dirProject = '/home/thomas/fusessh/scicore/projects/patrec'
dirData = os.path.join(dirProject, 'data')

dict_dataset_options = {
    'dir_data': dirData,
    'data_prefix': 'patrec',
    'dataset': '20122015',
    # 'subgroups':                ['DK'],
    'grouping': 'verylightgrouping',
    'encoding': 'categorical',
    'newfeatures': {
        'names': constantsPATREC.NEW_FEATURES
    },
    'featurereduction': None,
    'filtering': None
}

options = DatasetOptions(dict_dataset_options)
preproc = Preprocessor(options)
preproc.splitColumns()
preproc.clean()
preproc.group()
preproc.createFeatureSet()
preproc.encodeFeatures()
preproc.fuse()

Example #6

0

Show file

class BorderAnalytics:
    def __init__(self, inputFile, outputFilePath):
        self.input = inputFile
        self.outputFilePath = outputFilePath
        self.inputHandler = None
        self.outputHandler = None
        self.preprocessor = None
        self.borderCrossingCompute = None

    def inputFile(self, name):
        """
        This function fetches the input file from
        the name given and redirects it to the
        InputHandler.py file to get input data
        rows in a lists
        :param string name: name of the input file
        :returns: input file in list
        """
        self.inputHandler = InputHandler(name)
        return self.inputHandler.parse()

    def preprocess(self, inputList):
        """
        This function parses date time values into
        correct format.
        :param inputList: list of input data
        :return: parsed list of date time values
        """
        self.preprocessor = Preprocessor(inputList)
        return self.preprocessor.parseDateTime()

    def computeTotalCrossing(self):
        """
        This function computes the total crossing for each
        defined condition
        :return: a sorted array of data with total crossings value
        """
        store = self.inputFile(self.input)
        print("Store size:" + str(len(store)))

        processedList = self.preprocess(store)
        print("List processed:" + str(len(store)))

        self.borderCrossingCompute = BorderCrossingComputation(processedList)

        print("Starting computation for total crossing")
        self.borderCrossingCompute.computeTotalCross()
        print("Finished")

        return self.borderCrossingCompute.get_total_cross()

    def computeAverageCrossing(self):
        """
        This function computes the average for each category
        as required
        :return: a dictionary of computed averages
        """
        print("Starting avg cross computation")
        self.borderCrossingCompute.calculate_running_avg()
        print("Finished")

        return self.borderCrossingCompute.get_monthly_avg()

    def startAnalysis(self):
        """
        This function starts the core computation process of
        the project
        :return: a sorted list of final output
        """
        totalCrossing = self.computeTotalCrossing()
        avgCrossing = self.computeAverageCrossing()

        print("Merging results")
        output = self.borderCrossingCompute.getTotalCrossAndAverage()
        print("Finished")

        print("Saving output")
        self.saveOutput(output)

    def saveOutput(self, output):
        """
        This function saves the output list data structure
        into a csv file
        :param output: list of output rows
        """
        self.outputHandler = OutputHandler(self.outputFilePath)
        header = ["Border", "Date", "Measure", "Value", "Average"]
        status = self.outputHandler.save_to_csv(header, output)

Example #7

0

Show file

from preprocessing.Preprocessor import Preprocessor
#from neuralnetworks.deprecated.MLNWithoutKeras import MLNWithoutKeras
from neuralnetworks.factory.AbstractMLNCreator import *
from neuralnetworks.LayerFactory import LayerFactory
from neuralnetworks.Builder import Builder
from neuralnetworks.optimizer.optimizers import *
from neuralnetworks.lossfunctions.LossFunctions import *
from neuralnetworks.activations.ActivationFunctions import *

from datasource.DB import Dataset
from pandas import DataFrame

path = 'C:/Users/Giuseppe/Downloads/sonar_csv.csv'
scaline = 'standard'  #standard
dataset = Dataset(path)
pp = Preprocessor()

#label conversions
label = pp.replace(DataFrame(dataset.getLabels()), 'R', 0.0)
label = pp.replace(label, 'M', 1.0)
dataset.setLabels(label)

# access to n-2 colunm
n_inputs = len(dataset.getInstances()[0])
#print("Input", n_inputs, dataset.getInstances()[60])
n_outputs = 2  #len(set([row[-1] for row in dataset.getLabels()]))

#X_train, X_test, Y_train, Y_test = dataset.split_training_test(0.2)
#print(X_train, X_test)
print("Cv", dataset.cross_validation(3))
print("llov", dataset.llov())

Example #8

0

Show file

 def preprocess(self, inputList):
     self.preprocessor = Preprocessor(inputList)
     return self.preprocessor.parseDateTime()