class TraceabilityRunner: CAMEL = CamelCaseSplitter() LOWER = LowerCaseTransformer() LETTER = NonLetterFilter() URL = UrlRemover() SEP = Separator() JAVASTOP = JavaCodeStopWordRemover() JAVASTOP_IT = JavaCodeStopWordRemover(True) STOP_IT = StopWordRemover(True) STOP = StopWordRemover() LEMMA_IT = Lemmatizer(Lemmatizer.LemmatizerType.italian_spacy) LEMMA = Lemmatizer(Lemmatizer.LemmatizerType.english_spacy) W_LENGTH = WordLengthFilter(2) REQ_PREPROCESSOR = Preprocessor( [URL, SEP, LETTER, CAMEL, LOWER, LEMMA, STOP, W_LENGTH]) CODE_PREPROCESSOR = Preprocessor( [URL, SEP, LETTER, CAMEL, JAVASTOP, LOWER, LEMMA, STOP, W_LENGTH]) REQ_PREPROCESSOR_IT = Preprocessor( [URL, SEP, LETTER, CAMEL, LOWER, LEMMA_IT, STOP_IT, W_LENGTH]) CODE_PREPROCESSOR_IT = Preprocessor([ URL, SEP, LETTER, CAMEL, JAVASTOP_IT, LOWER, LEMMA_IT, STOP_IT, W_LENGTH ]) def __init__(self, dataset): self.dataset = dataset self.req_preprocessor = self.REQ_PREPROCESSOR if dataset.is_english( ) else self.REQ_PREPROCESSOR_IT self.code_preprocessor = self.CODE_PREPROCESSOR if dataset.is_english( ) else self.CODE_PREPROCESSOR_IT self.code_tokenizer = JavaCodeASTTokenizer( dataset, JavaDocDescriptionOnlyTokenizer(dataset, not dataset.is_english()))
def preprocessing(self, overwrite_cropped=False, analyse=False, run_preprocessing=False): self.crop(overwrite_cropped) self.dataset_analyse(analyse) if run_preprocessing or len(os.listdir(self.preprocessed_dir)) == 0: pp = Preprocessor(self.raw_data_dir, self.preprocessed_dir) pp.run()
def preprocess(self, inputList): """ This function parses date time values into correct format. :param inputList: list of input data :return: parsed list of date time values """ self.preprocessor = Preprocessor(inputList) return self.preprocessor.parseDateTime()
class BorderAnalytics: def __init__(self, inputFile, outputFilePath): self.input = inputFile self.outputFilePath = outputFilePath self.inputHandler = None self.outputHandler = None self.preprocessor = None self.borderCrossingCompute = None def inputFile(self, name): self.inputHandler = InputHandler(name) return self.inputHandler.parse() def preprocess(self, inputList): self.preprocessor = Preprocessor(inputList) return self.preprocessor.parseDateTime() def computeTotalCrossing(self): store = self.inputFile(self.input) print("Store size:" + str(len(store))) processedList = self.preprocess(store) print("List processed:" + str(len(store))) self.borderCrossingCompute = BorderCrossingComputation(processedList) print("Starting computation for total crossing") self.borderCrossingCompute.computeTotalCross() print("Finished") return self.borderCrossingCompute.get_total_cross() def computeAverageCrossing(self): print("Starting avg cross computation") self.borderCrossingCompute.calculate_running_avg() print("Finished") return self.borderCrossingCompute.get_monthly_avg() def startAnalysis(self): totalCrossing = self.computeTotalCrossing() avgCrossing = self.computeAverageCrossing() print("Merging results") output = self.borderCrossingCompute.getTotalCrossAndAverage() print("Finished") print("Saving output") self.saveOutput(output) def saveOutput(self, output): self.outputHandler = OutputHandler(self.outputFilePath) header = ["Border", "Date", "Measure", "Value", "Average"] status = self.outputHandler.save_to_csv(header, output) print(status)
from preprocessing.Preprocessor import Preprocessor import helpers.constants as constantsPATREC # dirProject = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/'; dirProject = '/home/thomas/fusessh/scicore/projects/patrec' dirData = os.path.join(dirProject, 'data') dict_dataset_options = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', # 'subgroups': ['DK'], 'grouping': 'verylightgrouping', 'encoding': 'categorical', 'newfeatures': { 'names': constantsPATREC.NEW_FEATURES }, 'featurereduction': None, 'filtering': None } options = DatasetOptions(dict_dataset_options) preproc = Preprocessor(options) preproc.splitColumns() preproc.clean() preproc.group() preproc.createFeatureSet() preproc.encodeFeatures() preproc.fuse()
class BorderAnalytics: def __init__(self, inputFile, outputFilePath): self.input = inputFile self.outputFilePath = outputFilePath self.inputHandler = None self.outputHandler = None self.preprocessor = None self.borderCrossingCompute = None def inputFile(self, name): """ This function fetches the input file from the name given and redirects it to the InputHandler.py file to get input data rows in a lists :param string name: name of the input file :returns: input file in list """ self.inputHandler = InputHandler(name) return self.inputHandler.parse() def preprocess(self, inputList): """ This function parses date time values into correct format. :param inputList: list of input data :return: parsed list of date time values """ self.preprocessor = Preprocessor(inputList) return self.preprocessor.parseDateTime() def computeTotalCrossing(self): """ This function computes the total crossing for each defined condition :return: a sorted array of data with total crossings value """ store = self.inputFile(self.input) print("Store size:" + str(len(store))) processedList = self.preprocess(store) print("List processed:" + str(len(store))) self.borderCrossingCompute = BorderCrossingComputation(processedList) print("Starting computation for total crossing") self.borderCrossingCompute.computeTotalCross() print("Finished") return self.borderCrossingCompute.get_total_cross() def computeAverageCrossing(self): """ This function computes the average for each category as required :return: a dictionary of computed averages """ print("Starting avg cross computation") self.borderCrossingCompute.calculate_running_avg() print("Finished") return self.borderCrossingCompute.get_monthly_avg() def startAnalysis(self): """ This function starts the core computation process of the project :return: a sorted list of final output """ totalCrossing = self.computeTotalCrossing() avgCrossing = self.computeAverageCrossing() print("Merging results") output = self.borderCrossingCompute.getTotalCrossAndAverage() print("Finished") print("Saving output") self.saveOutput(output) def saveOutput(self, output): """ This function saves the output list data structure into a csv file :param output: list of output rows """ self.outputHandler = OutputHandler(self.outputFilePath) header = ["Border", "Date", "Measure", "Value", "Average"] status = self.outputHandler.save_to_csv(header, output)
from preprocessing.Preprocessor import Preprocessor #from neuralnetworks.deprecated.MLNWithoutKeras import MLNWithoutKeras from neuralnetworks.factory.AbstractMLNCreator import * from neuralnetworks.LayerFactory import LayerFactory from neuralnetworks.Builder import Builder from neuralnetworks.optimizer.optimizers import * from neuralnetworks.lossfunctions.LossFunctions import * from neuralnetworks.activations.ActivationFunctions import * from datasource.DB import Dataset from pandas import DataFrame path = 'C:/Users/Giuseppe/Downloads/sonar_csv.csv' scaline = 'standard' #standard dataset = Dataset(path) pp = Preprocessor() #label conversions label = pp.replace(DataFrame(dataset.getLabels()), 'R', 0.0) label = pp.replace(label, 'M', 1.0) dataset.setLabels(label) # access to n-2 colunm n_inputs = len(dataset.getInstances()[0]) #print("Input", n_inputs, dataset.getInstances()[60]) n_outputs = 2 #len(set([row[-1] for row in dataset.getLabels()])) #X_train, X_test, Y_train, Y_test = dataset.split_training_test(0.2) #print(X_train, X_test) print("Cv", dataset.cross_validation(3)) print("llov", dataset.llov())
def preprocess(self, inputList): self.preprocessor = Preprocessor(inputList) return self.preprocessor.parseDateTime()