Python DataFormatter Examples, DataFormatter Python Examples

Example #1

0

Show file

File: NPMRDS_DataLoader.py Project: econ24/NPMRDS_data_loader

def loadData(tarball, params):
    sourceFile, formattedFile = extractFile(tarball)

    #open and format input file
    print "<DataLoader> formatting file: %s" % sourceFile
    try:
        inFile = open(sourceFile)
    except:
        print "<DataLoader> Could not retrieve data from file: %s" % sourceFile
        return

    DataFormatter.formatFile(inFile, formattedFile)

    inFile.close()

    print "<DataLoader> Successfully formatted file: %s" % sourceFile

    #attempt to upload file
    uploaded = DataUploader.upload(formattedFile, params["bucket"], **params)

    if not uploaded:
        print "<DataLoader> Failed to upload file, exiting"
        return

    #create bigquery object
    try:
        bigquery = Google_Service_Builder.buildBigQuery(**params)
    except Exception as e:
        print "<DataLoader>", e
        return

    source = "gs://" + params["bucket"] + "/" + formattedFile
    BigQuery_Append.appendData(bigquery, source, **params)

Example #2

0

Show file

File: Classification.py Project: joewledger/Cell-Line-Classification

 def get_model_RFE_top_features(self,expression_file,ic50_file,target_features,drug):
     expression_frame,ic50_series = dfm.get_expression_frame_and_ic50_series_for_drug(expression_file, ic50_file,drug,normalized=True,trimmed=True,threshold=None)
     scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series)
     step_length = int(len(scikit_data.tolist()[0]) / 100) + 1
     selector = RFE(self.model,int(target_features),step=step_length)
     selector.fit(scikit_data,scikit_target)
     return [expression_frame.index[i] for i in xrange(0,len(expression_frame.index)) if selector.support_[i]]

Example #3

0

Show file

File: NPMRDS_DataLoader.py Project: econ24/NPMRDS_data_loader

def loadData(tarball, params):
    sourceFile, formattedFile = extractFile(tarball)
        
    #open and format input file
    print "<DataLoader> formatting file: %s" % sourceFile
    try:
        inFile = open(sourceFile)
    except:
        print "<DataLoader> Could not retrieve data from file: %s" % sourceFile
        return
        
    DataFormatter.formatFile(inFile, formattedFile)
    
    inFile.close()
    
    print "<DataLoader> Successfully formatted file: %s" % sourceFile
            
    #attempt to upload file
    uploaded = DataUploader.upload(formattedFile, params["bucket"], **params)
                        
    if not uploaded:
        print "<DataLoader> Failed to upload file, exiting"
        return
    
    #create bigquery object
    try:
        bigquery = Google_Service_Builder.buildBigQuery(**params)
    except Exception as e:
        print "<DataLoader>", e
        return
        
    source = "gs://"+params["bucket"]+"/"+formattedFile
    BigQuery_Append.appendData(bigquery, source, **params)

Example #4

0

Show file

File: Classification.py Project: joewledger/Cell-Line-Classification

 def get_model_coefficients_threshold(self,expression_file,ic50_file,threshold,drug):
     if(self.model_type == 'svm' and self.kernel == 'linear'):
         expression_frame,ic50_series = dfm.get_expression_frame_and_ic50_series_for_drug(expression_file, ic50_file,drug,normalized=True,trimmed=True,threshold=threshold)
         scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series)
         self.model.fit(scikit_data,scikit_target)
         return expression_frame.index, self.model.coef_[0]
     else:
         raise Exception("Method only defined for the SVM linear model")

Example #5

0

Show file

File: Classification.py Project: joewledger/Cell-Line-Classification

 def get_model_accuracy_filter_feature_size(self,expression_file, ic50_file,feature_size,num_permutations,drug):
     scikit_data,scikit_target = dfm.get_expression_scikit_data_target_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True,threshold=None)
     for i in range(0,num_permutations):
         try:
             shuffled_data,shuffled_target = dfm.shuffle_scikit_data_target(scikit_data,scikit_target)
             accuracy = cv.cross_val_score_filter_feature_selection(self.model,cv.trim_X_num_features,feature_size,shuffled_data,shuffled_target,cv=5)
             yield accuracy.mean()
         except:
             yield 0.0

Example #6

0

Show file

File: Classification.py Project: joewledger/Cell-Line-Classification

    def get_predictions_full_CCLE_dataset_threshold(self,expression_file,ic50_file,threshold,drug):
        training_frame,training_series = dfm.get_expression_frame_and_ic50_series_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True,threshold=threshold)
        training_data,training_target = dfm.get_scikit_data_and_target(training_frame,training_series)

        cell_lines, testing_data = dfm.get_normalized_full_expression_identifiers_and_data(expression_file,training_frame.index)

        self.model.fit(training_data,training_target)
        predictions = self.model.predict(testing_data)

        return cell_lines, predictions

Example #7

0

Show file

File: fullrun.py Project: hicannon/LSCE

def main(argv=None):
    #Importer.loadFromRaw(argv[1], numFiles = 2)
    DataFormatter.formatData('.', argv[2])
    tmp = h5py.File("{0}.hdf5".format(argv[2]), "r+")
    data = []
    for dataset in tmp["raw_data"].keys():
        data.append(tmp["raw_data"][dataset])
        print tmp["raw_data"][dataset][0]
    datavisualization.analyze8x8data(data=data, samprate=20000, time=2)
    tmp.close()

Example #8

0

Show file

File: Classification.py Project: joewledger/Cell-Line-Classification

    def get_predictions_full_CCLE_dataset_top_features(self,expression_file,ic50_file,num_features,drug):
        expression_frame,ic50_series = dfm.get_expression_frame_and_ic50_series_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True)
        top_features = dfm.get_pval_top_n_features(expression_frame,ic50_series,num_features)
        expression_frame = expression_frame.ix[top_features]
        scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series)

        cell_lines, testing_data = dfm.get_normalized_full_expression_identifiers_and_data(expression_file,expression_frame.index)
        self.model.fit(scikit_data,scikit_target)
        predictions = self.model.predict(testing_data)

        return cell_lines,predictions,list(top_features)

Example #9

0

Show file

File: Classification.py Project: joewledger/Cell-Line-Classification

 def get_cross_validation_time(self,expression_file, ic50_file,feature_size,num_permutations,drug):
     scikit_data,scikit_target = dfm.get_expression_scikit_data_target_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True,threshold=None)
     for i in range(0,num_permutations):
         try:
             shuffled_data,shuffled_target = dfm.shuffle_scikit_data_target(scikit_data,scikit_target)
             start_time = datetime.datetime.now()
             cv.cross_val_score_filter_feature_selection(self.model,cv.trim_X_num_features,feature_size,shuffled_data,shuffled_target,cv=5)
             end_time = datetime.datetime.now()
             yield float((end_time - start_time).microseconds) / 100000
         except:
             yield 0.0

Example #10

0

Show file

File: Classification.py Project: joewledger/Cell-Line-Classification

    def get_model_accuracy_RFE(self,expression_file,ic50_file,target_features,num_permutations,drug):

        scikit_data,scikit_target = dfm.get_expression_scikit_data_target_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True,threshold=None)
        step_length = int(len(scikit_data.tolist()[0]) / 100) + 1
        for i in xrange(0,num_permutations):
            try:
                shuffled_data,shuffled_target = dfm.shuffle_scikit_data_target(scikit_data,scikit_target)
                selector = RFE(self.model,target_features,step=step_length)
                yield cross_val_score(selector,shuffled_data,shuffled_target,cv=5).mean()
            except:
                yield 0.0

Example #11

0

Show file

File: Classification.py Project: joewledger/Cell-Line-Classification

    def get_patient_predictions_rfe(self,expression_file,ic50_file,patient_directory,target_features,drug):

        e_data,e_target,p_identifiers,p_data = dfm.get_cell_line_and_patient_expression_data_target_for_drug(expression_file,ic50_file,patient_directory,1.0,drug)
        step_length = int(len(e_data.tolist()[0]) / 100) + 1

        model = RFE(self.model,target_features,step=step_length)

        model.fit(e_data,e_target)
        predictions = model.predict(p_data)

        all_features = dfm.get_cell_line_and_patient_expression_gene_intersection(dfm.get_cell_line_expression_frame(expression_file),dfm.get_patients_expression_frame(patient_directory))[0]
        top_features = [all_features[i] for i in xrange(0,len(all_features)) if model.support_[i]]
        return p_identifiers, predictions, top_features

Example #12

0

Show file

def _get_data():
    try:
        global training_input
        global training_output

        training_input = []
        training_output = []

        log.info('Fetching training data')

        try:
            number_of_files = int(len(os.listdir('trainingData')))
        except:
            log.error('Directory trainingData does not exist')
            return False

        for file_num in range(0, int(number_of_files / 2)):

            state_file = 'trainingData/ExportedState{}.txt'.format(file_num)
            move_file = 'trainingData/ExportedMove{}.txt'.format(file_num)

            state_data = df._format_array_v2(state_file)
            move_data = df._format_array_v2(move_file)

            if not state_data:
                log.error('Failed to load board state data')
                return False

            if not move_data:
                log.error('Failed to load move data')
                return False

            state_data = np.array(state_data)
            move_data = np.array(move_data)

            training_input.append(state_data)
            training_output.append(move_data)

        training_input = np.array(training_input)
        training_output = np.array(training_output)

        log.info('\tData fetched')
        log.info('\t\tTraining_input length: {}\tShape: {}'.format(
            len(training_input), training_input.shape))
        log.info('\t\tTraining_out length: {}\tShape: {}\n'.format(
            len(training_output), training_output.shape))
        return True

    except:
        log.error('\tUnknown error in NeuralNetwork._get_data\n')
        return False

Example #13

0

Show file

File: NeuralNetwork.py Project: MatthewNettleton/Python-Projects

def _get_data():
    try:
        global training_input
        global training_output

        training_input = []
        training_output = []

        log.info('Fetching training data')

        number_of_files = int(len(os.listdir('trainingData')))

        for file_num in range(0, int(number_of_files / 2)):

            state_file = 'trainingData/ExportedState{}.txt'.format(file_num)
            move_file = 'trainingData/ExportedMove{}.txt'.format(file_num)

            data = df._format_array_v2(state_file)

            if not data:
                log.error('Failed to load data')
                return False

            data = np.array(data)
            training_input.append(data)

            data = df._format_array_v2(move_file)

            if not data:
                log.error('Failed to load data')
                return False

            data = np.array(data)
            training_output.append(data)

        training_input = np.array(training_input)
        training_output = np.array(training_output)

        log.info("Input array shape: {}".format(training_input.shape))
        log.info("Input data array shape: {}".format(training_input[0].shape))

        log.info('\tData fetched')
        log.info('\t\tTraining_input length: {}'.format(len(training_input)))
        log.info('\t\tTraining_out length: {}\n'.format(len(training_output)))
        return True

    except:
        log.error('\tUnknown error in NeuralNetwork._get_data\n')
        return False

Example #14

0

Show file

        def click():
            print("Event Triggered")

            # Initialise elasticsearch request handler passing Elasticsearch Address
            requestHandler = ElasticRequest.elasticRequest(
                'http://10.2.5.21:9200')

            requestHandler.add_search_term(entry_box.get())
            try:
                r = requestHandler.send_request()
            except elasticsearch.exceptions.ConnectionError:
                print("Exception Catched!")
                messagebox.showerror(
                    "Error", "Connection Error, Unable to connect to server")
                return
            except Exceptions.noDataFoundError:
                print("No search results found!")
                messagebox.showerror("Error", "No Results found!")
                return
            # except Exception as e:
            #     messagebox.showerror("Error", "UNKNOWN ERROR")
            #     print(e)
            #     return
            data_formatter = DataFormatter.DataFormatter(r)
            to_display = data_formatter.get_display()
            self.generate_table(to_display)

Example #15

0

Show file

File: AdvSearchWindow.py Project: kyngyi/ElkQuery

        def click(event=None):

            # Initialise elasticsearch request handler passing Elasticsearch Address
            a = StaticValues.StaticValues()
            requestHandler = ElasticRequest.elasticRequest(
                a.get_elasticsearch_server())

            # Check if queries are empty else add the queries to request body
            if self.check_if_empty(entry_box1.get(), entry_box2.get(),
                                   entry_box3.get(), entry_box4.get()):
                requestHandler.set_minimum_matches(0)
            else:
                requestHandler.add_search_term(entry_box1.get())
                requestHandler.add_search_term(entry_box2.get())
                requestHandler.add_search_term(entry_box2.get())
                requestHandler.add_search_term(entry_box3.get())
                requestHandler.add_search_term(entry_box4.get())

            # Checking logger checkbuttons
            is_adapter_selected = False
            for i in var:
                if var[i].get() != 0:
                    is_adapter_selected = True
                    requestHandler.add_logger_name(str(i))

            if not is_adapter_selected:
                print("Please select at least one adapter")
                messagebox.showerror("Error",
                                     "Please check at least one Program")
                return

            # Checking Max Results entry
            if self.contains_value((request_entry_box.get())):
                requestHandler.change_max_results((request_entry_box.get()))

            if self.contains_value(datepicker1.get()):
                requestHandler.set_from_date(datepicker1.get())

            if self.contains_value(datepicker2.get()):
                requestHandler.set_to_date(datepicker2.get())

            try:
                r = requestHandler.send_request()
            except elasticsearch.exceptions.ConnectionError:
                print("Exception Caught!")
                messagebox.showerror(
                    "Error", "Connection Error, Unable to connect to server")
                return
            except Exceptions.noDataFoundError:
                print("No search results found!")
                messagebox.showerror("Error", "No Results found!")
                return
            except Exception as e:
                messagebox.showerror("Error", "UNKNOWN ERROR")
                print(e)
                return

            data_formatter = DataFormatter.DataFormatter(r)
            to_display = data_formatter.get_display()
            self.generate_table(to_display)

Example #16

0

Show file

File: Driver.py Project: benjaminalbert/ProbStat2019

 def _reshape_input(input, abs_max):
     return DataFormatter.reshape_recurrent_input(input,
                                                  rows=rows,
                                                  columns=columns,
                                                  global_columns=global_columns,
                                                  global_column_forecast_timesteps=global_column_forecast_timesteps,
                                                  abs_max=abs_max)

Example #17

0

Show file

File: Feature_Selection.py Project: joewledger/Cell-Line-Classification

def backward_step(model,expression_frame,ic50_series,backward_step_size,forward_features_selected,backward_features_removed):
    removable_features = set(expression_frame.index) - set(forward_features_selected) - set(backward_features_removed)
    expression_frame = dfm.get_expression_frame_with_features(expression_frame,set(expression_frame.index) - set(backward_features_removed))
    scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series)
    model.fit(scikit_data,scikit_target)
    coefs = model.coef_[0]
    feature_names = list(expression_frame.index)
    coefs,feature_names = zip(*sorted(zip(coefs,feature_names),key=lambda x : math.fabs(x[0])))
    num_features_removed = 0
    feature_index = 0
    while(num_features_removed < backward_step_size and feature_index < len(feature_names)):
        if(feature_names[feature_index] in removable_features):
            backward_features_removed.append(feature_names[feature_index])
            num_features_removed += 1
        feature_index += 1
    return backward_features_removed

Example #18

0

Show file

File: Classification.py Project: joewledger/Cell-Line-Classification

    def get_patient_predictions_top_features(self,expression_file,ic50_file,patient_directory,num_features,drug):

        e_data,e_target,p_identifiers,p_data,top_features = dfm.get_cell_line_and_patient_expression_data_target_top_features_for_drug(expression_file,ic50_file,patient_directory,num_features,drug)

        self.model.fit(e_data,e_target)
        predictions = self.model.predict(p_data)

        return p_identifiers,predictions,top_features

Example #19

0

Show file

File: Feature_Selection.py Project: joewledger/Cell-Line-Classification

def forward_step(model,expression_frame,ic50_series,forward_features_selected,backward_features_removed):
    potential_features = set(expression_frame.index) - set(forward_features_selected) - set(backward_features_removed)
    max_score = -1
    best_feature = None
    for feature in potential_features:
        model = copy.copy(model)
        model_features = set(forward_features_selected) & set(feature)
        expression_frame = dfm.get_expression_frame_with_features(expression_frame,model_features)
        scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series)
        score = cv.cross_val_score(model,scikit_data,scikit_target,cv=5).mean()
        if(score > max_score):
            max_score = score
            best_feature = feature
    if(best_feature):
        forward_features_selected.append(best_feature)
        return forward_features_selected
    else:
        return forward_features_selected

Example #20

0

Show file

File: NEAT_Metrics.py Project: joewledger/Cell-Line-Classification

def get_accuracy_and_runtime_vs_num_generations(expression_file,ic50_file,num_features,generation_range,num_permutations,num_threads):
    drug = "SMAP"

    scikit_data,scikit_target = dfm.get_expression_scikit_data_target_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True,threshold=None)
    p = Pool(num_threads)
    scores = p.map(wrap, [(g,scikit_data,scikit_target,num_features,num_permutations) for g in generation_range])
    scores = {g : scores[i] for i,g in enumerate(generation_range)}
    print(scores)
    return scores

Example #21

0

Show file

File: Classification.py Project: joewledger/Cell-Line-Classification

    def get_predictions_full_CCLE_dataset_rfe(self,expression_file,ic50_file,target_features,drug):

        scikit_data,scikit_target = dfm.get_expression_scikit_data_target_for_drug(expression_file,ic50_file,drug,normalized=True,trimmed=True,threshold=None)
        step_length = int(len(scikit_data.tolist()[0]) / 100) + 1

        model = RFE(self.model,target_features,step=step_length)
        model.fit(scikit_data,scikit_target)

        expression_frame = dfm.normalize_expression_frame(dfm.get_cell_line_expression_frame(expression_file))
        cell_lines = expression_frame.columns
        testing_data = dfm.get_scikit_data(expression_frame)

        predictions = model.predict(testing_data)

        top_features = [expression_frame.index[i] for i in xrange(0,len(expression_frame.index)) if model.support_[i]]


        return cell_lines,predictions,top_features

Example #22

0

Show file

def Main():

    config = Configuration('configuration.json')

    if config.isProcessingDataFirst:
        ProcessData.process_data_files()

    if config.isGeneratingGraph:
        GraphData.create_plots()

    if config.isGeneratingClocklabFiles:
        FormatDataForClockLab.create_clock_lab_formatted_bulked_out_with_zeros_text_file(
        )

    if config.isGeneratingChronosFitFile:
        FormatDataForChronosFit.create_cronos_fit_formatted_file()

    if config.isShowingIndividualFish and config.isGeneratingDistanceSums:
        DataFormatter.generate_distance_sums_for_individual_fish()

Example #23

0

Show file

File: Classification.py Project: joewledger/Cell-Line-Classification

    def get_patient_predictions_threshold(self,expression_file,ic50_file,patient_directory,threshold,drug):
        """
        Returns the predictions for which patients are likely to be sensitive to SMAPs and which are likely to be resistant.
        First trains a given SVM model on expression data, and then uses the trained model to predict patient outcome.

        Returns a list of patient identifiers, and a list of predictions about the patients response to a given drug.
        """
        e_data,e_target,p_identifiers,p_data = dfm.get_cell_line_and_patient_expression_data_target_for_drug(expression_file,ic50_file,patient_directory,threshold,drug)

        self.model.fit(e_data,e_target)
        predictions = self.model.predict(p_data)

        return p_identifiers,predictions

Example #24

0

Show file

File: Cross_Validator.py Project: joewledger/Cell-Line-Classification

def trim_X_num_features(X,y,train,num_features):
    """
    Do calculations to trim X by taking the top num_features features based on p-value rank
    """
    all_samples = pd.DataFrame(X)
    all_labels = pd.Series(y)

    train_samples,train_labels = get_training_samples_labels(all_samples,all_labels,train)
    features = dfm.get_pval_top_n_features(train_samples.T,train_labels,num_features)

    trimmed_all_samples = all_samples[features]

    return np.array([list(trimmed_all_samples.ix[row]) for row in trimmed_all_samples.index])

Example #25

0

Show file

File: Cross_Validator.py Project: joewledger/Cell-Line-Classification

def trim_X_threshold(X,y,train,threshold):
    """
    Do calculations to trim X based on a p-value threshold
    """
    all_samples = pd.DataFrame(X)
    all_labels = pd.Series(y)

    train_samples,train_labels = get_training_samples_labels(all_samples,all_labels,train)
    features = dfm.get_features_below_pval_threshold(train_samples.T,train_labels,threshold)

    trimmed_all_samples = all_samples[features]

    return np.array([list(trimmed_all_samples.ix[row]) for row in trimmed_all_samples.index])

Example #26

0

Show file

File: app.py Project: npafitis/FakeNewsREST

def hello_world():
    url = request.args.get('url')
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    # content = DataFormatter.strip_html_tags(soup)

    headline = soup.find("h1", {"class": "entry-title"})

    body = ''
    for paragraph in soup.find("div", {"class": "entry-content"}).findAll("p"):
        body += DataFormatter.strip_html_tags(paragraph)

    content = {
        'headline': DataFormatter.strip_html_tags(headline),
        'body': body
    }
    # content_df = pandas.DataFrame(columns=['Body ID', 'Headline', 'articleBody'])
    # content_df.append(['0', DataFormatter.normalize_string(content['headline'])],
    #                   DataFormatter.normalize_string(content['body']))
    # content_df.loc[1]['Body ID'] =
    # content_df.loc[1]['Headline'] =
    # content_df.loc[1]['articleBody'] =
    # print(content_df)
    X_prediction = numpy.array([
        DataFormatter.normalize_string(content['headline'] + content['body'])
    ])

    # print(X_prediction.reshape(-1,1))
    # print(X_prediction.reshape(-1,1).shape)

    prediction = model.predict(X_prediction)
    prob = model.predict_proba(X_prediction)
    # train_set = read_dataframe_train()
    # test_dataframe(train_set)
    # data_frame_to_csv(train_set)
    # create_distribution(train_set)
    return jsonify(prediction, prob)

Example #27

0

Show file

File: NEAT_Metrics.py Project: joewledger/Cell-Line-Classification

def acc_and_run(g,scikit_data,scikit_target,num_features,num_permutations):
    results = []

    for perm in xrange(0,num_permutations):
        try:
            start_time = datetime.datetime.now()
            model = n.NeatClassifier(max_generations=g)
            shuffled_data,shuffled_target = dfm.shuffle_scikit_data_target(scikit_data,scikit_target)
            acc = cv.cross_val_score_filter_feature_selection(model,cv.trim_X_num_features,num_features,shuffled_data,shuffled_target,cv=5)
            end_time = datetime.datetime.now()
            results.append((acc.mean(),float((end_time - start_time).microseconds) / 100000))
        except:
            results.append((0.0, 1000.0))
            print(sys.exc_info()[0])
    return results

Example #28

0

Show file

File: Driver.py Project: benjaminalbert/ProbStat2019

def read_police_call_weather_data(csv_file, rows, columns, train_test_split=0.80, backward_timesteps=28, forward_timesteps=4, forecast_timesteps=4, regression=True):
    """
    Read
    :param csv_file: raw data file
    :param rows: number of rows of the map grid
    :param columns: number of columns of the map grid
    :param train_test_split: float designating the train/test split percentage (0.80 yields 80% train and 20% test
    :param backward_timesteps: designates how many timesteps back the network will analyze
    :param forward_timesteps: designates the number of timesteps forward the network will predict
    :param forecast_timesteps: how many timesteps forward to forecast global variables such as temperature/relative humidity/precipitation
    :param regression: if True, regression is performed, else classification is performed.
            NOTE: classification does not work well currently because inputs are not diffed
    :return: shape of input, (training input data, training output data), (testing input data, testing output data)
    """
    from pandas import read_csv
    import DataFormatter
    raw = read_csv(csv_file)
    global_columns = ["Fahrenheit", "Precipitation", "Relative Humidity"]
    raw.drop(["Start Date", "End Date", "Num of Calls"], axis=1, inplace=True)
    global_column_forecast_timesteps = dict(zip(global_columns, [forecast_timesteps] * len(global_columns)))
    # raw.drop(global_columns, axis=1, inplace=True)
    recurrent = DataFormatter.make_recurrent(raw, backward_timesteps, forward_timesteps)
    training_set, testing_set = DataFormatter.partition(recurrent, train_test_split)
    if regression:
        train_input, train_output = DataFormatter.split_recurrent_data(training_set)
        test_input, test_output = DataFormatter.split_recurrent_data(testing_set)
    else:
        format_regression_to_classification = lambda x: DataFormatter.recurrent_regression_to_classification(*DataFormatter.split_recurrent_data(x),
                                                                                                                minimum_delta=2,
                                                                                                                minimum_delta_percentage=0.10,
                                                                                                                enforce_both_minimums=True)
        train_input, train_output = format_regression_to_classification(training_set)
        test_input, test_output = format_regression_to_classification(testing_set)

    def _reshape_input(input, abs_max):
        return DataFormatter.reshape_recurrent_input(input,
                                                     rows=rows,
                                                     columns=columns,
                                                     global_columns=global_columns,
                                                     global_column_forecast_timesteps=global_column_forecast_timesteps,
                                                     abs_max=abs_max)
    train_input, abs_max = _reshape_input(train_input, 0)
    test_input, _ = _reshape_input(test_input, abs_max)
    remove_global_output_columns = lambda x: DataFormatter.remove_global_output_columns(x, global_columns)
    train_output = remove_global_output_columns(train_output).values.astype(float)[:train_input.shape[0]]
    test_output = remove_global_output_columns(test_output).values.astype(float)[:test_input.shape[0]]
    train_output /= max(train_output.max(), abs(train_output.min()))
    test_output /= max(test_output.max(), abs(test_output.min()))

    return train_input.shape, (train_input, train_output), (test_input, test_output)

Example #29

0

Show file

File: testscript.py Project: hicannon/LSCE

def main(argv=None):
    if argv is None:
        argv = sys.argv
    run_importer = True
    run_formatter = True
    run_analysis = True
    try:
        try:
            opts, args = getopt.getopt(
                argv[1:], "h",
                ["help", "skip-importer", "skip-formatter", "skip-analysis"])
            print "got args"
        except getopt.error, msg:
            raise Usage(msg)
        for option, data in opts:
            if ('-h' == option or '--help' == option):
                print "LSCE test script. Usage: \"python testscript.py [--skip-importer] [--skip-formatter] [--skip-analysis]" +\
                    " mat_source hdf5_dest" +\
                    "\"\n\nSupply the following arguments to run pipeline:\n\n\tmat_source: " +\
                    "The path to the raw .mat files to be imported.\n\thdf5_dest: the name to save hdf5 output file under" +\
                    "\n\nAvailable modes:\n\t--skip-importer: skip importation step. Formatter wil still run using" +\
                    " mat_source as src directory." +\
                    "\n\t--skip-formatter: skip formatting step. Importer will use mat_source as usual. \n\t\t\t  Analysis will" +\
                    " use hdf5_dest if it exists." + \
                    "\n\t--skip-analysis: skip computation of analysis data. Formatter will still output to hdf5_dest. "
                return
            if ('--skip-importer' == option):
                run_importer = False
            if ('--skip-formatter' == option):
                run_formatter = False
            if ('--skip-analysis' == option):
                run_analysis = False
        if (len(args) < 2):
            raise Usage("Insufficient arguments supplied.")
        else:
            print args.__repr__()
        print "Welcome to LSCE test script.\nThis script will perform a " + \
            "complete iteration of our pipeline, starting with the data importer."
        if (run_importer):
            print "Importing data from directory " + args[0]
            Importer.loadFromMat(args[0])
        else:
            print "Skipped importing data."
        if (run_formatter):
            print "Formatting data as hdf5 in " + args[1] + ".hdf5"
            DataFormatter.formatData(args[0], args[1])
        else:
            print "Skipped formatting data."
        os.system("PAUSE")
        testing = None
        raw_data = None
        if (run_analysis):
            dtool = DataAnalysis.data_analysis()
            dtool.load_hdf5(args[1],
                            dataset_name="Electrode_12_master",
                            group_name="raw_data")
            dtool.sampling_rate = 1000
            testing = dtool.high_demo_filter(20)
            raw_data = dtool.f["raw_data"]["Electrode_12_master"]
        else:
            print "Skipped data analysis.\nPlaceholder groups " + \
                "\"/data_analysis/demo_filter_results\" and \"/raw_data/Electrode_12_master\" will be used."
            hdfile = h5py.File(args[1] + ".hdf5", "r+")
            if ("data_analysis" not in hdfile
                    or "demo_filter_results" not in hdfile["data_analysis"]):
                print "Skipping graphs..."
                return
            testing = hdfile["data_analysis"]["demo_filter_results"]
            raw_data = hdfile["raw_data"]["Electrode_12_master"]
        os.system("PAUSE")
        plt.subplot(2, 1, 1)
        plt.plot(testing)
        plt.subplot(2, 1, 2)
        plt.plot(raw_data)
        plt.show()
        if (run_analysis):
            dtool.close()

Example #30

0

Show file

File: BikeShareController.py Project: JPrier/TorontoBikeShare

def predictData(data):
    data = df.getFormattedData(data)
    return model.predict(data)

Example #31

0

Show file

File: testscript.py Project: cplab/LSCE

def main(argv=None):
    if argv is None:
        argv = sys.argv
    run_importer = True
    run_formatter = True
    run_analysis = True
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "h", ["help", "skip-importer", "skip-formatter", "skip-analysis"])
            print "got args"
        except getopt.error, msg:
                raise Usage(msg)
        for option, data in opts:
            if('-h' == option or '--help' == option):
                print "LSCE test script. Usage: \"python testscript.py [--skip-importer] [--skip-formatter] [--skip-analysis]" +\
                    " mat_source hdf5_dest" +\
                    "\"\n\nSupply the following arguments to run pipeline:\n\n\tmat_source: " +\
                    "The path to the raw .mat files to be imported.\n\thdf5_dest: the name to save hdf5 output file under" +\
                    "\n\nAvailable modes:\n\t--skip-importer: skip importation step. Formatter wil still run using" +\
                    " mat_source as src directory." +\
                    "\n\t--skip-formatter: skip formatting step. Importer will use mat_source as usual. \n\t\t\t  Analysis will" +\
                    " use hdf5_dest if it exists." + \
                    "\n\t--skip-analysis: skip computation of analysis data. Formatter will still output to hdf5_dest. "
                return
            if('--skip-importer' == option):
                run_importer = False
            if('--skip-formatter' == option):
                run_formatter = False
            if('--skip-analysis' == option):
                run_analysis = False
        if(len(args) < 2):
            raise Usage("Insufficient arguments supplied.")
        else:
            print args.__repr__()
        print "Welcome to LSCE test script.\nThis script will perform a " + \
            "complete iteration of our pipeline, starting with the data importer."
        if(run_importer):
            print "Importing data from directory "+args[0]
            Importer.loadFromMat(args[0])
        else:
            print "Skipped importing data."
        if(run_formatter):
            print "Formatting data as hdf5 in "+args[1]+".hdf5"
            DataFormatter.formatData(args[0], args[1])
        else:
            print "Skipped formatting data."
        os.system("PAUSE")
        testing = None
        raw_data = None
        if(run_analysis):
            dtool = DataAnalysis.data_analysis()
            dtool.load_hdf5(args[1], dataset_name="Electrode_12_master", group_name="raw_data")
            dtool.sampling_rate = 1000
            testing = dtool.high_demo_filter(20)
            raw_data = dtool.f["raw_data"]["Electrode_12_master"]
        else:
            print "Skipped data analysis.\nPlaceholder groups " + \
                "\"/data_analysis/demo_filter_results\" and \"/raw_data/Electrode_12_master\" will be used."
            hdfile = h5py.File(args[1]+".hdf5", "r+")
            if("data_analysis" not in hdfile or "demo_filter_results" not in hdfile["data_analysis"]):
                print "Skipping graphs..."
                return
            testing = hdfile["data_analysis"]["demo_filter_results"]
            raw_data = hdfile["raw_data"]["Electrode_12_master"]
        os.system("PAUSE")
        plt.subplot(2, 1, 1)
        plt.plot(testing)
        plt.subplot(2, 1, 2)
        plt.plot(raw_data)
        plt.show()
        if(run_analysis):
            dtool.close()

Example #32

0

Show file

def get_data(keyword, keyword2):

    # # Generate sub table from json in "_source" column
    # def generate_subTable(data):
    #     for i in range(len(data.index)):
    #         if i == 0:
    #             subtable = pd.DataFrame(actualData.iloc[i, 3])
    #             subtable.loc[i, "timestamp"] = dateutil.parser.parse(subtable["@timestamp"][i])
    #             subtable.loc[i, "date"] = (subtable["timestamp"][i] + datetime.timedelta(hours=8)).strftime("%H:%M:%S %d/%m/%Y")
    #             if isinstance(subtable.iloc[i]['response_body'], str):
    #                 subtable.loc[i, "response_message"] = json.loads(subtable.iloc[i]['response_body'])['message']
    #         else:
    #             subtable = subtable.append(pd.DataFrame(actualData.iloc[i, 3]), ignore_index=True)
    #             subtable.loc[i, 'timestamp'] = dateutil.parser.parse(subtable["@timestamp"][i])
    #             subtable.loc[i, "date"] = (subtable["timestamp"][i] + datetime.timedelta(hours=8)).strftime("%H:%M:%S %d/%m/%Y")
    #             if isinstance(subtable.iloc[i]['response_body'], str):
    #                 subtable.loc[i, "response_message"] = json.loads(subtable.iloc[i]['response_body'])['message']
    #     return subtable
    #
    # # Copies values of desired columns to be displayed
    # def form_displayTable(mainDataFrame):
    #     displayTable = pd.DataFrame()
    #     displayTable['timestamp'] = mainDataFrame['date']
    #     displayTable['logger_name'] = mainDataFrame['logger_name']
    #     displayTable['path'] = mainDataFrame['path']
    #     displayTable['response_status'] = mainDataFrame['response_status']
    #     displayTable['response_message'] = mainDataFrame['response_message']
    #     displayTable['request_body'] = mainDataFrame['request_body']
    #     displayTable['response_body'] = mainDataFrame['response_body']
    #     return displayTable

    print("Sendrequest Module Initiated")
    requestHandler = ElasticRequest.elasticRequest('http://10.2.5.21:9200')
    r = 0
    try:
        requestHandler.add_search_term(keyword)
        requestHandler.add_search_term(keyword2)
        r = requestHandler.send_request()
    except elasticsearch.exceptions.ConnectionError as e:
        print("Except block entered")
        raise elasticsearch.exceptions.ConnectionError
    # Checking for any search results
    if (r['hits']['total'] == 0):
        print("No Hits!")
        raise Exceptions.noDataFoundError
        return
    print("Finished log extraction from ELK")
    # Generate dataframe based on contents of r
    # data = pd.DataFrame(r)
    # # pull out desired data ( search results from the json <cell 1,3>
    # actualData = pd.DataFrame(data.iloc[1,3])
    #
    # main = generate_subTable(actualData)
    #
    # # Join the subtable back with original dataFrame
    # result = pd.concat([actualData, main], axis=1, join_axes=[actualData.index])
    # result = result.sort_values(by=['timestamp'], ascending=False)
    # result = result.reset_index(drop=True)
    #
    # # Remove '_source' column from original Dataframe
    # result = result.drop('_source', 1)
    #
    # # Copy desired fields to separate new df
    # to_display = form_displayTable(result)

    data_formatter = DataFormatter.DataFormatter(r)
    to_display = data_formatter.get_display()

    return to_display

Example #33

0

Show file

File: Sample_MSFT_main.py Project: KavanLiang/Finance

"""
A sample program that predicts MSFT stock data using linear regression and MatPlotLib
"""

import DataFormatter
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from math import ceil
from sklearn import linear_model, preprocessing, model_selection
import datetime

AV_key = ""  #AVKey here
DataFormatter.set_av_key(AV_key)

if __name__ == "__main__":
    df = DataFormatter.get_alpha_vantage_data('MSFT', AV_key)
    df.index = pd.to_datetime(df.index)
    projection_out = int(ceil(0.05 * len(df)))
    df['label'] = df['adjusted_close'].shift(projection_out)
    df.dropna(inplace=True)
    X = preprocessing.scale(np.array(df.drop(['label'], 1)))
    y = np.array(df['label'])

    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=0.2)
    clf = linear_model.LinearRegression(n_jobs=-1)
    clf.fit(X_train, y_train)
    print(clf.score(X_test, y_test))

    projection_set = clf.predict(X[:projection_out])[::-1]

Python DataFormatter, commcare-hq Examples