Python DataCleaner 예제들, DataCleaner Python 예제들

예제 #1

0

파일 보기

def create_mono_gram(READ_DIR):
    one_gram = []
    dc.clean_data()
    os.chdir(READ_DIR)
    index_count = 0
    for filename in os.listdir(os.getcwd()):
        file = open(filename, "r")
        contents = file.read()
        tokens = contents.split(" ")
        for token in tokens:
            token = token.strip()
            if (len(token) <= 0):
                continue
            value = token
            is_open = 0
            is_close = 0
            is_name = 0
            index = -1
            if (token[0] == '<'):
                is_open = 1
            if (token[-1] == '>'):
                is_close = 1
            if (is_open == 1 and is_close == 1):
                is_name = 1
            index = index_count
            index_count += 1
            if (is_open == 1 or is_close == 1):
                value = re.sub(r'\W+', '', token)

            tk = ti.token(value, is_open, is_close, is_name, index)
            one_gram.append(tk)
    os.chdir('..')
    return one_gram

예제 #2

0

파일 보기

 def load_mails_info(self):
     """
     Load user labels and mails.
     
     """
     self.load_labels()
     self.loader = Loader(self.service, self.directory, self.label_ids)
     self.load_mails()
     self.counts = self.mails['label'].value_counts()
     self.label_names.sort(key=lambda label: self.counts[label],
                           reverse=True)
     DataCleaner.pack(self.mails)

예제 #3

0

파일 보기

파일: main.py 프로젝트: potashkeren/NaiveBayesClassifier

 def build(self):
     #try:
         self.train = pd.read_csv(self.entryPath.get() + "/train.csv")
         if self.validate(self.entryDiscBins.get()):
             # load train file, test file and structure file
             if (os.path.getsize(self.entryPath.get() + "/Structure.txt") == 0):
                 raise Exception("The structure file is empty")
             self.structureFile = open(self.entryPath.get() + "/Structure.txt")
             self.fileHandler = FilesHandler()
             self.structureDic = self.fileHandler.createStstructureDic(self.structureFile)
             self.dataCleaner = DataCleaner(self.structureDic, self.numOfBins)
             self.toLowerCase("train")
             self.train = self.dataCleaner.trainCleaning(self.train)
             self.classifier = Classifier(self.train, self.entryPath.get(), self.structureDic, self.numOfBins)
             self.wasBuilt = True
             tkMessageBox.showinfo("Build Message", "Building classifier using train-set is done!")

예제 #4

0

파일 보기

파일: LSTM_K.py 프로젝트: BryanJHealy/DeepLearningStockTrading

    def __init__(self,
                 num_outputs,
                 num_cells=42,
                 sequence_length=30,
                 dropout=0.0,
                 epochs_per_fold=10,
                 activation_function='relu',
                 optimizer='adam'):
        self.num_outputs = num_outputs
        self.cleaner = DataCleaner.DataCleaner()
        self.sequence_length = sequence_length
        self.epochs = epochs_per_fold

        self.batch_size = 1
        self.model = Sequential()
        self.model.add(
            LSTM(num_cells,
                 input_shape=(None, 1),
                 activation=activation_function))
        self.model.add(Dropout(dropout))
        self.model.add(Dense(2))
        self.model.compile(optimizer=optimizer, loss='mse')

        self.test_data = []
        self.predictions = []

예제 #5

0

파일 보기

파일: Baseline.py 프로젝트: BryanJHealy/DeepLearningStockTrading

def get_average_profit():
    assets = 1000  #investing with $1000
    base = Baseline(assets)
    cleaner = DataCleaner.DataCleaner()
    profits = []
    invested = False
    num_folds = 4
    window_size = 30
    for k in range(1, num_folds + 1):
        training_data, cv_data = cleaner.get_clean_data(k)
        for index in range(0, len(training_data), window_size):
            window_data = training_data[index:index + window_size]
            open_values = []
            close_values = []
            for day in window_data:
                open_values.append(day[0])
                close_values.append(day[1])
            flat_window_data = []
            for index in range(len(open_values)):
                flat_window_data.append(open_values[index])
                flat_window_data.append(close_values[index])
            if len(flat_window_data) != 60:
                continue
            base.train(list(range(index, index + base.window_size)),
                       flat_window_data)
            base.predict(
                list(
                    range(index + base.window_size,
                          index + (2 * base.window_size))))
            if invested and (base.classify() == 4):
                invested = False
                profit = (base.y_train[-1] - base.purchase_value) * base.assets
                profits.append(profit)
                print("sold for ${} profit".format(profit))
            elif (not invested) and ((base.classify() == 0) or
                                     (base.classify() == 1)):
                base.purchase_value = base.y_train[-1]
                invested = True
                print("Invested")
    print("Profits: {}".format(profits))
    print("Total profits = ${}".format(sum(profits)))

예제 #6

0

파일 보기

import DataCleaner as dc
import pandas as pd
import os

os.chdir("./data")

data_train_class = dc.DataCleaner("application_test.csv")
data_train = data_train_class.get_data()
profile_train = data_train_class.get_profile("/Users/FangzhouYu/Desktop/home-credit-default-risk/test_profile.html")
data_train_class.replace_missing_data("ORGANIZATION_TYPE", "XNA", "NA")

예제 #7

0

파일 보기

def runDataCleaner(test_file1, test_file2, type):
    if type == 'c':
        d = DataCleaner("tmdb_5000_movies_classification.csv",
                        "tmdb_5000_credits.csv", test_file1, test_file2, type)

        d.movies = pd.read_csv('trainC.csv')  # TrainC is data not numeric
        # loop on 2 files only (2 loops)

        for i in tqdm(['trainC_data.csv', 'testC_data.csv']):
            if (not os.path.exists(i)
                ):  # If you have already created the dataset:
                d.movies = d.movies.iloc[:, 1:]  # beacuse coulmn 1 is the id
                d.defineCategories()
                d.normalaize()
                d.movies.to_csv(i)
            d.movies = pd.read_csv('testC.csv')
            #print(d.movies.shape[0])
        # now d.movies has the train dataset in first loop, in the second loop d.moves has the test dataset
        train = handelMissingValues(pd.read_csv('trainC_data.csv').iloc[:, 1:])
        test = handelMissingValues(pd.read_csv(
            'testC_data.csv').iloc[:, 1:])  # first coulmn is 0,1,2,3,...
    else:
        d = DataCleaner("tmdb_5000_movies_train.csv",
                        "tmdb_5000_credits_train.csv", test_file1, test_file2,
                        type)

        d.movies = pd.read_csv('trainR.csv')  # TrainC is data not numeric
        # loop on 2 files only (2 loops)

        for i in tqdm(['trainR_data.csv', 'testR_data.csv']):
            if (not os.path.exists(i)
                ):  # If you have already created the dataset:
                d.movies = d.movies.iloc[:, 1:]  # beacuse coulmn 1 is the id
                d.defineCategories()
                d.normalaize()
                d.movies.to_csv(i)
            d.movies = pd.read_csv('testR.csv')
        # now d.movies has the train dataset in first loop, in the second loop d.moves has the test dataset
        train = handelMissingValues(pd.read_csv('trainR_data.csv').iloc[:, 1:])
        test = handelMissingValues(pd.read_csv(
            'testR_data.csv').iloc[:, 1:])  # first coulmn is 0,1,2,3,...

    return train, test

예제 #8

0

파일 보기

파일: main.py 프로젝트: potashkeren/NaiveBayesClassifier

class NaiveBayesClassifier:

    # data members
    master = None
    filePath = ""
    numOfBins = 0
    train = None
    test = None
    structureFile = None
    wasBuilt = False
    structureDic = {}
    fileHandler = None
    classifier = None
    dataCleaner = None

    # Initialize the GUI
    def __init__(self, master):
        self.master = master
        master.title("Naive Bayes Classifier")
        master.geometry("650x300")

        # <editor-fold desc="init buttons, labels and entries">
        self.labelPath = Label(master, text="Directory Path:")
        self.entryPath = Entry(master, width=70)
        self.browse_button = Button(master, text="Browse", width=10, command=self.askopenfile)
        self.browse_button.pack()

        self.labelDiscBins = Label(master, text="Discretization Bins:")
        self.entryDiscBins = Entry(master, width=20, validate="key")

        self.build_button = Button(master, text="Build", width=20, command=self.build)
        self.build_button.pack()
        self.labelErr = Label(master, text="", fg="red", font="Verdana 10 bold")

        self.classify_button = Button(master, text="Classify", width=20, command=self.classify)
        self.classify_button.pack()

        self.close_button = Button(master, text="Exit", width=10, command=master.quit)
        self.close_button.pack()
        #</editor-fold>
        # Define grid
        self.gridDefinition(master)
        # layout the controls in the grid
        self.controlsLayout()

    # <editor-fold desc="Gui Functions">
    def controlsLayout(self):
        self.labelPath.grid(row=1, column=0, sticky=E)
        self.entryPath.grid(row=1, column=1, columnspan=2, sticky=W)
        self.browse_button.grid(row=1, column=3, sticky=W)
        self.labelDiscBins.grid(row=2, column=0, sticky=E)
        self.entryDiscBins.grid(row=2, column=1, sticky=W)
        self.labelErr.grid(row=4, column=0, columnspan=4, sticky=W)
        self.build_button.grid(row=5, column=1, columnspan=2)
        self.classify_button.grid(row=6, column=1, columnspan=2)
        self.close_button.grid(row=7, column=1, columnspan=2)

    def gridDefinition(self, master):
        master.grid_rowconfigure(0, weight=2)
        master.grid_rowconfigure(1, weight=1)
        master.grid_rowconfigure(2, weight=1)
        master.grid_rowconfigure(3, weight=1)
        master.grid_rowconfigure(4, weight=1)
        master.grid_rowconfigure(5, weight=1)
        master.grid_rowconfigure(6, weight=1)
        master.grid_rowconfigure(7, weight=1)
        master.grid_columnconfigure(0, weight=1)
        master.grid_columnconfigure(1, weight=1)
        master.grid_columnconfigure(2, weight=1)
        master.grid_columnconfigure(3, weight=2)
    # </editor-fold>

    # Build button was clicked
    def build(self):
        #try:
            self.train = pd.read_csv(self.entryPath.get() + "/train.csv")
            if self.validate(self.entryDiscBins.get()):
                # load train file, test file and structure file
                if (os.path.getsize(self.entryPath.get() + "/Structure.txt") == 0):
                    raise Exception("The structure file is empty")
                self.structureFile = open(self.entryPath.get() + "/Structure.txt")
                self.fileHandler = FilesHandler()
                self.structureDic = self.fileHandler.createStstructureDic(self.structureFile)
                self.dataCleaner = DataCleaner(self.structureDic, self.numOfBins)
                self.toLowerCase("train")
                self.train = self.dataCleaner.trainCleaning(self.train)
                self.classifier = Classifier(self.train, self.entryPath.get(), self.structureDic, self.numOfBins)
                self.wasBuilt = True
                tkMessageBox.showinfo("Build Message", "Building classifier using train-set is done!")
        #except Exception as e:
        #   tkMessageBox.showinfo("Error Message", "Something went wrong:\n" + str(e))

    # Clasify button was clicked
    def classify(self):
     try:
         if self.wasBuilt:
            self.test = pd.read_csv(self.entryPath.get() + "/test.csv")
            self.toLowerCase("test")
            self.test = self.dataCleaner.testCleaning(self.test)
            self.classifier.classify(self.test)
            tkMessageBox.showinfo("Classify Message", "Classifying the test-set to the chosen path is done!")
            sys.exit(0)
         else:
            tkMessageBox.showinfo("Error Message", "Please build before Classifying")
     except Exception as e:
         tkMessageBox.showinfo("Error Message", "Something went wrong:\n" + str(e))

    # Trnasfer dataset to lowercase
    def toLowerCase(self, file):
        for attribute in self.structureDic:
            if self.structureDic[attribute] != "NUMERIC" and attribute != "class":
                if file == "train":
                    self.train[attribute] = self.train[attribute].str.lower()
                else:
                    self.test[attribute] = self.test[attribute].str.lower()

    # Open file dialog openrer
    def askopenfile(self):
        self.filePath = tkFileDialog.askdirectory()
        self.entryPath.delete(0, END)
        self.entryPath.insert(0, self.filePath)

    # Validate input
    def validate(self, new_text):
        if not new_text:  # the field is being cleared
            self.labelErr['text'] = "Please enter a number"
            return False
        try:
            self.numOfBins = int(new_text)
            # check validate number
            if self.numOfBins < 1:
                self.labelErr['text'] = "The number for \"Discretization Bins\" should be bigger than 0"
                return False
            elif self.numOfBins > self.train.count()[0]:
                self.labelErr['text'] = "\"Discretization Bins\" shouldn't be higher then the number of records"
                return False
            # if the numer is valid
            else:
                self.labelErr['text'] = ""
                return True

        except ValueError:
            self.labelErr['text'] = "Invalid input - Please enter a number"

예제 #9

0

파일 보기

파일: Exec.py 프로젝트: HoxFramework/TwitterCat-DogImagesPosterBOT

    catimage = CattoPoster.catgetimgfile()
    catfact = CattoPoster.catgettextfile()
    with open('UsedImages.txt', 'a') as filer:
        filer.write("\n" + catimage)
    filer.close()
    tweetMe.goTweet(catfact, catimage)
elif mainin == "2":
    dogimage = DoggoPoster.doggetimgfile()
    doggofacto = DoggoPoster.doggettextfile()
    with open('UsedImages.txt', 'a') as filer:
        filer.write("\n" + dogimage)
    filer.close()
    tweetMe.goTweet(doggofacto, dogimage)
elif mainin == "3":
    print("Loading the cleaner...")
    print("THERE HAS TO BE AT LEAST 2 FILES IN EACH .txt FILE !\n")
    DataCleaner.cleaner()
elif mainin == "4":
    print("Downloading doggos...")
    dogsDownloader.dog()
elif mainin == "5":
    print("Downloading cattos...")
    catsDownloader.cat()
elif mainin == "6":
    print("Loading...")
    import imageChecker
    #ovo necemo importat odma jer ima previse modula anyway; We dont wanna bloat
    imageChecker.checkmypics()
else:
    print("No. Dont be rude.")

예제 #10

0

파일 보기

파일: LinearRegression.py 프로젝트: gjones1911/Project1_CS425

# used to remove car names from data array
cols_rmv = [8]

# represents the data split (traingin, validation)
size = [.75, .25]

split_selection = list()

runs = 15
for x in range(0, runs):
    split_selection.append(size)

print('Number of Runs: ', runs)
print("Data Split: ", split_selection[0])

# get the data using data cleaner
# returns a 2D array where rows are observations and columns
# are attributes of a specific observations
data_array = DataCleaner.data_cleaner("CarData.txt")

# used to do Linear Regression.
# Arguments are:
#               data_array: The data array created with DataCleaner
#               imputation: The users choice of imputation
#               cont_dis: The array that represents which cols/attributes are continuous or discrete(0,1)
#               cols_rmv: The columns the user would like to be removed from the data set here it is the car_name
#               bad data signal: This will be used to determine if and what data points are missing
#               split_selection: Array controlling how many tests are run and the split between test and validation sets
Regression.perform_regression(list(data_array), imputation, cont_dis, cols_rmv,
                              '?', 0, split_selection)

예제 #11

0

파일 보기

    fifaCleaner.getAllPlayersInPosition(midfielderPositions, fifaData))

defenders = fifaCleaner.convertHeightAndWeight(defenders)
midfield = fifaCleaner.convertHeightAndWeight(midfield)
defenders = fifaCleaner.ConvertMonetaryValue(defenders)
midfield = fifaCleaner.ConvertMonetaryValue(midfield)

defenders = defenders.fillna(defenders)
midfield = midfield.fillna(midfield)

missing_Defender_Data = util.Missing(defenders)
missing_midfielder_Data = util.Missing(midfield)
util.SideSide(missing_Defender_Data, missing_midfielder_Data)
print('Missing Data')

X_train, X_test, y_train, y_test = dc.Spliting(defenders, 'BP')
RF_Model = RandomForestClassifier(max_features='sqrt', max_leaf_nodes=5)

analysis.ApplyModel(X_train, y_train, RF_Model)

for position in arrayOfPositions:
    players = fifaCleaner.getAllPlayersInPosition(position, fifaData)
    playerDataFrame = pd.DataFrame(players)
    print(playerDataFrame.info)
    analysis.CorrelationMatrix('Overall', playerDataFrame)

    index = arrayOfPositions.index(position)
    if index == 0:
        fileName = 'GoalKeepers'
    elif index == 1:
        fileName = 'Defenders'

예제 #12

0

파일 보기

    rf = RandomForestClassifier(n_estimators=10,
                                max_features=n_features,
                                max_depth=None,
                                min_samples_split=2,
                                bootstrap=True)
    print('Starting RandomForest training process....')
    rf.fit(trainData_X, trainData_Y)
    print('Precison of training data: ' +
          str(rf.score(trainData_X, trainData_Y)))
    print('Precision of testing data: ' +
          str(rf.score(testData_X, testData_Y)))


# In[52]:

data, label = DataCleaner.loadData()
trainData_X, trainData_Y, testData_X, testData_Y = seperateData(data, label)
KNN(trainData_X, trainData_Y, testData_X, testData_Y)

# In[53]:

DecisionTree(trainData_X, trainData_Y, testData_X, testData_Y)

# In[54]:

logisticRegression(trainData_X, trainData_Y, testData_X, testData_Y)

# In[66]:

SVM(trainData_X, trainData_Y, testData_X, testData_Y)

예제 #13

0

파일 보기

파일: Build.py 프로젝트: ethanleeman/Cycle-Analyst

       'highway:track', 'highway:trunk', 'highway:trunk_link',
       'highway:unclassified'

ec = [
  'blue' if data['highway']=='tertiary_link'
  else 'green' if data['highway']=='residential'
  else 'yellow' if data['highway']=='tertiary'
  else 'orange' if data['highway']=='secondary'
  else 'red' if data['highway']=='primary'
  else 'blue' for u, v, key, data in G.edges(keys=True, data=True)]

#this works, but is static
ox.plot_graph(G,fig_height=8,fig_width=8,node_size=0, edge_color=ec)

# Run cleaning Methods
df_bike_accidents_in_region = DataCleaner.clean_crashes(df_all_traffic_accidents,polygon,north,south,east,west)
df_traffic_studies_in_region = DataCleaner.clean_traffic(df_all_traffic_studies,polygon,north,south,east,west)
df_undirected_edges_with_features = DataCleaner.edge_featurizer(df_edges_undirected,column_name_list)

df_undirected_edges_with_features.columns

len(df_traffic_studies_in_region)
len(df_bike_accidents_in_region)

## some plots
fig,ax = ox.plot_graph(G_undirected, node_zorder=2,node_size=0.03,node_alpha = 0.1,node_color='k', bgcolor='k', edge_linewidth=0.4,use_geom=True, axis_off=False,show=False, close=False)
ax=df_bike_accidents_in_region.plot(kind='scatter',x='DEC_LONG',y='DEC_LAT',s=1,fig=fig,label='Bike Accident',ax=ax,color='r')
ax = df_traffic_studies_in_region.plot(title='Plotting the 3 Main Datasets',kind='scatter',x='X',y='Y',s=3,c='y',label='Traffic Study',fig=fig,ax=ax)

df_traffic_studies_in_region.setyear.describe()
df_traffic_studies_in_region.plot.scatter(x='setyear',y = 'aadb')

예제 #14

0

파일 보기

파일: perform_data_clean.py 프로젝트: zhoumeiling233/CityWander

from DataCleaner import *
#dir format:/Users/mac/Desktop/CityWander/
cleaner = DataCleaner("Shanghai", "/Users/mac/Desktop/CityWander/")

cleaner.get_file_name()
cleaner.error_point_name()
cleaner.deleter_name()
cleaner.get_info()
cleaner.error_point_info()
cleaner.deleter_info()
cleaner.error_info_to_name()
cleaner.deleter_info_to_name()

예제 #15

0

파일 보기

파일: perform_data_clean.py 프로젝트: woodstone121/CityVision

from DataCleaner import *
#dir format:/Users/mac/Desktop/CityWander/
cleaner = DataCleaner("Guangzhou", "/data/yyh/CityWander/")

print("City:", cleaner.city_name)

cleaner.get_file_name()  #获取街景文件目录下的所有文件名
cleaner.error_point_name()  #使用文件名筛选出来的错误街景点
cleaner.deleter_name()  #删除error_point_name筛选出的文件
cleaner.get_info()  #筛选现在剩余街景文件的信息
cleaner.error_point_info()  #根据get_info方法筛选出的信息，找出信息有错误的街景点
cleaner.deleter_info()  #删除error_point_info方法找出的错误街景点
cleaner.error_info_to_name()  #找出文件名和信息不匹配的点
cleaner.deleter_info_to_name()  #删除文件名和信息不匹配的点

Python DataCleaner, simnet 예제들