def clean_data(self):
        dataset_dir = self.config['dataset_dir']
        work_dir = self.config['work_dir']

        # set log file and timer
        log_file = os.path.join(work_dir, 'logs/log_clean_data.txt')
        self.logger.set_log_file(log_file)
        # create a local timer
        local_timer = Timer('Data cleaning Module')
        local_timer.start()

        # clean data
        cleaned_data_dir = os.path.join(work_dir, 'cleaned_data')
        if os.path.exists(cleaned_data_dir):  # remove cleaned_data_dir
            shutil.rmtree(cleaned_data_dir)
        os.mkdir(cleaned_data_dir)

        # check if dataset_dir is a list or tuple
        if not (isinstance(dataset_dir, list) or isinstance(dataset_dir, tuple)):
            dataset_dir = [dataset_dir, ]
        clean_data(dataset_dir, cleaned_data_dir)

        # stop local timer
        local_timer.mark('Data cleaning done')
        logging.info(local_timer.summary())
Ejemplo n.º 2
0
def show_results():
    '''
    Main function: this function will run all of the above functions. Originally
    named "main()" but it could be confused with other scripts' main functions
    '''
    # Import other scripts' functions
    from clean_data import clean_data
    from process_data import match_error, match_samples
    # Clean data
    trollWords = clean_data(sample_size=300, path='tweets.csv')
    normalWords = clean_data(sample_size=300, path='election_day_tweets.csv')
    # Analyze
    matched = match_error(normalWords, trollWords, ['vote'])
    comparison_words = [
        'vote', 'trump', 'hillary', 'hillari', 'clinton', 'donald', 'amp'
    ]
    comparisons = match_samples(normalWords, trollWords, comparison_words)
    # Display results
    show_comparison(comparisons)
    list1 = ['hillari', 'clinton', 'hillary']
    list2 = ['donald', 'trump']
    show_cumulative_comparison(comparisons, list1, list2)
    list1 = ['hillari', 'hillary']
    list2 = ['hillari', 'hillary', 'clinton', 'donald', 'trump']
    show_special_comparison(comparisons, list1, list2)
    show_wordcloud(words=words)
    show_histogram(trollWords, title='Russian Twitter Troll Word Frequency')
    show_histogram(normalWords, title='User Political Tweet Word Frequency')
    show_histogram(matched, title='Word Comparison')
def get_data(filename=None):
    '''Load raw data from a file and return training data and responses.

    Parameters
    ----------
    filename: The path to a json file containing the raw data and response.

    Returns
    -------
    X: A numpy array containing the independent data used for training.
    y: A numpy array containing labels, used for model response.
    '''
    df = pd.DataFrame()
    if filename == None:
        df = pd.read_csv('data/clean_data.csv')
        del df['Unnamed: 0']
    else:
        df = pd.read_json(filename)
        df = clean_data(df, save=True)

    # These columns are only used in nlp
    del df['description']
    del df['org_desc']

    y = df.pop('fraud_target')
    X = df.values
    return X, y
Ejemplo n.º 4
0
    def update(self):
        content = self.__retrieve_content__('', self.link)
        title = BeautifulSoup(content).find('title').contents[0]
        content, comments_in_content = clean_data( content, '' )
        for i in range(0, len(comments_in_content)):
                self.entries.append(
                    FeedEntry.FeedEntry(self.link + '#comment' + str(i),
                                        None,
                                        comments_in_content[i],
                                        comments_in_content[i],
                                        title,
                                        self.link,
                                        None,
                                        None,
                                        None))

        self.entries.append(
                FeedEntry.FeedEntry(self.link,
                                    None,
                                    content,
                                    content,
                                    title,
                                    self.link,
                                    None,
                                    None,
                                    None))
Ejemplo n.º 5
0
def main(args):
    if not check_files():
        verify_download()
        print("\nDownloads verified.")
        print("Files Downloaded.\nCleaning Datasets...")
    clean_data()
    # human_ppi, yeast_ppi, human_complex, yeast_complex
    data_set = args["-d"].split(",")
    if len(data_set) > 1:
        raise Exception("Only one data set a time allowed")

    methods = args["-m"].split(",")

    if data_set[-3:] == "ppi":
        graph.get_metrics(methods, data_set)
    else:
        scn.generate_metrics(methods, data_set)
Ejemplo n.º 6
0
def init():
    teams = TEAMS.copy()
    players = clean_data(PLAYERS)
    balanced_teams = balance_teams(teams, players)
    intro_msg()
    while True:
        team_index = get_user_selected_team(balanced_teams)
        display_team_stats(balanced_teams[team_index])
Ejemplo n.º 7
0
def predict(filename):
    df = pd.read_json('data/' + filename)
    df = clean_data(df, training=False)

    with open('data/model.pkl', 'rb') as infile:
        model = pickle.load(infile)

    return model.predict(df.values)
Ejemplo n.º 8
0
def predict(new_data):
    df = pd.DataFrame([new_data])
    # df = df.from_dict([new_data])
    df = clean_data(df, training=False)
    
    with open('data/model.pkl', 'rb') as infile:
        model = pickle.load(infile)
    
    return model.predict(df.values)[0]
Ejemplo n.º 9
0
def check_news_type(news_article):
    news_article = [
        ' '.join([
            Word(word).lemmatize()
            for word in clean_data(news_article).split()
        ])
    ]
    features = vect.transform(news_article)

    return str(model.predict(features)[0])
Ejemplo n.º 10
0
def clean_data(html_list):
    html_list_cleaned = []

    for html in html_list:
        # clean program language
        html_str = cd.clean_data(html)

        if '' is not html_str:
            html_list_cleaned.append(html_str)

    return html_list_cleaned
Ejemplo n.º 11
0
def append_data():
    df = load_data.collect_data_spkr('full', online=True)
    baseline_path = load_data.get_newest_download('C:/Users/adm-mlung/Desktop/Projekte/Secrets/data/Umsatz_complete')
    de = pd.read_csv(baseline_path)
 
    df = clean_data.clean_data(df)
    de = clean_data.clean_data(de) 

    df.set_index(['Valutadatum', 'Verwendungszweck'], verify_integrity = True, inplace=True)
    de.set_index(['Valutadatum', 'Verwendungszweck'], verify_integrity = True, inplace=True)

    dc = de.combine_first(df)
    dc['Tags'] = dc['Tags'].fillna(' ')
    dc['Category'] = dc['Category'].fillna(' ')

    dc = dc.reset_index()

    dc.to_csv("C:/Users/adm-mlung/Desktop/Projekte/Secrets/data/Updated_Umsatz_"+ str(time.strftime("%d%m%Y")) +".csv")

    return dc
Ejemplo n.º 12
0
def run_previous_prediction(temp):
    print(int(temp))
    results = tab.find({'prediction': int(temp)})
    r = random.randint(0, results.count()-1)
    result = results[r]
    df_all = pd.DataFrame.from_dict(result, orient='index').transpose()
    # df_all['object_id'] = 0
    df = clean_data(df_all.copy(), training=False)
    prediction = df['prediction'][0]
    prediction_proba = df['prediction_proba'][0]

    return prediction, prediction_proba, df_all
Ejemplo n.º 13
0
    def update(self):
        content = self.__retrieve_content__('', self.link)
        title = BeautifulSoup(content).find('title').contents[0]
        content, comments_in_content = clean_data(content, '')
        for i in range(0, len(comments_in_content)):
            self.entries.append(
                FeedEntry.FeedEntry(self.link + '#comment' + str(i), None,
                                    comments_in_content[i],
                                    comments_in_content[i], title, self.link,
                                    None, None, None))

        self.entries.append(
            FeedEntry.FeedEntry(self.link, None, content, content, title,
                                self.link, None, None, None))
Ejemplo n.º 14
0
Archivo: main.py Proyecto: mmmlung/MFP
def init():
    df = load_data.collect_data_spkr('full', True)
    print("Success load")
    dc = clean_data.clean_data(df)
    print("Succes clean")
    dd = clean_data.tag_data(dc)
    print('Succsess tagging')
    de = clean_data.categorize_data(dd)
    print('Succes categorizing')
    de = update_data.tag_updated_data_ui(de)
    de = update_data.categorize_updated_data_ui(de)

    de.to_csv('C:/Users/adm-mlung/Desktop/Projekte/Secrets/data/Umsatz_complete/Umsatz_init'+str(time.strftime("%d%m%Y")) +".csv")

    return de
def get_results(id):
    tweets = get_related_tweets(id)
    tweets['cleaned_text'] = tweets['tweet_text'].apply(
        lambda x: clean_data(x))
    tweets['length'] = tweets['cleaned_text'].apply(lambda x: len(x))
    tweets = tweets[tweets['length'] < 200]
    text_sequences_to_predict = tokenize(tweets['cleaned_text'])
    preds = model.predict(text_sequences_to_predict)
    preds = list(preds)
    preds = make_prediction(preds)
    tweets['prediction'] = preds
    tweets['prediction'] = tweets['prediction'].replace({
        0: 'negative',
        1: 'positive'
    })
    tweets['entities'] = tweets['tweet_text'].apply(lambda x: find_entities(x))
    return tweets
Ejemplo n.º 16
0
def parse_data(city, labels):

    path = "datasets/" + "-".join(city.split()).lower() + "/" + "_".join(
        city.split()).lower() + ".csv"

    with open(path) as f:
        reader = csv.reader(f)
        contents = [row for row in reader]

    exhibitions = set()
    auction_houses = set()

    def get_sales_for_exhibition(exhibition, contents):
        return [row for row in contents if row[2] == exhibition]

    for row in contents:
        auction_houses.add(row[1])
        exhibitions.add(row[2])

    size = len(labels)

    city_data = [labels]

    while len(exhibitions) != 0:

        exhibition = exhibitions.pop()

        sales = get_sales_for_exhibition(exhibition, contents)

        exhibition_data = clean_data(sales, labels)

        # print (len(sales) - len(exhibition_data))

        city_data += exhibition_data

    with open("datasets/" + "-".join(city.split()).lower() + "/" + "data.csv",
              "wb") as my_file:
        wr = csv.writer(my_file)
        wr.writerows(city_data)
Ejemplo n.º 17
0
def run_prediction():
    with open('../models/model.pkl', 'rb') as f:
        model = pickle.load(f)
    url = 'http://galvanize-case-study-on-fraud.herokuapp.com/data_point'
    result = requests.get(url).json()
    df_all = pd.DataFrame.from_dict(result, orient='index').transpose()
    df = clean_data(df_all.copy(), training=False)

    del df['description']
    del df['org_desc']

    X = df.values
    print(X)
    prediction = int(model.predict(X)[0])
    prediction_proba = model.predict_proba(X)[0][1]

    insert = df_all.to_dict(orient='records')[0]
    insert['prediction'] = prediction
    insert['prediction_proba'] = prediction_proba

    if not bool(tab.find({'object_id': df_all['object_id'][0]}).count()):
        tab.insert_one(insert)
    return prediction, prediction_proba, df_all
Ejemplo n.º 18
0
def extract_naver_map():
    TITLE = []
    ADDRESS = []
    PHONE = []
    URL = []
    query = loc.get()+" "+keyword.get()
    browser = open_browser(query)
    wait = WebDriverWait(browser, 30)
    by_xpath = By.XPATH, "//object[@id='searchIframe']"
    wait.until(EC.presence_of_element_located(by_xpath))
    time.sleep(3)
    search_frame = browser.find_element_by_xpath("//object[@id='searchIframe']")
    browser.switch_to.frame(search_frame)
    last_page = int(get_pages(browser))
    get_browser(browser, query)
    wait.until(EC.presence_of_element_located(by_xpath))
    search_frame = browser.find_element_by_xpath("//object[@id='searchIframe']")
    browser.switch_to.frame(search_frame)
    for p in range(last_page):
        print(f"----------------------------------------------------\n\nextracting page{p+1}/{last_page}\n\n----------------------------------------------------\n\n")
        time.sleep(1)
        while True:
            atags_1 = browser.find_elements_by_class_name('_2aE-_')
            if len(atags_1) == 0:
                atags_1 = browser.find_elements_by_class_name('Tx7az') 
            browser.execute_script("document.querySelector('._1Az1K').scrollTo(document.querySelector('._1Az1K').scrollTop, document.querySelector('._1Az1K').scrollHeight);")
            atags = browser.find_elements_by_class_name('_2aE-_')
            if len(atags) == 0:
                atags = browser.find_elements_by_class_name('Tx7az')
            if len(atags_1) == len(atags):
                break
        print(f"현 페이지 총 아이템 수: {len(atags)}\n\n")
        #extract
        by_xpath = By.XPATH, '//object[@id="entryIframe"]'
        for a in atags:
            a.click()
            time.sleep(1)
            browser.switch_to_default_content()
            wait.until(EC.presence_of_element_located(by_xpath))
            url = browser.find_elements_by_tag_name('object')[1].get_attribute('data')
            browser.execute_script("window.open('');")
            browser.switch_to_window(browser.window_handles[-1])
            browser.get(url)
            try:
                html = browser.execute_script('return document.body.outerHTML')
                soup = BeautifulSoup(html,'html.parser')
            except WebDriverException:
                browser.close()
                browser.switch_to_window(browser.window_handles[0])
                entry_frame = browser.find_element_by_xpath('//object[@id="entryIframe"]')
                browser.switch_to_frame(entry_frame)
                html = browser.execute_script('return document.body.outerHTML')
                soup = BeautifulSoup(html,'html.parser')
                browser.switch_to_default_content()
            title = soup.find('span', {'class': '_3XamX'}).text
            address = soup.find('span',{'class': '_2yqUQ'}).text
            phone = soup.find('li', {'class': '_3xPmJ'})
            if phone:
                phone = phone.text.split('안내')[0]
            else:
                phone = None
            if len(browser.window_handles) >1 :
                browser.close()
                browser.switch_to_window(browser.window_handles[0])
            else:
                pass
            browser.switch_to.frame(search_frame)
            TITLE.append(title)
            ADDRESS.append(address)
            PHONE.append(phone)
            URL.append(url)
            df = pd.DataFrame({'상호명': TITLE, '주소': ADDRESS, '전화번호': PHONE, '링크': URL})
            df.to_csv(f'{query}.csv', encoding='utf-8')
        #click next page
        next_btn = browser.find_elements_by_class_name('_3pA6R')[1]
        next_btn.click()
    print('finished!')
    clean_data(query)
    messagebox.showinfo('info', '완료')
"""Creates the Ancient_Greek_ML dataset and then prepares the train, dev and test sets for the character-level BERT."""
from clean_data import clean_data
from sentence_tokenization import sentence_tokenize_corpus
from split_data import split_data
import os

os.chdir("../data")
clean_data()
sentence_tokenize_corpus()
split_data()
Ejemplo n.º 20
0
def clean_data(path):
    df = clean_data("../data/train.json")
    assert type(df) is pd.core.frame.DataFrame
    assert type(df['ingredients_string'][0]) is str
Ejemplo n.º 21
0
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
from clean_data import clean_data, impute_matrix, normalize_data
from sklearn.linear_model import SGDClassifier
from sklearn import cross_validation

X, y = clean_data('AusOpen-men-2013.csv', 'AusOpen-women-2013.csv', 'FrenchOpen-men-2013.csv',
					  'FrenchOpen-women-2013.csv', 'USOpen-men-2013.csv', 'Wimbledon-men-2013.csv',
					  'Wimbledon-women-2013.csv')

# 4762 NA values; we remove these through mean imputation
X = impute_matrix(X)

# mean normalize and feature scale X
X = normalize_data(X)

clf = SGDClassifier(loss='hinge', alpha=.0001)
clf.fit(X, y)
scores = cross_validation.cross_val_score(clf, X, y, cv=5)
print "Accuracy: %0.2f (+/- %02f)" % (scores.mean(), scores.std()*2)



Ejemplo n.º 22
0
def clean_dat():
    clean_data.clean_data(io_files.shuffle_training_path, io_files.clean_path)
Ejemplo n.º 23
0
    x_values = list(range(len(importances)))
    # Make a bar chart
    plt.bar(x_values, importances, orientation='vertical')
    # Tick labels for x axis
    plt.xticks(x_values, feature_list, rotation='vertical')
    # Axis labels and title
    plt.ylabel('Importance')
    plt.xlabel('Variable')
    plt.title('Variable Importances')


#df = pd.read_csv('JEOPARDY_CSV.csv')
df = pd.read_csv("jeopardy_cats.csv")

# clean the data
df = clean_data(df, classification=False)

# create basic features from the data
#df = featurize(df)

y = df['Value']

X = df.drop([
    'Value', 'Show Number', 'Air Date', 'Round', 'Category', 'Question',
    'Answer'
],
            axis=1,
            inplace=False)

feature_list = list(X.columns)
Ejemplo n.º 24
0
def clean_dfs(dfs):
    import clean_data as cld
    clean_dfs = []
    for df in dfs:
        clean_dfs.append(cld.clean_data(df))
    return clean_dfs
Ejemplo n.º 25
0
    def update(self):
        self.feedEntries = []
        self.lastUpdated = time.time()

        if re.search( 'reddit', self.url ) or re.search( 'imbd', self.url ) :
            return

        print 'updating ' + self.url
        feed = feedparser.parse( self.url )
        if len(feed.entries) == 0:
            return
        if len(feed.entries) > 1000:
            print 'More than 1000 entries in feed: ' + self.url
            
        firstEntry = feed['entries'][0].link
        for entry in feed.entries:
            author = None
            comments = []
            guid = None
            updated = None
            summary = ""

            if entry.link == self.__lastEntry__:
                self.__lastEntry__ = firstEntry
                return;
            try:
                author = entry.author
            except AttributeError:
                pass

            try:
                updated = entry.updated
            except AttributeError:
                pass

            try:
                summary = entry.summary
            except AttributeError:
                pass

            summary = summary.encode('utf-8')
            
            content = self.__retrieve_content__(summary, entry.link)
            content, comments_in_content = clean_data( content, summary )

            for i in range(0, len(comments_in_content)):
                comments.append(
                    FeedEntry.FeedEntry(entry.link + '#comment' + str(i),
                                        feed.url,
                                        comments_in_content[i].encode('utf-8'),
                                        comments_in_content[i].encode('utf-8'),
                                        (entry.title + ' Comment ' + str(i)).encode('utf-8'),
                                        (entry.link + '#comment' + str(i)).encode('utf-8'),
                                        '',
                                        None,
                                        updated))

            self.feedEntries.append(
                FeedEntry.FeedEntry(entry.link.encode('utf-8'),
                                    feed.url,
                                    content.encode('utf-8'),
                                    content.encode('utf-8'),
                                    entry.title.encode('utf-8'),
                                    entry.link.encode('utf-8'),
                                    author.encode('utf-8'),
                                    comments,
                                    updated))
            
        self.__lastEntry__ = firstEntry;
Ejemplo n.º 26
0
        if 'module' in str(globals()[var]): continue


clear_all()

generate_visualization = True

#carrega o dataset de tweets
df = pd.read_csv('./input/Tweets_Mg.csv', encoding='utf-8')

visualization.distr_qtd_carac(df)

################################################################
# Faz uma limpeza prévia da coluna de texto do tweet
################################################################
dataset = clean.clean_data(dataset=df, shuffle=False)

dataset, stops = clean.apply_text_processing(dataset)
######################################
# Separando os dados em suas classes
######################################
tweets = dataset["Text"].values
classificacao = dataset["Classificacao"].values

######################################
# Divide o dataset:
#   80% para treino
#   20% para teste
######################################
SEED = 8188
x_train, x_test, y_train, y_test = train_test_split(tweets,
Ejemplo n.º 27
0
# drop the "id" column. There is no use for patient ids in this anlysis.
X.drop("id", axis=1, inplace=True)

# print(X.gender.value_counts()) # checking different values for gender
# Only 1 instance with gender=Other. The rest are either male or female
# Removing the instance with gender=Other
X = X[X.gender != "Other"]
# print(X.gender.value_counts()) # rechecking gender values

visualize.visualize(X)

# extracting the response variable
y = X.pop("stroke")

X = clean_data.clean_data(X)
print(20 * "*" + " Data cleaning ended successfully!")
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=23)
rf = rfc.rfc(X_train, X_test, y_train, y_test)
print(20 * "*" + " Machine learning modeling ended successfully!")

# save model using joblib
FILENAME = "saved_model.sav"
joblib.dump(rf, FILENAME)

# load the model form disk
loaded_model = joblib.load(FILENAME)
print("The optimized model: \n", loaded_model)
Ejemplo n.º 28
0
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix
import pickle
import os
from clean_data import clean_data
from sklearn.feature_extraction.text import CountVectorizer

data = pd.read_csv("csv/data.csv", encoding='cp1252')
article_text = data['article'].tolist()
article_category = data['category'].tolist()

#print(data.head())

for i, value in enumerate(article_text):
    print("cleaning data:", i)
    article_text[i] = ' '.join(
        [Word(word).lemmatize() for word in clean_data(value).split()])

vect = TfidfVectorizer(stop_words='english', min_df=10)
#vect = CountVectorizer()
X = vect.fit_transform(article_text)
Y = np.array(article_category)

X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.20,
                                                    random_state=50)

print("training size", X_train.shape)
print("Testing size:", X_test.shape)
from sklearn.linear_model import PassiveAggressiveClassifier
Ejemplo n.º 29
0
import math

import os

import tensorflow as tf

import numpy as np

from clean_data import clean_data

from config import configs

config = configs()

clean_data = clean_data()
        
batch_size = config.batch_size

buckets = config.buckets

hidden_size = config.hidden_size

steps_per_checkpoint = config.steps_per_checkpoint
            
learning_rate = config.learning_rate
        
max_gradient_norm = config.max_gradient_norm
        
x_train = clean_data.x_train
        
Ejemplo n.º 30
0
import sklearn.linear_model as lm
from matplotlib.pyplot import figure, boxplot, xlabel, ylabel, show
import numpy as np
from scipy.io import loadmat
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
from scipy.io import loadmat
import torch
from sklearn import model_selection
from __init__ import train_neural_net, draw_neural_net
from scipy import stats
from clean_data import clean_data, transform_data

#-----------------------LOADING DATA----------------------------

data = clean_data('Datasets/**videos.csv')
data = transform_data(
    data, ['likes', 'dislikes', 'views', 'comment_count', 'trending_time'])
np.random.seed(180820)
data = data.head(100000)
X = np.array(
    data[['likes', 'dislikes', 'views', 'comment_count', 'trending_time']])
#y = np.array(data['views']).squeeze()
data['class'] = np.where(data["trending_time"] <= 3., 1, 0.)
y = np.where(data["trending_time"] <= 3., 1, 0.)
#X = np.array(data)
#y = X[:,[4]]
#X = X[:,0:4]
attributeNames = [
    'likes', 'dislikes', 'views', 'comment_count', 'trending_time'
]
Ejemplo n.º 31
0
import pandas as pd
import os
os.chdir('/home/tomas/Kaggle/Santander')

from clean_data import clean_data
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.pipeline import Pipeline

#Import samples
df = pd.read_csv('data/sample.csv', low_memory=False)

df_clean = clean_data(df)
#Split columns into train data and labels
data = df_clean.ix[:, :12]
labels = df_clean.ix[:, 12:]

#Split into trainig, validation and test sets
x_train, x_test, y_train, y_test = train_test_split(data,
                                                    labels,
                                                    test_size=0.2,
                                                    random_state=0)

#Normailze data
classifier = Pipeline([
    #Normalizer
    ('clf', OneVsRestClassifier(KNeighborsClassifier()))
])
Ejemplo n.º 32
0
# thingspeak-read.py

# use Python to read a set of datapoints off of ThingSpeak.com and
# then plot the data with matplotlib.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from user_input import user_input
from call_api import call_api
from clean_data import clean_data
from plot_data import plot_data

w_type, n_data_pts = user_input()
df = call_api(w_type, n_data_pts)
data = clean_data(df)
plot_data(data, w_type)
Ejemplo n.º 33
0
def gen_data(args):
    """
    Clean raw data so it can be processed.
    
    :param args: args for gen_data
    :type args: Namespace
    """
    # pull args out
    length = args.len
    split = args.split
    clean_dir = args.output_data
    unclean_dir = args.target_data
    spy_data = args.spy_data

    # do some checks
    try:
        assert length > 0, "len must be positive"
        assert split <= 1.0 and split > 0, "split must be between 0-1"
        assert os.path.exists(clean_dir), "output_data dir must exist"
        assert os.path.exists(unclean_dir), "target_data dir must exist"
    except AssertionError as err:
        logger.error("Failed check: {}".format(err))
        return

    # set the directories
    if spy_data == False:
        metadata_file = clean_dir + "METADATA.json"
        train_file = clean_dir + "Train.csv"
        eval_file = clean_dir + "Eval.csv"
    else:
        test_file = clean_dir + "Spy.csv"

    # get list of files
    list_of_files = common.file_list(unclean_dir)

    names = []
    for i in list_of_files:
        names.append(i['name'])

    match = ['02_SESSION_INFO', '03_CPU_INFO']
    # pull out matches
    meta_files = [
        s for s in list_of_files if any(m == s['name'] for m in match)
    ]
    for m in meta_files:
        list_of_files.remove(m)

    for m in meta_files:
        if m['name'] == '02_SESSION_INFO':
            session_file = m['path']
        elif m['name'] == '03_CPU_INFO':
            cpu_file = m['path']
        else:
            pass

    # process the data
    if spy_data == True:
        data, metadata = clean_data.clean_data(list_of_files, spy_data=True)
        logging.debug("DATA: {}".format(data))
        clean_data.write_to_csv(data,
                                metadata,
                                clean_dir,
                                test_file,
                                length,
                                spy_data=True)
    else:
        data, metadata = clean_data.clean_data(list_of_files)
        clean_data.write_to_csv(data, metadata, clean_dir, train_file, length,
                                split, eval_file, metadata_file)