def clean_data(self): dataset_dir = self.config['dataset_dir'] work_dir = self.config['work_dir'] # set log file and timer log_file = os.path.join(work_dir, 'logs/log_clean_data.txt') self.logger.set_log_file(log_file) # create a local timer local_timer = Timer('Data cleaning Module') local_timer.start() # clean data cleaned_data_dir = os.path.join(work_dir, 'cleaned_data') if os.path.exists(cleaned_data_dir): # remove cleaned_data_dir shutil.rmtree(cleaned_data_dir) os.mkdir(cleaned_data_dir) # check if dataset_dir is a list or tuple if not (isinstance(dataset_dir, list) or isinstance(dataset_dir, tuple)): dataset_dir = [dataset_dir, ] clean_data(dataset_dir, cleaned_data_dir) # stop local timer local_timer.mark('Data cleaning done') logging.info(local_timer.summary())
def show_results(): ''' Main function: this function will run all of the above functions. Originally named "main()" but it could be confused with other scripts' main functions ''' # Import other scripts' functions from clean_data import clean_data from process_data import match_error, match_samples # Clean data trollWords = clean_data(sample_size=300, path='tweets.csv') normalWords = clean_data(sample_size=300, path='election_day_tweets.csv') # Analyze matched = match_error(normalWords, trollWords, ['vote']) comparison_words = [ 'vote', 'trump', 'hillary', 'hillari', 'clinton', 'donald', 'amp' ] comparisons = match_samples(normalWords, trollWords, comparison_words) # Display results show_comparison(comparisons) list1 = ['hillari', 'clinton', 'hillary'] list2 = ['donald', 'trump'] show_cumulative_comparison(comparisons, list1, list2) list1 = ['hillari', 'hillary'] list2 = ['hillari', 'hillary', 'clinton', 'donald', 'trump'] show_special_comparison(comparisons, list1, list2) show_wordcloud(words=words) show_histogram(trollWords, title='Russian Twitter Troll Word Frequency') show_histogram(normalWords, title='User Political Tweet Word Frequency') show_histogram(matched, title='Word Comparison')
def get_data(filename=None): '''Load raw data from a file and return training data and responses. Parameters ---------- filename: The path to a json file containing the raw data and response. Returns ------- X: A numpy array containing the independent data used for training. y: A numpy array containing labels, used for model response. ''' df = pd.DataFrame() if filename == None: df = pd.read_csv('data/clean_data.csv') del df['Unnamed: 0'] else: df = pd.read_json(filename) df = clean_data(df, save=True) # These columns are only used in nlp del df['description'] del df['org_desc'] y = df.pop('fraud_target') X = df.values return X, y
def update(self): content = self.__retrieve_content__('', self.link) title = BeautifulSoup(content).find('title').contents[0] content, comments_in_content = clean_data( content, '' ) for i in range(0, len(comments_in_content)): self.entries.append( FeedEntry.FeedEntry(self.link + '#comment' + str(i), None, comments_in_content[i], comments_in_content[i], title, self.link, None, None, None)) self.entries.append( FeedEntry.FeedEntry(self.link, None, content, content, title, self.link, None, None, None))
def main(args): if not check_files(): verify_download() print("\nDownloads verified.") print("Files Downloaded.\nCleaning Datasets...") clean_data() # human_ppi, yeast_ppi, human_complex, yeast_complex data_set = args["-d"].split(",") if len(data_set) > 1: raise Exception("Only one data set a time allowed") methods = args["-m"].split(",") if data_set[-3:] == "ppi": graph.get_metrics(methods, data_set) else: scn.generate_metrics(methods, data_set)
def init(): teams = TEAMS.copy() players = clean_data(PLAYERS) balanced_teams = balance_teams(teams, players) intro_msg() while True: team_index = get_user_selected_team(balanced_teams) display_team_stats(balanced_teams[team_index])
def predict(filename): df = pd.read_json('data/' + filename) df = clean_data(df, training=False) with open('data/model.pkl', 'rb') as infile: model = pickle.load(infile) return model.predict(df.values)
def predict(new_data): df = pd.DataFrame([new_data]) # df = df.from_dict([new_data]) df = clean_data(df, training=False) with open('data/model.pkl', 'rb') as infile: model = pickle.load(infile) return model.predict(df.values)[0]
def check_news_type(news_article): news_article = [ ' '.join([ Word(word).lemmatize() for word in clean_data(news_article).split() ]) ] features = vect.transform(news_article) return str(model.predict(features)[0])
def clean_data(html_list): html_list_cleaned = [] for html in html_list: # clean program language html_str = cd.clean_data(html) if '' is not html_str: html_list_cleaned.append(html_str) return html_list_cleaned
def append_data(): df = load_data.collect_data_spkr('full', online=True) baseline_path = load_data.get_newest_download('C:/Users/adm-mlung/Desktop/Projekte/Secrets/data/Umsatz_complete') de = pd.read_csv(baseline_path) df = clean_data.clean_data(df) de = clean_data.clean_data(de) df.set_index(['Valutadatum', 'Verwendungszweck'], verify_integrity = True, inplace=True) de.set_index(['Valutadatum', 'Verwendungszweck'], verify_integrity = True, inplace=True) dc = de.combine_first(df) dc['Tags'] = dc['Tags'].fillna(' ') dc['Category'] = dc['Category'].fillna(' ') dc = dc.reset_index() dc.to_csv("C:/Users/adm-mlung/Desktop/Projekte/Secrets/data/Updated_Umsatz_"+ str(time.strftime("%d%m%Y")) +".csv") return dc
def run_previous_prediction(temp): print(int(temp)) results = tab.find({'prediction': int(temp)}) r = random.randint(0, results.count()-1) result = results[r] df_all = pd.DataFrame.from_dict(result, orient='index').transpose() # df_all['object_id'] = 0 df = clean_data(df_all.copy(), training=False) prediction = df['prediction'][0] prediction_proba = df['prediction_proba'][0] return prediction, prediction_proba, df_all
def update(self): content = self.__retrieve_content__('', self.link) title = BeautifulSoup(content).find('title').contents[0] content, comments_in_content = clean_data(content, '') for i in range(0, len(comments_in_content)): self.entries.append( FeedEntry.FeedEntry(self.link + '#comment' + str(i), None, comments_in_content[i], comments_in_content[i], title, self.link, None, None, None)) self.entries.append( FeedEntry.FeedEntry(self.link, None, content, content, title, self.link, None, None, None))
def init(): df = load_data.collect_data_spkr('full', True) print("Success load") dc = clean_data.clean_data(df) print("Succes clean") dd = clean_data.tag_data(dc) print('Succsess tagging') de = clean_data.categorize_data(dd) print('Succes categorizing') de = update_data.tag_updated_data_ui(de) de = update_data.categorize_updated_data_ui(de) de.to_csv('C:/Users/adm-mlung/Desktop/Projekte/Secrets/data/Umsatz_complete/Umsatz_init'+str(time.strftime("%d%m%Y")) +".csv") return de
def get_results(id): tweets = get_related_tweets(id) tweets['cleaned_text'] = tweets['tweet_text'].apply( lambda x: clean_data(x)) tweets['length'] = tweets['cleaned_text'].apply(lambda x: len(x)) tweets = tweets[tweets['length'] < 200] text_sequences_to_predict = tokenize(tweets['cleaned_text']) preds = model.predict(text_sequences_to_predict) preds = list(preds) preds = make_prediction(preds) tweets['prediction'] = preds tweets['prediction'] = tweets['prediction'].replace({ 0: 'negative', 1: 'positive' }) tweets['entities'] = tweets['tweet_text'].apply(lambda x: find_entities(x)) return tweets
def parse_data(city, labels): path = "datasets/" + "-".join(city.split()).lower() + "/" + "_".join( city.split()).lower() + ".csv" with open(path) as f: reader = csv.reader(f) contents = [row for row in reader] exhibitions = set() auction_houses = set() def get_sales_for_exhibition(exhibition, contents): return [row for row in contents if row[2] == exhibition] for row in contents: auction_houses.add(row[1]) exhibitions.add(row[2]) size = len(labels) city_data = [labels] while len(exhibitions) != 0: exhibition = exhibitions.pop() sales = get_sales_for_exhibition(exhibition, contents) exhibition_data = clean_data(sales, labels) # print (len(sales) - len(exhibition_data)) city_data += exhibition_data with open("datasets/" + "-".join(city.split()).lower() + "/" + "data.csv", "wb") as my_file: wr = csv.writer(my_file) wr.writerows(city_data)
def run_prediction(): with open('../models/model.pkl', 'rb') as f: model = pickle.load(f) url = 'http://galvanize-case-study-on-fraud.herokuapp.com/data_point' result = requests.get(url).json() df_all = pd.DataFrame.from_dict(result, orient='index').transpose() df = clean_data(df_all.copy(), training=False) del df['description'] del df['org_desc'] X = df.values print(X) prediction = int(model.predict(X)[0]) prediction_proba = model.predict_proba(X)[0][1] insert = df_all.to_dict(orient='records')[0] insert['prediction'] = prediction insert['prediction_proba'] = prediction_proba if not bool(tab.find({'object_id': df_all['object_id'][0]}).count()): tab.insert_one(insert) return prediction, prediction_proba, df_all
def extract_naver_map(): TITLE = [] ADDRESS = [] PHONE = [] URL = [] query = loc.get()+" "+keyword.get() browser = open_browser(query) wait = WebDriverWait(browser, 30) by_xpath = By.XPATH, "//object[@id='searchIframe']" wait.until(EC.presence_of_element_located(by_xpath)) time.sleep(3) search_frame = browser.find_element_by_xpath("//object[@id='searchIframe']") browser.switch_to.frame(search_frame) last_page = int(get_pages(browser)) get_browser(browser, query) wait.until(EC.presence_of_element_located(by_xpath)) search_frame = browser.find_element_by_xpath("//object[@id='searchIframe']") browser.switch_to.frame(search_frame) for p in range(last_page): print(f"----------------------------------------------------\n\nextracting page{p+1}/{last_page}\n\n----------------------------------------------------\n\n") time.sleep(1) while True: atags_1 = browser.find_elements_by_class_name('_2aE-_') if len(atags_1) == 0: atags_1 = browser.find_elements_by_class_name('Tx7az') browser.execute_script("document.querySelector('._1Az1K').scrollTo(document.querySelector('._1Az1K').scrollTop, document.querySelector('._1Az1K').scrollHeight);") atags = browser.find_elements_by_class_name('_2aE-_') if len(atags) == 0: atags = browser.find_elements_by_class_name('Tx7az') if len(atags_1) == len(atags): break print(f"현 페이지 총 아이템 수: {len(atags)}\n\n") #extract by_xpath = By.XPATH, '//object[@id="entryIframe"]' for a in atags: a.click() time.sleep(1) browser.switch_to_default_content() wait.until(EC.presence_of_element_located(by_xpath)) url = browser.find_elements_by_tag_name('object')[1].get_attribute('data') browser.execute_script("window.open('');") browser.switch_to_window(browser.window_handles[-1]) browser.get(url) try: html = browser.execute_script('return document.body.outerHTML') soup = BeautifulSoup(html,'html.parser') except WebDriverException: browser.close() browser.switch_to_window(browser.window_handles[0]) entry_frame = browser.find_element_by_xpath('//object[@id="entryIframe"]') browser.switch_to_frame(entry_frame) html = browser.execute_script('return document.body.outerHTML') soup = BeautifulSoup(html,'html.parser') browser.switch_to_default_content() title = soup.find('span', {'class': '_3XamX'}).text address = soup.find('span',{'class': '_2yqUQ'}).text phone = soup.find('li', {'class': '_3xPmJ'}) if phone: phone = phone.text.split('안내')[0] else: phone = None if len(browser.window_handles) >1 : browser.close() browser.switch_to_window(browser.window_handles[0]) else: pass browser.switch_to.frame(search_frame) TITLE.append(title) ADDRESS.append(address) PHONE.append(phone) URL.append(url) df = pd.DataFrame({'상호명': TITLE, '주소': ADDRESS, '전화번호': PHONE, '링크': URL}) df.to_csv(f'{query}.csv', encoding='utf-8') #click next page next_btn = browser.find_elements_by_class_name('_3pA6R')[1] next_btn.click() print('finished!') clean_data(query) messagebox.showinfo('info', '완료')
"""Creates the Ancient_Greek_ML dataset and then prepares the train, dev and test sets for the character-level BERT.""" from clean_data import clean_data from sentence_tokenization import sentence_tokenize_corpus from split_data import split_data import os os.chdir("../data") clean_data() sentence_tokenize_corpus() split_data()
def clean_data(path): df = clean_data("../data/train.json") assert type(df) is pd.core.frame.DataFrame assert type(df['ingredients_string'][0]) is str
import numpy as np import pandas as pd from pandas import DataFrame, Series import matplotlib.pyplot as plt from clean_data import clean_data, impute_matrix, normalize_data from sklearn.linear_model import SGDClassifier from sklearn import cross_validation X, y = clean_data('AusOpen-men-2013.csv', 'AusOpen-women-2013.csv', 'FrenchOpen-men-2013.csv', 'FrenchOpen-women-2013.csv', 'USOpen-men-2013.csv', 'Wimbledon-men-2013.csv', 'Wimbledon-women-2013.csv') # 4762 NA values; we remove these through mean imputation X = impute_matrix(X) # mean normalize and feature scale X X = normalize_data(X) clf = SGDClassifier(loss='hinge', alpha=.0001) clf.fit(X, y) scores = cross_validation.cross_val_score(clf, X, y, cv=5) print "Accuracy: %0.2f (+/- %02f)" % (scores.mean(), scores.std()*2)
def clean_dat(): clean_data.clean_data(io_files.shuffle_training_path, io_files.clean_path)
x_values = list(range(len(importances))) # Make a bar chart plt.bar(x_values, importances, orientation='vertical') # Tick labels for x axis plt.xticks(x_values, feature_list, rotation='vertical') # Axis labels and title plt.ylabel('Importance') plt.xlabel('Variable') plt.title('Variable Importances') #df = pd.read_csv('JEOPARDY_CSV.csv') df = pd.read_csv("jeopardy_cats.csv") # clean the data df = clean_data(df, classification=False) # create basic features from the data #df = featurize(df) y = df['Value'] X = df.drop([ 'Value', 'Show Number', 'Air Date', 'Round', 'Category', 'Question', 'Answer' ], axis=1, inplace=False) feature_list = list(X.columns)
def clean_dfs(dfs): import clean_data as cld clean_dfs = [] for df in dfs: clean_dfs.append(cld.clean_data(df)) return clean_dfs
def update(self): self.feedEntries = [] self.lastUpdated = time.time() if re.search( 'reddit', self.url ) or re.search( 'imbd', self.url ) : return print 'updating ' + self.url feed = feedparser.parse( self.url ) if len(feed.entries) == 0: return if len(feed.entries) > 1000: print 'More than 1000 entries in feed: ' + self.url firstEntry = feed['entries'][0].link for entry in feed.entries: author = None comments = [] guid = None updated = None summary = "" if entry.link == self.__lastEntry__: self.__lastEntry__ = firstEntry return; try: author = entry.author except AttributeError: pass try: updated = entry.updated except AttributeError: pass try: summary = entry.summary except AttributeError: pass summary = summary.encode('utf-8') content = self.__retrieve_content__(summary, entry.link) content, comments_in_content = clean_data( content, summary ) for i in range(0, len(comments_in_content)): comments.append( FeedEntry.FeedEntry(entry.link + '#comment' + str(i), feed.url, comments_in_content[i].encode('utf-8'), comments_in_content[i].encode('utf-8'), (entry.title + ' Comment ' + str(i)).encode('utf-8'), (entry.link + '#comment' + str(i)).encode('utf-8'), '', None, updated)) self.feedEntries.append( FeedEntry.FeedEntry(entry.link.encode('utf-8'), feed.url, content.encode('utf-8'), content.encode('utf-8'), entry.title.encode('utf-8'), entry.link.encode('utf-8'), author.encode('utf-8'), comments, updated)) self.__lastEntry__ = firstEntry;
if 'module' in str(globals()[var]): continue clear_all() generate_visualization = True #carrega o dataset de tweets df = pd.read_csv('./input/Tweets_Mg.csv', encoding='utf-8') visualization.distr_qtd_carac(df) ################################################################ # Faz uma limpeza prévia da coluna de texto do tweet ################################################################ dataset = clean.clean_data(dataset=df, shuffle=False) dataset, stops = clean.apply_text_processing(dataset) ###################################### # Separando os dados em suas classes ###################################### tweets = dataset["Text"].values classificacao = dataset["Classificacao"].values ###################################### # Divide o dataset: # 80% para treino # 20% para teste ###################################### SEED = 8188 x_train, x_test, y_train, y_test = train_test_split(tweets,
# drop the "id" column. There is no use for patient ids in this anlysis. X.drop("id", axis=1, inplace=True) # print(X.gender.value_counts()) # checking different values for gender # Only 1 instance with gender=Other. The rest are either male or female # Removing the instance with gender=Other X = X[X.gender != "Other"] # print(X.gender.value_counts()) # rechecking gender values visualize.visualize(X) # extracting the response variable y = X.pop("stroke") X = clean_data.clean_data(X) print(20 * "*" + " Data cleaning ended successfully!") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23) rf = rfc.rfc(X_train, X_test, y_train, y_test) print(20 * "*" + " Machine learning modeling ended successfully!") # save model using joblib FILENAME = "saved_model.sav" joblib.dump(rf, FILENAME) # load the model form disk loaded_model = joblib.load(FILENAME) print("The optimized model: \n", loaded_model)
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix import pickle import os from clean_data import clean_data from sklearn.feature_extraction.text import CountVectorizer data = pd.read_csv("csv/data.csv", encoding='cp1252') article_text = data['article'].tolist() article_category = data['category'].tolist() #print(data.head()) for i, value in enumerate(article_text): print("cleaning data:", i) article_text[i] = ' '.join( [Word(word).lemmatize() for word in clean_data(value).split()]) vect = TfidfVectorizer(stop_words='english', min_df=10) #vect = CountVectorizer() X = vect.fit_transform(article_text) Y = np.array(article_category) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=50) print("training size", X_train.shape) print("Testing size:", X_test.shape) from sklearn.linear_model import PassiveAggressiveClassifier
import math import os import tensorflow as tf import numpy as np from clean_data import clean_data from config import configs config = configs() clean_data = clean_data() batch_size = config.batch_size buckets = config.buckets hidden_size = config.hidden_size steps_per_checkpoint = config.steps_per_checkpoint learning_rate = config.learning_rate max_gradient_norm = config.max_gradient_norm x_train = clean_data.x_train
import sklearn.linear_model as lm from matplotlib.pyplot import figure, boxplot, xlabel, ylabel, show import numpy as np from scipy.io import loadmat from sklearn.neighbors import KNeighborsClassifier from sklearn import model_selection from scipy.io import loadmat import torch from sklearn import model_selection from __init__ import train_neural_net, draw_neural_net from scipy import stats from clean_data import clean_data, transform_data #-----------------------LOADING DATA---------------------------- data = clean_data('Datasets/**videos.csv') data = transform_data( data, ['likes', 'dislikes', 'views', 'comment_count', 'trending_time']) np.random.seed(180820) data = data.head(100000) X = np.array( data[['likes', 'dislikes', 'views', 'comment_count', 'trending_time']]) #y = np.array(data['views']).squeeze() data['class'] = np.where(data["trending_time"] <= 3., 1, 0.) y = np.where(data["trending_time"] <= 3., 1, 0.) #X = np.array(data) #y = X[:,[4]] #X = X[:,0:4] attributeNames = [ 'likes', 'dislikes', 'views', 'comment_count', 'trending_time' ]
import pandas as pd import os os.chdir('/home/tomas/Kaggle/Santander') from clean_data import clean_data from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.cross_validation import train_test_split from sklearn import preprocessing from sklearn.pipeline import Pipeline #Import samples df = pd.read_csv('data/sample.csv', low_memory=False) df_clean = clean_data(df) #Split columns into train data and labels data = df_clean.ix[:, :12] labels = df_clean.ix[:, 12:] #Split into trainig, validation and test sets x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=0) #Normailze data classifier = Pipeline([ #Normalizer ('clf', OneVsRestClassifier(KNeighborsClassifier())) ])
# thingspeak-read.py # use Python to read a set of datapoints off of ThingSpeak.com and # then plot the data with matplotlib. import numpy as np import pandas as pd import matplotlib.pyplot as plt from user_input import user_input from call_api import call_api from clean_data import clean_data from plot_data import plot_data w_type, n_data_pts = user_input() df = call_api(w_type, n_data_pts) data = clean_data(df) plot_data(data, w_type)
def gen_data(args): """ Clean raw data so it can be processed. :param args: args for gen_data :type args: Namespace """ # pull args out length = args.len split = args.split clean_dir = args.output_data unclean_dir = args.target_data spy_data = args.spy_data # do some checks try: assert length > 0, "len must be positive" assert split <= 1.0 and split > 0, "split must be between 0-1" assert os.path.exists(clean_dir), "output_data dir must exist" assert os.path.exists(unclean_dir), "target_data dir must exist" except AssertionError as err: logger.error("Failed check: {}".format(err)) return # set the directories if spy_data == False: metadata_file = clean_dir + "METADATA.json" train_file = clean_dir + "Train.csv" eval_file = clean_dir + "Eval.csv" else: test_file = clean_dir + "Spy.csv" # get list of files list_of_files = common.file_list(unclean_dir) names = [] for i in list_of_files: names.append(i['name']) match = ['02_SESSION_INFO', '03_CPU_INFO'] # pull out matches meta_files = [ s for s in list_of_files if any(m == s['name'] for m in match) ] for m in meta_files: list_of_files.remove(m) for m in meta_files: if m['name'] == '02_SESSION_INFO': session_file = m['path'] elif m['name'] == '03_CPU_INFO': cpu_file = m['path'] else: pass # process the data if spy_data == True: data, metadata = clean_data.clean_data(list_of_files, spy_data=True) logging.debug("DATA: {}".format(data)) clean_data.write_to_csv(data, metadata, clean_dir, test_file, length, spy_data=True) else: data, metadata = clean_data.clean_data(list_of_files) clean_data.write_to_csv(data, metadata, clean_dir, train_file, length, split, eval_file, metadata_file)