def main(): global config parser = OptionParser() parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="verbose log output", default=False) parser.add_option("-e", "--expected-hostname", dest="expected_hostname", action="store", help="expected hostname of current host. If hostname differs, agent will fail", default=None) (options, args) = parser.parse_args() expected_hostname = options.expected_hostname setup_logging(options.verbose) default_cfg = { 'agent' : { 'prefix' : '/home/ambari' } } config = ConfigParser.RawConfigParser(default_cfg) bind_signal_handlers() if (len(sys.argv) >1) and sys.argv[1]=='stop': stop_agent() # Check for ambari configuration file. config = resolve_ambari_config() # Starting data cleanup daemon data_cleaner = None if int(config.get('agent','data_cleanup_interval')) > 0: data_cleaner = DataCleaner(config) data_cleaner.start() perform_prestart_checks(expected_hostname) daemonize() # Starting ping port listener try: ping_port_listener = PingPortListener(config) except Exception as ex: err_message = "Failed to start ping port listener of: " + str(ex) logger.error(err_message); sys.stderr.write(err_message) sys.exit(1) ping_port_listener.start() update_log_level(config) server_url = 'https://' + config.get('server', 'hostname') + ':' + config.get('server', 'url_port') print("Connecting to the server at " + server_url + "...") logger.info('Connecting to the server at: ' + server_url) # Wait until server is reachable netutil = NetUtil() netutil.try_to_connect(server_url, -1, logger) # Launch Controller communication controller = Controller(config) controller.start() controller.join() stop_agent() logger.info("finished")
def main(): global config parser = OptionParser() parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="verbose log output", default=False) parser.add_option("-e", "--expected-hostname", dest="expected_hostname", action="store", help="expected hostname of current host. If hostname differs, agent will fail", default=None) (options, args) = parser.parse_args() expected_hostname = options.expected_hostname setup_logging(options.verbose) default_cfg = { 'agent' : { 'prefix' : '/home/ambari' } } config = ConfigParser.RawConfigParser(default_cfg) bind_signal_handlers() if (len(sys.argv) >1) and sys.argv[1]=='stop': stop_agent() # Check for ambari configuration file. config = resolve_ambari_config() # Starting data cleanup daemon data_cleaner = None if int(config.get('agent','data_cleanup_interval')) > 0: data_cleaner = DataCleaner(config) data_cleaner.start() perform_prestart_checks(expected_hostname) daemonize() # Starting ping port listener ping_port_listener = PingPortListener(config) ping_port_listener.start() update_log_level(config) server_url = 'https://' + config.get('server', 'hostname') + ':' + config.get('server', 'url_port') print("Connecting to the server at " + server_url + "...") logger.info('Connecting to the server at: ' + server_url) # Wait until server is reachable netutil = NetUtil() netutil.try_to_connect(server_url, -1, logger) # Launch Controller communication controller = Controller(config) controller.start() controller.join() stop_agent() logger.info("finished")
def __cleanSingleFile(self, source_name, file_path): #print(f"[{type(self).__name__}]Running ---> __cleanSingleFile: {file_path}") jsnData = CommonUtilities.loadJsonFile( f"{self.src_dir_name}/{file_path}") dt_cleaner = DataCleaner(jsnData, self.gtAttrNames) dt_cleaner.cleanKeys() dt_cleaner.cleanValues() jsnDataCl = dt_cleaner.getSignificantData() empty_keys_d, empty_value_d, composite_value_d = dt_cleaner.getEmptyDataKeys( ) if len(empty_keys_d.keys()) + len(empty_value_d.keys()) > 0: self.discarded_info_pool[file_path] = { "key_empty": empty_keys_d, "value_empty": empty_value_d } if len(composite_value_d.keys()) > 0: self.composite_value_pool[file_path] = composite_value_d CommonUtilities.writeDictToJson( jsnDataCl, f"{self.dst_dir_name}/{file_path}.json")
def find_best_coefficient(): data = read_csv_and_format("CountyData.csv") coefficient = 1.0 data_cleaner = DataCleaner(data) while coefficient < 2.1: start = time.time() data_before_clean = data_cleaner.data data_after_clean, data_throw_away = data_cleaner.clean_data(coefficient) end = time.time() print("Used time: ", end - start) print("If coefficient is set to {0}, {1}% of data is kept." .format(coefficient, len(data_after_clean) / len(data_before_clean) * 100)) coefficient += 0.1
def clean_data(self, data): cleaner = DataCleaner(data) cleaner.clean() cleaner.save("clean_data/clean_" + self.bankname + ".csv") return cleaner.data
def collectTrainData(urlfile=urlfile, pickle_file=TRAIN_PICKLE_PATH): # Going to have 32,000 total samples. UPDATE: Finished collecting. t0 = time() f = open(urlfile, 'r') num_unacceptable = 0 total_learner_data = unpickleFile(TRAIN_PICKLE_PATH) total_in_train = len(total_learner_data.keys()) for index, url in enumerate(f.readlines()): if index <= 42748 or total_in_train > 60000: continue try: speaking, studying, entry, incorrect, correct = mineLearnerData(url) dc = DataCleaner(correct, speaking, studying) corrections = dc.cleanCorrections() studying = dc.astrip(studying).split()[0] if studying not in POSSIBLE_LANGUAGES: num_unacceptable += 1 continue if speaking not in POSSIBLE_LANGUAGES: num_unacceptable += 1 continue else: learner_data = {'Speaking': speaking, 'Studying': studying, 'Entry': entry, 'Incorrect': incorrect, 'Corrections': corrections} total_learner_data[index] = learner_data createPickledDatasets(total_learner_data, pickle_file) total_in_train = len(total_learner_data.keys()) print("Total found: %s. Total in set: %s. Time elapsed: %s." % (str(index), str(total_in_train), time()-t0)) except IOError as e: num_unacceptable += 1 print("I/O error({0}): {1}".format(e.errno, e.strerror)) except ValueError: num_unacceptable += 1 print("Conversion or value error.") except: num_unacceptable += 1 print("Unexpected error found: ", sys.exc_info()[0]) continue print("Done collecting training data after %s seconds!" % (time()-t0)) print("There were %s unacceptable URLs in this run" % num_unacceptable)
def main(): date_time = time.strftime("%c") logging.basicConfig( filename='/NAS/PipelineReports/DataCleaner/Logs/DataCleanerLog-' + date_time + '.log', level=logging.DEBUG) while True: logging.info("Waiting for 5 minutes...") print "Waiting for 5 minutes..." time.sleep(300) try: get_run_url = "http://genapsys-services.appspot.com/listOfRunsToBeProcessed?stateInclude=6&keywork=&intervalDays=1&stateExclude=8" run_list = requests.get(get_run_url) run_list = json.loads(run_list.content) except: logging.info("Error in getting the run list! Trying again ...") print "Error in getting the run list! Trying again ..." for run_id in run_list: logging.info(run_id) print run_id try: # Starting the analysis store_experiment_state(run_id, 8, 1, 88) print "Hack Waiting for 10 minutes..." time.sleep(600) # Data Object DataObj = DataCleaner(run_id) logging.info("Defining the Data Object OKAY") print "Defining the Data Object OKAY" # Getting config information DataObj.getConfig(DataObj._config_file) logging.info("Getting config information OKAY") print "Getting config information OKAY" # Getting basic experiment info from the cloud database DataObj.getBasicInfo() logging.info( "Getting basic experiment info from database OKAY") print "Getting basic experiment info from database OKAY" # Sets load and save paths DataObj.setPaths() logging.info("Set load and save paths OKAY") print "Set load and save paths OKAY" # Calculating the read configuration DataObj.calcReadConfig() logging.info("Calculating the read configuration OKAY") print "Calculating the read configuration OKAY" # Dumping the experiment basic info as json file DataObj.dumpRunInfo() logging.info( "Dumping the experiment basic info as json file OKAY") print "Dumping the experiment basic info as json file OKAY" # Dumping the Experiment basic info successful store_experiment_state(run_id, 8, 2, 88) # Calculating the location of sensors and no_magnet flag DataObj.getChipInfo() logging.info( "Calculating the location of sensors and no_magnet flag OKAY" ) print "Calculating the location of sensors and no_magnet flag OKAY" # Extracting the instrument states DataObj.createMask() logging.info("Extracting the instrument states OKAY") print "Extracting the instrument states OKAY" # Exporting the instrument data DataObj.instrumentData() logging.info("Exporting the instrument data OKAY") print "Exporting the instrument data OKAY" # Exporting the instrument data successful store_experiment_state(run_id, 8, 3, 88) # Exporting the sensor reads data DataObj.sensorReads() logging.info("Exporting the sensor reads data OKAY") print "Exporting the sensor reads data OKAY" # Exporting the sensor data successful store_experiment_state(run_id, 8, 4, 88) except: print "DataCleaner failed for the run:" print run_id logging.info("DataCleaner failed for the run:") logging.info(run_id)
class TwitterAPI: tweets = None query = None number_of_tweets = None date = None consumer_key = "" consumer_secret = "" access_token = "" access_secret = "" data_clean = DataCleaner() tweets_classifier = TweetClassifier() def __init__(self): return def Auth(self): auth = tweepy.OAuthHandler(self.consumer_key, self.consumer_secret) auth.set_access_token(self.access_token, self.access_secret) api = tweepy.API(auth) return api # def retrieve_tweets(self , query , api): # # tweets = {} # for tweet in tweepy.Cursor(api.search, q=query).items(200): # tweets.update({'{}'.format(tweet.text): None}) # return tweets def retrieve_tweets(self, query, api): tweets = [] for tweet in tweepy.Cursor(api.search, q=query).items(500): if tweet.text not in tweets: tweets.append(tweet.text) return tweets def classify(self,tweets,filename): clean_tweets = self.data_clean.prepare_data_list(list(tweets)) classifier = self.tweets_classifier.load_classifier() vectorizer = self.tweets_classifier.put_word_features() tfidf = vectorizer.transform(clean_tweets) result = classifier.predict(tfidf) data_predicted = [] for tweet ,label in zip(tweets,result): data_predicted.append({'tweets':tweet ,'class':label}) data_predicted = pd.DataFrame(data_predicted) path= os.path.join(r'C:\Users\Mohammed\PycharmProjects\new_GP\historical_files', filename) if filename.lower().endswith(('.xlsx', '.xls')): data_predicted.to_excel(path,index=False) elif filename.lower().endswith(('.csv')): data_predicted.to_csv(path ,index=False,encoding='utf-8') def classify_real_time(self, tweets, filename): clean_tweets = self.data_clean.prepare_data_list(list(tweets)) classifier = self.tweets_classifier.load_classifier() vectorizer = self.tweets_classifier.put_word_features() tfidf = vectorizer.transform(clean_tweets) result = classifier.predict(tfidf) data_predicted = [] for tweet, label in zip(tweets, result): data_predicted.append({'tweets': tweet, 'class': label}) data_predicted = pd.DataFrame(data_predicted) path = os.path.join(r'C:\Users\Mohammed\PycharmProjects\new_GP\real_files', filename) data_predicted.to_excel(path, index=False) def read_real_time(self,filename): path = os.path.join(r'C:\Users\Mohammed\PycharmProjects\new_GP\real_files',filename) dataframe = pd.read_excel(path, names=['class', 'tweets']) return dataframe def read_file(self,filename): path = os.path.join(r'C:\Users\Mohammed\PycharmProjects\new_GP\historical_files',filename) if filename.lower().endswith(('.xlsx', '.xls')): dataframe = pd.read_excel(path, names=['class', 'tweets']) elif filename.lower().endswith(('.csv')): dataframe = pd.read_csv(path ,names=['class' ,'tweets']) return dataframe
from DataCleaner import DataCleaner from TfIdfMaker import TfIdfMaker from KMeansClusterer import KMeansClusterer if __name__ == "__main__": # create the instance that takes in the file name to read data and clean from data_cleaner = DataCleaner("plot_summaries.txt") data_cleaner.clean_data() tf_idf_maker = TfIdfMaker("limited_id_name_summary.pkl") tf_idf_maker.create_frequencies() tf_idf_maker.create_tf_idf() k_means_clusterer = KMeansClusterer("tf_idf.pkl") k_means_clusterer.find_clusters()
class TweetClassifier: final_file = None word_features = None data_clean = DataCleaner() def __init__(self): return # def classify_real_time_tweets(self, tweet): # # tweeter_api = TwitterAPI() # # # # return def get_tweets(self, file_name): # file = open_workbook(file_name) #Text = [] # Polarity = [] # for sheet in file.sheets(): # if sheet.name == "Sheet1": # for row in range(sheet.nrows): # for col in range(sheet.ncols): # data = sheet.cell(row,col).value # if(col == 0): # Text.append(data) # elif(col == 1): # Polarity.append(data) # # dic = dict(zip(Text,Polarity)) dataframe = pd.read_excel(file_name) Text = list(dataframe['tweets']) #Polarity = list(dataframe['class']) #dic = dict(zip(Text, Polarity)) return Text # # def get_words(self,tweets): # all_words = [] # for (word ,sentiment)in tweets: # all_words.extend(word) # # return all_words # def get_word_freq(self, wordlist): # wordlist =nltk.FreqDist(wordlist) # word_features = wordlist.keys() # return word_features # def extract_features(self,document): # document_word = set(document) # features = {} # for word in self.word_features: # features['Word(%s)' %word]= (word in document_word) # return features # training_set = nltk.classify.apply_features(extract_features ,train_data) # #test_set = nltk.classify.apply_features(extract_features ,test_data) # tel_classifier = nltk.NaiveBayesClassifier.train(training_set) # # def save_classifier(self, classifier): UPLOADS_PATH = join(dirname(realpath(__file__)), 'Classifiers/tel_classifier2.pickle') f = open(UPLOADS_PATH, 'wb') pickle.dump(classifier, f, -1) f.close() def save_classifier_dev(self, classifier, NameOfAlgo): UPLOADS_PATH = join(dirname(realpath(__file__)), 'dev_folder/' + NameOfAlgo + '.pickle') f = open(UPLOADS_PATH, 'wb') pickle.dump(classifier, f, -1) f.close() def load_classifier_dev(self, NameOfAlgo): UPLOADS_PATH = join(dirname(realpath(__file__)), 'dev_folder/' + NameOfAlgo + '.pickle') f = open(UPLOADS_PATH, 'rb') classifier = pickle.load(f) f.close() return classifier def load_classifier(self): UPLOADS_PATH = join(dirname(realpath(__file__)), 'Classifiers/tel_classifier2.pickle') f = open(UPLOADS_PATH, 'rb') classifier = pickle.load(f) f.close() return classifier # def put_word_features(self): # # training = self.get_tweets(r'C:\Users\Mohammed\Desktop\train_data.xlsx') # train_data = self.data_clean.prepare_data_set(training) # self.word_features = self.get_word_freq(self.get_words(train_data)) def feature_extraction(self, data): vectorizer = TfidfVectorizer(min_df=3, max_df=0.85, ngram_range=(1, 3)) tfidf_data = vectorizer.fit_transform(data) return tfidf_data # def data_transform(self,data): # # vectorizer = TfidfVectorizer(min_df=3, max_df=0.85, ngram_range=(1, 3)) # tfidf_data = vectorizer.transform(data) # return tfidf_data def fit_data(self, filename): vectorizer = TfidfVectorizer(min_df=3, max_df=0.85, ngram_range=(1, 3)) training = self.get_tweets(filename) train_data = self.data_clean.prepare_data_list(training) vectorizer.fit_transform(train_data) return vectorizer def build_pickle(self, train, train_label, algo): x_train, y_train = train, train_label classifier = algo classifier.fit(x_train, y_train) # classifier = twitterCal.load_classifier() self.save_classifier(classifier) # predict = cross_val_predict(classifier,x_test,y_test,cv=10) # scores = cross_val_score(classifier ,x_test , y_test,cv=10) # print(scores) # print('Accuracy of %s %0.2f (+/- %0.2f)'%( classifier ,scores.mean(),scores.std()*2)) # print(classification_report(y_test ,predict)) return classifier def learining(self, train, train_label, algo, NameOfAlgo): x_train, y_train = train, train_label classifier = algo classifier.fit(x_train, y_train) # classifier = twitterCal.load_classifier() self.save_classifier_dev(classifier, NameOfAlgo) # predict = cross_val_predict(classifier,x_test,y_test,cv=10) # scores = cross_val_score(classifier ,x_test , y_test,cv=10) # print(scores) # print('Accuracy of %s %0.2f (+/- %0.2f)'%( classifier ,scores.mean(),scores.std()*2)) # print(classification_report(y_test ,predict)) return classifier def put_word_features(self): vectorizer = TfidfVectorizer(min_df=3, max_df=0.85, ngram_range=(1, 3)) training = self.get_tweets( r'C:\Users\Mohammed\PycharmProjects\new_GP\Classifiers\train_data.xlsx' ) train_data = self.data_clean.prepare_data_list(training) vectorizer.fit_transform(train_data) return vectorizer def predict(self, tweets, filename, NameOfAlgo): classifier = self.load_classifier_dev(NameOfAlgo) clean_tweets = self.data_clean.prepare_data_list(list(tweets)) vectorizer = self.put_word_features() tfidf = vectorizer.transform(clean_tweets) result = classifier.predict(tfidf) data_predicted = [] for tweet, label in zip(tweets, result): data_predicted.append({'tweets': tweet, 'class': label}) data_predicted = pd.DataFrame(data_predicted) path = os.path.join( r'C:\Users\Mohammed\PycharmProjects\new_GP\dev_folder\predicted', filename) data_predicted.to_excel(path, index=False) def read_file_dev(self, filename): try: path = os.path.join( r'C:\Users\Mohammed\PycharmProjects\new_GP\dev_folder', filename) if filename.lower().endswith(('.xlsx', '.xls')): dataframe = pd.read_excel(path, names=['tweets']) elif filename.lower().endswith(('.csv')): dataframe = pd.read_csv(path, names=['tweets']) return dataframe except Exception as e: return False
def main(heartbeat_stop_callback=None): global config parser = OptionParser() parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="verbose log output", default=False) parser.add_option("-e", "--expected-hostname", dest="expected_hostname", action="store", help="expected hostname of current host. If hostname differs, agent will fail", default=None) (options, args) = parser.parse_args() expected_hostname = options.expected_hostname current_user = getpass.getuser() setup_logging(options.verbose) default_cfg = {'agent': {'prefix': '/home/ambari'}} config.load(default_cfg) bind_signal_handlers(agentPid) if (len(sys.argv) > 1) and sys.argv[1] == 'stop': stop_agent() if (len(sys.argv) > 2) and sys.argv[1] == 'reset': reset_agent(sys.argv) # Check for ambari configuration file. resolve_ambari_config() # Starting data cleanup daemon data_cleaner = None if config.has_option('agent', 'data_cleanup_interval') and int(config.get('agent','data_cleanup_interval')) > 0: data_cleaner = DataCleaner(config) data_cleaner.start() perform_prestart_checks(expected_hostname) # Starting ping port listener try: #This acts as a single process machine-wide lock (albeit incomplete, since # we still need an extra file to track the Agent PID) ping_port_listener = PingPortListener(config) except Exception as ex: err_message = "Failed to start ping port listener of: " + str(ex) logger.error(err_message) sys.stderr.write(err_message) sys.exit(1) ping_port_listener.start() update_log_level(config) server_hostname = config.get('server', 'hostname') server_url = config.get_api_url() if not OSCheck.get_os_family() == OSConst.WINSRV_FAMILY: daemonize() try: server_ip = socket.gethostbyname(server_hostname) logger.info('Connecting to Ambari server at %s (%s)', server_url, server_ip) except socket.error: logger.warn("Unable to determine the IP address of the Ambari server '%s'", server_hostname) # Wait until server is reachable netutil = NetUtil(heartbeat_stop_callback) retries, connected = netutil.try_to_connect(server_url, -1, logger) # Ambari Agent was stopped using stop event if connected: # Launch Controller communication controller = Controller(config, heartbeat_stop_callback) controller.start() controller.join() if not OSCheck.get_os_family() == OSConst.WINSRV_FAMILY: stop_agent() logger.info("finished")
def main(heartbeat_stop_callback=None): global config parser = OptionParser() parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="verbose log output", default=False) parser.add_option("-e", "--expected-hostname", dest="expected_hostname", action="store", help="expected hostname of current host. If hostname differs, agent will fail", default=None) (options, args) = parser.parse_args() expected_hostname = options.expected_hostname logging_level = logging.DEBUG if options.verbose else logging.INFO setup_logging(logger, AmbariConfig.AmbariConfig.getLogFile(), logging_level) global is_logger_setup is_logger_setup = True setup_logging(alerts_logger, AmbariConfig.AmbariConfig.getAlertsLogFile(), logging_level) Logger.initialize_logger('resource_management', logging_level=logging_level) # use the host's locale for numeric formatting try: locale.setlocale(locale.LC_ALL, '') except locale.Error as ex: logger.warning("Cannot set locale for ambari-agent. Please check your systemwide locale settings. Failed due to: {0}.".format(str(ex))) default_cfg = {'agent': {'prefix': '/home/ambari'}} config.load(default_cfg) if (len(sys.argv) > 1) and sys.argv[1] == 'stop': stop_agent() if (len(sys.argv) > 2) and sys.argv[1] == 'reset': reset_agent(sys.argv) # Check for ambari configuration file. resolve_ambari_config() # Add syslog hanlder based on ambari config file add_syslog_handler(logger) # Starting data cleanup daemon data_cleaner = None if config.has_option('agent', 'data_cleanup_interval') and int(config.get('agent','data_cleanup_interval')) > 0: data_cleaner = DataCleaner(config) data_cleaner.start() perform_prestart_checks(expected_hostname) # Starting ping port listener try: #This acts as a single process machine-wide lock (albeit incomplete, since # we still need an extra file to track the Agent PID) ping_port_listener = PingPortListener(config) except Exception as ex: err_message = "Failed to start ping port listener of: " + str(ex) logger.error(err_message) sys.stderr.write(err_message) sys.exit(1) ping_port_listener.start() update_log_level(config) if not OSCheck.get_os_family() == OSConst.WINSRV_FAMILY: daemonize() # # Iterate through the list of server hostnames and connect to the first active server # active_server = None server_hostnames = hostname.server_hostnames(config) connected = False stopped = False # Keep trying to connect to a server or bail out if ambari-agent was stopped while not connected and not stopped: for server_hostname in server_hostnames: try: server_ip = socket.gethostbyname(server_hostname) server_url = config.get_api_url(server_hostname) logger.info('Connecting to Ambari server at %s (%s)', server_url, server_ip) except socket.error: logger.warn("Unable to determine the IP address of the Ambari server '%s'", server_hostname) # Wait until MAX_RETRIES to see if server is reachable netutil = NetUtil(config, heartbeat_stop_callback) (retries, connected, stopped) = netutil.try_to_connect(server_url, MAX_RETRIES, logger) # if connected, launch controller if connected: logger.info('Connected to Ambari server %s', server_hostname) # Set the active server active_server = server_hostname # Launch Controller communication controller = Controller(config, server_hostname, heartbeat_stop_callback) controller.start() while controller.is_alive(): time.sleep(0.1) # # If Ambari Agent connected to the server or # Ambari Agent was stopped using stop event # Clean up if not Windows OS # if connected or stopped: ExitHelper().exit(0) logger.info("finished") break pass # for server_hostname in server_hostnames pass # while not (connected or stopped) return active_server
class User: result = Result() api = TwitterAPI() datacleaner = DataCleaner() tweet_classifier = TweetClassifier() tk = Tk() email = None file = NONE def __init__(self): return def search(self, query): auth = self.api.Auth() tweets = self.api.retrieve_tweets(query, auth) return tweets def upload(self, filename): try: name = filename fname = r'C:\Users\Mohammed\PycharmProjects\new_GP\uploads' filename = os.path.join(fname, filename) #filename = filedialog.askopenfilename(filetypes=[('excel file', '.xlsx'),('excel file', '.xls'),('excel file', '.csv')]) if filename: if filename.lower().endswith(('.xlsx', '.xls')): dataframe = pd.read_excel(filename, names=['tweets']) if len(dataframe.columns) == 1: self.api.classify(list(dataframe['tweets']), name) prediction_file = self.api.read_file(name) prediction_file.columns = ['class', 'tweets'] self.file = prediction_file else: return False elif filename.lower().endswith(('.csv')): dataframe = pd.read_csv(filename, names=['tweets']) if len(dataframe.columns) == 1: self.api.classify(list(dataframe['tweets']), name) prediction_file = self.api.read_file(name) prediction_file.columns = ['class', 'tweets'] self.file = prediction_file else: return False else: return False else: return False return True except Exception as e: return False def get_train_file_for_build_algo(self, train_file, algo): fname = r'C:\Users\Mohammed\PycharmProjects\new_GP\Classifiers' train_filename = os.path.join(fname, train_file) dataframe = pd.read_excel(train_filename, names=['tweets', 'class']) dataframe.tweets = self.datacleaner.prepare_data_list( list(dataframe.tweets)) data, label = list(dataframe['tweets']), list(dataframe['class']) tfidf = self.tweet_classifier.feature_extraction(data) classifier = self.tweet_classifier.build_pickle(tfidf, label, algo) return classifier def get_train_file(self, train_file, algo, NameOfAlgo): fname = r'C:\Users\Mohammed\PycharmProjects\new_GP\dev_folder' train_filename = os.path.join(fname, train_file) dataframe = pd.read_excel(train_filename, names=['tweets', 'class']) dataframe.tweets = self.datacleaner.prepare_data_list( list(dataframe.tweets)) data, label = list(dataframe['tweets']), list(dataframe['class']) tfidf = self.tweet_classifier.feature_extraction(data) classifier = self.tweet_classifier.learining(tfidf, label, algo, NameOfAlgo) return classifier def get_test_file(self, test_file): try: fname = r'C:\Users\Mohammed\PycharmProjects\new_GP\dev_folder' test_filename = os.path.join(fname, test_file) dataframe = pd.read_excel(test_filename, names=['tweets', 'class']) dataframe.tweets = self.datacleaner.prepare_data_list( list(dataframe.tweets)) data, label = list(dataframe['tweets']), list(dataframe['class']) vectorizer = self.tweet_classifier.fit_data(test_filename) tfidf = vectorizer.transform(data) return tfidf, label except Exception as e: return False def accuracy(self, train_file, test_file, algo, NameOfAlgo): try: classifier = self.get_train_file(train_file, algo, NameOfAlgo) tfidf, label = self.get_test_file(test_file) scores = cross_val_score(classifier, tfidf, label) label_predict = cross_val_predict(classifier, tfidf, label) matrix = classification_report(label, label_predict) result = [scores.mean() * 100, matrix] return result except Exception as e: return False
def main(): date_time = time.strftime("%c") logging.basicConfig(filename='/NAS/PipelineReports/DataCleaner/Logs/DataCleanerLog-' + date_time + '.log', level=logging.DEBUG) while True: logging.info("Waiting for 5 minutes...") print "Waiting for 5 minutes..." time.sleep(300) try: get_run_url = "http://genapsys-services.appspot.com/listOfRunsToBeProcessed?stateInclude=6&keywork=&intervalDays=1&stateExclude=8" run_list = requests.get(get_run_url) run_list = json.loads(run_list.content) except: logging.info("Error in getting the run list! Trying again ...") print "Error in getting the run list! Trying again ..." for run_id in run_list: logging.info(run_id) print run_id try: # Starting the analysis store_experiment_state(run_id, 8, 1, 88) print "Hack Waiting for 10 minutes..." time.sleep(600) # Data Object DataObj = DataCleaner(run_id) logging.info("Defining the Data Object OKAY") print "Defining the Data Object OKAY" # Getting config information DataObj.getConfig(DataObj._config_file) logging.info("Getting config information OKAY") print "Getting config information OKAY" # Getting basic experiment info from the cloud database DataObj.getBasicInfo() logging.info("Getting basic experiment info from database OKAY") print "Getting basic experiment info from database OKAY" # Sets load and save paths DataObj.setPaths() logging.info("Set load and save paths OKAY") print "Set load and save paths OKAY" # Calculating the read configuration DataObj.calcReadConfig() logging.info("Calculating the read configuration OKAY") print "Calculating the read configuration OKAY" # Dumping the experiment basic info as json file DataObj.dumpRunInfo() logging.info("Dumping the experiment basic info as json file OKAY") print "Dumping the experiment basic info as json file OKAY" # Dumping the Experiment basic info successful store_experiment_state(run_id, 8, 2, 88) # Calculating the location of sensors and no_magnet flag DataObj.getChipInfo() logging.info("Calculating the location of sensors and no_magnet flag OKAY") print "Calculating the location of sensors and no_magnet flag OKAY" # Extracting the instrument states DataObj.createMask() logging.info("Extracting the instrument states OKAY") print "Extracting the instrument states OKAY" # Exporting the instrument data DataObj.instrumentData() logging.info("Exporting the instrument data OKAY") print "Exporting the instrument data OKAY" # Exporting the instrument data successful store_experiment_state(run_id, 8, 3, 88) # Exporting the sensor reads data DataObj.sensorReads() logging.info("Exporting the sensor reads data OKAY") print "Exporting the sensor reads data OKAY" # Exporting the sensor data successful store_experiment_state(run_id, 8, 4, 88) except: print "DataCleaner failed for the run:" print run_id logging.info("DataCleaner failed for the run:") logging.info(run_id)
from DataCleaner import DataCleaner import os import csv # Save excels as csvs to create data cleaner instances DataCleaner.saveExcelFileAsCsv("Data/Enforcement_Policies_Data.xlsx") DataCleaner.saveExcelFileAsCsv("Data/Integration_Policies_Data.xlsx") DataCleaner.saveExcelFileAsCsv("Data/Public_Benefits_Data.xlsx") """ Each variable has its own sheet within the excel file, after using the data cleaner method each sheet is now its own csv. These are the variables we need and how to recode them: Enforcement: 287(g) task force- 0 = pro immigration, 1 and 2 = anti immigration Limited co-op w/ detainers- 0 = anti immigration, 1 and 2 = pro immigration E-verify- 0= pro immigration, 1 and 2 = anti immigration Integration: English as official language- 0 = pro immigration, 1 = anti immigration Driver's license policies- 0 = anti immigration, 1 = pro immigration Public Benefits: Public ins unauth kids- 0 = anti immigration, 1 = pro immigration Medicaid unauth adults- 0 = anti immigration, 1 = pro immigration Food asst. LPR kids- 0 = anti immigration, 1 = pro immigration Food asst. LPR adults- 0 = anti immigration, 1 = pro immigration Each variable will be recoded to have 0 as anti immigration and 1 as pro immigration. """
class Result: bar_chart1 = False pie_chart1 = False all = [] tweets_num = 0 positive_tweets_num = 0 negative_tweets_num = 0 neutral_tweets_num = 0 most_words = [] dataclean = DataCleaner() def __init__(self): return def report(self, dataframe): dataframe_class, dataframe_tweets = dataframe['class'], dataframe[ 'tweets'] dataframe_value_count = dataframe_class.value_counts() dictionary = dict( zip(dataframe_value_count.index, dataframe_value_count.values)) for key, value in dictionary.items(): if key == "neg": self.negative_tweets_num = value elif key == "pos": self.positive_tweets_num = value else: self.neutral_tweets_num = value self.tweets_num = dataframe_class.count() dataframe_tweets = self.dataclean.prepare_data_set_without_stem( list(dataframe_tweets)) self.most_words = Counter(dataframe_tweets).most_common(15) self.all = dataframe_tweets self.pie_chart1 = True self.bar_chart1 = True valuable_words = pd.read_excel( r'C:\Users\Mohammed\PycharmProjects\new_GP\Classifiers\words.xlsx') valuable_words = valuable_words.Term most = [] # for word in words : # if word in list(valuable_words): # most.append(word) # print(most) # print(self.all) for word in self.all: if word in list(valuable_words): most.append(word) self.most_words = Counter(most).most_common(15) return dataframe_value_count def pie_chart(self): data = { 'Class': ['Negative', 'Positive', 'Neutral'], 'Numbers': [ self.negative_tweets_num, self.positive_tweets_num, self.neutral_tweets_num ] } dataframe = pd.DataFrame(data) pie = Donut(dataframe, 'Class', values='Numbers', title="Pie Chart For Tweets", color=['#75D9EF', '#4B96D8', '#33D59F']) script_pie, div_pie = components(pie) script_pie = Markup(script_pie) div_pie = Markup(div_pie) page = [script_pie, div_pie] return page def pie_chart_most_words(self): most_words = pd.DataFrame(sorted(self.most_words), columns=['Word', 'Freq']) # words =list(most_words.Word) # words = self.dataclean.remove_stops(words) pie = Donut(most_words, 'Word', values='Freq', title="Pie Chart For Tweets", color=[ '#f20f32', '#39f20f', '#76dfe7', '#af0132', '#f39f29', '#3FC0CF', '#75D9EF', '#4B96D8', '#33D59F', '#33A9D5', '#94A2F3', '#687EFC', '#68FCCF', '#68FCEC', '#75D3F9' ]) script_pie, div_pie = components(pie) script_pie = Markup(script_pie) div_pie = Markup(div_pie) page = [script_pie, div_pie] return page def bar_chart(self): data = { 'Class': ['Negative', 'Positive', 'Neutral'], 'Numbers': [ self.negative_tweets_num, self.positive_tweets_num, self.neutral_tweets_num ] } dataframe = pd.DataFrame(data) bar = Bar(dataframe, 'Class', values='Numbers', title="Bar Chart For Tweets", legend='top_right', agg='median', color="#33A9D5") script, div = components(bar) script = Markup(script) div = Markup(div) page = [script, div] return page def bar_chart_most_words(self): most_words = pd.DataFrame(self.most_words, columns=['Words', "Freq"]) bar = Bar(most_words, "Words", values="Freq", title="Bar Chart For Tweets", legend='top_right', agg='median', color=["#75D3F9", "#75F9F7", "#75D9F9"]) script, div = components(bar) script = Markup(script) div = Markup(div) page = [script, div] return page
def extract_save_features(inputFileNameSensor): #blockPrint() print('Filename: ', inputFileNameSensor) # dataCleaner = DataCleaner() # extFeatures = ExtractFeatures() # Query data sensorStream = open_csv(inputFileNameSensor) #singleCharListLA = dataCleaner.get_single_char_list(sensorStream, 'LA') singleCharListLA = dataCleaner. \ get_single_char_list_rm_rj(sensorStream, 'LA', None) singleCharListGY = dataCleaner. \ get_single_char_list_rm_rj(sensorStream, 'GY', None) singleCharListLA[:] = [ item for item in singleCharListLA if not (item['Char'].iloc[0] == 'Do some PenDowns & PenUps' or item['Char'].iloc[0] == 'Press Accept!' or item['Char'].iloc[0] == 'Sync Validation. Press Accept.' or (item['Char'].iloc[0]) != (item['Char'].iloc[0])) ] singleCharListGY[:] = [ item for item in singleCharListGY if not (item['Char'].iloc[0] == 'Do some PenDowns & PenUps' or item['Char'].iloc[0] == 'Press Accept!' or item['Char'].iloc[0] == 'Sync Validation. Press Accept.' or (item['Char'].iloc[0]) != (item['Char'].iloc[0])) ] totalFeatureSet = [] #singleCharListLA,singleCharListGY=dataCleaner.sync_LA_GY(singleCharListLA, singleCharListGY) #print("No of CharsLA", len(singleCharListLA)) #print("No of CharsGY", len(singleCharListGY)) totalFeatureSet.extend( get_features(extFeatures, singleCharListLA, singleCharListGY)) #totalFeatureSet.extend(get_features(extFeatures, singleCharListGY, 'GY')) #suffix = inputFileNameSensor[-10:] #print("Suffix: ", suffix) #filename = os.path.join('Data/FW/', inputFileNameSensor[-27:-4]) #filename = inputFileNameSensor[:-4] #filename = os.path.join('Data/TSI-Char-FeatureSets/', inputFileNameSensor[-10:-4]) #save_features_to_csv(filename + 'CharFeatureSet' + '.csv', totalFeatureSet) filename = inputFileNameSensor[-17:-4] path = './Data/Results/Finger/FeatureSets/' filename = os.path.join(path, filename) save_features_to_csv(filename + 'CharFeatureSet' + '.csv', totalFeatureSet) #print('FeatureSets saved') enablePrint()
def main(heartbeat_stop_callback=None): global config parser = OptionParser() parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="verbose log output", default=False) parser.add_option("-e", "--expected-hostname", dest="expected_hostname", action="store", help="expected hostname of current host. If hostname differs, agent will fail", default=None) (options, args) = parser.parse_args() expected_hostname = options.expected_hostname setup_logging(logger, AmbariConfig.AmbariConfig.getLogFile(), options.verbose) global is_logger_setup is_logger_setup = True setup_logging(alerts_logger, AmbariConfig.AmbariConfig.getAlertsLogFile(), options.verbose) default_cfg = {'agent': {'prefix': '/home/ambari'}} config.load(default_cfg) if (len(sys.argv) > 1) and sys.argv[1] == 'stop': stop_agent() if (len(sys.argv) > 2) and sys.argv[1] == 'reset': reset_agent(sys.argv) # Check for ambari configuration file. resolve_ambari_config() # Add syslog hanlder based on ambari config file add_syslog_handler(logger) # Starting data cleanup daemon data_cleaner = None if config.has_option('agent', 'data_cleanup_interval') and int(config.get('agent','data_cleanup_interval')) > 0: data_cleaner = DataCleaner(config) data_cleaner.start() perform_prestart_checks(expected_hostname) # Starting ping port listener try: #This acts as a single process machine-wide lock (albeit incomplete, since # we still need an extra file to track the Agent PID) ping_port_listener = PingPortListener(config) except Exception as ex: err_message = "Failed to start ping port listener of: " + str(ex) logger.error(err_message) sys.stderr.write(err_message) sys.exit(1) ping_port_listener.start() update_log_level(config) server_hostname = hostname.server_hostname(config) server_url = config.get_api_url() if not OSCheck.get_os_family() == OSConst.WINSRV_FAMILY: daemonize() try: server_ip = socket.gethostbyname(server_hostname) logger.info('Connecting to Ambari server at %s (%s)', server_url, server_ip) except socket.error: logger.warn("Unable to determine the IP address of the Ambari server '%s'", server_hostname) # Wait until server is reachable netutil = NetUtil(heartbeat_stop_callback) retries, connected = netutil.try_to_connect(server_url, -1, logger) # Ambari Agent was stopped using stop event if connected: # Launch Controller communication controller = Controller(config, heartbeat_stop_callback) controller.start() controller.join() if not OSCheck.get_os_family() == OSConst.WINSRV_FAMILY: ExitHelper.execute_cleanup() stop_agent() logger.info("finished")
def main(): global config parser = OptionParser() parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="verbose log output", default=False) parser.add_option("-e", "--expected-hostname", dest="expected_hostname", action="store", help="expected hostname of current host. If hostname differs, agent will fail", default=None) (options, args) = parser.parse_args() expected_hostname = options.expected_hostname setup_logging(options.verbose) default_cfg = {'agent': {'prefix': '/home/ambari'}} config.load(default_cfg) bind_signal_handlers() if (len(sys.argv) > 1) and sys.argv[1] == 'stop': stop_agent() # Check for ambari configuration file. config = resolve_ambari_config() # Starting data cleanup daemon data_cleaner = None if int(config.get('agent', 'data_cleanup_interval')) > 0: data_cleaner = DataCleaner(config) data_cleaner.start() perform_prestart_checks(expected_hostname) daemonize() # Starting ping port listener try: ping_port_listener = PingPortListener(config) except Exception as ex: err_message = "Failed to start ping port listener of: " + str(ex) logger.error(err_message) sys.stderr.write(err_message) sys.exit(1) ping_port_listener.start() update_log_level(config) server_hostname = config.get('server', 'hostname') server_url = config.get_api_url() try: server_ip = socket.gethostbyname(server_hostname) logger.info('Connecting to Ambari server at %s (%s)', server_url, server_ip) except socket.error: logger.warn("Unable to determine the IP address of the Ambari server '%s'", server_hostname) # Wait until server is reachable netutil = NetUtil() netutil.try_to_connect(server_url, -1, logger) # Launch Controller communication controller = Controller(config) controller.start() controller.join() stop_agent() logger.info("finished")
def main(heartbeat_stop_callback=None): global config global home_dir parser = OptionParser() parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="verbose log output", default=False) parser.add_option( "-e", "--expected-hostname", dest="expected_hostname", action="store", help= "expected hostname of current host. If hostname differs, agent will fail", default=None) parser.add_option("--home", dest="home_dir", action="store", help="Home directory", default="") (options, args) = parser.parse_args() expected_hostname = options.expected_hostname home_dir = options.home_dir logging_level = logging.DEBUG if options.verbose else logging.INFO setup_logging(logger, AmbariConfig.AmbariConfig.getLogFile(), logging_level) global is_logger_setup is_logger_setup = True setup_logging(alerts_logger, AmbariConfig.AmbariConfig.getAlertsLogFile(), logging_level) Logger.initialize_logger('resource_management', logging_level=logging_level) if home_dir != "": # When running multiple Ambari Agents on this host for simulation, each one will use a unique home directory. Logger.info("Agent is using Home Dir: %s" % str(home_dir)) # use the host's locale for numeric formatting try: locale.setlocale(locale.LC_ALL, '') except locale.Error as ex: logger.warning( "Cannot set locale for ambari-agent. Please check your systemwide locale settings. Failed due to: {0}." .format(str(ex))) default_cfg = {'agent': {'prefix': '/home/ambari'}} config.load(default_cfg) if (len(sys.argv) > 1) and sys.argv[1] == 'stop': stop_agent() if (len(sys.argv) > 2) and sys.argv[1] == 'reset': reset_agent(sys.argv) # Check for ambari configuration file. resolve_ambari_config() # Add syslog hanlder based on ambari config file add_syslog_handler(logger) # Starting data cleanup daemon data_cleaner = None if config.has_option('agent', 'data_cleanup_interval') and int( config.get('agent', 'data_cleanup_interval')) > 0: data_cleaner = DataCleaner(config) data_cleaner.start() perform_prestart_checks(expected_hostname) # Starting ping port listener try: #This acts as a single process machine-wide lock (albeit incomplete, since # we still need an extra file to track the Agent PID) ping_port_listener = PingPortListener(config) except Exception as ex: err_message = "Failed to start ping port listener of: " + str(ex) logger.error(err_message) sys.stderr.write(err_message) sys.exit(1) ping_port_listener.start() update_log_level(config) update_open_files_ulimit(config) if not config.use_system_proxy_setting(): logger.info('Agent is configured to ignore system proxy settings') reconfigure_urllib2_opener(ignore_system_proxy=True) if not OSCheck.get_os_family() == OSConst.WINSRV_FAMILY: daemonize() # # Iterate through the list of server hostnames and connect to the first active server # active_server = None server_hostnames = hostname.server_hostnames(config) connected = False stopped = False # Keep trying to connect to a server or bail out if ambari-agent was stopped while not connected and not stopped: for server_hostname in server_hostnames: server_url = config.get_api_url(server_hostname) try: server_ip = socket.gethostbyname(server_hostname) logger.info('Connecting to Ambari server at %s (%s)', server_url, server_ip) except socket.error: logger.warn( "Unable to determine the IP address of the Ambari server '%s'", server_hostname) # Wait until MAX_RETRIES to see if server is reachable netutil = NetUtil(config, heartbeat_stop_callback) (retries, connected, stopped) = netutil.try_to_connect(server_url, MAX_RETRIES, logger) # if connected, launch controller if connected: logger.info('Connected to Ambari server %s', server_hostname) # Set the active server active_server = server_hostname # Launch Controller communication run_threads(server_hostname, heartbeat_stop_callback) # # If Ambari Agent connected to the server or # Ambari Agent was stopped using stop event # Clean up if not Windows OS # if connected or stopped: ExitHelper().exit(0) logger.info("finished") break pass # for server_hostname in server_hostnames pass # while not (connected or stopped) return active_server
def clean(self, df): dc = DataCleaner(df) return dc.clean_gadya()