Exemple #1
0
def main():
  global config
  parser = OptionParser()
  parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="verbose log output", default=False)
  parser.add_option("-e", "--expected-hostname", dest="expected_hostname", action="store",
                    help="expected hostname of current host. If hostname differs, agent will fail", default=None)
  (options, args) = parser.parse_args()

  expected_hostname = options.expected_hostname

  setup_logging(options.verbose)

  default_cfg = { 'agent' : { 'prefix' : '/home/ambari' } }
  config = ConfigParser.RawConfigParser(default_cfg)
  bind_signal_handlers()

  if (len(sys.argv) >1) and sys.argv[1]=='stop':
    stop_agent()

  # Check for ambari configuration file.
  config = resolve_ambari_config()

  # Starting data cleanup daemon
  data_cleaner = None
  if int(config.get('agent','data_cleanup_interval')) > 0:
    data_cleaner = DataCleaner(config)
    data_cleaner.start()

  perform_prestart_checks(expected_hostname)
  daemonize()

  # Starting ping port listener
  try:
    ping_port_listener = PingPortListener(config)
  except Exception as ex:
    err_message = "Failed to start ping port listener of: " + str(ex)
    logger.error(err_message);
    sys.stderr.write(err_message)
    sys.exit(1)
  ping_port_listener.start()

  update_log_level(config)

  server_url = 'https://' + config.get('server', 'hostname') + ':' + config.get('server', 'url_port')
  print("Connecting to the server at " + server_url + "...")
  logger.info('Connecting to the server at: ' + server_url)

  # Wait until server is reachable
  netutil = NetUtil()
  netutil.try_to_connect(server_url, -1, logger)

  # Launch Controller communication
  controller = Controller(config)
  controller.start()
  controller.join()
  stop_agent()
  logger.info("finished")
Exemple #2
0
def main():
  global config
  parser = OptionParser()
  parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="verbose log output", default=False)
  parser.add_option("-e", "--expected-hostname", dest="expected_hostname", action="store",
                    help="expected hostname of current host. If hostname differs, agent will fail", default=None)
  (options, args) = parser.parse_args()

  expected_hostname = options.expected_hostname

  setup_logging(options.verbose)

  default_cfg = { 'agent' : { 'prefix' : '/home/ambari' } }
  config = ConfigParser.RawConfigParser(default_cfg)
  bind_signal_handlers()

  if (len(sys.argv) >1) and sys.argv[1]=='stop':
    stop_agent()

  # Check for ambari configuration file.
  config = resolve_ambari_config()

  # Starting data cleanup daemon
  data_cleaner = None
  if int(config.get('agent','data_cleanup_interval')) > 0:
    data_cleaner = DataCleaner(config)
    data_cleaner.start()

  perform_prestart_checks(expected_hostname)
  daemonize()

  # Starting ping port listener
  ping_port_listener = PingPortListener(config)
  ping_port_listener.start()

  update_log_level(config)

  server_url = 'https://' + config.get('server', 'hostname') + ':' + config.get('server', 'url_port')
  print("Connecting to the server at " + server_url + "...")
  logger.info('Connecting to the server at: ' + server_url)

  # Wait until server is reachable
  netutil = NetUtil()
  netutil.try_to_connect(server_url, -1, logger)

  # Launch Controller communication
  controller = Controller(config)
  controller.start()
  controller.join()
  stop_agent()
  logger.info("finished")
    def __cleanSingleFile(self, source_name, file_path):

        #print(f"[{type(self).__name__}]Running ---> __cleanSingleFile: {file_path}")

        jsnData = CommonUtilities.loadJsonFile(
            f"{self.src_dir_name}/{file_path}")
        dt_cleaner = DataCleaner(jsnData, self.gtAttrNames)
        dt_cleaner.cleanKeys()
        dt_cleaner.cleanValues()

        jsnDataCl = dt_cleaner.getSignificantData()

        empty_keys_d, empty_value_d, composite_value_d = dt_cleaner.getEmptyDataKeys(
        )

        if len(empty_keys_d.keys()) + len(empty_value_d.keys()) > 0:
            self.discarded_info_pool[file_path] = {
                "key_empty": empty_keys_d,
                "value_empty": empty_value_d
            }

        if len(composite_value_d.keys()) > 0:
            self.composite_value_pool[file_path] = composite_value_d

        CommonUtilities.writeDictToJson(
            jsnDataCl, f"{self.dst_dir_name}/{file_path}.json")
Exemple #4
0
def find_best_coefficient():
    data = read_csv_and_format("CountyData.csv")
    coefficient = 1.0
    data_cleaner = DataCleaner(data)

    while coefficient < 2.1:
        start = time.time()
        data_before_clean = data_cleaner.data
        data_after_clean, data_throw_away = data_cleaner.clean_data(coefficient)
        end = time.time()
        print("Used time: ", end - start)

        print("If coefficient is set to {0}, {1}% of data is kept."
              .format(coefficient, len(data_after_clean) / len(data_before_clean) * 100))

        coefficient += 0.1
Exemple #5
0
    def clean_data(self, data):
        cleaner = DataCleaner(data)
        cleaner.clean()

        cleaner.save("clean_data/clean_" + self.bankname + ".csv")

        return cleaner.data
def collectTrainData(urlfile=urlfile, pickle_file=TRAIN_PICKLE_PATH):
	# Going to have 32,000 total samples. UPDATE: Finished collecting.
	t0 = time()
	f = open(urlfile, 'r')
	num_unacceptable = 0
	total_learner_data = unpickleFile(TRAIN_PICKLE_PATH)
	total_in_train = len(total_learner_data.keys())
	for index, url in enumerate(f.readlines()):
		if index <= 42748 or total_in_train > 60000: continue
		try:
			speaking, studying, entry, incorrect, correct = mineLearnerData(url)
			dc = DataCleaner(correct, speaking, studying)
			corrections = dc.cleanCorrections()
			studying = dc.astrip(studying).split()[0]
			if studying not in POSSIBLE_LANGUAGES:
				num_unacceptable += 1 
				continue
			if speaking not in POSSIBLE_LANGUAGES:
				num_unacceptable += 1 
				continue
			else: 
				learner_data = {'Speaking': speaking, 'Studying': studying, 'Entry': entry, 'Incorrect': incorrect, 'Corrections': corrections}
				total_learner_data[index] = learner_data
				createPickledDatasets(total_learner_data, pickle_file)
				total_in_train = len(total_learner_data.keys())
				print("Total found: %s. Total in set: %s. Time elapsed: %s." % (str(index), str(total_in_train), time()-t0))
		except IOError as e:
			num_unacceptable += 1
			print("I/O error({0}): {1}".format(e.errno, e.strerror))
		except ValueError:
			num_unacceptable += 1
			print("Conversion or value error.")
		except:
			num_unacceptable += 1
			print("Unexpected error found: ", sys.exc_info()[0])
			continue
	print("Done collecting training data after %s seconds!" % (time()-t0))
	print("There were %s unacceptable URLs in this run" % num_unacceptable)
Exemple #7
0
def main():

    date_time = time.strftime("%c")
    logging.basicConfig(
        filename='/NAS/PipelineReports/DataCleaner/Logs/DataCleanerLog-' +
        date_time + '.log',
        level=logging.DEBUG)
    while True:
        logging.info("Waiting for 5 minutes...")
        print "Waiting for 5 minutes..."
        time.sleep(300)
        try:
            get_run_url = "http://genapsys-services.appspot.com/listOfRunsToBeProcessed?stateInclude=6&keywork=&intervalDays=1&stateExclude=8"
            run_list = requests.get(get_run_url)
            run_list = json.loads(run_list.content)
        except:
            logging.info("Error in getting the run list! Trying again ...")
            print "Error in getting the run list! Trying again ..."

        for run_id in run_list:
            logging.info(run_id)
            print run_id
            try:

                # Starting the analysis
                store_experiment_state(run_id, 8, 1, 88)

                print "Hack Waiting for 10 minutes..."
                time.sleep(600)
                # Data Object
                DataObj = DataCleaner(run_id)
                logging.info("Defining the Data Object OKAY")
                print "Defining the Data Object OKAY"

                # Getting config information
                DataObj.getConfig(DataObj._config_file)
                logging.info("Getting config information OKAY")
                print "Getting config information OKAY"

                # Getting basic experiment info from the cloud database
                DataObj.getBasicInfo()
                logging.info(
                    "Getting basic experiment info from database OKAY")
                print "Getting basic experiment info from database OKAY"

                # Sets load and save paths
                DataObj.setPaths()
                logging.info("Set load and save paths OKAY")
                print "Set load and save paths OKAY"

                # Calculating the read configuration
                DataObj.calcReadConfig()
                logging.info("Calculating the read configuration OKAY")
                print "Calculating the read configuration OKAY"

                # Dumping the experiment basic info as json file
                DataObj.dumpRunInfo()
                logging.info(
                    "Dumping the experiment basic info as json file OKAY")
                print "Dumping the experiment basic info as json file OKAY"

                # Dumping the Experiment basic info successful
                store_experiment_state(run_id, 8, 2, 88)

                # Calculating the location of sensors and no_magnet flag
                DataObj.getChipInfo()
                logging.info(
                    "Calculating the location of sensors and no_magnet flag OKAY"
                )
                print "Calculating the location of sensors and no_magnet flag OKAY"

                # Extracting the instrument states
                DataObj.createMask()
                logging.info("Extracting the instrument states OKAY")
                print "Extracting the instrument states OKAY"

                # Exporting the instrument data
                DataObj.instrumentData()
                logging.info("Exporting the instrument data OKAY")
                print "Exporting the instrument data OKAY"

                # Exporting the instrument data successful
                store_experiment_state(run_id, 8, 3, 88)

                # Exporting the sensor reads data
                DataObj.sensorReads()
                logging.info("Exporting the sensor reads data OKAY")
                print "Exporting the sensor reads data OKAY"

                # Exporting the sensor data successful
                store_experiment_state(run_id, 8, 4, 88)

            except:
                print "DataCleaner failed for the run:"
                print run_id
                logging.info("DataCleaner failed for the run:")
                logging.info(run_id)
Exemple #8
0
class TwitterAPI:

    tweets = None
    query = None
    number_of_tweets = None
    date = None
    consumer_key = ""
    consumer_secret = ""
    access_token = ""
    access_secret = ""
    data_clean = DataCleaner()
    tweets_classifier = TweetClassifier()

    def __init__(self):

        return



    def Auth(self):

        auth = tweepy.OAuthHandler(self.consumer_key, self.consumer_secret)
        auth.set_access_token(self.access_token, self.access_secret)
        api = tweepy.API(auth)
        return api


    # def retrieve_tweets(self , query , api):
    #
    #     tweets = {}
    #     for tweet in tweepy.Cursor(api.search, q=query).items(200):
    #         tweets.update({'{}'.format(tweet.text): None})
    #     return tweets

    def retrieve_tweets(self, query, api):
        tweets = []
        for tweet in tweepy.Cursor(api.search, q=query).items(500):
            if tweet.text not in tweets:
               tweets.append(tweet.text)
        return tweets


   







    def classify(self,tweets,filename):


       

        clean_tweets = self.data_clean.prepare_data_list(list(tweets))
        classifier = self.tweets_classifier.load_classifier()
        vectorizer = self.tweets_classifier.put_word_features()
        tfidf = vectorizer.transform(clean_tweets)
        result = classifier.predict(tfidf)
        data_predicted = []
        for tweet ,label in zip(tweets,result):
            data_predicted.append({'tweets':tweet ,'class':label})
        data_predicted = pd.DataFrame(data_predicted)
        path= os.path.join(r'C:\Users\Mohammed\PycharmProjects\new_GP\historical_files', filename)
        if filename.lower().endswith(('.xlsx', '.xls')):
           data_predicted.to_excel(path,index=False)
        elif filename.lower().endswith(('.csv')):
           data_predicted.to_csv(path ,index=False,encoding='utf-8')

    def classify_real_time(self, tweets, filename):

         

            clean_tweets = self.data_clean.prepare_data_list(list(tweets))
            classifier = self.tweets_classifier.load_classifier()
            vectorizer = self.tweets_classifier.put_word_features()
            tfidf = vectorizer.transform(clean_tweets)
            result = classifier.predict(tfidf)
            data_predicted = []
            for tweet, label in zip(tweets, result):
                data_predicted.append({'tweets': tweet, 'class': label})
            data_predicted = pd.DataFrame(data_predicted)
            path = os.path.join(r'C:\Users\Mohammed\PycharmProjects\new_GP\real_files', filename)
            data_predicted.to_excel(path, index=False)


    def read_real_time(self,filename):
        path = os.path.join(r'C:\Users\Mohammed\PycharmProjects\new_GP\real_files',filename)
        dataframe = pd.read_excel(path, names=['class', 'tweets'])

        return dataframe

    def read_file(self,filename):
        path = os.path.join(r'C:\Users\Mohammed\PycharmProjects\new_GP\historical_files',filename)
        if filename.lower().endswith(('.xlsx', '.xls')):
          dataframe = pd.read_excel(path, names=['class', 'tweets'])
        elif filename.lower().endswith(('.csv')):
            dataframe = pd.read_csv(path ,names=['class' ,'tweets'])

        return dataframe
from DataCleaner import DataCleaner
from TfIdfMaker import TfIdfMaker
from KMeansClusterer import KMeansClusterer

if __name__ == "__main__":
    # create the instance that takes in the file name to read data and clean from
    data_cleaner = DataCleaner("plot_summaries.txt")
    data_cleaner.clean_data()

    tf_idf_maker = TfIdfMaker("limited_id_name_summary.pkl")
    tf_idf_maker.create_frequencies()
    tf_idf_maker.create_tf_idf()

    k_means_clusterer = KMeansClusterer("tf_idf.pkl")
    k_means_clusterer.find_clusters()
Exemple #10
0
class TweetClassifier:

    final_file = None
    word_features = None
    data_clean = DataCleaner()

    def __init__(self):

        return

    # def classify_real_time_tweets(self, tweet):
    #
    #     tweeter_api = TwitterAPI()
    #
    #
    #
    #     return

    def get_tweets(self, file_name):
        # file = open_workbook(file_name)
        #Text = []
        # Polarity = []
        # for sheet in file.sheets():
        #     if sheet.name == "Sheet1":
        #         for row in range(sheet.nrows):
        #             for col in range(sheet.ncols):
        #                 data = sheet.cell(row,col).value
        #                 if(col == 0):
        #                     Text.append(data)
        #                 elif(col == 1):
        #                     Polarity.append(data)
        #
        # dic = dict(zip(Text,Polarity))
        dataframe = pd.read_excel(file_name)
        Text = list(dataframe['tweets'])
        #Polarity = list(dataframe['class'])
        #dic = dict(zip(Text, Polarity))
        return Text

    #
    # def get_words(self,tweets):
    #     all_words = []
    #     for (word ,sentiment)in tweets:
    #       all_words.extend(word)
    #
    #     return all_words

    # def get_word_freq(self, wordlist):
    #     wordlist =nltk.FreqDist(wordlist)
    #     word_features = wordlist.keys()
    #     return word_features

    # def extract_features(self,document):
    #     document_word = set(document)
    #     features = {}
    #     for word in self.word_features:
    #         features['Word(%s)' %word]= (word in document_word)
    #     return features

    # training_set = nltk.classify.apply_features(extract_features ,train_data)
    # #test_set = nltk.classify.apply_features(extract_features ,test_data)
    # tel_classifier = nltk.NaiveBayesClassifier.train(training_set)
    #
    #
    def save_classifier(self, classifier):
        UPLOADS_PATH = join(dirname(realpath(__file__)),
                            'Classifiers/tel_classifier2.pickle')
        f = open(UPLOADS_PATH, 'wb')
        pickle.dump(classifier, f, -1)
        f.close()

    def save_classifier_dev(self, classifier, NameOfAlgo):
        UPLOADS_PATH = join(dirname(realpath(__file__)),
                            'dev_folder/' + NameOfAlgo + '.pickle')
        f = open(UPLOADS_PATH, 'wb')
        pickle.dump(classifier, f, -1)
        f.close()

    def load_classifier_dev(self, NameOfAlgo):
        UPLOADS_PATH = join(dirname(realpath(__file__)),
                            'dev_folder/' + NameOfAlgo + '.pickle')
        f = open(UPLOADS_PATH, 'rb')
        classifier = pickle.load(f)
        f.close()
        return classifier

    def load_classifier(self):
        UPLOADS_PATH = join(dirname(realpath(__file__)),
                            'Classifiers/tel_classifier2.pickle')
        f = open(UPLOADS_PATH, 'rb')
        classifier = pickle.load(f)
        f.close()
        return classifier

    # def put_word_features(self):
    #
    #     training = self.get_tweets(r'C:\Users\Mohammed\Desktop\train_data.xlsx')
    #     train_data = self.data_clean.prepare_data_set(training)
    #     self.word_features = self.get_word_freq(self.get_words(train_data))

    def feature_extraction(self, data):
        vectorizer = TfidfVectorizer(min_df=3, max_df=0.85, ngram_range=(1, 3))
        tfidf_data = vectorizer.fit_transform(data)
        return tfidf_data

    # def data_transform(self,data):
    #
    #     vectorizer = TfidfVectorizer(min_df=3, max_df=0.85, ngram_range=(1, 3))
    #     tfidf_data = vectorizer.transform(data)
    #     return tfidf_data

    def fit_data(self, filename):
        vectorizer = TfidfVectorizer(min_df=3, max_df=0.85, ngram_range=(1, 3))
        training = self.get_tweets(filename)
        train_data = self.data_clean.prepare_data_list(training)
        vectorizer.fit_transform(train_data)
        return vectorizer

    def build_pickle(self, train, train_label, algo):
        x_train, y_train = train, train_label
        classifier = algo
        classifier.fit(x_train, y_train)
        # classifier = twitterCal.load_classifier()
        self.save_classifier(classifier)
        # predict = cross_val_predict(classifier,x_test,y_test,cv=10)
        # scores = cross_val_score(classifier ,x_test , y_test,cv=10)
        # print(scores)
        # print('Accuracy of %s %0.2f (+/- %0.2f)'%( classifier ,scores.mean(),scores.std()*2))
        # print(classification_report(y_test ,predict))

        return classifier

    def learining(self, train, train_label, algo, NameOfAlgo):
        x_train, y_train = train, train_label
        classifier = algo
        classifier.fit(x_train, y_train)
        # classifier = twitterCal.load_classifier()
        self.save_classifier_dev(classifier, NameOfAlgo)
        # predict = cross_val_predict(classifier,x_test,y_test,cv=10)
        # scores = cross_val_score(classifier ,x_test , y_test,cv=10)
        # print(scores)
        # print('Accuracy of %s %0.2f (+/- %0.2f)'%( classifier ,scores.mean(),scores.std()*2))
        # print(classification_report(y_test ,predict))

        return classifier

    def put_word_features(self):
        vectorizer = TfidfVectorizer(min_df=3, max_df=0.85, ngram_range=(1, 3))
        training = self.get_tweets(
            r'C:\Users\Mohammed\PycharmProjects\new_GP\Classifiers\train_data.xlsx'
        )
        train_data = self.data_clean.prepare_data_list(training)
        vectorizer.fit_transform(train_data)
        return vectorizer

    def predict(self, tweets, filename, NameOfAlgo):

        classifier = self.load_classifier_dev(NameOfAlgo)
        clean_tweets = self.data_clean.prepare_data_list(list(tweets))
        vectorizer = self.put_word_features()
        tfidf = vectorizer.transform(clean_tweets)
        result = classifier.predict(tfidf)
        data_predicted = []
        for tweet, label in zip(tweets, result):
            data_predicted.append({'tweets': tweet, 'class': label})
        data_predicted = pd.DataFrame(data_predicted)
        path = os.path.join(
            r'C:\Users\Mohammed\PycharmProjects\new_GP\dev_folder\predicted',
            filename)
        data_predicted.to_excel(path, index=False)

    def read_file_dev(self, filename):
        try:
            path = os.path.join(
                r'C:\Users\Mohammed\PycharmProjects\new_GP\dev_folder',
                filename)
            if filename.lower().endswith(('.xlsx', '.xls')):
                dataframe = pd.read_excel(path, names=['tweets'])
            elif filename.lower().endswith(('.csv')):
                dataframe = pd.read_csv(path, names=['tweets'])

            return dataframe
        except Exception as e:
            return False
Exemple #11
0
def main(heartbeat_stop_callback=None):
  global config
  parser = OptionParser()
  parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="verbose log output", default=False)
  parser.add_option("-e", "--expected-hostname", dest="expected_hostname", action="store",
                    help="expected hostname of current host. If hostname differs, agent will fail", default=None)
  (options, args) = parser.parse_args()

  expected_hostname = options.expected_hostname

  current_user = getpass.getuser()

  setup_logging(options.verbose)
  
  default_cfg = {'agent': {'prefix': '/home/ambari'}}
  config.load(default_cfg)

  bind_signal_handlers(agentPid)

  if (len(sys.argv) > 1) and sys.argv[1] == 'stop':
    stop_agent()

  if (len(sys.argv) > 2) and sys.argv[1] == 'reset':
    reset_agent(sys.argv)

  # Check for ambari configuration file.
  resolve_ambari_config()

  # Starting data cleanup daemon
  data_cleaner = None
  if config.has_option('agent', 'data_cleanup_interval') and int(config.get('agent','data_cleanup_interval')) > 0:
    data_cleaner = DataCleaner(config)
    data_cleaner.start()

  perform_prestart_checks(expected_hostname)

  # Starting ping port listener
  try:
    #This acts as a single process machine-wide lock (albeit incomplete, since
    # we still need an extra file to track the Agent PID)
    ping_port_listener = PingPortListener(config)
  except Exception as ex:
    err_message = "Failed to start ping port listener of: " + str(ex)
    logger.error(err_message)
    sys.stderr.write(err_message)
    sys.exit(1)
  ping_port_listener.start()

  update_log_level(config)

  server_hostname = config.get('server', 'hostname')
  server_url = config.get_api_url()

  if not OSCheck.get_os_family() == OSConst.WINSRV_FAMILY:
    daemonize()

  try:
    server_ip = socket.gethostbyname(server_hostname)
    logger.info('Connecting to Ambari server at %s (%s)', server_url, server_ip)
  except socket.error:
    logger.warn("Unable to determine the IP address of the Ambari server '%s'", server_hostname)

  # Wait until server is reachable
  netutil = NetUtil(heartbeat_stop_callback)
  retries, connected = netutil.try_to_connect(server_url, -1, logger)
  # Ambari Agent was stopped using stop event
  if connected:
    # Launch Controller communication
    controller = Controller(config, heartbeat_stop_callback)
    controller.start()
    controller.join()
  if not OSCheck.get_os_family() == OSConst.WINSRV_FAMILY:
    stop_agent()
  logger.info("finished")
Exemple #12
0
def main(heartbeat_stop_callback=None):
  global config
  parser = OptionParser()
  parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="verbose log output", default=False)
  parser.add_option("-e", "--expected-hostname", dest="expected_hostname", action="store",
                    help="expected hostname of current host. If hostname differs, agent will fail", default=None)
  (options, args) = parser.parse_args()

  expected_hostname = options.expected_hostname

  logging_level = logging.DEBUG if options.verbose else logging.INFO

  setup_logging(logger, AmbariConfig.AmbariConfig.getLogFile(), logging_level)
  global is_logger_setup
  is_logger_setup = True
  setup_logging(alerts_logger, AmbariConfig.AmbariConfig.getAlertsLogFile(), logging_level)
  Logger.initialize_logger('resource_management', logging_level=logging_level)

  # use the host's locale for numeric formatting
  try:
    locale.setlocale(locale.LC_ALL, '')
  except locale.Error as ex:
    logger.warning("Cannot set locale for ambari-agent. Please check your systemwide locale settings. Failed due to: {0}.".format(str(ex)))

  default_cfg = {'agent': {'prefix': '/home/ambari'}}
  config.load(default_cfg)

  if (len(sys.argv) > 1) and sys.argv[1] == 'stop':
    stop_agent()

  if (len(sys.argv) > 2) and sys.argv[1] == 'reset':
    reset_agent(sys.argv)

  # Check for ambari configuration file.
  resolve_ambari_config()
  
  # Add syslog hanlder based on ambari config file
  add_syslog_handler(logger)

  # Starting data cleanup daemon
  data_cleaner = None
  if config.has_option('agent', 'data_cleanup_interval') and int(config.get('agent','data_cleanup_interval')) > 0:
    data_cleaner = DataCleaner(config)
    data_cleaner.start()

  perform_prestart_checks(expected_hostname)

  # Starting ping port listener
  try:
    #This acts as a single process machine-wide lock (albeit incomplete, since
    # we still need an extra file to track the Agent PID)
    ping_port_listener = PingPortListener(config)
  except Exception as ex:
    err_message = "Failed to start ping port listener of: " + str(ex)
    logger.error(err_message)
    sys.stderr.write(err_message)
    sys.exit(1)
  ping_port_listener.start()

  update_log_level(config)

  if not OSCheck.get_os_family() == OSConst.WINSRV_FAMILY:
    daemonize()

  #
  # Iterate through the list of server hostnames and connect to the first active server
  #

  active_server = None
  server_hostnames = hostname.server_hostnames(config)

  connected = False
  stopped = False

  # Keep trying to connect to a server or bail out if ambari-agent was stopped
  while not connected and not stopped:
    for server_hostname in server_hostnames:
      try:
        server_ip = socket.gethostbyname(server_hostname)
        server_url = config.get_api_url(server_hostname)
        logger.info('Connecting to Ambari server at %s (%s)', server_url, server_ip)
      except socket.error:
        logger.warn("Unable to determine the IP address of the Ambari server '%s'", server_hostname)

      # Wait until MAX_RETRIES to see if server is reachable
      netutil = NetUtil(config, heartbeat_stop_callback)
      (retries, connected, stopped) = netutil.try_to_connect(server_url, MAX_RETRIES, logger)

      # if connected, launch controller
      if connected:
        logger.info('Connected to Ambari server %s', server_hostname)
        # Set the active server
        active_server = server_hostname
        # Launch Controller communication
        controller = Controller(config, server_hostname, heartbeat_stop_callback)
        controller.start()
        while controller.is_alive():
          time.sleep(0.1)

      #
      # If Ambari Agent connected to the server or
      # Ambari Agent was stopped using stop event
      # Clean up if not Windows OS
      #
      if connected or stopped:
        ExitHelper().exit(0)
        logger.info("finished")
        break
    pass # for server_hostname in server_hostnames
  pass # while not (connected or stopped)

  return active_server
class User:

    result = Result()
    api = TwitterAPI()
    datacleaner = DataCleaner()
    tweet_classifier = TweetClassifier()
    tk = Tk()
    email = None
    file = NONE

    def __init__(self):
        return

    def search(self, query):

        auth = self.api.Auth()
        tweets = self.api.retrieve_tweets(query, auth)
        return tweets

    def upload(self, filename):
        try:
            name = filename
            fname = r'C:\Users\Mohammed\PycharmProjects\new_GP\uploads'
            filename = os.path.join(fname, filename)
            #filename = filedialog.askopenfilename(filetypes=[('excel file', '.xlsx'),('excel file', '.xls'),('excel file', '.csv')])
            if filename:
                if filename.lower().endswith(('.xlsx', '.xls')):
                    dataframe = pd.read_excel(filename, names=['tweets'])
                    if len(dataframe.columns) == 1:
                        self.api.classify(list(dataframe['tweets']), name)
                        prediction_file = self.api.read_file(name)
                        prediction_file.columns = ['class', 'tweets']
                        self.file = prediction_file
                    else:
                        return False

                elif filename.lower().endswith(('.csv')):
                    dataframe = pd.read_csv(filename, names=['tweets'])
                    if len(dataframe.columns) == 1:
                        self.api.classify(list(dataframe['tweets']), name)
                        prediction_file = self.api.read_file(name)
                        prediction_file.columns = ['class', 'tweets']
                        self.file = prediction_file
                    else:
                        return False
                else:
                    return False
            else:
                return False

            return True
        except Exception as e:
            return False

    def get_train_file_for_build_algo(self, train_file, algo):

        fname = r'C:\Users\Mohammed\PycharmProjects\new_GP\Classifiers'
        train_filename = os.path.join(fname, train_file)
        dataframe = pd.read_excel(train_filename, names=['tweets', 'class'])

        dataframe.tweets = self.datacleaner.prepare_data_list(
            list(dataframe.tweets))
        data, label = list(dataframe['tweets']), list(dataframe['class'])
        tfidf = self.tweet_classifier.feature_extraction(data)
        classifier = self.tweet_classifier.build_pickle(tfidf, label, algo)

        return classifier

    def get_train_file(self, train_file, algo, NameOfAlgo):

        fname = r'C:\Users\Mohammed\PycharmProjects\new_GP\dev_folder'
        train_filename = os.path.join(fname, train_file)
        dataframe = pd.read_excel(train_filename, names=['tweets', 'class'])

        dataframe.tweets = self.datacleaner.prepare_data_list(
            list(dataframe.tweets))
        data, label = list(dataframe['tweets']), list(dataframe['class'])
        tfidf = self.tweet_classifier.feature_extraction(data)
        classifier = self.tweet_classifier.learining(tfidf, label, algo,
                                                     NameOfAlgo)

        return classifier

    def get_test_file(self, test_file):
        try:
            fname = r'C:\Users\Mohammed\PycharmProjects\new_GP\dev_folder'
            test_filename = os.path.join(fname, test_file)
            dataframe = pd.read_excel(test_filename, names=['tweets', 'class'])
            dataframe.tweets = self.datacleaner.prepare_data_list(
                list(dataframe.tweets))
            data, label = list(dataframe['tweets']), list(dataframe['class'])
            vectorizer = self.tweet_classifier.fit_data(test_filename)
            tfidf = vectorizer.transform(data)

            return tfidf, label
        except Exception as e:
            return False

    def accuracy(self, train_file, test_file, algo, NameOfAlgo):
        try:

            classifier = self.get_train_file(train_file, algo, NameOfAlgo)

            tfidf, label = self.get_test_file(test_file)

            scores = cross_val_score(classifier, tfidf, label)

            label_predict = cross_val_predict(classifier, tfidf, label)

            matrix = classification_report(label, label_predict)

            result = [scores.mean() * 100, matrix]

            return result

        except Exception as e:
            return False
Exemple #14
0
def main():

    date_time = time.strftime("%c")
    logging.basicConfig(filename='/NAS/PipelineReports/DataCleaner/Logs/DataCleanerLog-' + date_time + '.log', level=logging.DEBUG)
    while True:
        logging.info("Waiting for 5 minutes...")
        print "Waiting for 5 minutes..."
        time.sleep(300)
        try:
            get_run_url = "http://genapsys-services.appspot.com/listOfRunsToBeProcessed?stateInclude=6&keywork=&intervalDays=1&stateExclude=8"
            run_list = requests.get(get_run_url)
            run_list = json.loads(run_list.content)
        except:
            logging.info("Error in getting the run list! Trying again ...")
            print "Error in getting the run list! Trying again ..."

        for run_id in run_list:
            logging.info(run_id)
            print run_id
            try:

                # Starting the analysis
                store_experiment_state(run_id, 8, 1, 88)

                print "Hack Waiting for 10 minutes..."
                time.sleep(600)
                # Data Object
                DataObj = DataCleaner(run_id)
                logging.info("Defining the Data Object OKAY")
                print "Defining the Data Object OKAY"

                # Getting config information
                DataObj.getConfig(DataObj._config_file)
                logging.info("Getting config information OKAY")
                print "Getting config information OKAY"

                # Getting basic experiment info from the cloud database
                DataObj.getBasicInfo()
                logging.info("Getting basic experiment info from database OKAY")
                print "Getting basic experiment info from database OKAY"

                # Sets load and save paths
                DataObj.setPaths()
                logging.info("Set load and save paths OKAY")
                print "Set load and save paths OKAY"

                # Calculating the read configuration
                DataObj.calcReadConfig()
                logging.info("Calculating the read configuration OKAY")
                print "Calculating the read configuration OKAY"

                # Dumping the experiment basic info as json file
                DataObj.dumpRunInfo()
                logging.info("Dumping the experiment basic info as json file OKAY")
                print "Dumping the experiment basic info as json file OKAY"

                # Dumping the Experiment basic info successful
                store_experiment_state(run_id, 8, 2, 88)

                # Calculating the location of sensors and no_magnet flag
                DataObj.getChipInfo()
                logging.info("Calculating the location of sensors and no_magnet flag OKAY")
                print "Calculating the location of sensors and no_magnet flag OKAY"

                # Extracting the instrument states
                DataObj.createMask()
                logging.info("Extracting the instrument states OKAY")
                print "Extracting the instrument states OKAY"

                # Exporting the instrument data
                DataObj.instrumentData()
                logging.info("Exporting the instrument data OKAY")
                print "Exporting the instrument data OKAY"

                # Exporting the instrument data successful
                store_experiment_state(run_id, 8, 3, 88)

                # Exporting the sensor reads data
                DataObj.sensorReads()
                logging.info("Exporting the sensor reads data OKAY")
                print "Exporting the sensor reads data OKAY"

                # Exporting the sensor data successful
                store_experiment_state(run_id, 8, 4, 88)

            except:
                print "DataCleaner failed for the run:"
                print run_id
                logging.info("DataCleaner failed for the run:")
                logging.info(run_id)
Exemple #15
0
from DataCleaner import DataCleaner
import os
import csv

# Save excels as csvs to create data cleaner instances
DataCleaner.saveExcelFileAsCsv("Data/Enforcement_Policies_Data.xlsx")
DataCleaner.saveExcelFileAsCsv("Data/Integration_Policies_Data.xlsx")
DataCleaner.saveExcelFileAsCsv("Data/Public_Benefits_Data.xlsx")
"""
Each variable has its own sheet within the excel file, after using
the data cleaner method each sheet is now its own csv.
These are the variables we need and how to recode them:

Enforcement:
287(g) task force- 0 = pro immigration, 1 and 2 = anti immigration
Limited co-op w/ detainers- 0 = anti immigration, 1 and 2 = pro immigration
E-verify- 0= pro immigration, 1 and 2 = anti immigration

Integration:
English as official language- 0 = pro immigration, 1 = anti immigration
Driver's license policies- 0 = anti immigration, 1 = pro immigration

Public Benefits:
Public ins unauth kids- 0 = anti immigration, 1 = pro immigration
Medicaid unauth adults- 0 = anti immigration, 1 = pro immigration
Food asst. LPR kids- 0 = anti immigration, 1 = pro immigration
Food asst. LPR adults- 0 = anti immigration, 1 = pro immigration

Each variable will be recoded to have 0 as anti immigration and 1 as pro immigration.
"""
Exemple #16
0
class Result:
    bar_chart1 = False
    pie_chart1 = False
    all = []
    tweets_num = 0
    positive_tweets_num = 0
    negative_tweets_num = 0
    neutral_tweets_num = 0
    most_words = []
    dataclean = DataCleaner()

    def __init__(self):

        return

    def report(self, dataframe):

        dataframe_class, dataframe_tweets = dataframe['class'], dataframe[
            'tweets']

        dataframe_value_count = dataframe_class.value_counts()

        dictionary = dict(
            zip(dataframe_value_count.index, dataframe_value_count.values))

        for key, value in dictionary.items():
            if key == "neg":
                self.negative_tweets_num = value
            elif key == "pos":
                self.positive_tweets_num = value
            else:
                self.neutral_tweets_num = value

        self.tweets_num = dataframe_class.count()
        dataframe_tweets = self.dataclean.prepare_data_set_without_stem(
            list(dataframe_tweets))
        self.most_words = Counter(dataframe_tweets).most_common(15)
        self.all = dataframe_tweets
        self.pie_chart1 = True
        self.bar_chart1 = True
        valuable_words = pd.read_excel(
            r'C:\Users\Mohammed\PycharmProjects\new_GP\Classifiers\words.xlsx')
        valuable_words = valuable_words.Term
        most = []
        # for word in words :
        #     if word in list(valuable_words):
        #         most.append(word)
        # print(most)
        # print(self.all)

        for word in self.all:
            if word in list(valuable_words):
                most.append(word)
        self.most_words = Counter(most).most_common(15)
        return dataframe_value_count

    def pie_chart(self):

        data = {
            'Class': ['Negative', 'Positive', 'Neutral'],
            'Numbers': [
                self.negative_tweets_num, self.positive_tweets_num,
                self.neutral_tweets_num
            ]
        }

        dataframe = pd.DataFrame(data)
        pie = Donut(dataframe,
                    'Class',
                    values='Numbers',
                    title="Pie Chart For Tweets",
                    color=['#75D9EF', '#4B96D8', '#33D59F'])

        script_pie, div_pie = components(pie)
        script_pie = Markup(script_pie)
        div_pie = Markup(div_pie)
        page = [script_pie, div_pie]

        return page

    def pie_chart_most_words(self):

        most_words = pd.DataFrame(sorted(self.most_words),
                                  columns=['Word', 'Freq'])
        # words =list(most_words.Word)
        # words = self.dataclean.remove_stops(words)

        pie = Donut(most_words,
                    'Word',
                    values='Freq',
                    title="Pie Chart For Tweets",
                    color=[
                        '#f20f32', '#39f20f', '#76dfe7', '#af0132', '#f39f29',
                        '#3FC0CF', '#75D9EF', '#4B96D8', '#33D59F', '#33A9D5',
                        '#94A2F3', '#687EFC', '#68FCCF', '#68FCEC', '#75D3F9'
                    ])

        script_pie, div_pie = components(pie)
        script_pie = Markup(script_pie)
        div_pie = Markup(div_pie)
        page = [script_pie, div_pie]
        return page

    def bar_chart(self):

        data = {
            'Class': ['Negative', 'Positive', 'Neutral'],
            'Numbers': [
                self.negative_tweets_num, self.positive_tweets_num,
                self.neutral_tweets_num
            ]
        }
        dataframe = pd.DataFrame(data)
        bar = Bar(dataframe,
                  'Class',
                  values='Numbers',
                  title="Bar Chart For Tweets",
                  legend='top_right',
                  agg='median',
                  color="#33A9D5")

        script, div = components(bar)
        script = Markup(script)
        div = Markup(div)
        page = [script, div]

        return page

    def bar_chart_most_words(self):

        most_words = pd.DataFrame(self.most_words, columns=['Words', "Freq"])
        bar = Bar(most_words,
                  "Words",
                  values="Freq",
                  title="Bar Chart For Tweets",
                  legend='top_right',
                  agg='median',
                  color=["#75D3F9", "#75F9F7", "#75D9F9"])

        script, div = components(bar)
        script = Markup(script)
        div = Markup(div)
        page = [script, div]
        return page
Exemple #17
0
def extract_save_features(inputFileNameSensor):
    #blockPrint()
    print('Filename: ', inputFileNameSensor)

    #
    dataCleaner = DataCleaner()

    #
    extFeatures = ExtractFeatures()

    # Query data
    sensorStream = open_csv(inputFileNameSensor)

    #singleCharListLA = dataCleaner.get_single_char_list(sensorStream, 'LA')
    singleCharListLA = dataCleaner. \
         get_single_char_list_rm_rj(sensorStream, 'LA', None)

    singleCharListGY = dataCleaner. \
        get_single_char_list_rm_rj(sensorStream, 'GY', None)

    singleCharListLA[:] = [
        item for item in singleCharListLA
        if not (item['Char'].iloc[0] == 'Do some PenDowns & PenUps'
                or item['Char'].iloc[0] == 'Press Accept!'
                or item['Char'].iloc[0] == 'Sync Validation. Press Accept.' or
                (item['Char'].iloc[0]) != (item['Char'].iloc[0]))
    ]

    singleCharListGY[:] = [
        item for item in singleCharListGY
        if not (item['Char'].iloc[0] == 'Do some PenDowns & PenUps'
                or item['Char'].iloc[0] == 'Press Accept!'
                or item['Char'].iloc[0] == 'Sync Validation. Press Accept.' or
                (item['Char'].iloc[0]) != (item['Char'].iloc[0]))
    ]

    totalFeatureSet = []
    #singleCharListLA,singleCharListGY=dataCleaner.sync_LA_GY(singleCharListLA, singleCharListGY)
    #print("No of CharsLA", len(singleCharListLA))
    #print("No of CharsGY", len(singleCharListGY))

    totalFeatureSet.extend(
        get_features(extFeatures, singleCharListLA, singleCharListGY))
    #totalFeatureSet.extend(get_features(extFeatures, singleCharListGY, 'GY'))

    #suffix = inputFileNameSensor[-10:]
    #print("Suffix: ", suffix)

    #filename = os.path.join('Data/FW/', inputFileNameSensor[-27:-4])
    #filename = inputFileNameSensor[:-4]

    #filename = os.path.join('Data/TSI-Char-FeatureSets/', inputFileNameSensor[-10:-4])
    #save_features_to_csv(filename + 'CharFeatureSet' + '.csv', totalFeatureSet)

    filename = inputFileNameSensor[-17:-4]
    path = './Data/Results/Finger/FeatureSets/'
    filename = os.path.join(path, filename)
    save_features_to_csv(filename + 'CharFeatureSet' + '.csv', totalFeatureSet)

    #print('FeatureSets saved')

    enablePrint()
Exemple #18
0
def main(heartbeat_stop_callback=None):
  global config
  parser = OptionParser()
  parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="verbose log output", default=False)
  parser.add_option("-e", "--expected-hostname", dest="expected_hostname", action="store",
                    help="expected hostname of current host. If hostname differs, agent will fail", default=None)
  (options, args) = parser.parse_args()

  expected_hostname = options.expected_hostname

  setup_logging(logger, AmbariConfig.AmbariConfig.getLogFile(), options.verbose)
  global is_logger_setup
  is_logger_setup = True
  setup_logging(alerts_logger, AmbariConfig.AmbariConfig.getAlertsLogFile(), options.verbose)

  default_cfg = {'agent': {'prefix': '/home/ambari'}}
  config.load(default_cfg)

  if (len(sys.argv) > 1) and sys.argv[1] == 'stop':
    stop_agent()

  if (len(sys.argv) > 2) and sys.argv[1] == 'reset':
    reset_agent(sys.argv)

  # Check for ambari configuration file.
  resolve_ambari_config()
  
  # Add syslog hanlder based on ambari config file
  add_syslog_handler(logger)

  # Starting data cleanup daemon
  data_cleaner = None
  if config.has_option('agent', 'data_cleanup_interval') and int(config.get('agent','data_cleanup_interval')) > 0:
    data_cleaner = DataCleaner(config)
    data_cleaner.start()

  perform_prestart_checks(expected_hostname)

  # Starting ping port listener
  try:
    #This acts as a single process machine-wide lock (albeit incomplete, since
    # we still need an extra file to track the Agent PID)
    ping_port_listener = PingPortListener(config)
  except Exception as ex:
    err_message = "Failed to start ping port listener of: " + str(ex)
    logger.error(err_message)
    sys.stderr.write(err_message)
    sys.exit(1)
  ping_port_listener.start()

  update_log_level(config)

  server_hostname = hostname.server_hostname(config)
  server_url = config.get_api_url()

  if not OSCheck.get_os_family() == OSConst.WINSRV_FAMILY:
    daemonize()

  try:
    server_ip = socket.gethostbyname(server_hostname)
    logger.info('Connecting to Ambari server at %s (%s)', server_url, server_ip)
  except socket.error:
    logger.warn("Unable to determine the IP address of the Ambari server '%s'", server_hostname)

  # Wait until server is reachable
  netutil = NetUtil(heartbeat_stop_callback)
  retries, connected = netutil.try_to_connect(server_url, -1, logger)
  # Ambari Agent was stopped using stop event
  if connected:
    # Launch Controller communication
    controller = Controller(config, heartbeat_stop_callback)
    controller.start()
    controller.join()
  if not OSCheck.get_os_family() == OSConst.WINSRV_FAMILY:
    ExitHelper.execute_cleanup()
    stop_agent()
  logger.info("finished")
Exemple #19
0
def main():
  global config
  parser = OptionParser()
  parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="verbose log output", default=False)
  parser.add_option("-e", "--expected-hostname", dest="expected_hostname", action="store",
                    help="expected hostname of current host. If hostname differs, agent will fail", default=None)
  (options, args) = parser.parse_args()

  expected_hostname = options.expected_hostname

  setup_logging(options.verbose)

  default_cfg = {'agent': {'prefix': '/home/ambari'}}
  config.load(default_cfg)

  bind_signal_handlers()

  if (len(sys.argv) > 1) and sys.argv[1] == 'stop':
    stop_agent()

  # Check for ambari configuration file.
  config = resolve_ambari_config()

  # Starting data cleanup daemon
  data_cleaner = None
  if int(config.get('agent', 'data_cleanup_interval')) > 0:
    data_cleaner = DataCleaner(config)
    data_cleaner.start()

  perform_prestart_checks(expected_hostname)
  daemonize()

  # Starting ping port listener
  try:
    ping_port_listener = PingPortListener(config)
  except Exception as ex:
    err_message = "Failed to start ping port listener of: " + str(ex)
    logger.error(err_message)
    sys.stderr.write(err_message)
    sys.exit(1)
  ping_port_listener.start()

  update_log_level(config)

  server_hostname = config.get('server', 'hostname')
  server_url = config.get_api_url()

  try:
    server_ip = socket.gethostbyname(server_hostname)
    logger.info('Connecting to Ambari server at %s (%s)', server_url, server_ip)
  except socket.error:
    logger.warn("Unable to determine the IP address of the Ambari server '%s'", server_hostname)

  # Wait until server is reachable
  netutil = NetUtil()
  netutil.try_to_connect(server_url, -1, logger)

  # Launch Controller communication
  controller = Controller(config)
  controller.start()
  controller.join()
  stop_agent()
  logger.info("finished")
Exemple #20
0
def main(heartbeat_stop_callback=None):
    global config
    global home_dir

    parser = OptionParser()
    parser.add_option("-v",
                      "--verbose",
                      dest="verbose",
                      action="store_true",
                      help="verbose log output",
                      default=False)
    parser.add_option(
        "-e",
        "--expected-hostname",
        dest="expected_hostname",
        action="store",
        help=
        "expected hostname of current host. If hostname differs, agent will fail",
        default=None)
    parser.add_option("--home",
                      dest="home_dir",
                      action="store",
                      help="Home directory",
                      default="")
    (options, args) = parser.parse_args()

    expected_hostname = options.expected_hostname
    home_dir = options.home_dir

    logging_level = logging.DEBUG if options.verbose else logging.INFO

    setup_logging(logger, AmbariConfig.AmbariConfig.getLogFile(),
                  logging_level)
    global is_logger_setup
    is_logger_setup = True
    setup_logging(alerts_logger, AmbariConfig.AmbariConfig.getAlertsLogFile(),
                  logging_level)
    Logger.initialize_logger('resource_management',
                             logging_level=logging_level)

    if home_dir != "":
        # When running multiple Ambari Agents on this host for simulation, each one will use a unique home directory.
        Logger.info("Agent is using Home Dir: %s" % str(home_dir))

    # use the host's locale for numeric formatting
    try:
        locale.setlocale(locale.LC_ALL, '')
    except locale.Error as ex:
        logger.warning(
            "Cannot set locale for ambari-agent. Please check your systemwide locale settings. Failed due to: {0}."
            .format(str(ex)))

    default_cfg = {'agent': {'prefix': '/home/ambari'}}
    config.load(default_cfg)

    if (len(sys.argv) > 1) and sys.argv[1] == 'stop':
        stop_agent()

    if (len(sys.argv) > 2) and sys.argv[1] == 'reset':
        reset_agent(sys.argv)

    # Check for ambari configuration file.
    resolve_ambari_config()

    # Add syslog hanlder based on ambari config file
    add_syslog_handler(logger)

    # Starting data cleanup daemon
    data_cleaner = None
    if config.has_option('agent', 'data_cleanup_interval') and int(
            config.get('agent', 'data_cleanup_interval')) > 0:
        data_cleaner = DataCleaner(config)
        data_cleaner.start()

    perform_prestart_checks(expected_hostname)

    # Starting ping port listener
    try:
        #This acts as a single process machine-wide lock (albeit incomplete, since
        # we still need an extra file to track the Agent PID)
        ping_port_listener = PingPortListener(config)
    except Exception as ex:
        err_message = "Failed to start ping port listener of: " + str(ex)
        logger.error(err_message)
        sys.stderr.write(err_message)
        sys.exit(1)
    ping_port_listener.start()

    update_log_level(config)

    update_open_files_ulimit(config)

    if not config.use_system_proxy_setting():
        logger.info('Agent is configured to ignore system proxy settings')
        reconfigure_urllib2_opener(ignore_system_proxy=True)

    if not OSCheck.get_os_family() == OSConst.WINSRV_FAMILY:
        daemonize()

    #
    # Iterate through the list of server hostnames and connect to the first active server
    #

    active_server = None
    server_hostnames = hostname.server_hostnames(config)

    connected = False
    stopped = False

    # Keep trying to connect to a server or bail out if ambari-agent was stopped
    while not connected and not stopped:
        for server_hostname in server_hostnames:
            server_url = config.get_api_url(server_hostname)
            try:
                server_ip = socket.gethostbyname(server_hostname)
                logger.info('Connecting to Ambari server at %s (%s)',
                            server_url, server_ip)
            except socket.error:
                logger.warn(
                    "Unable to determine the IP address of the Ambari server '%s'",
                    server_hostname)

            # Wait until MAX_RETRIES to see if server is reachable
            netutil = NetUtil(config, heartbeat_stop_callback)
            (retries, connected,
             stopped) = netutil.try_to_connect(server_url, MAX_RETRIES, logger)

            # if connected, launch controller
            if connected:
                logger.info('Connected to Ambari server %s', server_hostname)
                # Set the active server
                active_server = server_hostname
                # Launch Controller communication
                run_threads(server_hostname, heartbeat_stop_callback)

            #
            # If Ambari Agent connected to the server or
            # Ambari Agent was stopped using stop event
            # Clean up if not Windows OS
            #
            if connected or stopped:
                ExitHelper().exit(0)
                logger.info("finished")
                break
        pass  # for server_hostname in server_hostnames
    pass  # while not (connected or stopped)

    return active_server
 def clean(self, df):
     dc = DataCleaner(df)
     return dc.clean_gadya()