def main(): description = "Cleans up old backups to leave more room on the backup server." \ "\n\nE.g. python cleaner.py -p /path/to/archive -o 3:4 7:7." \ "\n\nThe example provided will keep an archive from every 4th day if it's more than 3 days old" \ " and archive every 7 days if it's more than a week old." \ "\n\nThe format of backups this script takes is BACKUP_SET-VERSION." parser = argparse.ArgumentParser( description=description, formatter_class=RawDescriptionHelpFormatter) parser.add_argument('-p', '--root-path', type=str, required=True, help='The root path of your backups.') parser.add_argument( '-o', '--options', type=str, required=True, nargs='*', help='Your age threshold and desired interval size separated by a colon' ) parser.add_argument('-f', '--force', action='store_true', help='Automatically confirms that you want to delete.') args = parser.parse_args() calc = Calculator(args.root_path, args.options, args.force) calc.calculate() cleaner = Cleaner(calc) cleaner.clean()
def run(self): """ Start processing. """ # parse the command line arguments and set logging options try: self.args = self.parser.parse_args() self.configureLogging() self.logger.info("Started with {0}".format(' '.join(sys.argv[1:]))) except Exception as e: self.parser.print_help() sys.exit(e) # load the configuration file try: with open(self.args.config) as f: self.config.readfp(f) except Exception as e: self.logger.critical("Could not load the specified configuration file") sys.exit(e) # set options Cfg.LOG_EXC_INFO = self.args.trace # execute commands with Timer.Timer() as t: if self.args.crawl: import Crawler Crawler.crawl(self.config, self.args.update) if self.args.clean: import Cleaner Cleaner.clean(self.config, self.args.update) if self.args.infer: import Facter Facter.infer(self.config, self.args.update) if self.args.graph: import Grapher Grapher.graph(self.config, self.args.update) if self.args.transform: import Transformer Transformer.transform(self.config) if self.args.post: import Poster Poster.post(self.config) if self.args.analyze: import Analyzer Analyzer.analyze(self.config, self.args.update) self.logger.info("Indexer finished in {0}:{1}:{2}".format(t.hours, t.minutes, t.seconds))
def create_training_data(): print("Loading articles... This may take a while") t_start = time.time() articles = [] for root, dirnames, filenames in os.walk('./Articles'): for filename in fnmatch.filter(filenames, '*.txt'): articles.append(os.path.join(root, filename)) print("Loading articles complete. Took {0} seconds...".format(time.time() - t_start)) # Questions # Q1 in_random_articles = input("Use random articles? [y/N]") if in_random_articles == "y": random.shuffle(articles) in_random_articles = True # Q2 in_clean_file = input("Clean articles [Y/n]") if in_clean_file == "n": in_clean_file = False else: in_clean_file = True # Q3 in_num_articles = input("Number or articles? [Default: 10]") try: num_articles = int(in_num_articles) except: num_articles = 10 selected_articles = articles[0:min(len(articles), num_articles)-1] try: os.mkdir("./Training") except: pass training_filename = "Training-{0}-{1}-{2}-{3}.txt".format( \ "Clean" if in_clean_file == True else "Dirty", \ "Shuffle" if in_random_articles else "Iterate", \ num_articles, \ str(uuid.uuid4())[:8]) for article in selected_articles: with codecs.open("./Training/" + training_filename, "a+", encoding="utf8") as file: with codecs.open(article,'r', encoding="utf8") as f: content = f.read() if in_clean_file == True: content = Cleaner.clean(content) file.write(content) print("Created Training set named: {0}".format(training_filename))
# importo le librerie import csv import Cleaner import sys csv.field_size_limit(sys.maxsize) # risolve il problema di overflow with open('File_Parsered.csv', 'rt', encoding='utf8') as f, \ open('/Users/robertopenna/Desktop/Archivio/UNIMIB/Stage/JST-master/data/MR.dat', 'wt', encoding='utf8') as d: csv_f = csv.reader(f) next(csv_f) for row in csv_f: idtweet = row[0] string = row[1].lower() string_clean = Cleaner.clean(string) string_noTW = Cleaner.remove_stopW(string_clean) string_fin = string_noTW.replace('é', 'e').replace('ò', 'o').replace('è', 'e').replace('à', 'a').replace('ù', 'u') if string_fin != "": d.write('Tweet' + idtweet + ' ' + string_fin + '\n')
import pandas as pd import Cleaner from sklearn import cross_validation from sklearn.ensemble import RandomForestClassifier import FeatureSelector # read csv star_wars = pd.read_csv("star_wars.csv", encoding="ISO-8859-1") # clean data star_wars = Cleaner.clean(star_wars) # split into train and test data star_wars_train = star_wars[:-200] star_wars_test = star_wars[-200:] # Initialize our algorithm with the default paramters # n_estimators is the number of trees we want to make # min_samples_split is the minimum number of rows we need to make a split # min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree) alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1) # Set predictors predictors = ["SeenSW", "IsStarTrekFan", "Gender", "Age", "Income", "Education", "Location"] # uncomment to check what features to use # FeatureSelector.check(star_wars, predictors)