コード例 #1
0
    def run(self):
        dump_logger = getLogger('dumpscraper')
        # Let's invoke the getscore runner and tell him to work on training data
        dump_logger.info("Calculating dump score...")
        running = getscore.DumpScraperGetscore(self.settings, self.parentArgs)
        running.run()

        # First of all let's feed the classifier with the training data
        training = scipy_genfromtxt(self.settings['data_dir'] + "/" + "training/features.csv", delimiter=",", skip_header=1, usecols=(0, 1, 2))
        target = scipy_genfromtxt(self.settings['data_dir'] + "/" + "training/features.csv", delimiter=",", skip_header=1, usecols=(-2))

        clf = sklearn.neighbors.KNeighborsClassifier(10, weights='uniform')
        clf.fit(training, target)

        trash_count = hash_count = plain_count = 0
        cleared = []

        with open(self.settings['data_dir'] + "/" + 'features.csv', 'rb') as csvfile:
            reader = csv_reader(csvfile)

            for line in reader:
                if line[0] == 'Trash score':
                    continue

                features = np_array(line[0:3])
                features = features.reshape(1, -1)
                label = clf.predict(features)

                if label == 0:
                    folder = 'trash'
                    trash_count += 1
                elif label == 1:
                    folder = 'hash'
                    hash_count += 1
                elif label == 2:
                    folder = 'plain'
                    plain_count += 1

                target_file = self.settings['data_dir'] + "/" + 'organized/' + folder + "/" + line[-1]
                target_dir = path.dirname(target_file)

                # If asked for a clean run, let's delete the entire folder before copying any file
                if self.parentArgs.clean and target_dir not in cleared and path.exists(target_dir):
                    cleared.append(target_dir)
                    shutil_rmtree(target_dir)

                if not path.exists(target_dir):
                    makedirs(target_dir)

                shutil_copyfile(self.settings['data_dir'] + "/" + 'raw/' + line[-1], target_file)

        dump_logger.info("Trash files: " + str(trash_count))
        dump_logger.info("Hash files: " + str(hash_count))
        dump_logger.info("Plain files: " + str(plain_count))
        dump_logger.info("Operation completed")
コード例 #2
0
ファイル: classify.py プロジェクト: pking74/dump-scraper
    def run(self):
        # Let's invoke the getscore runner and tell him to work on training data
        print("Calculating dump score...")
        running = getscore.DumpScraperGetscore(self.settings, self.parentArgs)
        running.run()

        # First of all let's feed the classifier with the training data
        training = scipy.genfromtxt("data/training/features.csv",
                                    delimiter=",",
                                    skip_header=1,
                                    usecols=(0, 1, 2))
        target = scipy.genfromtxt("data/training/features.csv",
                                  delimiter=",",
                                  skip_header=1,
                                  usecols=(-2))

        clf = sklearn.neighbors.KNeighborsClassifier(10, weights='uniform')
        clf.fit(training, target)

        trash_count = hash_count = plain_count = 0

        with open('data/raw/features.csv', 'rb') as csvfile:
            reader = csv.reader(csvfile)

            for line in reader:
                if line[0] == 'Trash score':
                    continue

                features = line[0:3]
                label = clf.predict(features)

                if label == 0:
                    folder = 'trash'
                    trash_count += 1
                elif label == 1:
                    folder = 'hash'
                    hash_count += 1
                elif label == 2:
                    folder = 'plain'
                    plain_count += 1

                target_dir = 'data/organized/' + folder + "/" + line[-1]

                if not os.path.exists(os.path.dirname(target_dir)):
                    os.makedirs(os.path.dirname(target_dir))

                shutil.copyfile('data/raw/' + line[-1], target_dir)

        print("Trash files: " + str(trash_count))
        print("Hash files: " + str(hash_count))
        print("Plain files: " + str(plain_count))
        print("Operation completed")
コード例 #3
0
    def run(self):
        self.banner()

        # Peform some sanity checks
        try:
            self.checkenv()
        except exceptions.InvalidSettings as error:
            print("")
            print(error)

            return

        # Let's load the correct object
        if self.args.command == 'scrape':
            runner = scrape.DumpScraperScrape(self.settings, self.args)
        elif self.args.command == 'scrapeold':
            runner = scrapeold.DumpScraperScrapeold(self.settings, self.args)
        elif self.args.command == 'getscore':
            runner = getscore.DumpScraperGetscore(self.settings, self.args)
        elif self.args.command == 'training':
            runner = training.DumpScraperTraining(self.settings, self.args)
        elif self.args.command == 'classify':
            runner = classify.DumpScraperClassify(self.settings, self.args)
        elif self.args.command == 'extract':
            runner = extract.DumpScraperExtract(self.settings, self.args)
        else:
            print("Unrecognized command")
            return

        # And away we go!
        try:
            runner.check()
            runner.run()
        # Ehm.. something wrong happened?
        except exceptions.RunningError as error:
            print("")
            print(error)
        # Always save the updated settings
        finally:
            with open(os.path.realpath("settings.json"),
                      'w+') as update_settings:
                json.dump(self.settings, update_settings, indent=4)
コード例 #4
0
ファイル: training.py プロジェクト: zero-code/dump-scraper
 def _getscore(self):
     # Let's invoke the getscore runner and tell him to work on training data
     self.parentArgs.level = 1
     running = getscore.DumpScraperGetscore(self.settings, self.parentArgs)
     running.run(training=True)
コード例 #5
0
    def run(self):
        self.banner()
        self.check_updates()

        dump_logger = logging.getLogger('dumpscraper')

        # Perform some sanity checks
        try:
            self.checkenv()
        except exceptions.InvalidSettings as error:
            dump_logger.error(error)
            return

        # Let's ouput some info
        if hasattr(self.args, 'level') and self.args.level > 0:
            dump_logger.debug('\tUsing a greedy level of ' +
                              str(self.args.level))

        if hasattr(self.args, 'clean') and self.args.clean:
            dump_logger.debug(
                "\tClean the target folder before attempting to write inside it"
            )

        if hasattr(self.args, 'force') and self.args.force:
            dump_logger.debug("\tForcing the execution only on file " +
                              str(self.args.force))

        # Let's load the correct object
        if self.args.command == 'scrape':
            from lib.runner import scrape
            runner = scrape.DumpScraperScrape(self.settings, self.args)
        elif self.args.command == 'scraperaw':
            from lib.runner import scraperaw
            runner = scraperaw.DumpScraperScraperaw(self.settings, self.args)
        elif self.args.command == 'scrapeold':
            from lib.runner import scrapeold
            runner = scrapeold.DumpScraperScrapeold(self.settings, self.args)
        elif self.args.command == 'getscore':
            from lib.runner import getscore
            runner = getscore.DumpScraperGetscore(self.settings, self.args)
        elif self.args.command == 'training':
            from lib.runner import training
            runner = training.DumpScraperTraining(self.settings, self.args)
        elif self.args.command == 'classify':
            from lib.runner import classify
            runner = classify.DumpScraperClassify(self.settings, self.args)
        elif self.args.command == 'extract':
            from lib.runner import extract
            runner = extract.DumpScraperExtract(self.settings, self.args)
        elif self.args.command == 'review':
            from lib.runner import review
            runner = review.DumpScraperReview(self.settings, self.args)
        else:
            dump_logger.error("Unrecognized command " + self.args.command)
            return

        # And away we go!
        try:
            runner.check()
            runner.run()
        # Ehm.. something wrong happened?
        except exceptions.RunningError as error:
            dump_logger.error(error)
        # Always save the updated settings
        finally:
            with open(os_path.realpath("settings.json"),
                      'w+') as update_settings:
                json.dump(self.settings, update_settings, indent=4)