Beispiel #1
0
    def check(self):
        if not path.exists(self.settings['data_dir'] + "/" + 'training/features.csv'):
            raise RunningError("Training score was not calculated. Do it and then run this command again")

        if not path.exists(self.settings['data_dir'] + "/" + 'organized'):
            makedirs(self.settings['data_dir'] + "/" + 'organized')
        if not path.exists(self.settings['data_dir'] + "/" + 'organized/hash'):
            makedirs(self.settings['data_dir'] + "/" + 'organized/hash')
        if not path.exists(self.settings['data_dir'] + "/" + 'organized/plain'):
            makedirs(self.settings['data_dir'] + "/" + 'organized/plain')
        if not path.exists(self.settings['data_dir'] + "/" + 'organized/trash'):
            makedirs(self.settings['data_dir'] + "/" + 'organized/trash')
Beispiel #2
0
    def check(self):
        if not os.path.exists('data/organized'):
            raise RunningError(
                colorama.Fore.RED +
                "There aren't any organized dump files to process. Organize them before continuing."
            )

        if not os.path.exists('data/processed'):
            os.makedirs('data/processed')
        if not os.path.exists('data/processed/hash'):
            os.makedirs('data/processed/hash')
        if not os.path.exists('data/processed/plain'):
            os.makedirs('data/processed/plain')
Beispiel #3
0
    def check(self):
        if not path.exists(self.settings['data_dir'] + "/" + 'organized'):
            raise RunningError(
                "There aren't any organized dump files to process. Organize them before continuing."
            )

        if not path.exists(self.settings['data_dir'] + "/" + 'processed'):
            makedirs(self.settings['data_dir'] + "/" + 'processed')
        if not path.exists(self.settings['data_dir'] + "/" + 'processed/hash'):
            makedirs(self.settings['data_dir'] + "/" + 'processed/hash')
        if not path.exists(self.settings['data_dir'] + "/" +
                           'processed/plain'):
            makedirs(self.settings['data_dir'] + "/" + 'processed/plain')
Beispiel #4
0
    def check(self):
        if not path.exists(self.settings['data_dir'] + "/" + 'raw'):
            raise RunningError(
                "There aren't any dump files to process. Scrape them before continuing."
            )

        if not path.exists(self.settings['data_dir'] + "/" + 'training'):
            makedirs(self.settings['data_dir'] + "/" + 'training')
        if not path.exists(self.settings['data_dir'] + "/" + 'training/hash'):
            makedirs(self.settings['data_dir'] + "/" + 'training/hash')
        if not path.exists(self.settings['data_dir'] + "/" + 'training/plain'):
            makedirs(self.settings['data_dir'] + "/" + 'training/plain')
        if not path.exists(self.settings['data_dir'] + "/" + 'training/trash'):
            makedirs(self.settings['data_dir'] + "/" + 'training/trash')
Beispiel #5
0
    def check(self):
        if not os.path.exists('data/raw'):
            raise RunningError(
                colorama.Fore.RED +
                "There aren't any dump files to process. Scrape them before continuing."
            )

        if not os.path.exists('data/training'):
            os.makedirs('data/training')
        if not os.path.exists('data/training/hash'):
            os.makedirs('data/training/hash')
        if not os.path.exists('data/training/plain'):
            os.makedirs('data/training/plain')
        if not os.path.exists('data/training/trash'):
            os.makedirs('data/training/trash')
Beispiel #6
0
    def check(self):
        if not os.path.exists('data/training/features.csv'):
            raise RunningError(
                colorama.Fore.RED +
                "Training score was not calculated. Do it and then run this command again"
            )

        if not os.path.exists('data/organized'):
            os.makedirs('data/organized')
        if not os.path.exists('data/organized/hash'):
            os.makedirs('data/organized/hash')
        if not os.path.exists('data/organized/plain'):
            os.makedirs('data/organized/plain')
        if not os.path.exists('data/organized/trash'):
            os.makedirs('data/organized/trash')
Beispiel #7
0
    def update(self):
        """update(self) - Fill Queue with new Pastebin IDs"""
        new_pastes = []
        raw = None

        while not raw:
            try:
                raw = requests_get('http://pastebin.com/archive').content
                if "Pastebin.com has blocked your IP" in raw:
                    getLogger('dumpscraper').critical(
                        "Pastebin blocked your IP. Wait a couple of hours and try again"
                    )
                    raise RunningError()
            except ConnectionError:
                getLogger('dumpscraper').warn(
                    'Connection error, trying again in 5 seconds')
                raw = None
                sleep(5)

        results = BeautifulSoup(raw, "lxml").findAll(
            lambda tag: tag.name == 'td' and tag.a and '/archive/' not in tag.
            a['href'] and tag.a['href'][1:])

        for entry in results:
            paste = PastebinPaste(entry.a['href'][1:])
            # Check to see if we found our last checked URL
            if paste.id == self.ref_id:
                break
            new_pastes.append(paste)

        # Don't cry if we don't have any results
        try:
            # Let's save the starting id, so I can skip already processed pastes
            self.ref_id = results[0].a['href'][1:]
        except IndexError:
            dump_logger = getLogger('dumpscraper')
            dump_logger.info("\tArchive links not found")
            dump_logger.debug('\t\tFetched page:')

            for row in results:
                dump_logger.debug('\t\t\t' + row)

        for entry in new_pastes[::-1]:
            self.put(entry)
Beispiel #8
0
    def run(self):
        prev_day = '1970-05-01'
        since_id = None if not self.settings['last_id'] else self.settings[
            'last_id']
        max_id = None if not self.settings['max_id'] else self.settings[
            'max_id']
        processed = 0

        connection = twitter.Api(
            consumer_key=self.settings['app_key'],
            consumer_secret=self.settings['app_secret'],
            access_token_key=self.settings['token'],
            access_token_secret=self.settings['token_secret'])

        # Let's check if we really have some valid credentials
        try:
            connection.VerifyCredentials()
        except twitter.error.TwitterError as error:
            raise RunningError(colorama.Fore.RED + 'Twitter error: ' +
                               error.message[0]['message'])

        dump_logger = getLogger('dumpscraper')

        while processed <= self.settings['processing_limit']:

            tweets = connection.GetUserTimeline(screen_name='dumpmon',
                                                max_id=max_id,
                                                exclude_replies=True,
                                                include_rts=False,
                                                count=self.settings['limit'],
                                                since_id=since_id)

            if not len(tweets):
                break

            removed = 0
            processed += len(tweets)

            for tweet in tweets:
                max_id = tweet.id if not max_id else min(max_id, tweet.id)
                max_id -= 1

                self.settings['last_id'] = max(since_id, tweet.id)

                try:
                    link = tweet.urls[0].expanded_url
                except KeyError:
                    continue

                dObject = datetime.datetime.fromtimestamp(
                    tweet.created_at_in_seconds)
                day = dObject.strftime('%Y-%m-%d')

                if day != prev_day:
                    prev_day = day
                    dump_logger.info("Processing day: " + day)

                # Let's create the folder name using year/month/(full-date) structure
                folder = dObject.strftime('%Y') + '/' + dObject.strftime(
                    '%m') + '/' + dObject.strftime('%d')

                target_dir = os.path.realpath(self.settings['data_dir'] +
                                              "/raw/" + folder)

                # If I already have the file, let's skip it
                if os.path.isfile(target_dir + '/' + str(tweet.id) + '.txt'):
                    continue

                if not os.path.exists(target_dir):
                    os.makedirs(target_dir)

                sleep(self.settings['delay'])

                data = requests.get(link)

                if not data.text:
                    continue

                if "Pastebin.com has blocked your IP" in data.text:
                    self.settings['last_id'] = since_id
                    raise RunningError(
                        colorama.Fore.RED +
                        "Pastebin blocked your IP. Wait a couple of hours and try again, raising the delay between tweets"
                    )

                if "has been removed" in data.text:
                    removed += 1
                    continue

                with open(target_dir + "/" + str(tweet.id) + ".txt",
                          'w+') as dump_file:
                    dump_file.write(data.text.encode('utf-8'))

            dump_logger.info("Processed " + str(processed) + " tweets")
            dump_logger.info("Found " + str(removed) +
                             " removed tweets in this batch")

        dump_logger.info("Total processed tweets: " + str(processed))
Beispiel #9
0
    def run(self):
        prev_day = '1970-05-01'
        since_id = None if not self.settings['last_id'] else self.settings[
            'last_id']
        max_id = None if not self.settings['max_id'] else self.settings[
            'max_id']
        processed = 0

        connection = twitter.Api(
            consumer_key=self.settings['app_key'],
            consumer_secret=self.settings['app_secret'],
            access_token_key=self.settings['token'],
            access_token_secret=self.settings['token_secret'])

        # Let's check if we really have some valid credentials
        try:
            connection.VerifyCredentials()
        except twitter.error.TwitterError as error:
            raise RunningError(colorama.Fore.RED + 'Twitter error: ' +
                               error.message[0]['message'])

        while processed <= self.settings['processing_limit']:

            tweets = connection.GetUserTimeline(screen_name='dumpmon',
                                                max_id=max_id,
                                                exclude_replies=True,
                                                include_rts=False,
                                                count=self.settings['limit'],
                                                since_id=since_id)

            if not len(tweets):
                break

            removed = 0
            processed += len(tweets)

            for tweet in tweets:
                max_id = tweet.id if not max_id else min(max_id, tweet.id)
                max_id -= 1

                self.settings['last_id'] = max(since_id, tweet.id)

                try:
                    link = tweet.urls[0].expanded_url
                except KeyError:
                    continue

                day = datetime.datetime.fromtimestamp(
                    tweet.created_at_in_seconds).strftime('%Y-%m-%d')

                if day != prev_day:
                    prev_day = day
                    print("")
                    print("Processing day: " + day)

                folder = day

                if not os.path.exists(os.path.realpath("data/raw/" + folder)):
                    os.makedirs(os.path.realpath("data/raw/" + folder))

                sleep(self.settings['delay'])

                data = requests.get(link)

                if not data.text:
                    continue

                if "Pastebin.com has blocked your IP" in data.text:
                    self.settings['last_id'] = since_id
                    raise RunningError(
                        colorama.Fore.RED +
                        "Pastebin blocked your IP. Wait a couple of hours and try again, raising the delay between tweets"
                    )

                if "has been removed" in data.text:
                    removed += 1
                    sys.stdout.write('x')
                    sys.stdout.flush()
                    continue

                sys.stdout.write('.')
                sys.stdout.flush()

                with open(
                        os.path.realpath("data/raw/" + folder + "/" +
                                         str(tweet.id) + ".txt"),
                        'w+') as dump_file:
                    dump_file.write(data.text.encode('utf-8'))

            print("")
            print("\tprocessed " + str(processed) + " tweets")
            print("\tFound " + str(removed) + " removed tweets in this batch")

        print("")
        print("Total processed tweets: " + str(processed))
Beispiel #10
0
    def run(self):
        base_url = 'https://twitter.com/i/search/timeline?f=realtime&q='
        base_query = 'from:dumpmon since:%s until:%s'
        prev_day = '1970-05-01'
        processed = 0

        origurl = base_url + urllib.quote(
            base_query % (self.parentArgs.since, self.parentArgs.until))

        processing = True
        url = origurl
        # We have to pass an user agent, otherwise Twitter will display an empty content
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0'
        }

        dump_logger = getLogger('dumpscraper')

        while processing:
            r = requests.get(url, headers=headers)
            json_data = json.loads(r.content)
            raw_html = json_data['items_html'].strip()

            if not raw_html:
                processing = False
                continue

            html = LH.fromstring(raw_html)

            removed = 0
            tweets = html.cssselect('.original-tweet')

            if not tweets:
                processing = False

            for tweet in tweets:
                link = tweet.cssselect('.twitter-timeline-link')

                if not link:
                    continue

                link = link[0]
                processed += 1

                paste_link = link.get('data-expanded-url')
                timestamp = tweet.cssselect('.js-short-timestamp')[0].get(
                    'data-time')
                tweetid = tweet.get('data-tweet-id')

                if not paste_link:
                    continue

                dObject = datetime.datetime.fromtimestamp(float(timestamp))
                day = dObject.strftime('%Y-%m-%d')

                if day != prev_day:
                    prev_day = day
                    dump_logger.info("Processing day: " + day)

                # Let's create the folder name using year/month/(full-date) structure
                folder = dObject.strftime('%Y') + '/' + dObject.strftime(
                    '%m') + '/' + dObject.strftime('%d')

                target_dir = os.path.realpath(self.settings['data_dir'] +
                                              "/raw/" + folder)

                if not os.path.exists(target_dir):
                    os.makedirs(target_dir)

                sleep(self.settings['delay'])

                # Sometimes we download virus and the AV drops the connection
                try:
                    data = requests.get(paste_link)
                except requests.exceptions.ConnectionError:
                    continue

                if not data.text:
                    continue

                if "Pastebin.com has blocked your IP" in data.text:
                    raise RunningError(
                        colorama.Fore.RED +
                        "Pastebin blocked your IP. Wait a couple of hours and try again, raising the delay between tweets"
                    )

                if "has been removed" in data.text:
                    removed += 1
                    continue

                with open(
                        os.path.realpath(target_dir + "/" + str(tweetid) +
                                         ".txt"), 'w+') as dump_file:
                    dump_file.write(data.text.encode('utf-8'))

            # Let's setup the url for the next iteration
            url = origurl + '&scroll_cursor=' + json_data['scroll_cursor']

        dump_logger.info("Total processed tweets: " + str(processed))
Beispiel #11
0
 def check(self):
     if not os.path.exists('data/raw'):
         raise RunningError(colorama.Fore.RED + "There aren't any dump files to process. Scrape them before continuing.")
Beispiel #12
0
 def check(self):
     if not os_path.exists(self.settings['data_dir'] + '/organized'):
         raise RunningError(
             "There aren't any organized dump files to process. Organize them before continuing."
         )