コード例 #1
0
    def run(self):
        latest_list = self.latest()
        tld_dict = {}
        for domain in latest_list:
            try:
                t = tldextract.tldextract.extract(domain)
                if t.suffix not in tld_dict:
                    tld_dict[t.suffix] = []
                tld_dict[t.suffix].append(domain)
            except Exception as e:
                print(e)

        ret = {}
        if len(tld_dict) > 0:
            Path(self.save_path).mkdir(parents=True, exist_ok=True)
            for tld in tld_dict:
                ret[tld] = tld_dict[tld]
                filename = "{}.txt.gz".format(tld)
                text_string_list = '\n'.join(tld_dict[tld])
                text_string_bytes_object = BytesIO()
                text_string_bytes_object.write(text_string_list.encode('utf-8'))
                text_string_bytes_object.seek(0)
                with gzip.open('{0}{1}'.format(self.save_path, filename), 'wb') as f:
                    f.write(text_string_bytes_object.read())
            return ret
コード例 #2
0
 def process_image(self, image_bytes):
     buf = BytesIO()
     buf.write(image_bytes)
     buf.seek(0)
     try:
         image = Image.open(buf)
     except IOError as e:
         self.logger.error(e)
         return False
     else:
         fn = 'screenshot_{}.jpeg'.format(
             str(datetime.datetime.now()).replace(' ', '_'))
         fp = os.path.join(self.screenshot_dir, fn)
         with open(fp, 'wb') as f:
             f.write(image_bytes)
         self.logger.info('Image written successfully: {}'.format(fp))
         return True
     finally:
         del buf
コード例 #3
0
ファイル: czds.py プロジェクト: ziqi521/deepdive-domain-data
    def _download_single_zone_file(self, url):
        response = self._get(url)
        status_code = response.status_code

        if status_code == 200:
            zone_name = url.rsplit('/', 1)[-1].rsplit('.')[-2]
            compressed_file = BytesIO(response.content)

            _, option = cgi.parse_header(
                response.headers['content-disposition'])
            filename = option['filename']

            if not filename:
                filename = zone_name + '.txt.gz'

            path_filename = "{}{}".format(self.save_path, filename)

            decompressed_file = gzip.GzipFile(fileobj=compressed_file,
                                              mode='rb')
            text_list = []
            for line in decompressed_file.readlines():
                domain = line.decode('utf-8').split('\t')[0].rstrip('.')
                text_list.append(domain)
            text_string_list = '\n'.join(list(set(text_list)))
            text_string_bytes_object = BytesIO()
            text_string_bytes_object.write(text_string_list.encode('utf-8'))
            text_string_bytes_object.seek(0)
            text_string_buf = text_string_bytes_object.read()

            gzip_object = gzip.compress(text_string_buf)
            gzip_size = gzip_object.__sizeof__()

            MAX_FILE_SIZE = 1024 * 1024 * 35
            if gzip_size >= MAX_FILE_SIZE:
                chapters = 0
                source_buf = gzip_object

                n = MAX_FILE_SIZE
                final = [
                    source_buf[i * n:(i + 1) * n]
                    for i in range((len(source_buf) + n - 1) // n)
                ]  # list comprehension chunker

                for chunk in final:
                    chapters += 1
                    chapter_string = "{}".format(chapters)
                    chapter_string = chapter_string.zfill(2)
                    chapter_filename = "{}_{}{}".format(
                        zone_name, chapter_string, '.txt.gz')
                    chapter_path_filename = "{}{}".format(
                        self.save_path, chapter_filename)
                    with open(chapter_path_filename, 'wb+') as f:
                        f.write(chunk)
            else:
                with open(path_filename, 'wb+') as f:
                    f.write(gzip_object)

        elif status_code == 401:
            self.token = self.authenticate()
        elif status_code == 404:
            pass
        else:
            pass