Esempio n. 1
0
 def read_pages(self):
     pages = []
     k = Key(FileInfo.objects.s3_bucket)
     k.key = self.object_key + "/" + self.sha1
     h = k.read(100)
     page_size = int(h[16:18].encode("hex"), 16)
     n_pages = int(h[28:32].encode("hex"), 16)
     page1 = h + k.read(page_size - 100)
     pages.append(page1)
     for i in range(2, n_pages + 1):
         page = k.read(page_size)
         pages.append(page)
     return pages
Esempio n. 2
0
def login():
  error = ''
  file_contents = ''
  global login_failed
  keys = []
  if(login_failed < 2):
    if request.method == 'POST':
        username_form  = request.form['username']
        bucket = conn.get_bucket(config.buck_name, validate=True)
        k = Key(bucket)
        k.key = 'auth_users.txt'
        k.open()
        file_contents = k.read()
        if username_form in file_contents:
            session['logged_in'] = True
            session['username'] = username_form
            #session.permanent = True
            #app.permanent_session_lifetime = timedelta(seconds=300)
            return render_template('upload_db.html', username = session['username'])
        else:
            login_failed = login_failed+1
            error+= "Invalid Username. Login Again"
            return render_template('welcome.html', error = error)
  else:
	login_failed = 0
	error = 'You have exceeded maximum attempts for failed login. Locked Out. Try agin after 30 mins'
        return render_template('welcome.html', error = error)
Esempio n. 3
0
    def s3_url(self, is_compressed=False):
        k = Key(FileInfo.objects.s3_bucket)
        if self.is_diff_file:
            k.key = self.object_key + "/diff/" + self.diff_name
            if is_compressed:
                k_compressed = FileInfo.objects.s3_bucket.get_key(self.object_key + "/diff/gz/" + self.diff_name)
                if k_compressed:
                    k = k_compressed
                else:
                    zipstream = StringIO.StringIO()
                    gzipper = gzip.GzipFile(mode="w", fileobj=zipstream)
                    gzipper.write(k.read())
                    gzipper.close()
                    k.key = self.object_key + "/diff/gz/" + self.diff_name
                    k.metadata.update(
                        {
                            "Content-Type": str("application/sqlite3-diff"),
                            "Content-Disposition": str("attachment;filename=" + self.name() + "-diff"),
                            "Content-Encoding": str("gzip"),
                        }
                    )
                    k.set_contents_from_string(zipstream.getvalue())
                    k = FileInfo.objects.s3_bucket.get_key(self.object_key + "/diff/gz/" + self.diff_name)

        else:
            k.key = self.object_key + "/" + self.sha1
            if is_compressed:
                k_compressed = FileInfo.objects.s3_bucket.get_key(self.object_key + "/gz/" + self.sha1)
                if k_compressed:
                    k = k_compressed
                else:
                    zipstream = StringIO.StringIO()
                    gzipper = gzip.GzipFile(mode="w", fileobj=zipstream)
                    gzipper.write(k.read())
                    gzipper.close()
                    k.key = self.object_key + "/gz/" + self.sha1
                    k.metadata.update(
                        {
                            "Content-Type": str(self.file_format.mime_type.text),
                            "Content-Disposition": str("attachment;filename=" + self.name()),
                            "Content-Encoding": str("gzip"),
                        }
                    )
                    k.set_contents_from_string(zipstream.getvalue())
                    k = FileInfo.objects.s3_bucket.get_key(self.object_key + "/gz/" + self.sha1)

        return k.generate_url(3600, "GET")
Esempio n. 4
0
 def Get_Object_Metatags(self, obj, bucket):
     """
     Returns a dictionary of metatag keys and values for the given object
     
     _obj_: target object
     _bucket_: bucket containing object
     """
     s3bucket = self._conn.get_bucket(bucket)
     try:
         s3obj = Key(s3bucket, obj)
         logger.debug("Read 1 byte of object %s" % obj)
         s3obj.read(size=1)
     except S3ResponseError, err:
         if err.status == 404:
             dirobj = obj + '/'
             logger.debug("Could not find %s, trying %s" % (obj, dirobj))
             s3obj = Key(s3bucket, dirobj)
             logger.debug("Read 1 byte of object %s" % dirobj)
             s3obj.read(size=1)
Esempio n. 5
0
 def getChunkyKeyObj(self, chunk_size=512000):
     key_obj = Key(bucket)
     key_obj.key = self.row.file_key
     while True:
         chunk = key_obj.read(chunk_size)
         self.download_session_row.downloaded_size += len(chunk)
         self.download_session_row.save()
         if len(chunk) == 0:
             break
         yield chunk
Esempio n. 6
0
def get_index(prefix):
    """
    :param prefix: str
        Prefix to S3 bucket

    :return: Uncompressed warc index
    :rtype: str
    """
    botokey = Key(DATASET, prefix + 'warc.paths.gz')
    return gzip.GzipFile(fileobj=StringIO(botokey.read())).read()
Esempio n. 7
0
    def get_index(self, prefix):
        """
        :param prefix: str
            Prefix to S3 bucket

        :return: Uncompressed warc index
        :rtype: str
        """
        crawl = self.select_crawl(prefix)
        botokey = Key(self.bucket, crawl + 'warc.paths.gz')
        return [i.strip() for i in GzipFile(fileobj=BytesIO(botokey.read()))]
Esempio n. 8
0
    def get_index(self, prefix):
        """
        :param prefix: str
            Prefix to S3 bucket

        :return: Uncompressed warc index
        :rtype: str
        """
        crawl = self.select_crawl(prefix)
        botokey = Key(self.bucket, crawl + 'warc.paths.gz')
        return [i.strip() for i in GzipFile(fileobj=BytesIO(botokey.read()))]
Esempio n. 9
0
    def wait_for(self, bucket, key, timeout, start=None):
        if start is None:
            start = datetime.utcnow()

        log.info("Looking for key with last_modified greater than %s", start)
        for _ in hp.until(timeout=timeout, step=5):
            try:
                bucket_obj = self.get_bucket(bucket)
            except BadS3Bucket as error:
                log.error(error)
                continue

            if key == '/':
                log.info("The bucket exists! and that is all we are looking for")
                return

            k = Key(bucket_obj)
            k.key = key

            try:
                k.read()
            except boto.exception.S3ResponseError as error:
                if error.status == 404:
                    log.info("Key doesn't exist yet\tbucket=%s\tkey=%s", bucket_obj.name, key)
                    continue
                else:
                    log.error(error)
                    continue

            last_modified = k.last_modified
            log.info("Found key in the bucket\tbucket=%s\tkey=%s\tlast_modified=%s", bucket_obj.name, key, last_modified)

            date = datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S GMT")
            if date > start:
                log.info("Found key and it's newer than our start time!")
                return
            else:
                log.info("Found key but it's older than our start time, hasn't been updated yet")

        raise BespinError("Couldn't find the s3 key with a newer last modified")
Esempio n. 10
0
    def generate_diff(self, original_file_info, latest_file_info):
        # self.set_file_contents("DIFF CONTENT")
        k1 = Key(FileInfo.objects.s3_bucket)
        k1.key = original_file_info.object_key + "/" + original_file_info.sha1

        k2 = Key(FileInfo.objects.s3_bucket)
        k2.key = latest_file_info.object_key + "/" + latest_file_info.sha1

        h1 = k1.read(100)
        h2 = k2.read(100)

        f1_page_size = int(h1[16:18].encode("hex"), 16)
        f2_page_size = int(h2[16:18].encode("hex"), 16)

        if f1_page_size != f2_page_size:
            return False

        f1_n_pages = int(h1[28:32].encode("hex"), 16)
        f2_n_pages = int(h2[28:32].encode("hex"), 16)

        f1_page1 = h1 + k1.read(f1_page_size - 100)
        f2_page1 = h2 + k2.read(f2_page_size - 100)

        diff_file_contents = "SQLITE DIFF FILE"
        diff_file_contents += original_file_info.sha1.decode("hex")
        diff_file_contents += latest_file_info.sha1.decode("hex")
        diff_file_contents += struct.pack(">L", f1_page_size)
        if f1_page1 != f2_page1:
            diff_file_contents += struct.pack(">L", 1)
            diff_file_contents += "!"
            diff_file_contents += f2_page1
        for i in range(2, max(f1_n_pages, f2_n_pages) + 1):
            f1_page = None
            f2_page = None
            if i <= f1_n_pages:
                f1_page = k1.read(f1_page_size)
            if i <= f2_n_pages:
                f2_page = k2.read(f2_page_size)
            if f1_page == f2_page:
                continue
            if f1_page == None:
                diff_file_contents += struct.pack(">L", i)
                diff_file_contents += "+"
                diff_file_contents += f2_page
                continue
            if f2_page == None:
                diff_file_contents += struct.pack(">L", i)
                diff_file_contents += "-"
                continue
            diff_file_contents += struct.pack(">L", i)
            diff_file_contents += "!"
            diff_file_contents += f2_page
        self.set_file_contents(diff_file_contents)
        return True
Esempio n. 11
0
def s3_download(output_file_path, s3_bucket, s3_access_key_id, s3_secret_key,
                s3_file_key=None, prefix=None):
    """ Downloads the file matching the provided key, in the provided bucket,
        from Amazon S3.
        
        If s3_file_key is none, it downloads the last file
        from the provided bucket with the .tbz extension, filtering by
        prefix if it is provided. """
    bucket = s3_connect(s3_bucket, s3_access_key_id, s3_secret_key)
    if not s3_file_key:
        keys = s3_list(s3_bucket, s3_access_key_id, s3_secret_key, prefix)
        if not keys:
            raise Exception("Target S3 bucket is empty")
        s3_file_key = keys[-1]
    key = Key(bucket, s3_file_key)
    with open(output_file_path, "w+") as f:
        f.write(key.read())
Esempio n. 12
0
def s3_download(output_file_path, s3_bucket, s3_access_key_id, s3_secret_key,
                s3_file_key=None, prefix=None):
    """ Downloads the file matching the provided key, in the provided bucket,
        from Amazon S3.
        
        If s3_file_key is none, it downloads the last file
        from the provided bucket with the .tbz extension, filtering by
        prefix if it is provided. """
    bucket = s3_connect(s3_bucket, s3_access_key_id, s3_secret_key)
    if not s3_file_key:
        keys = s3_list(s3_bucket, s3_access_key_id, s3_secret_key, prefix)
        if not keys:
            raise Exception("Target S3 bucket is empty")
        s3_file_key = keys[-1]
    key = Key(bucket, s3_file_key)
    with open(output_file_path, "w+") as f:
        f.write(key.read())
Esempio n. 13
0
def _do_retrieve(bucket_name, key_path, number_retries=DEFAULT_S3_RETRIES):
    """ Run-logic to do a data retrieval for a file in an S3 bucket."""
    key = Key(_get_bucket(bucket_name), key_path)
    try:
        return key.read()
    except IncompleteRead:
        if number_retries > 0:
            print "s3_retreive failed with incomplete read, retrying on %s" % key_path
            return _do_retrieve(bucket_name,
                                key_path,
                                number_retries=number_retries - 1)
        raise
    except SSLError as e:
        if 'The read operation timed out' == e.message:
            print "s3_retreive failed with timeout, retrying on %s" % key_path
            return _do_retrieve(bucket_name,
                                key_path,
                                number_retries=number_retries - 1)
        raise
Esempio n. 14
0
def register_page():
    error = ''
    file_contents = ''
    try:
            username = request.form['user']
            if (re.match(userreg,username)):
                bucket = conn.get_bucket(config.buck_name, validate=True)
                k = Key(bucket)
                k.key = 'auth_user.txt'
                k.open()
                file_contents = k.read()
                file_contents+=username
                key = bucket.new_key('auth_users.txt')
                key.set_contents_from_string(file_contents)
                key.set_acl('public-read')
                return 'Successfully Registered. Login.'
            else:
		return 'UserName:3-15 charecters consisting of letter or digits and optional -or_.'
    except Exception as e:
        return(str(e))
Esempio n. 15
0
File: s3.py Progetto: samuel/gypsy
class S3File(File):
    def __init__(self, bucket, name):
        self._bucket = bucket
        self._name = name
        self._key = Key(bucket=bucket, name=name.encode('utf-8'))
        self._pos = 0
        self._open = False
        self._fake_open = False
        self._mode = 'r'

    @property
    def name(self):
        return self._name

    @property
    def mode(self):
        return self._key.mode

    @property
    def closed(self):
        return self._fake_open

    def size():
        doc = "The size property."
        def fget(self):
            raise NotImplementedError("S3File doesn't implement size and __len__")
        def fset(self, value):
            raise NotImplementedError("S3File doesn't implement size and __len__")
        return locals()

    def open(self, mode="r"):
        self.close()
        self._mode = (mode or 'r')[0]
        self._fake_open = True

    def close(self):
        if self._open:
            self._pos = 0
            self._key.close()
        self._fake_open = False

    def seek(self, position):
        if position != 0:
            raise NotImplementedError("S3File doesn't implement seek at positions other than 0")
        if self._pos != 0:
            # TODO: This is a bit flakey I imagine
            self._key.resp = None
            self._pos = 0

    def tell(self):
        return self._pos

    def read(self, num_bytes=None):
        if not self._open:
            self._key.open(self._mode)
            self._open = True
        data = self._key.read(num_bytes)
        self._pos += len(data)
        return data

    def write(self, content):
        raise NotImplementedError("S3File doesn't implement write")

    def flush(self):
        raise NotImplementedError("S3File doesn't implement flush")

    def close(self):
        self._key.close()
Esempio n. 16
0
 def valid_segments(self):
     kfile = Key(self.bucket,
                 '/common-crawl/parse-output/valid_segments.txt')
     return [i.strip() for i in kfile.read().splitlines()]
Esempio n. 17
0
File: gz.py Progetto: pie-crust/etl
    gzipped = GzipFile(None, 'rb', fileobj=k)
    reader = csv.reader(io.TextIOWrapper(gzipped, newline="",
                                         encoding="utf-8"),
                        delimiter='^')
    data = []
    for id, line in enumerate(reader):
        data.append(line)
        pprint(line)
        if id > 10: break

    ptitle = kname
    #headers=['Col#%d' % i for i in range(len(data[0]))]
    #print (get_formatted(ptitle,data,headers,join = True))

if 0:
    buffer = io.BytesIO(k.read())
    print(buffer)
    z = zipfile.ZipFile(buffer)
    foo2 = z.open(z.infolist()[0])
    print(sys.getsizeof(foo2))
    line_counter = 0
    for _ in foo2:
        line_counter += 1
    print(line_counter)
    z.close()

if 0:
    #print k.read(10)
    gz_file = gzip.GzipFile(fileobj=k, mode='rb')
    reader = csv.ListReader(io.TextIOWrapper(gz_file,
                                             newline="",
Esempio n. 18
0
 def valid_segments(self):
     kfile = Key(self.bucket, '/common-crawl/parse-output/valid_segments.txt')
     return [i.strip() for i in kfile.read().splitlines()]
Esempio n. 19
0
s3 = boto3.client('s3', region_name=cred['AWS_DEFAULT_REGION'])
s3.put_object(Bucket=datasetF, Key=self.name, Body=self.value)


conn = boto.connect_s3(cred['AWS_DEFAULT_REGION'])
bucket = conn.get_bucket(datasetF)

for line in smart_open.smart_open('s3://mybucket/mykey.txt'):
    print line

bucket = conn.get_bucket(baseUrl + folderUrl + fileUrl)
k = Key(bucket)
k.key = 'filename.txt'
k.open()
k.read(10)


peopleDF = spark.read.json("examples/src/main/resources/people.json")

# DataFrames can be saved as Parquet files, maintaining the schema information.
peopleDF.write.parquet("people.parquet")

# Read in the Parquet file created above.
# Parquet files are self-describing so the schema is preserved.
# The result of loading a parquet file is also a DataFrame.
parquetFile = spark.read.parquet("people.parquet")

# Parquet files can also be used to create a temporary view and then used in SQL statements.
parquetFile.createOrReplaceTempView("parquetFile")
teenagers = spark.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19")
Esempio n. 20
0
	conn=S3Connection(login[1],login[2])
	mybucket = conn.get_bucket('ec2dev')
	print "Connected to S3"

except:
	print "Unable to connect to S3"
	exit()

try:
	for j in mybucket.list():
		if j.name == 'login.txt':
			print j.name
			k = Key(mybucket)
			k.key = j.name
			k.open()
			file_1 = k.read()
			print "Successfully opened login.txt"
except:
	print "Unable to open File on S3"
	exit()

		
login = file_1.split()

try:
	conn = pymysql.connect(host=login[0],user= login[1],password=login[2],db= login[4])
	print "Connected successfully to RDS"
except:
	print "Unable to connect to RDS"
	exit()
cur = conn.cursor()
Esempio n. 21
0
 def read(self):
     k = Key(FileInfo.objects.s3_bucket)
     k.key = self.object_key + "/" + self.sha1
     return k.read()
Esempio n. 22
0
class S3FunctionalityTest():
    '''
  Functionality Test of an S3 Bucket
  Only works with Keystone Auth URL v3
  '''
    options = dict()

    def __init__(self, options):
        # First we try to list the ec2 credentials

        try:
            res = json.loads(
                subprocess.check_output([
                    "openstack", "--os-auth-url", options.auth_url,
                    "--os-username", options.username, "--os-password",
                    options.password, "--os-project-name", options.tenant,
                    "--os-project-domain-name", DEFAULT_DOMAIN_NAME,
                    "--os-user-domain-name", DEFAULT_DOMAIN_NAME,
                    "--os-identity-api-version", "3", "ec2", "credentials",
                    "list", "-f", "json"
                ]))
            res[0]['Access']

    # If they don't exist we create some
        except:
            try:
                subprocess.check_output([
                    "openstack", "--os-auth-url", options.auth_url,
                    "--os-username", options.username, "--os-password",
                    options.password, "--os-project-name", options.tenant,
                    "--os-project-domain-name", DEFAULT_DOMAIN_NAME,
                    "--os-user-domain-name", DEFAULT_DOMAIN_NAME,
                    "--os-identity-api-version", "3", "ec2", "credentials",
                    "create"
                ],
                                        stderr=subprocess.STDOUT)
            except:
                print "Could not create EC2 credentials"
                sys.exit(NAGIOS_STATE_UNKNOWN)
            res = json.loads(
                subprocess.check_output([
                    "openstack", "--os-auth-url", options.auth_url,
                    "--os-username", options.username, "--os-password",
                    options.password, "--os-project-name", options.tenant,
                    "--os-project-domain-name", DEFAULT_DOMAIN_NAME,
                    "--os-user-domain-name", DEFAULT_DOMAIN_NAME,
                    "--os-identity-api-version", "3", "ec2", "credentials",
                    "list", "-f", "json"
                ]))

        if LOCAL_DEBUG:
            print res
        _access_key = res[0]['Access']
        _secret_key = res[0]['Secret']
        _s3_host = options.s3_host

        self.conn = S3Connection(aws_access_key_id=_access_key,
                                 aws_secret_access_key=_secret_key,
                                 host=_s3_host)
        try:
            self.b = self.conn.get_bucket(DEFAULT_BUCKET_NAME)
        except:
            self.b = self.conn.create_bucket(DEFAULT_BUCKET_NAME)
        self.k = Key(self.b)
        self.k.key = 'nagiostest3'

    def s3_create_bucket(self):
        """ create a bucket, does not fail if it exists
    """
        self.conn.create_bucket(DEFAULT_BUCKET_NAME)

    def s3_store_data(self):
        """ store a 3MB object in the bucket
    """

        USERHOMEDIR = os.path.expanduser('~')
        TESTFILEPATH = "%s/3MBFILE" % USERHOMEDIR
        if not os.path.exists(TESTFILEPATH):
            with open(TESTFILEPATH, "wb") as out:
                out.truncate(1024 * 1024 * 3)
        self.k.set_contents_from_filename(TESTFILEPATH)

    def s3_read_data(self):
        """ read object from bucket
    """

        self.k.open()
        self.k.read()

    def s3_delete_data(self):
        """ delete object from bucket
    """

        self.k.delete()

    def execute(self):
        results = dict()
        try:
            self.s3_create_bucket()
            self.s3_store_data()
            self.s3_read_data()
            self.s3_delete_data()
        except:
            raise
        return results