Beispiel #1
0
def dump(connection, collection, database, hdfs, filename, username, hdfsconnection):
    client = TokenClient(hdfsconnection, username, root=hdfs)
    length = getCollection(connection,database, collection).find().count()
    with client.write(filename, encoding='utf-8') as writer:
        with click.progressbar(getCollection(connection,database, collection).find(), length=length, label='Writing collection: '+ collection+" to HDFS: "+ filename) as bar:
            for j in bar:
                writer.write(dumps(j))
Beispiel #2
0
    def client(self):  # type ()-> WebHDFS
        if self.client_type == WebHdfsClientType.KERBEROS:
            from hdfs.ext.kerberos import KerberosClient

            return KerberosClient(
                url=self.url,
                mutual_authentication=self.mutual_authentication,
                service=self.service,
                delegate=self.delegate,
                force_preemptive=self.force_preemptive,
                principal=self.principal,
                hostname_override=self.hostname_override,
                sanitize_mutual_error_response=self.
                sanitize_mutual_error_response,
                send_cbt=self.send_cbt,
            )

        elif self.client_type == WebHdfsClientType.INSECURE:
            from hdfs import InsecureClient

            return InsecureClient(url=self.url, user=self.user)

        elif self.client_type == WebHdfsClientType.TOKEN:
            from hdfs import TokenClient

            return TokenClient(url=self.url, token=self.token)
        else:
            raise Exception("WebHdfs client type %s does not exist" %
                            self.client_type)
Beispiel #3
0
from hdfs import TokenClient

client = TokenClient('http://10.91.228.145:50070', 'hdfs',root='/')
print client.list('/user/hdfs/test')
print client.content('/user/hdfs/test')
#client.download(
client.download('/user/hdfs/aichuche/t_reportData101/20140909/1234.txt','f:\\12345.txt')
import psycopg2
import requests
from hdfs import TokenClient


print "Fetching data from the database"
conn = psycopg2.connect(database="NewsSource", user="******", password="******", host="newdb.cnceaogjppz8.us-west-2.rds.amazonaws.com", port="5432")
HDFS_URL = "http://localhost:50070"
cur = conn.cursor()
cur.execute("select id,articletext from articlestable where classifiedcategory IS NULL")
rows = cur.fetchall()

count = 1
for row in rows:
    print "Storing file " + str(count) + " in HDFS"
    client = TokenClient(HDFS_URL, 'crawled_data', root='/user/root')
    client.write(str(row[0]), row[1])
    count +=1
conn.close()