def dump(connection, collection, database, hdfs, filename, username, hdfsconnection): client = TokenClient(hdfsconnection, username, root=hdfs) length = getCollection(connection,database, collection).find().count() with client.write(filename, encoding='utf-8') as writer: with click.progressbar(getCollection(connection,database, collection).find(), length=length, label='Writing collection: '+ collection+" to HDFS: "+ filename) as bar: for j in bar: writer.write(dumps(j))
def client(self): # type ()-> WebHDFS if self.client_type == WebHdfsClientType.KERBEROS: from hdfs.ext.kerberos import KerberosClient return KerberosClient( url=self.url, mutual_authentication=self.mutual_authentication, service=self.service, delegate=self.delegate, force_preemptive=self.force_preemptive, principal=self.principal, hostname_override=self.hostname_override, sanitize_mutual_error_response=self. sanitize_mutual_error_response, send_cbt=self.send_cbt, ) elif self.client_type == WebHdfsClientType.INSECURE: from hdfs import InsecureClient return InsecureClient(url=self.url, user=self.user) elif self.client_type == WebHdfsClientType.TOKEN: from hdfs import TokenClient return TokenClient(url=self.url, token=self.token) else: raise Exception("WebHdfs client type %s does not exist" % self.client_type)
from hdfs import TokenClient client = TokenClient('http://10.91.228.145:50070', 'hdfs',root='/') print client.list('/user/hdfs/test') print client.content('/user/hdfs/test') #client.download( client.download('/user/hdfs/aichuche/t_reportData101/20140909/1234.txt','f:\\12345.txt')
import psycopg2 import requests from hdfs import TokenClient print "Fetching data from the database" conn = psycopg2.connect(database="NewsSource", user="******", password="******", host="newdb.cnceaogjppz8.us-west-2.rds.amazonaws.com", port="5432") HDFS_URL = "http://localhost:50070" cur = conn.cursor() cur.execute("select id,articletext from articlestable where classifiedcategory IS NULL") rows = cur.fetchall() count = 1 for row in rows: print "Storing file " + str(count) + " in HDFS" client = TokenClient(HDFS_URL, 'crawled_data', root='/user/root') client.write(str(row[0]), row[1]) count +=1 conn.close()