def dump(connection, collection, database, hdfs, filename, username, hdfsconnection): client = TokenClient(hdfsconnection, username, root=hdfs) length = getCollection(connection,database, collection).find().count() with client.write(filename, encoding='utf-8') as writer: with click.progressbar(getCollection(connection,database, collection).find(), length=length, label='Writing collection: '+ collection+" to HDFS: "+ filename) as bar: for j in bar: writer.write(dumps(j))
import psycopg2 import requests from hdfs import TokenClient print "Fetching data from the database" conn = psycopg2.connect(database="NewsSource", user="******", password="******", host="newdb.cnceaogjppz8.us-west-2.rds.amazonaws.com", port="5432") HDFS_URL = "http://localhost:50070" cur = conn.cursor() cur.execute("select id,articletext from articlestable where classifiedcategory IS NULL") rows = cur.fetchall() count = 1 for row in rows: print "Storing file " + str(count) + " in HDFS" client = TokenClient(HDFS_URL, 'crawled_data', root='/user/root') client.write(str(row[0]), row[1]) count +=1 conn.close()