def update_csv(): local = '/Users/constantine/PycharmProjects/test02/data.csv' tmpLocal = '/Users/constantine/PycharmProjects/test02/tmpdata.csv' remote = '/data/data.csv' host = '127.0.0.1:9870' user_name = 'host' client = HdfsClient(hosts=host,user_name=user_name) if client.exists(remote): client.copy_to_local(remote,tmpLocal) client.delete(remote) fRead = open(local,'r') fWrite = open(tmpLocal,'w') lines = fRead.readlines() for line in lines: fWrite.writelines(lines) fRead.close() fWrite.close() fRead = open(local, 'r') lines = fRead.read() fRead.close() fWrite = open(tmpLocal, 'w') lines = '\n'.join(list(set(lines.split('\n')))[1:]) fWrite.write(lines) fWrite.close() client.copy_from_local(tmpLocal,remote) else: client.copy_from_local(local, remote)
def Copy_From_Local(file): ''' 上传文件到hadoop ''' h_file = ('/tmp/te/%s' % file) client = HdfsClient(hosts='localhost:50070') #hdfs地址,连接hdfs if client.exists(h_file): client.delete(h_file) #判断文件是否存在于hdfs,存在就删除 client.copy_from_local(file, h_file)
def crawler(word, products_list=[]): """ 爬取一号店的商品数据 """ word = urllib.parse.quote(word) url = 'https://search.yhd.com/c0-0/k{0}'.format(word) # 获取html源码 html_doc = requests.get(url).text # xpath对象 selector = html.fromstring(html_doc) # 商品列表 ul_list = selector.xpath('//div[@id="itemSearchList"]/div') # 解析数据 for li in ul_list: # 标题 title = li.xpath('div//p[@class="proName clearfix"]/a/@title') #print(title) # 链接 link = li.xpath('div//p[@class="proName clearfix"]/a/@href') #print(link) # 价格 price = li.xpath('div//p[@class="proPrice"]/em/@yhdprice') with open("p_price", "a", encoding="gbk") as f: for j in range(len(price)): f.write(price[j] + "\n") f.close() #print(price) if len(title) > 0 and len(link) > 0 and len(price) > 0: # print(title) # print(link) # print(price) # print('--------------------') products_list.append({ 'title': title[0], 'price': price[0], 'link': 'https:' + link[0], 'referer': '1号店' }) client = HdfsClient(hosts='222.27.166.209:50070', user_name='hadoop') client.copy_from_local('/home/hadoop/Downloads/PriceCompaer/p_price', '/p_price.txt')
def basic(): client = HdfsClient(hosts='study:50070') print(client.list_status('/')) print '判断某个路径是否存在' print client.exists("/test") print client.exists("/data/gz/thrift-0.9.2.tar.gz") client = HdfsClient(hosts='study:50070') print client.get_file_checksum("/data/gz/bison-2.5.1.tar.gz") summary = client.get_content_summary("/") print summary #文件拷贝--从HDFS拷贝到本地磁盘系统 client.copy_to_local("/data/gz/pip-7.1.2.tar.gz","/root/data/pip-7.1.2.tar.gz") #文件拷贝--从本地磁盘系统拷贝到HDFS系统中 client.copy_from_local("/root/data/thrift-0.9.2.tar.gz","/data/gz/thrift-0.9.2.tar.gz") print client.get_home_directory()
#db.commit() cursor.execute(sql4) db.commit() cursor.execute(sql5) db.commit() #cursor.execute(sql6) #db.commit() mysql_time = time.asctime(time.localtime(time.time())) print "Data into Mysql: ", mysql_time cursor.execute(sql7) db.commit() client.copy_from_local('/tmp/data_5q.txt', '/data_5q.txt') #results = cursor.fetchall() #client.create('/data_1000.txt', '\0') #for row in results: # f = row[0] # name = row[1] # score = row[2] # s = str(f) + ',' + name + ',' + str(score) + '\n' # client.append('/data_1000.txt', s) endtime = time.asctime(time.localtime(time.time())) print "Data into hdfs: ", endtime except: print "Error!" db.close()
def csv_to_hdfs(row): client = HdfsClient(hosts=HDFS_IP) client.copy_from_local("%s%s%s.csv" % (local_csv_dir, tablename, row), '/hivecsv-%s%s' % (tablename, row)) #本地文件绝对路径,HDFS目录必须不存在
# 将本地文件传入HDFS存储 from pyhdfs import HdfsClient client = HdfsClient(hosts='ghym:50070', user_name='hadoop') client.copy_from_local( 'D:/programs/workspace/pythonworks/doubanuser/doubanuser/userdemo.txt', '/score.txt')
#cursor.execute(sql3) #db.commit() #cursor.execute(sql4) #db.commit() cursor.execute(sql5) db.commit() #cursor.execute(sql6) #db.commit() mysql_time = time.asctime(time.localtime(time.time())) print "Data into Mysql: ", mysql_time cursor.execute(sql7) db.commit() client.copy_from_local('/tmp/data_1000m.txt', '/data_1000m.txt') #results = cursor.fetchall() #client.create('/data_1000.txt', '\0') #for row in results: # f = row[0] # name = row[1] # score = row[2] # s = str(f) + ',' + name + ',' + str(score) + '\n' # client.append('/data_1000.txt', s) endtime = time.asctime(time.localtime(time.time())) print "Data into hdfs: ", endtime except: print "Error!" db.close()