Exemple #1
0
 def save_to_hdfs(self, key, url, title, content):
     current_date = datetime.datetime.now().strftime("%Y%m%d")
     hdfs_path = hdfs_dir + current_date
     import sys
     reload(sys)
     sys.setdefaultencoding('utf-8')
     data = "\n" + key + "\n" + url + "\n"
     if (title != None and title != ''):
         data = data + title + "\n"
     if (content != None and content != ''):
         data = data + content + "\n"
     try:
         client = InsecureClient(hdfs_web, user=hdfs_user)
         client.write(hdfs_path=hdfs_path, data=data, append=True)
     except HdfsError, e:
         client.write(hdfs_path=hdfs_path, data=data)
Exemple #2
0
def save_pd_DF(df_pd: pd.DataFrame, cli: InsecureClient, file_path):
    """
     将pandas的DataFrame写入hdfs  csv
    :param df_pd: pandas的DataFrame
    :param cli: hdfs的InsecureClient
    :param file_path: hdfs的文件路径,相对InsecureClient里面设置的root路径
    """
    with cli.write(hdfs_path=file_path, encoding='utf-8',
                   overwrite=True) as writer:
        df_pd.to_csv(writer)
        arg_types = tuple([
            llvm2impala[arg.pointee.name]
            for arg in function.type.pointee.args[1:]
        ])
        functions.append((symbol, arg_types))
    except (AttributeError, KeyError):
        # this process could fail for non-UDF helper functions...just ignore
        # them, because we're not going to be registering them anyway
        log("Had trouble with function %s; moving on..." % symbol)
        pass

# transfer the LLVM module to HDFS
url = 'http://{nn_host}:{webhdfs_port}'.format(nn_host=args.nn_host,
                                               webhdfs_port=args.webhdfs_port)
hdfs_client = InsecureClient(url, user=args.user)
hdfs_client.write(args.hdfs_path, bc, overwrite=args.force)
log("Transferred LLVM IR to HDFS at %s" % args.hdfs_path)

# register the functions with impala
conn = impala.dbapi.connect(host=args.impala_host, port=args.impala_port)
cursor = conn.cursor(user=args.user)
log("Connected to impalad: %s" % args.impala_host)
if args.db:
    cursor.execute('USE %s' % args.db)
cursor.execute("SHOW FUNCTIONS")
registered_functions = cursor.fetchall()
for (udf_name, return_type) in zip(args.name, args.return_type):
    log("Registering function %s" % udf_name)
    # find matching LLVM symbols to the current UDF name
    matches = [pair for pair in functions if udf_name in pair[0]]
    if len(matches) == 0:
        log("Loading types for function %s" % symbol)
        # skip the first argument, which is FunctionContext*
        arg_types = tuple([llvm2impala[arg.pointee.name]
                           for arg in function.type.pointee.args[1:]])
        functions.append((symbol, arg_types))
    except (AttributeError, KeyError):
        # this process could fail for non-UDF helper functions...just ignore
        # them, because we're not going to be registering them anyway
        log("Had trouble with function %s; moving on..." % symbol)
        pass

# transfer the LLVM module to HDFS
url = 'http://{nn_host}:{webhdfs_port}'.format(
    nn_host=args.nn_host, webhdfs_port=args.webhdfs_port)
hdfs_client = InsecureClient(url, user=args.user)
hdfs_client.write(args.hdfs_path, bc, overwrite=args.force)
log("Transferred LLVM IR to HDFS at %s" % args.hdfs_path)

# register the functions with impala
conn = impala.dbapi.connect(host=args.impala_host, port=args.impala_port)
cursor = conn.cursor(user=args.user)
log("Connected to impalad: %s" % args.impala_host)
if args.db:
    cursor.execute('USE %s' % args.db)
cursor.execute("SHOW FUNCTIONS")
registered_functions = cursor.fetchall()
for (udf_name, return_type) in zip(args.name, args.return_type):
    log("Registering function %s" % udf_name)
    # find matching LLVM symbols to the current UDF name
    matches = [pair for pair in functions if udf_name in pair[0]]
    if len(matches) == 0:
Exemple #5
0
                'localField': '_id.playerID',
                'foreignField': 'playerID',
                'as': 'playersinfo'
            }
        },
        {
            '$lookup': {
                'from': 'teamsfranchises',
                'localField': 'teams._id.franchID',
                'foreignField': 'franchID',
                'as': 'teamsinfo'
            }
        }
    ])

    return result

results=count_hr()

output_path="/users/mongo/Wang/output"
# create connection with hdfs cluster
hdfs_client = InsecureClient("http://localhost:9870/",user="******")
# create a file
hdfs_client.write(output_path, "",overwrite=False,append=False)

# append data into the output
for row in results:
    line = row['teamsinfo'][0]['franchName']+" "+row['playersinfo'][0]['nameFirst']+" "+row['playersinfo'][0]['nameLast']+" "+str(row['_id']['yearID'])
    hdfs_client.write(output_path, line, overwrite=False, append=True)
    hdfs_client.write(output_path, "\n", overwrite=False, append=True)