Ejemplo n.º 1
0
def hdfs_open_file(path, auth):
    from hdfs3 import HDFileSystem
    hdfs = HDFileSystem(**auth)
    return hdfs.open(path, mode='rb')
Ejemplo n.º 2
0
def handle(q):
    hdfs = HDFileSystem(host='localhost', port=8020)
    q.put(hdfs._handle.contents.filesystem)
Ejemplo n.º 3
0
def test_different_handles():
    a = HDFileSystem(host=test_host, port=test_port)
    b = HDFileSystem(host=test_host, port=test_port)
    assert a._handle.contents.filesystem != b._handle.contents.filesystem
#!/home/cloudera/anaconda3/bin/python3.7
## import de la librairie HDFS3
import hdfs3
#import du dictionnaire de la librairie collections
from collections import defaultdict, Counter
from hdfs3 import HDFileSystem

hdfs = HDFileSystem(host='localhost', port=8020)

#affiche la liste des fichiers disponible dans le répertoire
filenames = hdfs.glob('/data_in')
print('Liste des fichiers :')
print(filenames)


##fonction permet de compter le nombre de mot d'un fichier
def count_words(file):
    word_counts = defaultdict(int)
    for line in file:
        #utilisation de la fonction decode pour l'encodage
        line = line.decode('utf8').strip()
        for word in line.split():
            word_counts[word] += 1
    return word_counts


##fonction permet de compter le nombre dans l'ensemble des fichiers disponibles dans le répertoire data_in
all_counts = Counter()
for fn in filenames:
    ##lecture fichiers avec hfds
    with hdfs.open(fn) as f:
Ejemplo n.º 5
0
import pyspark
from pyspark.sql import SparkSession, Row
sc = SparkSession.builder.master("local[*]").getOrCreate()
import pandas as pd
from hdfs3 import HDFileSystem
hdfs = HDFileSystem(host='horton')
hdfsS = HDFileSystem(host='stampy')

data = sc.read.csv('/user/rynguyen/state_info.csv') #Doesnt include header

data.show() # shows headers as _c0 , _c01 because header is not = True

data.printSchema() # prints column settings"

data.select("_c0").show()
data.registerTempTable("Data")

sc.sql("Select * from Data where _c0 = 'California'").show()

# Optimal way of reading data files into data frame
df = sc.read.format('csv').options(header='true', inferSchema='true').load('/user/rynguyen/state_info.csv')
df.show() # includes headers


df.createOrReplaceTempView("data") # Returns results as data frame"

Houseseats2 = sc.sql("select * from data where HouseSeats = 2").show() #data frame query"

df.createGlobalTempView("data") #Temporary view that is shared among all sessions and keep alive until the Spark application terminates"

frame1 = sc.sql("Select * from global_temp.data where HouseSeats = 2").show()
Ejemplo n.º 6
0
 def __init__(self, host, port, user):
     self.hdfs = HDFileSystem(host=host, port=port, user=user)
Ejemplo n.º 7
0
#!/usr/bin/python

#imports
import io
import pandas as pd
import os
import sys
from hdfs3 import HDFileSystem

inputF = sys.argv[1]
outputF = sys.argv[2]

hdfs = HDFileSystem(host='bdhKC', port=9000)


# Metodos necesarios
def LimpiarBarrio(pBarrio):
    if (pBarrio[0].isdigit()):
        return pBarrio[4:]
    else:
        return pBarrio


#Visualizamos que tenemos
with hdfs.open(inputF) as f:
    df = pd.read_csv(f, header=[0, 1], delimiter=';', nrows=22)

#corregimos los nombres de las columnas
as_list = df.columns.tolist()

as_list[0] = ''
Ejemplo n.º 8
0
def hdfs_open_file(path, auth):
    hdfs = HDFileSystem(**auth)
    return hdfs.open(path, mode='rb')
Ejemplo n.º 9
0
def open_file_write(paths, hdfs=None, **kwargs):
    """ Open list of files using delayed """
    if hdfs is None:
        hdfs = HDFileSystem(kwargs.get('host'), kwargs.get('port'))
    out = [delayed(hdfs.open)(path, 'wb') for path in paths]
    return out
Ejemplo n.º 10
0
    def setUpClass(cls):
        super().setUpClass()
        # kafka configuration
        kafka_test_utils_clz = cls.sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
            .loadClass('org.apache.spark.streaming.kafka.KafkaTestUtils')
        cls._kafkaTestUtils = kafka_test_utils_clz.newInstance()
        cls._kafkaTestUtils.setup()
        ezk = cls._kafkaTestUtils.getClass().getDeclaredField('zookeeper')
        ezk.setAccessible(True)
        zk = ezk.get(
            cls._kafkaTestUtils).getClass().getDeclaredField('zookeeper')
        zk.setAccessible(True)
        zk.get(ezk.get(cls._kafkaTestUtils)).getServerCnxnFactory(
        ).setMaxClientCnxnsPerHost(100)

        # hbase configuration
        hbase_testing_utility_clz = cls.sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
            .loadClass('org.apache.hadoop.hbase.HBaseTestingUtility')
        cls._hbaseTestingUtility = hbase_testing_utility_clz.newInstance()
        cls._hbaseTestingUtility.getConfiguration().setBoolean(
            'hbase.table.sanity.checks', False)  # for thrift
        cls._hbaseTestingUtility.getConfiguration().set(
            'hbase.zookeeper.property.clientPort',
            cls._kafkaTestUtils.zkAddress().split(':')[1])

        cls._hbaseTestingUtility.startMiniDFSCluster(1)
        cls._hbaseTestingUtility.startMiniHBaseCluster(1, 1)
        # cls._hbaseTestingUtility.startMiniCluster()

        # thrift server configuration
        thrift_server_clz = cls.sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
            .loadClass('org.apache.hadoop.hbase.thrift.ThriftServer')

        cArgs = cls.sc._gateway.new_array(cls.sc._jvm.java.lang.Class, 1)
        cArgs[0] = cls._hbaseTestingUtility.getConfiguration().getClass()
        iArgs = cls.sc._gateway.new_array(cls.sc._jvm.java.lang.Object, 1)
        iArgs[0] = cls._hbaseTestingUtility.getConfiguration()

        cls._thriftServer = thrift_server_clz \
            .getDeclaredConstructor(cArgs) \
            .newInstance(iArgs)

        tArgs = cls.sc._gateway.new_array(cls.sc._jvm.java.lang.String, 5)
        port = cls._hbaseTestingUtility.randomFreePort()
        cls.thrift_port = port
        tArgs[0] = "-port"
        tArgs[1] = str(port)
        tArgs[2] = "-infoport"
        info_port = cls._hbaseTestingUtility.randomFreePort()
        tArgs[3] = str(info_port)
        tArgs[4] = "start"

        mArgs = cls.sc._gateway.new_array(cls.sc._jvm.java.lang.Class, 1)
        mArgs[0] = tArgs.getClass()
        method = thrift_server_clz.getDeclaredMethod('doMain', mArgs)
        method.setAccessible(True)

        args = cls.sc._gateway.new_array(cls.sc._jvm.java.lang.Object, 1)
        args[0] = tArgs
        cls.thrift_server_thread = threading.Thread(
            target=method.invoke, args=[cls._thriftServer, args])
        cls.thrift_server_thread.setDaemon(True)
        cls.thrift_server_thread.start()
        time.sleep(5)

        cls._hbaseTestingUtility.getMiniHBaseCluster(
        ).waitForActiveAndReadyMaster(60000)

        # test topics
        cls._kafkaTestUtils.createTopic(cls.test_topic_1)
        cls._kafkaTestUtils.createTopic(cls.test_topic_2)
        cls._kafkaTestUtils.createTopic('test_topic1')
        cls._kafkaTestUtils.createTopic('test_topic2')

        # HBase configuration
        connection = happybase.Connection(port=cls.thrift_port)
        connection.create_table(DummyProcessor.TABLE_NAME, {'dummy': dict()})
        connection.create_table('test_' + DummyProcessor.TABLE_NAME,
                                {'dummy': dict()})

        # TODO Add table for coprocessor tests

        #HDFS configuration (uses for co-processors)
        port = cls._hbaseTestingUtility.getDFSCluster().getNameNodePort()
        hdfs = HDFileSystem(host='localhost',
                            port=port,
                            pars={'dfs.client.read.shortcircuit': 'false'})

        # TODO change path to real test co-prc.
        hdfs.put(
            '../../../../hbase-coprocessors/target/scala-2.10/hbase-coprocessors.jar',
            '/hbase-coprocessors.jar')

        # streaming engine test configuration
        cls.engine_config = {
            'socketServer.port':
            4444,  # port of socket server
            'hadoop.dfs.url':
            'hdfs://localhost:{}/'.format(
                cls._hbaseTestingUtility.getDFSCluster().getFileSystem(
                ).getUri().getPort()),
            'hadoop.conf': {
                'hbase.table.sanity.checks':
                False,
                'hbase.zookeeper.property.clientPort':
                cls._kafkaTestUtils.zkAddress().split(':')[1]
            },
            'zookeeper.zkQuorum':
            cls._kafkaTestUtils.zkAddress(),
            'kafka.groupId':
            'test-streaming-consumer',
            'kafka.params': {
                'auto.offset.reset': 'largest'
            },
            'hbase.host':
            'localhost',
            'hbase.port':
            cls.thrift_port,
            'processor.dir':
            'processors',
            'processor.joinWindow':
            20,
            'processor.poolExecutors':
            1
        }
Ejemplo n.º 11
0
from enum import Enum
import pandas as pd
from io import StringIO
import time

from IPython.core.display import display, HTML
from IPython.core.magic import Magics, magics_class, cell_magic, line_magic, \
    needs_local_scope, register_cell_magic
from IPython.core.magic_arguments import argument, magic_arguments, \
    parse_argstring
from hdfs3 import HDFileSystem

horton = HDFileSystem(host='horton')
stampy = HDFileSystem(host='stampy')
tahoe = HDFileSystem(host='tahoe')


@magics_class
class mymagics(Magics):

    @register_cell_magic
    @magic_arguments()
    #     @line_magic
    #     @cell_magic
    @argument("-s", "--save", type=str, help="dataframe to be saved into csv format")
    def cdataframe(line, cell):

        sio = StringIO(cell)
        data = pd.read_csv(sio)
        return data
Ejemplo n.º 12
0
# !/usr/bin/env python
# -*- coding:utf-8 -*-

# https://www.cnblogs.com/shoufengwei/p/5949791.html

from hdfs3 import HDFileSystem
hdfs = HDFileSystem(host='XX.XX.XX.XX', port=9000)


def mkdir(remotepath):
    if not exists(remotepath):
        hdfs.mkdir(dir)


def get(remotepath, localpath):
    if exists(remotepath):
        hdfs.get(remotepath, localpath)


# def put(localfile, remotefile):
#     dir = getDirPath(remotefile)
#     mkdir(dir)
#     hdfs.put(localfile, remotefile)


def exists(remotepath):
    return hdfs.exists(remotepath)


def delete(remotepath):
    if exists(remotepath):
Ejemplo n.º 13
0
#!/bin/env python
# -*- coding: utf-8 -*-


import os.path
from hdfs3 import HDFileSystem
import config

print 'NameNode host:', config.NAMENODE_HOST
print 'NameNode port:', config.NAMENODE_PORT

client = HDFileSystem(host=config.NAMENODE_HOST, port=config.NAMENODE_PORT)
remote_dir = os.path.dirname(config.RFILE_FMT)
if not client.exists(remote_dir):
    client.mkdir(remote_dir)

for day in range(0, config.DAYS):
    src = "".join([config.LFILE_FMT, str(day)])
    dst = "".join([config.RFILE_FMT, str(day)])

    if not os.path.exists(src):
        print 'Skipping:', src, 'file not found!'
        continue

    if client.exists(dst):
        print 'Skipping:', dst, 'file already exists: hadoop fs -rm', dst
        continue

    print 'Uploading', src, '=>', dst
    client.put(src, dst)
Ejemplo n.º 14
0
def test_default_port_and_host(no_conf):
    guess_config()
    hdfs = HDFileSystem(connect=False)
    assert hdfs.host == conf_defaults['host']
    assert hdfs.port == conf_defaults['port']
Ejemplo n.º 15
0
    # read from mrbox.conf
    config = configparser.ConfigParser()
    config.read(config_filepath)

    # local folder properties
    local_folder = customize_path(config['User']['localPath'], 'mrbox')
    local_file_size_limit_MB = config['User']['localFileSizeMB']
    remote_folder = customize_path(config['User']['hdfsPath'], 'mrbox')
    if not os.path.exists(local_folder):
        os.mkdir(local_folder)
    local_file_size_limit_bytes = bytes_to_mb(int(local_file_size_limit_MB))
    local = MRBoxObject(local_folder, local_file_size_limit_bytes,
                        remote_folder)

    # connect to hdfs and create hadoop interface, todo: check how to create list of multiple hadoops
    hdfs_con = HDFileSystem(host=config['User']['hdfsHost'],
                            port=config['User'].getint('hdfsPort'))
    hadoop_path = config['User']['hadoopPath']
    hdfs_con.mkdir(remote_folder)
    hadoop = HadoopInterface(hdfs_con, hadoop_path)

    # create sqlite db
    full_db_path = os.path.join(config['User']['localPath'],
                                config['User']['dbFile'])
    lc = LocalCatalog(full_db_path)

    # todo: run sync thread for initial consistency with local folder

    # create thread to monitor /mrbox directory and log events generated
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')
Ejemplo n.º 16
0
def open_file_write_direct(path, hdfs=None, **kwargs):
    if hdfs is None:
        hdfs = HDFileSystem(kwargs.get('host'), kwargs.get('port'))
    return hdfs.open(path, 'wb')
Ejemplo n.º 17
0
def test_default_port_and_host():
    hdfs = HDFileSystem(connect=False)
    assert hdfs.host == DEFAULT_HOST
    assert hdfs.port == DEFAULT_PORT
Ejemplo n.º 18
0
def read_bytes(path,
               client=None,
               hdfs=None,
               lazy=True,
               delimiter=None,
               not_zero=False,
               sample=True,
               blocksize=None,
               compression=None,
               **hdfs_auth):
    """ Convert location in HDFS to a list of distributed futures

    Parameters
    ----------
    path: string
        location in HDFS
    client: Client (optional)
        defaults to most recently created client
    hdfs: HDFileSystem (optional)
    lazy: boolean (optional)
        If True then return lazily evaluated dask Values
    delimiter: bytes
        An optional delimiter, like ``b'\n'`` on which to split blocks of bytes
    not_zero: force seek of start-of-file delimiter, discarding header
    **hdfs_auth: keyword arguments
        Extra keywords to send to ``hdfs3.HDFileSystem``

    Returns
    -------
    List of ``distributed.Future`` objects if ``lazy=False``
    or ``dask.Value`` objects if ``lazy=True``
    """
    if compression:
        raise NotImplementedError("hdfs compression")
    hdfs = hdfs or HDFileSystem(**hdfs_auth)
    client = default_client(client)
    blocks = get_block_locations(hdfs, path)
    filenames = [d['filename'] for d in blocks]
    offsets = [d['offset'] for d in blocks]
    if not_zero:
        offsets = [max([o, 1]) for o in offsets]
    lengths = [d['length'] for d in blocks]
    workers = [[h.decode() for h in d['hosts']] for d in blocks]

    logger.debug("Read %d blocks of binary bytes from %s", len(blocks), path)

    if sample is True:
        sample = 10000
    if sample:
        with hdfs.open(filenames[0], 'rb') as f:
            sample = f.read(sample)
    else:
        sample = b''

    f = delayed(read_block_from_hdfs, pure=True)
    values = [
        f(fn, offset, length, hdfs.host, hdfs.port, delimiter)
        for fn, offset, length in zip(filenames, offsets, lengths)
    ]

    restrictions = {v.key: w for v, w in zip(values, workers)}

    client._send_to_scheduler({
        'op': 'update-graph',
        'tasks': {},
        'dependencies': [],
        'keys': [],
        'restrictions': restrictions,
        'loose_restrictions': list(restrictions),
        'client': client.id
    })

    return sample, values
Ejemplo n.º 19
0
from hdfs3 import HDFileSystem

cli = HDFileSystem(host="192.168.3.140", post=8020)

cli.ls("/")
Ejemplo n.º 20
0
def read_bytes(fn,
               executor=None,
               hdfs=None,
               lazy=True,
               delimiter=None,
               not_zero=False,
               **hdfs_auth):
    """ Convert location in HDFS to a list of distributed futures

    Parameters
    ----------
    fn: string
        location in HDFS
    executor: Executor (optional)
        defaults to most recently created executor
    hdfs: HDFileSystem (optional)
    lazy: boolean (optional)
        If True then return lazily evaluated dask Values
    delimiter: bytes
        An optional delimiter, like ``b'\n'`` on which to split blocks of bytes
    not_zero: force seek of start-of-file delimiter, discarding header
    **hdfs_auth: keyword arguments
        Extra keywords to send to ``hdfs3.HDFileSystem``

    Returns
    -------
    List of ``distributed.Future`` objects if ``lazy=False``
    or ``dask.Value`` objects if ``lazy=True``
    """
    from hdfs3 import HDFileSystem
    hdfs = hdfs or HDFileSystem(**hdfs_auth)
    executor = default_executor(executor)
    blocks = get_block_locations(hdfs, fn)
    filenames = [d['filename'] for d in blocks]
    offsets = [d['offset'] for d in blocks]
    if not_zero:
        offsets = [max([o, 1]) for o in offsets]
    lengths = [d['length'] for d in blocks]
    workers = [[h.decode() for h in d['hosts']] for d in blocks]
    names = [
        'read-binary-hdfs3-%s-%s' %
        (fn, tokenize(offset, length, delimiter, not_zero))
        for fn, offset, length in zip(filenames, offsets, lengths)
    ]

    logger.debug("Read %d blocks of binary bytes from %s", len(blocks), fn)
    if lazy:
        restrictions = dict(zip(names, workers))
        executor._send_to_scheduler({
            'op': 'update-graph',
            'tasks': {},
            'dependencies': [],
            'keys': [],
            'restrictions': restrictions,
            'loose_restrictions': names,
            'client': executor.id
        })
        values = [
            Value(name, [{
                name: (read_block_from_hdfs, fn, offset, length, hdfs.host,
                       hdfs.port, delimiter)
            }]) for name, fn, offset, length in zip(names, filenames, offsets,
                                                    lengths)
        ]
        return values
    else:
        return executor.map(read_block_from_hdfs,
                            filenames,
                            offsets,
                            lengths,
                            host=hdfs.host,
                            port=hdfs.port,
                            delimiter=delimiter,
                            workers=workers,
                            allow_other_workers=True)
Ejemplo n.º 21
0
# In[ ]:

label_vocabulary = ['ad.', 'nonad.']
target = '1558'
cols_categorical = ['0', '1', '2', '3']

default_value = [[0]] + [[""]] * 4 + [[0.]] * 1554 + [[
    'nonad.'
]]  #cols ['0', '1', '2', '3'] treated as categorical

# ### get feature/label column names

# In[ ]:

from hdfs3 import HDFileSystem
hdfs = HDFileSystem(host='172.17.0.2', port=9000)  #namenode and port
with hdfs.open(training_data_pandas) as f:
    df = pd.read_csv(f,
                     delimiter=delim,
                     index_col=0,
                     skipinitialspace=True,
                     nrows=0)

feature_cols = {}
idx = 1
for col in df.columns:
    feature_cols[col] = idx
    idx += 1

label_cols = {target: feature_cols.pop(target)}
Ejemplo n.º 22
0
    def test_sync_segment_freshness(self):
        sync.init(self.rethinker)
        with tempfile.TemporaryDirectory() as tmp_dir:
            self.rethinker.table('lock').delete().run()
            self.rethinker.table('assignment').delete().run()
            self.rethinker.table('services').delete().run()
            controller = self.make_fresh_controller()
            controller.local_data = tmp_dir
            assert controller.healthy_service_ids == set()
            # make segment 4 a segment of interest
            with open(os.path.join(tmp_dir, '4.sqlite'), 'wb'):
                pass
            controller.sync()
            assert controller.healthy_service_ids == {'trough-read:test01:4'}

            # create a write lock
            lock = sync.Lock.acquire(self.rethinker, 'trough-write:test01:4',
                                     {'segment': '4'})
            controller.sync()
            assert controller.healthy_service_ids == {
                'trough-read:test01:4', 'trough-write:test01:4'
            }
            locks = list(self.rethinker.table('lock').run())

            assert len(locks) == 1
            assert locks[0]['id'] == 'trough-write:test01:4'

        self.rethinker.table('lock').delete().run()
        self.rethinker.table('assignment').delete().run()

        # clean slate
        with tempfile.TemporaryDirectory() as tmp_dir:
            hdfs = HDFileSystem(host=controller.hdfs_host,
                                port=controller.hdfs_port)
            hdfs.rm(controller.hdfs_path, recursive=True)
            hdfs.mkdir(controller.hdfs_path)
            with hdfs.open(os.path.join(controller.hdfs_path, '5.sqlite'),
                           'wb',
                           replication=1) as f:
                f.write('y' * 1024)
            self.rethinker.table('lock').delete().run()
            self.rethinker.table('assignment').delete().run()
            self.rethinker.table('services').delete().run()
            controller = self.make_fresh_controller()
            controller.local_data = tmp_dir
            # create an assignment without a local segment
            assignment = sync.Assignment(self.rethinker,
                                         d={
                                             'hash_ring':
                                             'a',
                                             'node':
                                             'test01',
                                             'segment':
                                             '5',
                                             'assigned_on':
                                             r.now(),
                                             'bytes':
                                             0,
                                             'remote_path':
                                             os.path.join(
                                                 controller.hdfs_path,
                                                 '5.sqlite')
                                         })
            assignment.save()
            lock = sync.Lock.acquire(self.rethinker, 'write:lock:5',
                                     {'segment': '5'})
            assert len(list(self.rethinker.table('lock').run())) == 1
            controller.healthy_service_ids.add('trough-write:test01:5')
            controller.healthy_service_ids.add('trough-read:test01:5')
            controller.sync()
            assert controller.healthy_service_ids == {'trough-read:test01:5'}
            assert list(self.rethinker.table('lock').run()) == []
            # clean up
            hdfs.rm(controller.hdfs_path, recursive=True)
            hdfs.mkdir(controller.hdfs_path)

        # third case: not assigned, local file exists, is older than hdfs
        # this corresponds to the situation where we have an out-of-date
        # segment on disk that was probably a write segment before it was
        # reassigned when it was pushed upstream
        with tempfile.TemporaryDirectory() as tmp_dir:
            # create a local segment without an assignment
            with open(os.path.join(tmp_dir, '6.sqlite'), 'wb'):
                pass
            time.sleep(2)
            # create file in hdfs with newer timestamp
            hdfs = HDFileSystem(host=controller.hdfs_host,
                                port=controller.hdfs_port)
            hdfs.rm(controller.hdfs_path, recursive=True)
            hdfs.mkdir(controller.hdfs_path)
            with hdfs.open(os.path.join(controller.hdfs_path, '6.sqlite'),
                           'wb',
                           replication=1) as f:
                f.write('z' * 1024)
            self.rethinker.table('lock').delete().run()
            self.rethinker.table('assignment').delete().run()
            self.rethinker.table('services').delete().run()
            controller = self.make_fresh_controller()
            controller.local_data = tmp_dir
            controller.healthy_service_ids.add('trough-write:test01:6')
            controller.healthy_service_ids.add('trough-read:test01:6')
            controller.sync()
            assert controller.healthy_service_ids == set()
            # clean up
            hdfs.rm(controller.hdfs_path, recursive=True)
            hdfs.mkdir(controller.hdfs_path)
Ejemplo n.º 23
0
        f.write(data)
    with hdfs_client.open(file_a, 'rb') as f:
        out = f.read(len(data))

        assert out == data


def hdfs_readlines(hdfs_client):
    file_b = '/tmp/test/file_b'
    with hdfs_client.open(file_b, 'wb', replication=1) as f:
        f.write(b"hello\nhadoop")

    with hdfs_client.open(file_b, 'rb') as f:
        lines = f.readlines()
        assert len(lines) == 2


if __name__ == '__main__':
    hdfs_client = HDFileSystem(host=test_host, port=test_port)

    hdfs_exists(hdfs_client)

    hdfs_write_read(hdfs_client)

    hdfs_readlines(hdfs_client)

    hdfs_client.disconnect()

    print("-" * 20)
    print("hello hadoop")
Ejemplo n.º 24
0
import pandas as pd
import numpy as np
import gc
import gzip
from hdfs3 import HDFileSystem
hdfs = HDFileSystem(host='horton')


def entropy1(x):  #用这个!
    # x is pd.Series, and already pd.cut
    counts = x.value_counts(normalize=True, dropna=False).values  #ndarry
    return -counts.dot(np.log(counts + np.e**-100))


# 读取数据, 该数据集中所有的Variable都是Category类型的
path = "/user/runyu/data/masked/TSD/join_result_37_smp2_one_final_bins.csv"  #hdfs
with hdfs.open(path) as f:
    df = pd.read_csv(f,
                     low_memory=False,
                     na_values=['.', ' ', ''],
                     keep_default_na=False,
                     skipinitialspace=True,
                     memory_map=True)
    target = df['mm18_bad']  #已经修复了

rtn = pd.DataFrame()
rtn['name'] = df.columns

print("start cal own entropy ...")
# 计算自身的熵
for col_name in df.columns:
Ejemplo n.º 25
0
def test_different_handles():
    a = HDFileSystem(host='localhost', port=8020)
    b = HDFileSystem(host='localhost', port=8020)
    assert a._handle.contents.filesystem != b._handle.contents.filesystem
Ejemplo n.º 26
0
    # lock.acquire()
    return elp.save_as_comments(text, HDFS)
    # lock.release()


#处理session
els = Elong_session()


def get_info(hid, n):
    return els.get_info(hid, n)


#获取数据
elc = Elong_spyder()


def get_data(json, hid):
    return elc.get_data(json, hid)


#处理队列数据
def deal_queue(text):
    queue_data.put(text)
    queue_data.task_done()


global HDFS
HDFS = HDFileSystem(host='192.168.100.178', port=8020)
#******************************************************************************************************************
Ejemplo n.º 27
0
from hdfs3 import HDFileSystem
hdfs = HDFileSystem(host='trevally.amer.nevint.com', port=9000)
Ejemplo n.º 28
0
"""
'''
    Т.к. `libhdfs3` - это библиотека для C/C++, то будем использовать обертку для нее под Python - `hdfs3`
    https://hdfs3.readthedocs.io/en/latest/
    
    Модуль поддерживает и Python2 и Python3, поэтому будем работать с дефолтной версией интерпретатора
    
    Проверяем установлена ли библиотека `hdfs3` 
    pydoc modules
    
    Библиотеки не оказлось - устанавливаем
    pip install hdfs3
    
    Выясняем адрес и порт для запросов 
    hdfs getconf -confKey fs.defaultFS
    
    Дальше запускаем терминал Python и работаем в нем
    (как вариант - можно подготовить скрипт и запускать его)
'''

from hdfs3 import HDFileSystem
hdfs = HDFileSystem(host='manager.novalocal', port=8020)
'''
И здесь он нам говорит: "Can not find the shared library: libhdfs3.so"
и предлагает установить ее по инструкции http://hdfs3.readthedocs.io/en/latest/install.html,
ок - пробуем установить
yum install libhdfs3 libhdfs3-dev
а он нам: "You need to be root to perform this command."
финиталякомедия
'''
Ejemplo n.º 29
0
def handle(q):
    hdfs = HDFileSystem(host=test_host, port=test_port)
    q.put(hdfs._handle.contents.filesystem)
Ejemplo n.º 30
0
# READ THE OBJECTS ONE PAGE AT A SIZE
for evol in range( 1, count, PAGE_SIZE ):
        this_evol_link = link + "/?limit=25" + str(evol) + "&offset=" + str(PAGE_SIZE)
        evol_output.write(  requests.get(this_evol_link).text  )

contents = evol_output.getvalue()

# WRITE RESULTS TO OUTPUT FILE
local_path  = "/home/jeff/output/pokemon-site.output"
output_file = open( local_path, "w+" )
output_file.write( contents )
output_file.close()

# CLOSE DATA BUFFER
evol_output.close()

# WRITE RESULTS TO HDFS FILE
#   SETUP
from hdfs3 import HDFileSystem
hdfs_host = "localhost"
hdfs_port = 50502
hdfs_path = "/user/pokemon-site.output"

#   FILE MOVE
hdfs_sys = HDFileSystem( host=hdfs_host, port=hdfs_port )
#hdfs_sys.put( local_path, hdfs_path )

with hdfs_sys.open( hdfs_path, 'wb' ) as output_file:
    output_file.write(contents)