Example #1
0
    def run(self):
        c = Client(self.host, self.port)

        listing = c.ls([self.log_path], recurse=True)

        for f in listing:
            path = f['path']

            if not path.endswith('.jhist'):
                continue

            ts = arrow.get(f['modification_time'] / 1000)

            if ts <= self.checktime:
                continue

            job_id = job_pattern.match(path.split('/')[-1]).group(0)

            if job_id in self.jobs and self.jobs[job_id] >= ts.timestamp * 1000:
                log.debug('Skipping processed job: ' + job_id)
                continue

            config_path = path[:path.rfind('/')] + '/' + job_id + '_conf.xml'

            event = {
                'inviso.type':
                'mr2',
                'job.id':
                job_id,
                'application.id':
                job_id.replace('job_', 'application_'),
                'job.type':
                'mr2',
                'file.type': ['history', 'config'],
                'jobflow':
                self.jobflow,
                'cluster.id':
                self.cluster_id,
                'cluster':
                self.cluster_name,
                'history.uri':
                'hdfs://%s:%s%s' % (self.host, self.port, path),
                'config.uri':
                'hdfs://%s:%s%s' % (self.host, self.port, config_path),
                'host':
                self.host,
                'port':
                self.port,
                'timestamp':
                str(ts),
                'epoch':
                f['modification_time'],
                'mapreduce.version':
                'mr2'
            }

            log.info('Publishing event: (%s) %s %s' %
                     (event['cluster'], event['job.id'], ts))
            self.publisher.publish([event])
Example #2
0
def get_df_pats():
    df_paths = []
    HDFS_CLIENT = Client(CONF.HDFS_HOST, 9000, use_trash=False)
    for file_entry in HDFS_CLIENT.ls(['/user/root']):
        if 'df_joined_df' in file_entry['path']:
            continue
        df_paths.append(file_entry['path'])
    return df_paths
def scan_files(env):
    hdfs = env['hdfs']
    host, port = hdfs.split(':')
    client = Client(host, int(port), use_trash=False, effective_user='******')
    input_files = []
    for item in client.ls([env['input']]):
        if item['file_type'] == 'd':
            input_files.append(item['path'])
    return input_files
def get_DB(Hive_Warehouse):
    DB={}
    DB_re=[]
    client = Client('yhbd01',8020,use_trash=False)
    list_hive = list(client.ls([Hive_Warehouse]))
    for x in list_hive:
        DB=x
        DB_re.append(DB['path'])
    return DB_re
Example #5
0
def test():
    """
    """
    client = Client("192.168.99.100", 9000)
    for f in client.ls(['/files']):
        print f
        for line in client.cat([f.get('path')]):
            for l in line:
                print l
Example #6
0
class HDFSClient:
    __client = None
    def __init__(self):
        self.client = Client("localhost", 9000)
    @staticmethod
    def get_instance(self):
    def test(self):
        for x in self.client.ls(['/rush/input/']):
            print (x)
Example #7
0
def get_DB(Hive_Warehouse):
    DB = {}
    DB_re = []
    client = Client('yhbd01', 8020, use_trash=False)
    list_hive = list(
        client.ls([Hive_Warehouse],
                  include_toplevel=False,
                  include_children=True,
                  recurse=True))
    for x in list_hive:
        DB = x
        DB_re.append(DB['path'])
    #print 'hive表路径扫描完毕!'
    return DB_re
Example #8
0
def metrics():
    print "Recieved metrics request..."
    metric_prefix = "hdfs_directory_stats"
    metrics = {"the_number_one": "1"}
    c = Client("namenode", 8020)
    filepaths = map(lambda entry: entry['path'], c.ls([sys.argv[1]]))
    lines = reduce(lambda a, b: a + b,
                   [1 for f in c.cat(filepaths) for _ in f])
    metrics['lines_of_text_in_directory'] = lines
    template_kwargs = {
        'metrics': metrics,
        'dir': sys.argv[1],
        'metric_prefix': metric_prefix
    }
    return Response(render_template("metrics", **template_kwargs),
                    mimetype='text/plain')
def scan_event_files(env):
    hdfs = env['hdfs']
    host, port = hdfs.split(':')
    client = Client(host, int(port), use_trash=False, effective_user='******')
    event_files = []

    basename = '_'.join(os.path.basename(env['first_clip']).split('_')[:-1])
    event_dir = os.path.join(env['event_dir'], basename)

    if not client.test(event_dir, exists=True, directory=True):
        return event_files

    for item in client.ls([event_dir]):
        if item['file_type'] == 'f':
            event_files.append(os.path.basename(item['path']))
    return event_files
Example #10
0
    def run(self):
        c = Client(self.host, self.port)

        listing = c.ls([self.log_path], recurse=True)
        events = []
        for f in listing:
            path = f['path']

            if not path.endswith('.jhist'):
                continue

            ts = arrow.get(f['modification_time']/1000)

            if ts <= self.checktime:
                continue

            job_id = job_pattern.match(path.split('/')[-1]).group(0)

            if job_id in self.jobs and self.jobs[job_id] >= ts.timestamp*1000:
                log.debug('Skipping processed job: ' + job_id)
                continue

            config_path = path[:path.rfind('/')]+'/'+job_id+'_conf.xml'

            event = {
                'inviso.type': 'mr2',
                'job.id': job_id,
                'application.id': job_id.replace('job_', 'application_'),
                'job.type': 'mr2',
                'file.type': ['history', 'config'],
                'jobflow' : self.jobflow,
                'cluster.id': self.cluster_id,
                'cluster': self.cluster_name,
                'history.uri': 'hdfs://%s:%s%s' % (self.host,self.port,path),
                'config.uri':'hdfs://%s:%s%s' % (self.host,self.port,config_path),
                'host': self.host,
                'port': self.port,
                'timestamp': str(ts),
                'epoch': f['modification_time'],
                'mapreduce.version': 'mr2'
            }

            log.info('Publishing event: (%s) %s %s' % (event['cluster'], event['job.id'], ts))
            events.append(event)
        for chunk in [events[i:i + self.chunk_size] for i in xrange(0, len(events), self.chunk_size)]:
            self.publisher.publish(chunk)
def getTrainedModel(hdfsServer, modelFile):
    hdfsPort = int(os.environ.get('HDFS_NAME_PORT', 8020))
    modelSavePath = "/user/" + os.getenv('LOGNAME') + "/data/model/" + modelFile + '/'

    # Load the saved model data
    hdfs_client = Client(hdfsServer, hdfsPort)
    filesInfo = hdfs_client.ls([modelSavePath])

    # Copy HDFS files to local temp directory
    # First clean up and recreate the temp folder
    copyDir = tempfile.gettempdir() + "/" + modelFile
    shutil.rmtree(copyDir, ignore_errors=True)
    os.makedirs(copyDir)
    res = hdfs_client.copyToLocal([f['path'] for f in filesInfo], copyDir)
    for r in res:
        if not r['result']:
            print "Error: %s" % r

    modelFilePath = copyDir + '/' + modelFile
    print "Load model from  %s" % modelFilePath
    return joblib.load(modelFilePath)
Example #12
0
class HDFSStat(object):

    cluster = 'hostname'
    port = 8020
    default_path = '/user/hive/warehouse'

    @staticmethod
    def build_path(table):
        nm = table.split('.')[0]
        tb = table.split('.')[1]
        return default_path + '/' + nm + '.db/' + tb

    def __init__(self):
        self.client = Client(HDFSStat.cluster, HDFSStat.port, use_trash=False)

    def latest_partition(self, table_name, table_path=None):
        t_path = HDFSStat.build_path(table_name) if table_path is None else table_path
        latest_dir = list(self.client.ls([t_path])).pop()
        return path.basename(latest_dir['path']).split('=')[1]

    def poke_partition(self, table_name, partition_name, partition, table_path=None):
        t_path = HDFSStat.build_path(table_name) if table_path is None else table_path
        partition_path = t_path + '/' + partition_name + '=' + partition
        return self.client.test(partition_path, exists=True, directory=True, zero_length=False)
Example #13
0
#!/usr/bin/env python

from snakebite.client import Client
import time

host = '10.118.205.8'
port = 9000
client = Client(host=host, port=port, use_trash=False, effective_user='******')

path = '/tmp'

result = []
for x in client.ls([path]):
    result.append(x)

ordered = sorted(result, key=lambda x: x['path'])

for f in ordered:
    if f['file_type'] == 'd':
        print f['path']
    else:
        print f
from snakebite.client import Client

client = Client('localhost', 9000)
for x in client.ls(['/']):
   print x
Example #15
0
def main(args):
    xml = minidom.parse(
        path.join(os.environ["HADOOP_HOME"], "etc", "hadoop", "hdfs-site.xml"))

    element = [
        x for x in xml.getElementsByTagName("property")
        if (x.getElementsByTagName("name")[0].childNodes[0].nodeValue ==
            "dfs.namenode.http-address")
    ][0]

    namenode = (element.getElementsByTagName("value")
                [0].childNodes[0].nodeValue.split(":")[0])

    fs = HDFS(namenode, 8020)

    path_prefix = "/amplab/text"
    for size in args.sizes:
        timings = {}

        MPI.COMM_WORLD.Barrier()
        if c_rank == 0: tic()

        file_list = None
        if c_rank == 0:
            file_list = [
                entry["path"] for entry in fs.ls(
                    [path.join(path_prefix, size, "uservisits")])
            ]
            file_list = [file_list[i::c_size] for i in range(c_size)]

        file_list = MPI.COMM_WORLD.scatter(file_list, root=0)

        MPI.COMM_WORLD.Barrier()
        if c_rank == 0: timings["open-and-register"] = toc()

        MPI.COMM_WORLD.Barrier()
        if c_rank == 0: tic()

        os_results = reduce_data(row_iterator(file_list, fs), 4, "os")

        MPI.COMM_WORLD.Barrier()
        if c_rank == 0: timings["q-stats-by-os"] = toc()
        if c_rank == 0: os_results.index = os_results.pop("os")

        MPI.COMM_WORLD.Barrier()
        if c_rank == 0: tic()

        browser_results = reduce_data(row_iterator(file_list, fs), 6,
                                      "browser")

        MPI.COMM_WORLD.Barrier()
        if c_rank == 0: timings["q-stats-by-browser"] = toc()
        if c_rank == 0: browser_results.index = browser_results.pop("browser")

        if c_rank == 0:
            top_dir = path.join("results", size, "mpi", str(args.nodes))
            mkdir_p(top_dir)
            with open(path.join(top_dir, "timings"), "w") as f:
                for entry in timings.items():
                    f.write("%s, %.18e\n" % entry)
                f.flush()

            browser_results.to_pickle(path.join(top_dir, "browser"))
            os_results.to_pickle(path.join(top_dir, "os"))

    return 0
Example #16
0
    def __init__(self, sc, doclist,
                 ngram_range = [1,1], vocab = None, stop_words = None, nmin = None, nmax = None,
                 num_partitions = None, features_max = None, tokenizer = alpha_tokenizer,
                 hashing = False, load_path = None, hdfs_namenode = None) :

        self._sc = sc
        self._ngram_range = ngram_range
        self._vocab = vocab
        self._stop_words = stop_words
        self._nmin = nmin
        self._nmax = nmax
        self._num_partitions = num_partitions
        self._doclist = doclist
        self._features_max = features_max if features_max is not None else 2**31
        self._tokenizer = tokenizer

        # initialie the RDDs
        self._doc_rdd = None
        self._ngram_rdd = None
        self._vocab_rdd = None
        self._docvec_rdd = None
        self._vocab_map_rdd = None

        # dictionary of RDDs 
        self.rdds = {}

        # initialize other properties
        self._nfeatures = None
        self._hashing = hashing

        # make the vocabulary a set if it isn't one already
        if type(vocab) is not set and vocab is not None: 
            try: 
                self._vocab = set(vocab)
            except TypeError : 
                raise TypeError("Vocabulary must be an iterable like a list, set, etc.")


        if load_path is not None : 
            if load_path[:4] != 'hdfs' : 
                for rdd_name in os.listdir(load_path) :
                    if rdd_name[-3:] == 'rdd' : 
                        self.rdds[rdd_name] = sc.pickleFile(load_path + '/' + rdd_name)
            
            # we're dealing with HDFS
            else : 
                try : 
                    from snakebite.client import Client
                except ImportError : 
                    raise ImportError("package snakebite is required for working with HDFS: pip install snakebite")
                
                if hdfs_namenode is None : 
                    # get the hadoop configuration files from user's environment and extract namenode 
                    import xml
                    hadoop_conf = '%s/core-site.xml'%os.environ['HADOOP_CONF_DIR']
                    tree = xml.etree.ElementTree.parse(hadoop_conf)
                    for prop in tree.findall('property') : 
                        if prop.find('name').text == 'fs.defaultFS' : 
                            dummy, hdfs_namenode, hdfs_port = prop.find('value').text.split(':')
                            hdfs_namenode = hdfs_namenode[2:]                            
                            break
 
                client = Client(hdfs_namenode, int(hdfs_port))
                for rdd_path_dict in client.ls([load_path[7:]]) : 
                    rdd_name = rdd_path_dict['path'].split('/')[-1]
                    if rdd_name[-3:] == 'rdd': 
                        self.rdds[rdd_name] = sc.pickleFile(load_path + '/' + rdd_name)
                    
            print 'Loaded %d RDDs: '%(len(self.rdds))
            for rdd in self.rdds.keys() :
                print rdd

                    
        # make the vital properties dictionary for pickling
        self.properties = {'ngram_range': ngram_range, 
                           'stop_words': stop_words,
                           'nmin': nmin, 
                           'nmax': nmax,
                           'num_partitions': num_partitions,
                           'doclist': doclist,
                           'features_max': features_max,
                           'hashing': hashing,
                           }
Example #17
0
def get_locations(filename, name_host, name_port, **kwargs):
    client = Client(name_host, name_port, use_trash=False)
    files = list(client.ls([filename]))
    return [pair for file in files for pair in find(file, client, **kwargs)]
Example #18
0
class Loader:
    """
    The idea of the loader is to provide a convenient interface to create a new table
    based on some input files
    """

    def __init__(self, path, name_node, hive_server,
                 user="******", hive_db="default", password=None, nn_port=8020, hive_port=10000):

        # HDFS Connection
        self._client = Client(name_node, nn_port)

        self._db = hive_db

        # Hive Connection
        self._hive = pyhs2.connect(host=hive_server,
                                   port=hive_port,
                                   authMechanism="PLAIN",
                                   database=hive_db,
                                   user=user,
                                   password=password)
        self._path = path


    def load(self):
        # Check data to see which kind it is
        files = self._client.ls([self._path])

        files = [f for f in files if f['file_type'] == 'f']
        if len(files) == 0:
            raise Exception("Cannot load empty directory")

        # Pick the first file and assume that it has the same content as the others
        data = self.head(files[0]['path'])
        res = self.check_separator(data)
        if res == None:
            # We cant load the data and better abort here
            print("cant load data, cannot find a separator")
            return

        sep = res[0]
        num_cols = res[1]

        # Build table statement
        table_statement, table_name = self._create_table(self._path, sep, num_cols)
        cursor = self._hive.cursor()
        cursor.execute(table_statement)

        return self._db, table_name


    def _create_table(self, path, sep, count):
        buf = """CREATE EXTERNAL TABLE pyxplorer_data (
    %s
    )ROW FORMAT DELIMITED FIELDS TERMINATED BY '%s'
    STORED AS TEXTFILE LOCATION '%s'
    """ % (",".join(["col_%d string" % x for x in range(count)]), sep, path)
        return buf, "pyxplorer_data"

    def check_separator(self, data):
        """
        THis method evaluates a list of separators on the input data to check which one
        is correct. This is done by first splitting the input by newline and then
        checking if the split by separator is equal for each input row except the last
        that might be incomplete due to the limited input data

        :param data: input data to check
        :return:
        """

        sep_list = [r'\t', r';', r',', r'|', r'\s+']

        data_copy = data
        for sep in sep_list:
            # Check if the count matches each line
            splitted = data_copy.split("\n")
            parts = [len(re.split(sep, line)) for line in splitted]

            # If we did not split anything continue
            if sum(parts) == len(splitted):
                continue

            diff = 0

            for i in range(len(parts[1:-1])):
                diff += abs(parts[i] - parts[i + 1])

            if diff == 0:
                return sep, parts[0]

        # If we reach this point we did not find a separator
        return None


    def head(self, file_path):
        """
        Onlye read the first packets that come, try to max out at 1024kb

        :return: up to 1024b of the first block of the file
        """
        processor = lambda path, node, tail_only=True, append=False: self._handle_head(
            path, node)

        # Find items and go
        for item in self._client._find_items([file_path], processor,
                                             include_toplevel=True,
                                             include_children=False, recurse=False):
            if item:
                return item

    def _handle_head(self, path, node, upper=1024 * 1024):
        data = ''
        for load in self._client._read_file(path, node, tail_only=False,
                                            check_crc=False):
            data += load
            if (len(data) > upper):
                return data

        return data
from snakebite.client import Client

client = Client('localhost', 9000)
for x in client.ls(['/']):
    print x
Example #20
0
# not work, snakebite only support python2

from snakebite.client import Client

client = Client('119.23.182.3', 9000)
for x in client.ls(['/data', '/lookfit/test/logs/user-service/20201205/']):
    print(x)

Example #21
0
#!/usr/bin/env python
from snakebite.client import Client

# this line creates the client connection to the HDFS NameNode
# NameNode hostname = localhost, NameNode port = 9000
# these parameters are set in hadoop/conf/core-site.xml under fs.defaultFS
client = Client('localhost', 9000)

# list the content of the HDFS root directory
# note that many methods in snakebite returns generators
for x in client.ls(['/user/cbohara']):
    print x
Example #22
0
def display():
	client = Client("study", 9000, use_trash=False)
	for x in client.ls(['/data/gz']):
		print x
Example #23
0
def get_locations(filename, name_host, name_port, data_root='/data/dfs/dn'):
    client = Client(name_host, name_port, use_trash=False)
    files = list(client.ls([filename]))
    return [pair for file in files for pair in find(file, client, data_root)]
Example #24
0
from snakebite.client import Client

client = Client('localhost', 54310)
for x in client.ls(['/input.txt']):
   print x
    return (citydb.city(ip).country.name or u'Unknown').encode()

if __name__ == '__main__':
    if len(sys.argv) != 3:
        print >> sys.stderr, "Usage: forgeInternationalAccess <date> <hour>"
        exit(-1)

    spark = SparkContext(appName='ForgeGeoAccess')
    spark.addPyFile('hdfs://digiledap/user/spark/share/lib/accessLogParser.py')
    spark.addFile('hdfs://digiledap/user/spark/share/lib/GeoLite2-City.mmdb')

    from accessLogParser import *
    from snakebite.client import Client

    hdfsHandle = Client('hmaster01')
    hosts = spark.parallelize(hdfsHandle.ls(['/flume/events/apache_access_combined/']))\
                 .filter(lambda dirs: dirs['file_type'] == 'd')\
                 .map(lambda directory: 'hdfs://digiledap%s' % directory['path'])\
                 .collect()

    rdds = {
        item.split('/')[-1]: spark.textFile('%s/%s/%s' % (item, sys.argv[1], sys.argv[2])) for item in hosts
    }

    results = {
        key: rdds[key].map(lambda log: Parser.create(Parser.COMBINED).parse(log))
                      .map(lambda log: (((log['timestamp'] - timedelta(minutes=log['timestamp'].minute % 5))
                                         .replace(second=0),
                                         _getCountryByIP(log['remote_ip'].compressed)),
                                        1))
                      .reduceByKey(add).map(lambda x: (key, x[0][0], x[0][1], x[1])) for key in rdds
Example #26
0
#!/usr/bin/env python

from snakebite.client import Client
import time

host='10.118.205.8'
port=9000
client = Client(host=host, port=port, use_trash=False, effective_user='******')

path='/tmp'

result=[]
for x in client.ls([path]):
    result.append(x)


ordered=sorted(result, key=lambda x: x['path'])

for f in ordered:
    if f['file_type'] == 'd':
        print f['path']
    else:
        print f



Example #27
0
	view_map = view_events_arr.map(lambda line: (line[2].split('_')[0], 1))

	# OUTPUT
	ads_bid_count_by_company = bid_map.reduceByKey(lambda a, b: a + b)
	ads_view_count_by_company = view_map.reduceByKey(lambda a, b: a + b)

	
	print "======== Result =========\n"
	print ads_bid_count_by_company.take(2), ads_view_count_by_company.take(20)
	print "======== Result =========\n"
	ads_bid_count_by_company.saveAsTextFile("hdfs://ec2-52-72-23-2.compute-1.amazonaws.com:9000/user/ubuntu/testdan.txt")
	sc.stop()


if __name__ == "__main__":

	client = Client('ec2-52-72-23-2.compute-1.amazonaws.com', 9000, use_trash=False)
	last_modification_time = sys.argv[1]
	list_of_new_files = [dict for dict in client.ls(['/']) if dict['modification_time'] > last_modification_time]

    # CONFIGURE SPARK
    conf = SparkConf().setAppName(APP_NAME)
    conf = conf.setMaster("local[*]")
    sc = SparkContext(conf=conf)
    
    # FILE TO PROCESS
    filename = sys.argv[1]

    # CALLING MAIN
    main(sc, filename)
Example #28
0
from snakebite.client import Client

client = Client('localhost', 54310)
for x in client.ls(['/user/hduser/']):
   print x
Example #29
0
def get_locations(filename, name_host, name_port, **kwargs):
    client = Client(name_host, name_port, use_trash=False)
    files = list(client.ls([filename]))
    return [pair for file in files for pair in find(file, client, **kwargs)]
Example #30
0
class HDFS_topic(object):
    def __init__(self, topic, user, server, port, web_port, base, hdfs_tmp):
        self.topic = topic
        self.username = user
        self.server = server
        self.port = port
        self.base = base
        self.path = ["%s/%s" % (base, topic)]
        self.hdfs_tmp = hdfs_tmp

        try:
            self.client = Client(server, port, effective_user=user)
            self.hdfsclient=hdfs.client.InsecureClient(\
                          "http://%s:%d" % (server,web_port),user=user)
            self.daylist = self.check()
        except:
            print "Base path %s does not contain valid structure" % (base)
            raise

    #
    # Check basic hdfs access and that directory format is appropiate
    # also builds datelist structure
    #
    def check(self):
        self.content = self.client.ls(self.path)
        ret = []
        for item in self.content:
            (head, tail) = os.path.split(item['path'])
            try:
                parse(tail, yearfirst=True, dayfirst=True)
                if item['file_type'] == 'd':
                    ret.append(tail)
                else:
                    print("WARNING: %s is not a directory, skipping\n" %
                          (item['path']))
            except:
                print("WARNING: %s is not in date format, skipping\n" % (tail))

        if len(ret) > 0:
            ret.sort(key=lambda x: datetime.strptime(x, "%Y-%m-%d"))
            return ret
        else:
            return false

    #
    # Give a date, check if that date is on the dirlist and return matching dir entry
    #
    def day_in_topic(self, date):
        for item in self.daylist:
            if parse(date) == parse(item):
                return item
        return False

    #
    # Check and validates date_from and date_to arguments
    #
    def check_date_range(self, date_from, date_to):
        if date_from:
            try:
                parse(date_from)
            except:
                raise ValueError("FATAL: start date (%s) invalid date format" %
                                 (date_from))

            if (parse(date_from) < parse(self.daylist[0])) or (
                    parse(date_from) > parse(self.daylist[-1])):
                raise ValueError(
                    "FATAL: start date (%s) not in range (%s ---> %s)" %
                    (date_from, self.daylist[0], self.daylist[-1]))
            else:
                ret_from = parse(date_from).strftime("%Y-%m-%d")
                while not self.day_in_topic(ret_from):
                    print "WARNING: start date %s not in topic %s, trying next day" % (
                        ret_from, self.topic)
                    ret_from = datetime.strftime(
                        (parse(ret_from) + timedelta(days=1)), "%Y-%m-%d")

                ret_from = self.day_in_topic(ret_from)

        else:
            ret_from = self.daylist[0]

        if date_to:
            try:
                parse(date_to)
            except:
                raise ValueError("FATAL: end date (%s) invalid date format" %
                                 (date_to))

            if (parse(date_to) < parse(self.daylist[0])) or (
                    parse(date_to) > parse(self.daylist[-1])):
                raise ValueError(
                    "FATAL: end date (%s) not in range (%s ---> %s)" %
                    (date_to, self.daylist[0], self.daylist[-1]))
            else:
                ret_to = parse(date_to).strftime("%Y-%m-%d")
        else:
            ret_to = self.daylist[-1]

        if (parse(ret_from) > parse(ret_to)):
            raise ValueError(
                "FATAL: start date (%s) must be <= end date (%s)" %
                (ret_from, ret_to))

        return (ret_from, ret_to)

    #
    #  Traverses the list of valid directories and merges each day
    #
    def merge(self, date_from="", date_to=""):
        day = ""
        try:
            (day, date_to) = self.check_date_range(date_from, date_to)
        except Exception as err:
            raise ValueError(err)

        print "INFO: Trying to merge %s from %s to %s\n" % (self.topic, day,
                                                            date_to)

        while (parse(day) <= parse(date_to)):
            if self.day_in_topic(day):
                self.merge_day(day)
            else:
                print "WARNING: %s is not on %s, skipping\n" % (day, self.path)

            day = datetime.strftime((parse(day) + timedelta(days=1)),
                                    "%Y-%m-%d")
            while not self.day_in_topic(day) and parse(day) <= parse(date_to):
                print "WARNING: %s not found in %s, trying next day" % (
                    day, self.topic)
                day = datetime.strftime((parse(day) + timedelta(days=1)),
                                        "%Y-%m-%d")

            day = self.day_in_topic(day)
            if not day:
                return

        return True

    #
    # Given a date, if there are files that are not .snappy download and remove them, then getmerge, and upload everything
    #
    def merge_day(self, date):
        print "INFO: processing ", date
        daytmp = "%s/snappymerge-%s-tmp" % (self.hdfs_tmp, date)
        daypath = ["%s/%s/%s/" % (self.base, self.topic, date)]
        #mergedfile="./%s-merged.snappy" % (date)
        mergedfile = "./%s-merged.snappy" % (datetime.strftime(
            datetime.now(), "%Y-%d-%m.%f"))
        day_files = [x['path'] for x in self.client.ls(daypath)]
        print "INFO: DAYPATH: ", daypath
        try:
            os.remove(mergedfile)
        except:
            pass

        if len([x for x in day_files if x.endswith('.snappy')]) <= 1:
            print "WARNING: %s does not have enough files to getmerge, skipping" % (
                date)
            return

        if [file for file in day_files if not file.endswith('.snappy')]:
            print "WARNING: %s contains a non snappy file (%s), moving *snappy to %s getmerge there\n" % (
                daypath, file, daytmp)
            self.merge_with_move(daypath[0], daytmp, day_files, mergedfile)
        else:
            print "INFO: MERGING ", daypath[0]
            result = self.client.getmerge(daypath[0], mergedfile)
            print[x for x in result if not x['result']]

            print "INFO: DELETING original files in ", daypath[0]
            for file in day_files:
                print "INFO: Deleting original file ", file
                self.hdfsclient.delete(file)

            print "INFO: UPLOADING merged (%s) to %s" % (mergedfile,
                                                         daypath[0])
            self.hdfsclient.upload(daypath[0], mergedfile, overwrite=True)
            os.remove(mergedfile)

        return


#
# When there are files that do not contain .snappy suffix, merge with move, first moves everyting to an hdfs temp dir, merges there, and uploads
#

    def merge_with_move(self, day_path, day_tmp, dayfiles, merged_file):
        self.hdfsclient.makedirs(day_tmp)

        print "INFO: MOVING files to ", day_tmp
        snap = [x for x in dayfiles if x.endswith(".snappy")]
        result = self.client.rename(snap, day_tmp)
        print[x['path'] for x in result if not x['result']]

        print "INFO: MERGING files in ", day_tmp
        result = self.client.getmerge(day_tmp, merged_file)
        print[x['path'] for x in result if not x['result']]

        print "INFO: UPLOADING merged (%s) to %s" % (merged_file, day_path)
        self.hdfsclient.upload(day_path, merged_file, overwrite=True)
        os.remove(merged_file)

        print "INFO: Deleting files on ", day_tmp
        self.hdfsclient.delete(day_tmp, recursive=True)
Example #31
0
from snakebite.client import Client

client = Client('localhost', 54310)
for x in client.ls(['/user/hduser/']):
    print x
Example #32
0
 def health_check():
     c = Client("namenode", 8020)
     print "Checking for %s directory..." % sys.argv[1]
     for top_level in c.ls([sys.argv[1]]):
         print "DIR CHILD=%s" % top_level['path']
     print "Ok!"
Example #33
0
from snakebite.client import Client
client = Client('localhost', 8020)  #port is the RPC port of the namenode.
for i in client.ls(['/user/cloudera/behrouz']):  #takes a list of paths!!
    print i
#get this parameters from /etc/hadoop/conf/core-site.xml under the fs.defaults
#many of the methods in snake bite return generators

#creating a directory:
#create two directories behrouz, behrouz1/b1 on HDFS:
print '*' * 40
for p in client.mkdir(['/behrouz', 'behrouz1/b1'], create_parent=True):
    print p
print '*' * 40
#deleting files and directories: deletes any subdirectories and files a directory contains
#recursively deleting the directories!
for p in client.delete(['/behrouz', 'behrouz1/b1'], recurse=True):
    print p
print '*' * 40
# retrieving data from hdfs:
#copying files from HDFS to Local file system:
for f in client.copyToLocal(['/user/cloudera/wordCount.out'],
                            '/home/cloudera/'):
    print f
print '*' * 40
#######
#reading contents of a file
for l in client.text(['/user/cloudera/testfile.txt']):
    print l
#the text method automatically decompress and display gzip and bzip2 files.
Example #34
0
def main(args):
    xml = minidom.parse(path.join(os.environ["HADOOP_HOME"],
                                  "etc", "hadoop", "hdfs-site.xml"))

    element = [ x for x in xml.getElementsByTagName("property")
                if (x.getElementsByTagName("name")[0]
                     .childNodes[0]
                     .nodeValue == "dfs.namenode.http-address") ][0]

    namenode = (element.getElementsByTagName("value")[0]
                       .childNodes[0]
                       .nodeValue.split(":")[0])

    fs = HDFS(namenode, 8020)

    path_prefix = "/amplab/text"
    for size in args.sizes:
        timings = {}

        MPI.COMM_WORLD.Barrier()
        if c_rank == 0: tic()

        file_list = None
        if c_rank == 0:
            file_list = [
                entry["path"] for entry in fs.ls([
                    path.join(path_prefix, size, "uservisits")])]
            file_list = [file_list[i::c_size] for i in range(c_size)]

        file_list = MPI.COMM_WORLD.scatter(file_list, root=0)

        MPI.COMM_WORLD.Barrier()
        if c_rank == 0: timings["open-and-register"] = toc()

        MPI.COMM_WORLD.Barrier()
        if c_rank == 0: tic()

        os_results = reduce_data(row_iterator(file_list, fs), 4, "os")

        MPI.COMM_WORLD.Barrier()
        if c_rank == 0: timings["q-stats-by-os"] = toc()
        if c_rank == 0: os_results.index = os_results.pop("os")

        MPI.COMM_WORLD.Barrier()
        if c_rank == 0: tic()

        browser_results = reduce_data(row_iterator(file_list, fs), 6, "browser")

        MPI.COMM_WORLD.Barrier()
        if c_rank == 0: timings["q-stats-by-browser"] = toc()
        if c_rank == 0: browser_results.index = browser_results.pop("browser")

        if c_rank == 0:
            top_dir = path.join("results", size, "mpi", str(args.nodes))
            mkdir_p(top_dir)
            with open(path.join(top_dir, "timings"), "w") as f:
                for entry in timings.items():
                    f.write("%s, %.18e\n" % entry)
                f.flush()

            browser_results.to_pickle(path.join(top_dir, "browser"))
            os_results.to_pickle(path.join(top_dir, "os"))

    return 0
if __name__ == '__main__':
#    hdfs_host='100.127.6.35'
    hdfs_host='100.127.13.16'
#    hdfs_port=9820
    hdfs_port=8020

    client = Client(host=hdfs_host, port= hdfs_port, use_trash=False, effective_user='******')

    if len(sys.argv) < 2:
        print 'inf_verification.py path'
        sys.exit(0)

    input_dir=sys.argv[1]

    input_files=[]
    for clip in client.ls([input_dir]):
        if clip['file_type'] == 'd':
            input_files.append(clip['path'])

    for folder in sorted(input_files):
        for inf in client.cat([getInf(folder)]):
            for content in inf:
                start=None
                end=None
                for aline in content.split('\n'):
                    if aline.startswith('startTime'):
                        start=aline.strip()
                    elif aline.startswith('endTime'):
                        end=aline.strip()

                print '{}\t{}\t{}'.format(os.path.basename(folder),start, end)
Example #36
0
if (run_mode == "swift" or out_mode == "swift"):
    swiftConf = sc._jsc.hadoopConfiguration()
    for key, value in SWIFT_DEFAULT_CONFIGS.items():
        swiftConf.set(key, value)

    swift_client = swift.Connection(user=swift_user,
                                    key=swift_key,
                                    authurl=swift_authurl)

# read list of files
src_files = []

if run_mode == "hdfs":
    # spotify's snakebite as hdfs client
    src_files = [
        hdfs_url + files['path'] for files in hdfs_client.ls([source_files])
    ]

    # deleting output directory if exists
    if (hdfs_client.test(target_dir, exists=True, directory=True)):
        hdfs_client.delete(target_dir)
        hdfs_client.rmdir(target_dir)

elif run_mode == "swift":
    # read list of files from swift  src_files = []
    src_file_regex = re.compile(source_files)
    for data in swift_client.get_container(source_dir)[1]:
        if src_file_regex.match(data['name']):
            src_files.append(data['name'])

    src_files.sort(key=lambda x: os.path.basename(x))
Example #37
0
class HDFS_topic(object):
	def __init__(self,topic,user,server,port,web_port,base,hdfs_tmp):
		self.topic = topic
		self.username = user
		self.server = server
		self.port = port
		self.base = base
		self.path = ["%s/%s" % (base,topic)]
		self.hdfs_tmp = hdfs_tmp

		try:
			self.client=Client(server,port,effective_user=user) 
			self.hdfsclient=hdfs.client.InsecureClient(\
                 "http://%s:%d" % (server,web_port),user=user)
	 		self.daylist=self.check()
		except:
	 		print "Base path %s does not contain valid structure" % (base)
	 		raise	
	
	#
	# Check basic hdfs access and that directory format is appropiate
	# also builds datelist structure
	#
	def check(self):
		self.content=self.client.ls(self.path)
		ret=[]
		for item in self.content:
			(head,tail) = os.path.split(item['path'])
			try:
				parse(tail,yearfirst=True,dayfirst=True)
				if item['file_type'] == 'd':
					ret.append(tail)
				else:
					print("WARNING: %s is not a directory, skipping\n" % (item['path']))
			except:
				print("WARNING: %s is not in date format, skipping\n"  % (tail))

		if len(ret) > 0:
			ret.sort(key=lambda x: datetime.strptime(x,"%Y-%m-%d"))
			return ret
		else:
			return false

	#
	# Give a date, check if that date is on the dirlist and return matching dir entry
	#
	def day_in_topic(self, date):
		for item in self.daylist:
			if parse(date) == parse(item):
				return item
		return False


	#
	# Check and validates date_from and date_to arguments
	#	
	def check_date_range(self,date_from,date_to):
		if date_from:
			try:
				parse(date_from)
			except:
				raise ValueError("FATAL: start date (%s) invalid date format" % (date_from) )
		
			if ( parse(date_from)  < parse(self.daylist[0])  ) or ( parse(date_from)  > parse(self.daylist[-1]) ):
				raise ValueError("FATAL: start date (%s) not in range (%s ---> %s)" % (date_from,self.daylist[0],self.daylist[-1]))
			else:
				ret_from=parse(date_from).strftime("%Y-%m-%d") 
				while not self.day_in_topic(ret_from):
					print "WARNING: start date %s not in topic %s, trying next day" % (ret_from,self.topic)
					ret_from=datetime.strftime((parse(ret_from)+timedelta(days=1)), "%Y-%m-%d" )
					
				ret_from=self.day_in_topic(ret_from)

				
		else:
				ret_from=self.daylist[0]

		if date_to:
			try:
				parse(date_to)
			except:
				raise ValueError("FATAL: end date (%s) invalid date format" % (date_to) )

			if ( parse(date_to)  < parse(self.daylist[0])  ) or ( parse(date_to)  > parse(self.daylist[-1]) ):
				raise ValueError("FATAL: end date (%s) not in range (%s ---> %s)" % (date_to,self.daylist[0],self.daylist[-1]))
			else:
				ret_to=parse(date_to).strftime("%Y-%m-%d")
		else:
				ret_to=self.daylist[-1]
		
		if (parse(ret_from) > parse(ret_to) ):
			raise ValueError("FATAL: start date (%s) must be <= end date (%s)" % (ret_from,ret_to))


		return (ret_from,ret_to)
		
	
	#
	#  Traverses the list of valid directories and merges each day
	#
	def merge(self,date_from="",date_to=""):
		day=""
		try:
			(day,date_to)=self.check_date_range(date_from,date_to)
		except Exception as err:
			raise ValueError(err)

		print "INFO: Trying to merge %s from %s to %s\n" % (self.topic,day, date_to)

		while (parse(day) <= parse(date_to)):
			if  self.day_in_topic(day):
				self.merge_day(day)
			else:
				print "WARNING: %s is not on %s, skipping\n" % (day,self.path)

			day=datetime.strftime((parse(day)+timedelta(days=1)), "%Y-%m-%d" )
			while not self.day_in_topic(day) and parse(day) <= parse(date_to):
				print "WARNING: %s not found in %s, trying next day" % (day,self.topic)
				day=datetime.strftime((parse(day)+timedelta(days=1)), "%Y-%m-%d" )

			day=self.day_in_topic(day)
			if not day:
				return	
				
		return True

	#
	# Given a date, if there are files that are not .snappy download and remove them, then getmerge, and upload everything
	#
	def merge_day(self,date):
		print "INFO: processing ", date
		daytmp="%s/snappymerge-%s-tmp" % (self.hdfs_tmp,date)
		daypath=["%s/%s/%s/" % (self.base, self.topic,date)]
		#mergedfile="./%s-merged.snappy" % (date)
		mergedfile="./%s-merged.snappy" % (datetime.strftime(datetime.now(),"%Y-%d-%m.%f"))
		day_files=[x['path'] for x in self.client.ls(daypath)]
		print "INFO: DAYPATH: ", daypath
		try:
			os.remove(mergedfile)
		except:
			pass

		
		if len([ x for x in day_files if x.endswith('.snappy') ]) <= 1:
			print "WARNING: %s does not have enough files to getmerge, skipping" % (date)
			return

		if [ file for file in day_files if not file.endswith('.snappy') ]:
				print "WARNING: %s contains a non snappy file (%s), moving *snappy to %s getmerge there\n" % (daypath,file,daytmp)
				self.merge_with_move(daypath[0],daytmp,day_files,mergedfile)
		else:
			print "INFO: MERGING ", daypath[0]
			result=self.client.getmerge(daypath[0],mergedfile)
			print [x for x in result if not x['result']]
		
			print "INFO: DELETING original files in ", daypath[0]
			for file in day_files:
				print "INFO: Deleting original file ", file
				self.hdfsclient.delete(file)

			print "INFO: UPLOADING merged (%s) to %s" % (mergedfile,daypath[0])
			self.hdfsclient.upload(daypath[0],mergedfile,overwrite=True)
			os.remove(mergedfile)

		return

#
# When there are files that do not contain .snappy suffix, merge with move, first moves everyting to an hdfs temp dir, merges there, and uploads
#
	def merge_with_move(self,day_path,day_tmp,dayfiles,merged_file):
		self.hdfsclient.makedirs(day_tmp)


		print "INFO: MOVING files to ", day_tmp
		snap = [x for x in dayfiles if x.endswith(".snappy")]
		result=self.client.rename(snap,day_tmp)
		print [ x['path'] for x in result if not x['result']]

		print "INFO: MERGING files in ", day_tmp
		result=self.client.getmerge(day_tmp,merged_file)
		print [x['path'] for x in result if not x['result']]

		print "INFO: UPLOADING merged (%s) to %s"  % (merged_file,day_path)
		self.hdfsclient.upload(day_path,merged_file,overwrite=True)
		os.remove(merged_file)

		print "INFO: Deleting files on ", day_tmp
		self.hdfsclient.delete(day_tmp,recursive=True)


				
if __name__ == '__main__' :
	import argparse

	count=0

	parser = argparse.ArgumentParser(description="Merge daily historical snappy files into one to save hdfs space")
	parser.add_argument('topic', help="Topic name relative to --base")
	parser.add_argument('--hdfs_user', help="HDFS user name (default: current user)",default=None)
	parser.add_argument('--hdfs_server', help="HDFS server name or ip (default: aquhmstsys022001.c022.digitalriverws.net)",default="aquhmstsys022001.c022.digitalriverws.net")
	parser.add_argument('--hdfs_port', help="HDFS server port number (default:8020)", type=int, default=8020)
	parser.add_argument('--hdfs_tmp', help="HDFS temporary dir to store files to be merged (default:/user/hduser/tmp)", default="/user/hduser/tmp")
	parser.add_argument('--web_port', help="HDFS server WEB port number (default:50070)", type=int, default=50070)
	parser.add_argument('--base', help="Alternate hdfs base path for topic (default:/user/aqueduct/flume)",default="/user/aqueduct/flume")
	parser.add_argument('--start', help="Start Date inclusive  (default: from beginning)")
	parser.add_argument('--end', help="End Date inclusive (default: to end)")


	args = parser.parse_args()
	topic=HDFS_topic(topic=args.topic,user=args.hdfs_user,server=args.hdfs_server,port=args.hdfs_port,\
                     hdfs_tmp=args.hdfs_tmp,web_port=args.web_port,base=args.base)
	try:
		topic.merge(args.start,args.end)
	except Exception as err:
		print err
		exit
Example #38
0
def get_locations(filename, name_host, name_port, data_root='/data/dfs/dn'):
    client = Client(name_host, name_port, use_trash=False)
    files = list(client.ls([filename]))
    return [pair for file in files for pair in find(file, client, data_root)]
if (run_mode == "swift" or out_mode == "swift"):
  swiftConf = sc._jsc.hadoopConfiguration()
  for key, value in SWIFT_DEFAULT_CONFIGS.items():
    swiftConf.set(key, value)

  swift_client = swift.Connection(
    user = swift_user, 
    key = swift_key, 
    authurl = swift_authurl)

# read list of files
src_files = []

if run_mode == "hdfs":
  # spotify's snakebite as hdfs client
  src_files = [ hdfs_url + files['path'] for files in hdfs_client.ls([source_files]) ]

  # deleting output directory if exists
  if (hdfs_client.test(target_dir, exists = True, directory = True)):
    hdfs_client.delete(target_dir)
    hdfs_client.rmdir(target_dir)

elif run_mode == "swift":  
  # read list of files from swift  src_files = []
  source_files = '|'.join([ '(pagecounts-' + (datetime.now() - timedelta(hours=i)).strftime("%Y%m%d-%H") + '(.*))' for i in range(48, 71) ])
  src_file_regex = re.compile(source_files)
  for data in swift_client.get_container(source_dir)[1]:
     if src_file_regex.match(data['name']):
       src_files.append(data['name'])
  
  src_files.sort(key = lambda x: os.path.basename(x))
 def test_request(self):
     from snakebite.client import Client
     client = Client("10.0.137.24", 8022, use_trash=False)
     for x in client.ls(['/user']):
         print x
Example #41
0
    Библиотеки не оказлось - устанавливаем
    pip install snakebite
    
    Выясняем адрес и порт для запросов 
    hdfs getconf -confKey fs.defaultFS
    
    Дальше запускаем терминал Python и работаем в нем
    (как вариант - можно подготовить скрипт и запускать его)
'''

from snakebite.client import Client

client = Client('manager.novalocal', 8020)

# Посмотрим, что у нас есть в рабочей директории
for x in client.ls(['/student9_7']):
    print(x)
'''
{'group': u'supergroup', 'permission': 420, 'file_type': 'f', 'access_time': 1605967318187L, 'block_replication': 3, 'modification_time': 1605967318265L, 'length': 1705L, 'blocksize': 134217728L, 'owner': u'student9_7', 'path': '/student9_7/cur_readme'}
{'group': u'supergroup', 'permission': 420, 'file_type': 'f', 'access_time': 1605953216696L, 'block_replication': 3, 'modification_time': 1605953220706L, 'length': 7104L, 'blocksize': 134217728L, 'owner': u'student9_7', 'path': '/student9_7/googlobots.txt'}
{'group': u'supergroup', 'permission': 420, 'file_type': 'f', 'access_time': 1605966686950L, 'block_replication': 3, 'modification_time': 1605966688013L, 'length': 1705L, 'blocksize': 134217728L, 'owner': u'student9_7', 'path': '/student9_7/readme'}
{'group': u'supergroup', 'permission': 420, 'file_type': 'f', 'access_time': 1605964109596L, 'block_replication': 2, 'modification_time': 1605946691680L, 'length': 19L, 'blocksize': 134217728L, 'owner': u'student9_7', 'path': '/student9_7/test'}
{'group': u'supergroup', 'permission': 420, 'file_type': 'f', 'access_time': 1605964267111L, 'block_replication': 3, 'modification_time': 1605964267975L, 'length': 19L, 'blocksize': 134217728L, 'owner': u'student9_7', 'path': '/student9_7/test2'}
{'group': u'supergroup', 'permission': 493, 'file_type': 'd', 'access_time': 0L, 'block_replication': 0, 'modification_time': 1605950057832L, 'length': 0L, 'blocksize': 0L, 'owner': u'student9_7', 'path': '/student9_7/testdir'}
'''

# Создадим пару директорий
for p in client.mkdir(['/student9_7/py_dir_01', '/student9_7/py_dir_02'],
                      create_parent=True):
    print(p)
'''