def read_result_from_hdfs(username):
    result = ""
    to_return = {}
    file_path = "/jobs_done/" + username + "/part-00000"
    logger.debug("Reading file " + file_path + " from HDFS")
    try:
        logger.debug("Trying to connect to " + hdfs_namenodes[0] + " namenode")
        hdfs_client = PyWebHdfsClient(host=hdfs_namenodes[0],
                                      port='50070',
                                      user_name='xnet',
                                      timeout=100)
        result = hdfs_client.read_file(file_path)
    except (ActiveHostNotFound, ConnectionError) as e:
        to_return["details_1"] = str(e)
        try:
            logger.debug("Trying to connect to " + hdfs_namenodes[1] +
                         " namenode")
            hdfs_client = PyWebHdfsClient(host=hdfs_namenodes[1],
                                          port='50070',
                                          user_name='xnet',
                                          timeout=100)
            result = hdfs_client.read_file(file_path)
        except (ActiveHostNotFound, ConnectionError) as e2:
            to_return[
                "error"] = "There was a problem while trying to read result from HDFS."
            to_return["details2"] = str(e2)
            logger.debug(str(to_return))
            return False, to_return

    return True, result
Exemple #2
0
def main(argv):
    """
    Main method.

    This method performs the following tasks:
    1. Parse command line arguments
    2. Retrieve credentials and connect to Cloudant and WebHDFS
    3. Connect to the Cloudant `_changes` feed for checkpointed document
       consumption
    4. Process each change individually.
    5. Upon exception throwing, store the latest checkpoint to local file and
       exit.
    """

    #add options into the parser
    parser = configureOptions()
    (options, args) = parser.parse_args()
    checkRequiredArguments(options, parser)
    print options

    # configurations
    last_seq = options.last_seq

    #get credential
    perm_file = '%s/.clou' % os.environ['HOME']
    creds = get_creds(perm_file)

    #connect to source database
    s = Server('https://%s:%s@%s' %
               (creds['cloudant_user'], creds['cloudant_pwd'], options.uri))
    db = s[options.dbname]
    #print db.info()

    #connect to target hdfs cluster
    hdfs = PyWebHdfsClient(host=options.hdfs_host,
                           port=options.hdfs_port,
                           user_name=creds['hdfs_user'])
    hdfs.make_dir(options.hdfs_path)

    #and here we consume the cloudant `_changes` feed
    counter = 0
    changestream = ChangesStream(db,
                                 include_docs=True,
                                 heartbeat=True,
                                 since=last_seq)
    for c in changestream:
        #print c
        try:
            if counter % 100 == 0:
                checkpoint(last_seq)
            seq = processChange(hdfs, c, options.hdfs_path)
            if seq:  # protect against the last line being blank
                last_seq = seq
                counter += 1
        except Exception:
            traceback.print_exc()
            checkpoint(last_seq)
            os._exit(1)

    checkpoint(last_seq)
Exemple #3
0
    def ship_udf(ic, function, hdfs_path=None, udf_name=None, database=None,
                 overwrite=False):
        # extract some information from the function
        if udf_name is None:
            udf_name = function.name
        symbol = function.llvm_func.name
        ir = function.llvm_module.to_bitcode()
        return_type = udf_to_impala_type[function.signature.return_type.name]
        arg_types = [udf_to_impala_type[arg.name]
                     for arg in function.signature.args[1:]]

        # ship the IR to the cluster
        hdfs_client = PyWebHdfsClient(host=ic._nn_host, port=ic._webhdfs_port,
                                      user_name=ic._hdfs_user)
        if hdfs_path is None:
            hdfs_path = os.path.join(ic._temp_dir, udf_name + '.ll')
        if not hdfs_path.endswith('.ll'):
            raise ValueError("The HDFS file name must end with .ll")
        hdfs_client.create_file(hdfs_path.lstrip('/'), ir, overwrite=overwrite)

        # register the function in Impala
        if database is None:
            database = ic._temp_db
        impala_name = '%s.%s(%s)' % (database, udf_name, ', '.join(arg_types))
        if overwrite:
            ic._cursor.execute("DROP FUNCTION IF EXISTS %s" % impala_name)
        register_query = ("CREATE FUNCTION %s RETURNS %s "
                          "LOCATION '%s' SYMBOL='%s'") % (impala_name,
                                                          return_type,
                                                          hdfs_path, symbol)
        ic._cursor.execute(register_query)
Exemple #4
0
def upload_file():
    """
    Upload file
    ---
    tags:
        - Files
    consumes: "multipart/form-data"
    parameters:
        -   name: file
            in: formData
            required: true
            paramType: body
            dataType: file
            type: file
    responses:
        200:
            description: Return a successful message
        401:
            description: Unauthorized
        400:
            description: Bad Request
        500:
            description: Server Internal error
    """
    # hard-code config information. You should imporove it.
    hdfs = PyWebHdfsClient(host='webhdfs',port='50070', user_name='thanhson1085')
    if request.method == 'POST':
        file = request.files['file']
        if file and allowed_file(file.filename):
            filename = secure_filename(str(time.time()) + file.filename)
            my_file = 'tmp/thanhson1085/data/' + filename
            hdfs.create_file(my_file, file)
            return jsonify({'success':'true'})

    return jsonify({'success':'false'})
def solarLog_call(epoch_time):
    conn = http.client.HTTPConnection("")
    r = requests.get(" http://winsun.solarlog-web.ch/api?cid=" + pfadheimBaarCID + "&locale=de_ch&username=277555406&password=5a03cdf0a3ff42de09bc85361d8a2f0f&function=dashboard&format=jsonh&solarlog=9112&tiles=Yield|true,Grafic|true,Env|true,Weather|true&ctime=" + epoch_time)
    logging.info("Response: " + str(r.status_code) + " " + r.reason)

    data = r.json()  # This will return entire content.
    data['timestamp'] = epoch_time
    # Remove key's with complex JSON structure
    del data['cur_production_per_wrid']
    del data['invEnergyType']
    #del data['decimalseperator']
    logging.debug(data)

    #write data to .json
    with open('/home/claude/repo/bda-solar/data/data_timestamp/pfadibaar_solarlog_' + epoch_time + '.json', 'w', encoding='utf-8') as outfile:
        json.dump(data, outfile, indent=4, ensure_ascii=False)

    #write the same data as .csv since it is more easy to handel with hdfs..
    with open('/home/claude/repo/bda-solar/data/data_timestamp/pfadibaar_solarlog_' + epoch_time + '.csv', 'w') as f:  # Just use 'w' mode in 3.x
        w = csv.DictWriter(f, data.keys(), dialect=csv.excel_tab)
        w.writeheader()
        w.writerow(data)

    # write the same data as .csv since it is more easy to handel with hdfs..
    hdfs = PyWebHdfsClient(host='193.246.208.147', port='50079', user_name='hdfs')
    #hdfs_path = 'user/hdfs/from_python'
    hdfs.create_file('user/hdfs/pfadibaar_solarlog.csv', '0100')
    #with open('pfadibaar_solarlog_' + epoch_time + '.csv') as file_data:
    #    hdfs.create_file(hdfs_path, data=file_data)

    conn.close()
Exemple #6
0
def save_extracted_subgraph(elements, args: application_args):
    pair, subgraph, _ = elements
    path = args.get_folder_results_path()
    hdfs = PyWebHdfsClient(host=args.hdfs_host, port=args.hdfs_port)
    file = os.path.join(path, f"graph_{str(pair[0])}_{str(pair[1])}")
    pickled = pkl.dumps(subgraph)
    hdfs.create_file(file, pickled, overwrite=True)
def saveToStore(path,meta):
    con=happybase.Connection(MasterHbase)
    con.open()
    metaTable= con.table('MetaTable')
    if meta['size'] < largeSize:
        # save to Hbase
        encTable = con.table('EncTable')
        with open(path,'rb') as f:
            encTable.put(meta['rowkey'],{'enc:data': f.read()})
        metaTable.put(str(meta['rowkey']),{
                'pp:name': str(meta['filename']),
                'pp:checksum': str(meta['checksum']),
                'pp:size': str(meta['size']),
                'pp:often': str(meta['often']),
                'pp:des': str(meta['description'])
                }
              )
        app.logger.debug('%s is saved to Hbase',meta['rowkey'])
    else:
        # save to HDFS
        hdfs = PyWebHdfsClient(host=Master,port='50070', timeout=None,user_name='hduser')
        with open(path, 'rb') as f:
            hdfs.create_file(HDFSMainPath+meta['rowkey'], f)
        metaTable.put(str(meta['rowkey']),{
                'pp:name': str(meta['filename']),
                'pp:checksum': str(meta['checksum']),
                'pp:size': str(meta['size']),
                'pp:HDFSpath': str(HDFSMainPath + meta['rowkey']),
                'pp:often': str(meta['often']),
                'pp:des': str(meta['description'])
                }
              )
        app.logger.debug('%s is saved to HDFS',meta['rowkey'])
    con.close()
Exemple #8
0
def to_hdfs(file_path, hdfs_path):
    hdfs = PyWebHdfsClient(host='hdfs-v1',
                           port='50070',
                           user_name='hdfs',
                           timeout=100)
    with open(file_path, 'rb') as f:
        hdfs.create_file(hdfs_path, f, overwrite=True)
Exemple #9
0
def downParts(fpath):
    '从hdfs下载spark输出的文件, 以 part-0??? 格式的系列文件, 下载的时候直接聚合到一个文件中'
    from pywebhdfs.webhdfs import PyWebHdfsClient
    hdfs = PyWebHdfsClient(host='iasp76', port='12003', user_name='mci')
    flist = hdfs.list_dir(fpath)
    x = flist['FileStatuses']['FileStatus']
    _SUCCESS = False
    for f in x:
        if f['pathSuffix'] == '_SUCCESS':
            _SUCCESS = True
            break
    if not _SUCCESS:
        print("not complete yet!")
        return
    fnames = [
        f['pathSuffix'] for f in x if f['pathSuffix'].startswith('part-')
    ]
    fnames1 = sorted(fnames)
    foutname = fpath[fpath.rfind('/') + 1:]
    l = len(fnames1)
    with open(foutname, "wb") as fo:
        for fname in fnames1:
            fpath1 = fpath + "/" + fname
            fo.write(hdfs.read_file(fpath1))
            print(" progress: ", fname, l)
Exemple #10
0
 def setUp(self):
     self.host = 'hostname'
     self.port = '00000'
     self.user_name = 'username'
     self.path = 'user/hdfs'
     self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                    user_name=self.user_name)
Exemple #11
0
def setup_common_oozie_libs(name_node):
    webhdfs_port = '14000'
    webhdfs_user = '******'
    platform_dir = 'user/deployment/platform'
    lib_path_list = [
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-it.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/hbase-prefix-tree.jar',
        '/opt/cloudera/parcels/CDH/lib/hbase/lib/zookeeper.jar',
        '/opt/cloudera/parcels/CDH/lib/pig/piggybank.jar'
    ]

    # Setup a connection with hdfs using namenode.
    hdfs_client = PyWebHdfsClient(host=name_node,
                                  port=webhdfs_port,
                                  user_name=webhdfs_user,
                                  timeout=None)
    # Create directory on hadoop file system (HDFS).
    hdfs_client.make_dir(platform_dir)
    # Creates a new file on HDFS and write contents from local FS.
    for path in lib_path_list:
        platform_file = '%s/%s' % (platform_dir, os.path.basename(path))
        logging.info('Copying source file: %s to HDFS path %s', path,
                     platform_file)
        with open(path) as file_data:
            hdfs_client.create_file(platform_file, file_data, overwrite=True)
Exemple #12
0
    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host,
                                       port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.path = 'user/hdfs/old_dir'
        self.response = MagicMock()
        self.file_status = {
            "FileStatus": {
                "accessTime": 0,
                "blockSize": 0,
                "group": "supergroup",
                "length": 0,
                "modificationTime": 1320173277227,
                "owner": "webuser",
                "pathSuffix": "",
                "permission": "777",
                "replication": 0,
                "type": "DIRECTORY"
            }
        }
        self.response.json = MagicMock(return_value=self.file_status)
    def __init__(self, host, port, user, logger):
        self._hdfs = PyWebHdfsClient(
            host=host, port=port, user_name=user, timeout=None)

        global LOGGER
        LOGGER = logger
        LOGGER.debug('webhdfs = %s@%s:%s', user, host, port)
    def load(self, job, task, fifo):
        self.job = job
        self.task = task
        self.fifo = fifo
        self.key = None
        self.script_proc = None
        self.decompress_obj = None
        self.pycurl_callback_exception = None

        if task.data['scheme'] == 's3':
            self.is_anonymous = job.spec.source.aws_access_key is None or job.spec.source.aws_secret_key is None
            if self.is_anonymous:
                s3_conn = S3Connection(anon=True)
            else:
                s3_conn = S3Connection(job.spec.source.aws_access_key,
                                       job.spec.source.aws_secret_key)
            bucket = s3_conn.get_bucket(task.data['bucket'])

            try:
                self.key = bucket.get_key(task.data['key_name'])
            except S3ResponseError as e:
                raise WorkerException(
                    "Received %s %s accessing `%s`, aborting" %
                    (e.status, e.reason, task.data['key_name']))
        elif task.data['scheme'] == 'hdfs':
            fname = task.data['key_name']
            client = PyWebHdfsClient(job.spec.source.hdfs_host,
                                     job.spec.source.webhdfs_port,
                                     user_name=job.spec.source.hdfs_user)
            try:
                filesize = client.get_file_dir_status(
                    fname)['FileStatus']['length']
            except pywebhdfs.errors.FileNotFound:
                raise WorkerException("File '%s' does not exist on HDFS" %
                                      fname)
            self.key = AttrDict({'name': fname, 'size': filesize})
        elif task.data['scheme'] == 'file':
            globber = glob2.Globber()
            fname = globber._normalize_string(task.data['key_name'])

            if not os.path.exists(fname):
                raise WorkerException(
                    "File '%s' does not exist on this filesystem" % fname)
            elif not os.path.isfile(fname):
                raise WorkerException("File '%s' exists, but is not a file" %
                                      fname)

            self.key = AttrDict({
                'name': fname,
                'size': os.path.getsize(fname)
            })
        else:
            raise WorkerException('Unsupported job with paths: %s' %
                                  [str(p) for p in self.job.paths])

        if self.key is None:
            raise WorkerException(
                'Failed to find key associated with task ID %s' % task.task_id)

        self.metrics = DownloadMetrics(self.key.size)
Exemple #15
0
 def test_create_throws_exception_for_not_created(self, mock_put):
     webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                               user_name=self.user_name)
     self.init_response.status_code = http_client.TEMPORARY_REDIRECT
     self.response.status_code = http_client.BAD_REQUEST
     mock_put.side_effect = [self.init_response, self.response]
     with self.assertRaises(errors.PyWebHdfsException):
         webhdfs.create_file(self.path, self.file_data)
Exemple #16
0
def webhdfs_connect():
    webhdfs = PyWebHdfsClient(base_uri_pattern=cfg['DEFAULT']['HDFS_BASEURL'],
                              request_extra_opts={
                                  'verify':
                                  cfg['DEFAULT'].get('HDFS_CERT', None),
                                  'auth': get_auth()
                              })
    return webhdfs
Exemple #17
0
 def get_file_contents(self,
                       hdfs_path,
                       user_name='trifacta',
                       httpfs_port='14000'):
     hdfs = PyWebHdfsClient(host=urlparse(self.trifacta_base_url).netloc,
                            port=httpfs_port,
                            user_name=user_name)
     return hdfs.read_file(hdfs_path).decode('utf-8')
Exemple #18
0
 def getHdfsClient(self):
     try:
         hdfs = PyWebHdfsClient(host=self.server, port=self.port, user_name=self.user)
     except:
         etype, evalue, etb = sys.exc_info()
         self.logger.error('Could not connect to webfs service on %s. Exception: %s, Error: %s.' % (self.getConfigurationValue('server'), etype, evalue))
         return None
     return hdfs
Exemple #19
0
def from_hdfs(hdfs_path, file_path):
    hdfs = PyWebHdfsClient(host='hdfs-v1',
                           port='50070',
                           user_name='hdfs',
                           timeout=100)
    binary_file = hdfs.read_file(hdfs_path)
    with open(file_path, 'wb') as f:
        f.write(binary_file)
Exemple #20
0
    def test_webhdfs_csv(self):
        from pywebhdfs.webhdfs import PyWebHdfsClient
        dfs = PyWebHdfsClient(host='localhost',port='9870', user_name='hadoop')
        dfs.make_dir("/temp")

        with open("tests/data/data.csv") as input_file:
            dfs.create_file("/temp/data.csv", file_data=input_file, overwrite=True)

        dfs.delete_file_dir("/temp", recursive=True)
Exemple #21
0
    def test_init_args_provided(self):
        host = '127.0.0.1'
        port = '50075'
        user_name = 'myUser'

        webhdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name)
        self.assertEqual(host, webhdfs.host)
        self.assertEqual(port, webhdfs.port)
        self.assertEqual(user_name, webhdfs.user_name)
Exemple #22
0
 def setUp(self):
     self.host = 'hostname'
     self.port = '00000'
     self.user_name = 'username'
     self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                    user_name=self.user_name)
     self.response = MagicMock()
     self.requests = MagicMock(return_value=self.response)
     self.path = 'user/hdfs/old_dir'
Exemple #23
0
 def setUp(self):
     self.host = 'hostname'
     self.port = '00000'
     self.user_name = 'username'
     self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                    user_name=self.user_name)
     self.path = 'user/hdfs/old_dir'
     self.xattr = 'user.test'
     self.response = MagicMock()
Exemple #24
0
def hdfs_client(ic):
    pywebhdfs = importorskip('pywebhdfs')
    if ic._nn_host is None:
        skip("NAMENODE_HOST not set; skipping...")
    from pywebhdfs.webhdfs import PyWebHdfsClient
    hdfs_client = PyWebHdfsClient(host=ic._nn_host,
                                  port=ic._webhdfs_port,
                                  user_name=ic._hdfs_user)
    return hdfs_client
Exemple #25
0
 def setUp(self):
     self.host = 'hostname'
     self.port = '00000'
     self.user_name = 'username'
     self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                    user_name=self.user_name)
     self.path = 'user/hdfs'
     self.file_data = u'010101'
     self.response = MagicMock()
     self.response.content = self.file_data
Exemple #26
0
 def test_create_returns_file_location(self, mock_put):
     webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                               user_name=self.user_name)
     self.init_response.status_code = http_client.TEMPORARY_REDIRECT
     self.response.status_code = http_client.CREATED
     mock_put.side_effect = [self.init_response, self.response]
     result = webhdfs.create_file(self.path, self.file_data)
     self.assertTrue(result)
     mock_put.assert_called_with(
         self.location, headers=self.expected_headers, data=self.file_data)
Exemple #27
0
 def put_file_contents(self,
                       hdfs_path,
                       file_contents,
                       user_name='trifacta',
                       httpfs_port='14000'):
     hdfs = PyWebHdfsClient(host=urlparse(self.trifacta_base_url).netloc,
                            port=httpfs_port,
                            user_name=user_name)
     hdfs.create_file(hdfs_path, file_contents, overwrite=True)
     return True
Exemple #28
0
 def setUp(self):
     self.host = 'hostname'
     self.port = '00000'
     self.user_name = 'username'
     self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                    user_name=self.user_name)
     self.response = MagicMock()
     self.path = 'user/hdfs/old_dir'
     self.new_path = '/user/hdfs/new_dir'
     self.response = MagicMock()
     self.rename = {"boolean": True}
     self.response.json = MagicMock(return_value=self.rename)
    def classify(self, inputJson):
        pass

        self.hdfs = PyWebHdfsClient(host=self.config.acm.servers.hdfs.host,port=self.config.acm.servers.hdfs.restPort, user_name=self.config.acm.servers.hdfs.fileOwner)
        self.hdfsServerUrl = "hdfs://"+self.config.acm.servers.hdfs.host+":"+str(self.config.acm.servers.hdfs.port)

        if hasattr(self, 'sc')==False: 
            self.sc =SparkContext()
        if hasattr(self, 'sqlContext')==False:
            self.sqlContext = SQLContext(self.sc)


        schema = StructType([StructField('Category', StringType(), True),
                     StructField('Descript', StringType(), True),
                     StructField('Dates', StringType(), True),
                     StructField('DayOfWeek', StringType(), True),
                     StructField('PdDistrict', StringType(), True),
                     StructField('Resolution', StringType(), True),
                     StructField('Address', StringType(), True),
                     StructField('X', DoubleType(), True),
                     StructField('Y', DoubleType(), True)
                    ])
        test = self.sqlContext.createDataFrame(inputJson, schema)

        #pipeline= PipelineModel.load("/home/halil/gitlab/acm/pyspark/acm-text-classification-rest/lr.model.pipeline.savepoint")
        pipeline= PipelineModel.load(self.pipelineHdfsPath)


        testData = pipeline.transform(test)
        print("Test Dataset Count: " + str(testData.count()))

        ########################################################## 
        ################## Train/load the model ################## 
        ########################################################## 

        #lrModel = LogisticRegressionModel.load("/home/halil/gitlab/acm/pyspark/acm-text-classification-rest/lr.model.savepoint")
        lrModel = LogisticRegressionModel.load(self.modelHdfsPath)

        predictions = lrModel.transform(testData)

        predictions.filter(predictions['prediction'] == 7)  \
            .select("Descript","Category","probability","label","prediction") \
            .orderBy("probability", ascending=False) \
            .show(n = 10, truncate = 30)

        #.select("probability","label","prediction") \
        resultJson = predictions.filter(predictions['prediction'] == 7)  \
            .select("prediction") \
            .orderBy("probability", ascending=False) \
            .toJSON().collect()
        self.sc.stop()

        return ["al sana ML!", resultJson]
Exemple #30
0
 def hdfs_connection(self, host, port, user_name, hdfs_path='/tmp/'):
     ''' This function use to establish a connection to hdfs, for preparing to
     create, retrieve, update, delete file in hdfs. We use pywebhdfs in order to
     do this task via hdfs rest api.(See more: http://pythonhosted.org/pywebhdfs/)
     :param : host - hdfs rest host
     :param : port - hdfs rest running port
     :param : user_name - hdfs username (for authentication)
     :param : hdfs_path - location to store files. (default: '/tmp/')
     :return: Nothing.
     '''
     self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name)
     self.hdfs_path = hdfs_path