Example #1
0
class HdfsWrapper:
    def __init__(self):
        self.client = None

    def connect_hdfs(self):
        self.client = InsecureClient(CONST.HDFS_URL, user=CONST.HDFS_USER)

    def mkdir_hdfs(self, path):
        if not exists(path):
            self.client.makedirs(path)

    def list_hdfs(self, path):
        return self.client.list(path)

    def read_hdfs(self, hdfs_path):
        try:
            with self.client.read(hdfs_path) as reader:
                return reader.read()
        except:
            log.error(traceback.format_exc())
            self.connect_hdfs()
            log.error('reconnect hdfs...')

    def write_hdfs(self, hdfs_path, data, overwrite=False):
        try:
            with self.client.write(hdfs_path, overwrite=overwrite) as writer:
                writer.write(data)
            return hdfs_path
        except:
            log.error(traceback.format_exc())
            self.connect_hdfs()
            log.error('reconnect hdfs...')

    def delete_hdfs(self, hdfs_path, recursive=False):
        return self.client.delete(hdfs_path, recursive)
Example #2
0
def main():

    client = InsecureClient(f'http://127.0.0.1:50070/', user='******')

    # create directory in HDFS
    client.makedirs('/test')

    #list content
    ll = client.list('/')
    print(ll)

    # create file in HDFS
    data = [{
        "name": "Anne",
        "salary": 10000
    }, {
        "name": "Victor",
        "salary": 9500
    }]
    with client.write('/test/sample_file.json',
                      encoding='utf-8') as json_file_in_hdfs:
        json.dump(data, json_file_in_hdfs)
    # OR
    client.write(os.path.join('/', 'test', 'sample_file2.json'),
                 data=json.dumps(data),
                 encoding='utf-8')

    # download file from HDFS
    client.download('/test/sample_file.json', './file_from_hadoop.json')

    # upload file to HDFS
    client.upload('/test/local_file_in_hadoop.json', './file_from_hadoop.json')
Example #3
0
def uploadHDFS(filename):
    #    if ' ' in filename:
    #        aoi_file = rename(aoi_file)
    #    else:
    aoi_file = filename
    client = InsecureClient('http://10.41.158.65:50070', user='******')
    fname1 = client.list(hdfs_path)
    #    if aoi_file.split("@").count('-') >= 2:
    dt = str(aoi_file.split("@")[1].split("-", 1)[0][:8])
    #    else:
    #        dt = str(aoi_file.split("_")[-1].split("-")[-2].split("@")[1][:8])
    #        dt = str(aoi_file.split("@")[1].split("-",1)[0][:8])
    folder1 = dt
    if folder1 in fname1:
        client.upload(hdfs_path + folder1 + "/" + aoi_file,
                      upload_path + aoi_file,
                      overwrite=True)
        print "uploadHDFS ok"
        shutil.move(upload_path + aoi_file, backup_path + aoi_file)
    else:
        client.makedirs(hdfs_path + folder1)
        client.upload(hdfs_path + folder1 + "/" + aoi_file,
                      upload_path + aoi_file,
                      overwrite=True)
        print "uploadHDFS ok"
        shutil.move(upload_path + aoi_file, backup_path + aoi_file)
Example #4
0
class DataProcessor:
    def __init__(self, data_path=None):
        if data_path is None:
            self.data_path = r'./config/connect_info.json'
        else:
            assert type(data_path) == str
            self.data_path = data_path
        if not os.path.exists(self.data_path):
            self.data_path = r'./connect_info.json'

        with open(self.data_path) as data_file:
            data = json.load(data_file)
            self.hdfs_client = InsecureClient(
                url='http://' + data['namenode_url'] + ':' + str(data['port']),
                user=data['user'],
                root=data['root_path'])
            self.img_dir = data['img_dir']

        if self.img_dir[-1] != '/':
            self.img_dir += '/'

        self.file_name = 1

    def InitImgDir(self):
        try:
            list_rslt = self.hdfs_client.list(self.img_dir)
            if len(list_rslt) > 0:
                for name in list_rslt:
                    file_path = self.img_dir + name
                    self.hdfs_client.delete(file_path)

        except util.HdfsError:
            self.hdfs_client.makedirs(self.img_dir)
            print("Mkdir ...")

        return True

    def DataProcess(self, data, append=False, file_name=None):
        assert type(data) == str
        if file_name is None:
            file_name = self.img_dir + str(self.file_name)
        else:
            assert (type(file_name)) == str
        print("start writing...")
        start = time.time()
        self.hdfs_client.write(file_name,
                               data,
                               overwrite=True,
                               replication=1,
                               append=append)
        delta = time.time() - start
        print("writing complete, time delta is " + str(delta))
        return True

    def Upload(self, remote_name, local_path):
        assert os.path.exists(local_path)

        remote_path = self.img_dir + remote_name
        self.hdfs_client.upload(remote_path, local_path, True)
        return True
Example #5
0
 def mkdir_hdfs(self):
     ip_address = self.ip_input.toPlainText()
     port_number = self.port_input.toPlainText()
     user_name = self.user_input.toPlainText()
     dir_name = self.dir_input.toPlainText()
     target_name = dir_name + '/' + self.mkdir_input.toPlainText()
     host_address = 'http://' + ip_address + ':' + port_number
     hadoop = InsecureClient(host_address, user_name)
     hadoop.makedirs(target_name)
Example #6
0
class HDFSStorage(Storage):
    def __init__(self, bucket_name: str, folder_name: str):
        super().__init__(bucket_name, folder_name)
        self.client = InsecureClient(url=settings.HDFS_CONN,
                                     user=settings.HDFS_USERNAME)

    def setup(self) -> HDFSResource:
        super().setup()

        self.client.makedirs(f"{self.bucket_name}/{self.folder_name}")

        return HDFSResource(
            resource=f"hdfs:/{self.bucket_name}/{self.folder_name}/")

    def put_file(self,
                 file_path: Union[str, Path],
                 rename: Optional[str] = None) -> HDFSResource:
        if isinstance(file_path, Path):
            file_path = str(file_path)

        file_name = Path(file_path).name if not rename else rename

        # copy file to task directory
        if not file_path.startswith(str(self.local_dir)):
            file_path = shutil.copy(file_path, Path(self.local_dir, file_name))

        try:
            self.client.upload(
                f"{self.bucket_name}/{self.folder_name}/{file_name}",
                file_path)
        except (gaierror, NewConnectionError):
            raise

        return HDFSResource(
            resource=f"hdfs:/{self.bucket_name}/{self.folder_name}/{file_name}"
        )

    def get_file(self, data_file: str) -> str:
        if not data_file.startswith("hdfs:"):
            raise NotValidScheme(
                "Object file prefix is invalid: expected `hdfs:`")

        _, bucket_name, folder_name, file_name = data_file.split("/")
        file_path = Path(self.temp_dir, bucket_name, folder_name, file_name)

        if not file_path.is_file():
            try:
                self.client.download(data_file, file_path)
            except Exception as err:
                print(err)

        return str(file_path)

    def remove_remote_dir(self, omit_files: List[str] = None) -> None:
        pass
Example #7
0
def load_enedis():

    client = InsecureClient('http://localhost:50070', user='******')
    client.makedirs('data')
    print(client.list('/user/cloudera'))

    # load 10 lignes
    client.upload(
        '/user/cloudera/data',
        '/home/fitec/projet_fil_rouge/source_des_données/data/consommation_elec_regions_2019_l10.json',
        overwrite=True)
Example #8
0
def save(inp):
    logging.info('Start saving')
    client = InsecureClient('http://127.0.0.1:50070/', user='******')

    name = inp[0]['date']
    path = f'/bronze/{datetime.now().strftime("%Y-%m-%d")}/'

    client.makedirs(path)

    client.write(f'{path}/out_of_stock.json', data=json.dumps(inp))
    logging.info('Saving ok')
Example #9
0
class SavedModelUploader(object):
    """upload a saved model to hadoop file system"""
    def __init__(self, url, user, base_path=""):
        self._logger = logging.getLogger(self.__class__.__name__)
        self._url = url
        self._user_ = user
        self._base_path = base_path
        self._client = InsecureClient(url, user)

        if not self._exist(base_path):
            self._mkdir(base_path)

    def _exist(self, path):
        if self._client.content(path, strict=False):
            return True
        else:
            return False

    def _mkdir(self, path):
        self._client.makedirs(path)

    def _del(self, path):
        self._client.delete(path, recursive=True)

    def _upload(self, local_path, hdfs_path):
        self._client.upload(hdfs_path, local_path)

    def _logging_progress(self, local_path, nbytes):
        msg = None
        if nbytes > 0:
            msg = "uploading: '{}' [{} bytes]".format(local_path, nbytes)
        else:
            msg = "uploading: '{}' [done]".format(local_path)
        self._logger.info(msg)

    def upload(self, local_model_path, overwrite=False):
        hdfs_model_path = self._base_path + '/' + basename(local_model_path)

        existed = self._exist(hdfs_model_path)
        if overwrite and existed:
            self._del(hdfs_model_path)
        elif not overwrite and existed:
            raise RuntimeError(
                "could not overwrite the model, already existed.")

        try:
            self._client.upload(self._base_path,
                                local_model_path,
                                progress=self._logging_progress)
        except HdfsError as e:
            self._logger.error(e)

        self._logger.info("model upload done")
 def __init__(self, current_user):
     # 需要将current_user对象传进来否则会报错
     self.login_name = current_user.login_name
     try:
         # 测试能否正常连接hdfs而已
         c = InsecureClient("http://master:50070", self.login_name)
         # 默认尝试给用户创建一个文件夹
         c.makedirs("/lake/usr/" + self.login_name)
     except Exception as error:
         self.is_connection = False
     else:
         self.this_connection = c
Example #11
0
def handleHdfsUpload(file_path, proj_id, task_id):
    try:
        client = InsecureClient("http://hdfs.neurolearn.com:50070",
                                user="******")
        hdfs_path = "/neurolearn/files/" + proj_id + "/results/" + task_id
        client.makedirs(hdfs_path)
        client.upload(hdfs_path, file_path)
        print('Uploaded Images to HDFS.')
    except Exception as e:
        print(e)
        hdfs_path = ''
    return hdfs_path
Example #12
0
class HdfsDb(object):
    HOST = '192.168.71.156'
    PORT = 50070
    USER = '******'
    HOST_URI = 'http://{0}:{1}'.format(HOST, PORT)

    def __init__(self):
        self.client = InsecureClient(self.HOST_URI, user=self.USER)

    @check_dir_path
    def list_dir(self, dir_path=None):
        """
        列出根目录
        :return:
        """
        dir_data = self.client.list(dir_path)
        return dir_data

    @check_dir_path
    def mk_dir(self, dir_path=None):
        self.client.makedirs(dir_path)

    def write_file(self, filename, data, dir_path=None):
        """
        写入文件
        hd.write_file('test.json', {'name': 'zhexiao'}, dir_path='/data')
        :param filename:
        :param data:
        :param dir_path:
        :return:
        """
        file_path = '{0}/{1}'.format(dir_path, filename)
        self.client.write(file_path, str(data))

    @check_dir_path
    def read_file(self, filename, dir_path=None):
        """
        读取文件数据
        filedata = hd.read_file('README.txt', dir_path='/data')
        :param filename:
        :param dir_path:
        :return:
        """
        file_path = '{0}/{1}'.format(dir_path, filename)

        with self.client.read(file_path, encoding='utf-8') as reader:
            for line in reader:
                yield line

    @check_dir_path
    def delete(self, filename, dir_path=None):
        file_path = '{0}/{1}'.format(dir_path, filename)
        self.client.delete(file_path)
def uploadHDFS(filename):
    aoi_file = filename
    client = InsecureClient('http://10.41.158.65:50070', user='******')
    fname1 = client.list(hdfs_path)
    dt = str(aoi_file.split("@")[1].split("_")[0][:8])
    folder1 = dt
    if folder1 in fname1:
        client.upload(hdfs_path+folder1+"/"+aoi_file,upload_path+aoi_file,overwrite=True)
        shutil.move(upload_path + aoi_file, backup_path + aoi_file)
    else:
        client.makedirs(hdfs_path+folder1)
        client.upload(hdfs_path+folder1+"/"+aoi_file,upload_path+aoi_file,overwrite=True)
        shutil.move(upload_path + aoi_file, backup_path + aoi_file)
def copy_table_to_hdfs(**kwargs):
    client = InsecureClient(f'http://127.0.0.1:50070/', user='******')

    logging.info(f"Creating dir /bronze on hadoop")
    client.makedirs('/bronze')

    _table_name = kwargs['table_name']
    pg_hook = PostgresHook.get_hook(POSTGRES_CONN_ID)

    with client.write(f'/bronze/{_table_name}.csv', ) as csv_file:
        logging.info("Exporting table to csv file '%s'", csv_file.name)
        pg_hook.copy_expert(
            f"COPY (SELECT * FROM {_table_name})  TO STDOUT WITH HEADER CSV",
            filename=csv_file)
Example #15
0
class DataProcessor:
    def __init__(self, data_path=None):
        if data_path == None:
            self.data_path = r'./config/connect_info.json'
        else:
            assert type(data_path) == str
            self.data_path = data_path
        if not os.path.exists(self.data_path):
            self.data_path = r'./connect_info.json'

        with open(self.data_path) as data_file:
            data = json.load(data_file)
            print("Data: ", data)
            self.hdfs_client = InsecureClient(
                url='http://' + data['namenode_url'] + ':' + str(data['port']),
                user=data['user'],
                root=data['root_path'])
            print("hdfs client: ", self.hdfs_client)
            self.img_dir = data['img_dir']
            print("img dir: ", self.img_dir)

        if self.img_dir[-1] != '/':
            self.img_dir += '/'
        else:
            pass

        self.file_name = 1

    def InitImgDir(self):
        try:
            list_rslt = self.hdfs_client.list(self.img_dir)
            if len(list_rslt) > 0:
                for name in list_rslt:
                    file_path = self.img_dir + name
                    self.hdfs_client.delete(file_path)

        except util.HdfsError:
            self.hdfs_client.makedirs(self.img_dir)

        return True

    def Upload(self, file_path, threads=2):
        print("FilePath: ", file_path)
        print("img_dir: ", self.img_dir[:-1])
        self.hdfs_client.upload(hdfs_path=self.img_dir[:-1],
                                local_path=file_path,
                                n_threads=threads,
                                overwrite=True)
        return 0
Example #16
0
class HDFSService(object):
    def __init__(self):
        self.hdfs = InsecureClient('http://127.0.0.1:9870', user='******')
        self.base_path = '/users/root'

    def mkdir(self, path):
        return self.hdfs.makedirs(path)

    def list(self, path):
        try:
            return self.hdfs.list(path)
        except HdfsError as e:
            print(e)
            return []

    def get(self, path):
        pass

    def upload(self, path, local_path=None, data=None):
        path = self.base_path + path
        if data is not None:
            return self.hdfs.write(path, data=data)
        elif local_path is not None:
            return self.hdfs.upload(path, local_path)
        return False
        pass

    def download(self, path):
        path = self.base_path + path
        with self.hdfs.read(path) as reader:
            print(path)
            buf = reader.read()
        print(len(buf))
        return buf
Example #17
0
def uploadHDFS():
    client = InsecureClient('http://10.41.158.65:50070', user='******')
    fname1 = client.list(hdfs_path)
    if rec_dat not in fname1:
        client.makedirs(hdfs_path + "/" + rec_dat)
    src = temp_path + '*'
    backup_path = '/bfdata/buffer/total_pre_arch/'
    dsc = hdfs_path + rec_dat + '/'
    print 'hdfs dfs -copyFromLocal ', src, dsc
    os.system(
        '/home/hadoop/wistron-hadoop/hadoop-2.7.1/bin/hdfs dfs -copyFromLocal '
        + src + ' ' + dsc)
    print 'mv -f ', temp_path + "*", backup_path
    os.system('mv -f ' + temp_path + "* " + backup_path)
    #os.system('/usr/bin/find ' + temp_path + ' -name *.JPG -exec mv {} ' + backup_path + ' \;')
    end_time = time.time()
    com_dat = datetime.datetime.now().strftime('%Y%m%d %H:%M:%S')
Example #18
0
def handle_uploaded_file(f, data_id, proj_id):
    file_name = str(f.name)
    with open(file_name, 'wb+') as destination:
        for chunk in f.chunks():
            destination.write(chunk)
    data_content = pd.read_csv(file_name, encoding='utf-8')
    data_json = data_content.to_json()

    try:
        client = InsecureClient("http://hdfs.neurolearn.com:50070",
                                user="******")
        hdfs_path = "/neurolearn/files/" + proj_id + "/datasets/" + data_id
        client.makedirs(hdfs_path)
        client.upload(hdfs_path, file_name)
    except:
        hdfs_path = ''

    return data_json, hdfs_path
Example #19
0
    def get(self):
        
        # Récupération du Dataset pour l'évaluation
        df = get_data_cassandra()
        
        print(df.head())
        X = df['total_estimated_load'].values

        # evaluate parameters (p,d,q)  <=> (AR, I, MA)
        p_values = 7
        d_values = 0
        q_values = 5
        #best_cfg, best_score = evaluate_models(X, p_values, d_values, q_values)
        best_cfg = (p_values,d_values,q_values)
        
        # Entrainement du meilleur modèle
        model = ARIMA(X, order=best_cfg)
        model_fit = model.fit()
        
        # save model
        if not os.path.exists(model_local_path):
               # Création du dossier d'export local qui n'existe pas
               os.makedirs(model_local_path,exist_ok=False)
        
        model_fit.save(model_local_path + model_name)
            
        # Connexion au client HDFS
        client = InsecureClient(url='http://namenode:9870', user='******')
    
        # Création du dossier de stockage des fichiers traités
        if client.status(model_hdfs_remote_path,strict=False) == None:
                client.makedirs(model_hdfs_remote_path)

	# Copie du modèle sur HDFS
        remote_load_path = client.upload(model_hdfs_remote_path, model_local_path + model_name,overwrite=True)
        #print(remote_load_path)

        print(client.list(model_hdfs_remote_path))

	
        return { 'best_cfg': best_cfg , 'status': 'Terminated'}
Example #20
0
def increment_load(tables, cur):
    for table in tables:
        tableName = table
        ts = datetime.datetime.now().strftime('_%Y%m%d_%H%M%S')
        query = "COPY (SELECT * FROM " + tableName + " where LastModifiedDate>(select run_time from control_table where table_name='" + tableName + "')) TO '/tmp/" + tableName + "_CDC" + ts + ".csv'"
        cur.execute(query)
        ssh = paramiko.SSHClient()
        ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        ssh.connect('172.16.6.89', username='******', password='******')
        ftp = ssh.open_sftp()
        ftp.get("/tmp/" + tableName + "_CDC" + ts + ".csv",
                "Gp/" + tableName + "_CDC" + ts + ".csv")
        ftp.close()
        #Connect To hadoop
        client = InsecureClient('http://172.16.4.144:50070', user='******')
        client.makedirs("/user/root/greenplum/source/" + tableName + "__ct",
                        "0777")
        client.upload(
            "/user/root/greenplum/source/" + tableName + "__ct/",
            "F:/Srilatha/Attunity-POC/Greenplum/Gp/" + tableName + "_CDC" +
            ts + ".csv")
Example #21
0
class WriteWebHDFS(BatchingBolt):

    # Batch up the tuples every 5 seconds
    ticks_between_batches = 5

    def initialize(self, conf, ctx):
        self.conf = conf
        self.ctx = ctx

        # Keep track of how many tuples we have seen
        self.lines = 0

        # Open a connection via webHDFS to hadoop cluster
        # Will need to replace IP address with address of hadoop cluster
        # Also may need to update local /etc/hosts if that remote
        # address returns a hostname that does not resolve to the same
        # address in this file
        self.client = InsecureClient('http://52.91.211.34:50070/', user='******')

        # Create an HDFS directory name based on bolt startup time
        n = datetime.now()
        self.dirname = 'scanrun-' + n.strftime("%Y%m%d%H%M%S")
        self.client.makedirs(self.dirname)

    def process_batch(self, key, tups):

        # Track number of lines and use it in filename to write
        self.lines += len(tups)
        filename = self.dirname + '/'
        filename = filename + str(self.ctx['taskid']).zfill(2)
        filename = filename + str(self.lines).zfill(10)
        filename = filename + '.txt'

        # Write to HDFS
        with self.client.write(filename, encoding='utf-8') as writer:
            for tup in tups:
                writer.write(tup.values[0] + '\n')

        self.log(self.lines)
        self.log(filename)
Example #22
0
def full_load(tables, cur):
    for table in tables:
        tableName = table
        ts = datetime.datetime.now().strftime('_%Y%m%d_%H%M%S')
        query = "COPY (SELECT * FROM " + tableName + ") TO '/tmp/" + tableName + "_FL" + ts + ".csv'"
        cur.execute(query)
        ssh = paramiko.SSHClient()
        ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        ssh.connect('172.16.6.89', username='******', password='******')
        ftp = ssh.open_sftp()
        ftp.get("/tmp/" + tableName + "_FL" + ts + ".csv",
                "Gp/" + tableName + "_FL" + ts + ".csv")
        ftp.close()
        #Connect To hadoop
        client = InsecureClient('http://172.16.4.144:50070', user='******')
        client.delete("/user/root/greenplum/source/" + tableName, True)
        client.makedirs("/user/root/greenplum/source/" + tableName, "0777")
        client.upload(
            "/user/root/greenplum/source/" + tableName + "/",
            "F:/Srilatha/Attunity-POC/Greenplum/Gp/" + tableName + "_FL" + ts +
            ".csv")
        sql = "INSERT INTO control_table(table_name) VALUES(%s);"
        cur.execute(sql, (tableName, ))
        connection.commit()
Example #23
0
from hdfs import InsecureClient
import os
client = InsecureClient("http://localhost:9870", user='******')
client.delete("streamInput/area", True)
client.makedirs("streamInput/area")
# os.removedirs('file')
def start_service():

    # Chemin de téléchargement du fichier des données
    file_path = "/home/formation/Downloads/hrl_load_estimated.csv"

    #Connexion au client HDFS
    client = InsecureClient(url='http://*****:*****@default-value='defaultStartDate']")))

            #print(driver.page_source)

            # Saisie des dates de début et de fin
            elem = driver.find_element_by_xpath(
                "//input[@default-value='defaultStartDate']")
            elem.clear()
            elem.send_keys("01/01/" + str(year))
            elem = driver.find_element_by_xpath(
                "//input[@default-value='defaultEndDate']")
            elem.clear()
            elem.send_keys("12/31/" + str(year))

            # Attente de rechargement de la page
            time.sleep(5)

            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located(
                    (By.XPATH, "//button[text()='Submit']")))

            # Envoi du formulaire pour le rechargement des données pour la période demandée
            elem = driver.find_element_by_xpath("//button[text()='Submit']")
            elem.click()

            # Attente de l'afffichage du bouton d'export
            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "dm-download")))
            elem = driver.find_element_by_class_name("dm-download")
            elem.click()

            # Attendre le téléchargement du fichier
            while not os.path.exists(file_path):
                time.sleep(1)

            if os.path.isfile(file_path):
                print("Fichier téléchargé pour l'année {}".format(year))

                # Renommage du fichier
                new_file_name = file_path.replace(".csv",
                                                  "_" + str(year) + ".csv")
                os.rename(file_path, new_file_name)

                # Upload du fichier CSV local dans le système HDFS
                try:
                    remote_load_path = client.upload('/user/root/data/pjm',
                                                     new_file_name,
                                                     overwrite=True)
                    # print(remote_load_path)
                except:
                    print("error")

                print(client.list('/user/root/data/pjm'))

            else:
                raise ValueError("%s isn't a file!" % file_path)

        finally:
            #driver.quit()
            print("fin du traitement du fichier")

        #assert "No results found." not in driver.page_source
        driver.close()

        time.sleep(10)
Example #25
0
# ==== Writing Dataframe to HDFS =====
with client_hdfs.write('/user/hdfs/wiki/helloworld.csv',
                       encoding='utf-8') as writer:
    df.to_csv(writer)

# ====== Reading files ======
with client_hdfs.read('/user/hdfs/wiki/helloworld.csv',
                      encoding='utf-8') as reader:
    df = pd.read_csv(reader, index_col=0)

# ==== Getting Content Summary ====
client_hdfs.content('hdfs_path')

# ==== Remove a directory or File in HDFS ====
client_hdfs.delete('hdfs_path', recursive=False, skip_trash=True)

# ==== Create a Directory ====
client_hdfs.makedirs('hdfs_path', permission=None)

# ==== Upload FIle into HDFS ====
client_hdfs.upload('hdfs_path',
                   'local_path',
                   n_threads=1,
                   temp_dir=None,
                   chunk_size=65536,
                   progress=None,
                   cleanup=True,
                   overwrite=True)

# Source : https://hdfscli.readthedocs.io/en/latest/api.html#module-hdfs.client
Example #26
0
class MasterCrawler:
    def __init__(self, url_ckan, redis_ip, redis_port):
        self.ckan = url_ckan
        self.r = redis.StrictRedis(host=redis_ip, port=redis_port, db=0)
        self.client = InsecureClient('http://cdh1:50070/',
                                     'admin',
                                     root='/user/admin/open_data')

    def formatUrl(self, url):
        urlSplit = url.rsplit('/', 1)
        urlEnd = urllib.quote(urlSplit[1])
        urlStart = urlSplit[0]
        finalUrl = urlStart + "/" + urlEnd
        return finalUrl

    def initializeRedis(self):
        content = self.client.content('dati_gov/dati_gov.json', strict=False)
        if not content:
            with self.client.write('dati_gov/dati_gov.json',
                                   encoding='utf-8') as writer:
                writer.write('')
        request = urllib2.Request(URL_DATI_GOV + "/api/3/action/package_list")
        response = urllib2.urlopen(request)
        assert response.code == 200
        response_dict = json.loads(response.read())
        # Check the contents of the response.
        assert response_dict['success'] is True
        result = response_dict['result']
        test_res = result  #[:2000]
        for res in test_res:
            print res
            self.r.rpush("dataset_id", res)

    def consumeData(self):
        red = self.r
        while (red.llen("dataset_id") != 0):
            dataset_id = red.lpop("dataset_id")
            encRes = urllib.urlencode(
                {"id": unicode(dataset_id).encode('utf-8')})
            request_info = urllib2.Request(URL_DATI_GOV +
                                           "/api/3/action/package_show?" +
                                           encRes)
            #request_info.add_header("Authorization", "Basic %s" % base64string)
            try:
                response_info = urllib2.urlopen(request_info)
                info_dataset = json.loads(response_info.read())
                results = info_dataset['result']
                info = results
                #print json.dumps(info)
                if 'resources' in info:
                    #print info
                    info["m_status_resources"] = "ok"
                    resources = info['resources']
                    name = info['name']
                    idInfo = info['id']
                    for resource in resources:
                        rUrl = resource['url']
                        rFormat = resource['format']
                        rName = resource['name']
                        rId = resource['id']
                        finalUrl = self.formatUrl(rUrl)
                        print finalUrl
                        rInfo = urllib2.Request(finalUrl)
                        try:
                            rReq = urllib2.urlopen(rInfo)
                            if rReq.code == 200:
                                resource["m_status"] = "ok"
                                if "csv" in rFormat.lower():
                                    print "qui passo"
                                    data = rReq.read()
                                    data_dir = "dati_gov/open_api/csv/" + dataset_id
                                    existDir = self.client.content(
                                        data_dir, strict=False)
                                    if not existDir:
                                        self.client.makedirs(data_dir)
                                    file_path = data_dir + "/" + rId + ".csv"
                                    #with self.client.write(file_path, encoding='utf-8') as writer:
                                    with self.client.write(
                                            file_path) as writer:
                                        writer.write(data)
                                if "json" in rFormat.lower():
                                    data = rReq.read()
                                    data_dir = "dati_gov/open_api/json/" + dataset_id
                                    existDir = self.client.content(
                                        data_dir, strict=False)
                                    if not existDir:
                                        self.client.makedirs(data_dir)
                                    file_path = data_dir + "/" + rId + ".json"
                                    #    with self.client.write(file_path, encoding='utf-8') as writer:
                                    with self.client.write(
                                            file_path) as writer:
                                        writer.write(data)
                            else:
                                resource["m_status"] = "ko"
                        except Exception, e:
                            resource["m_status"] = "ko"
                            print str(e)
                else:
                    print info
                    info["m_status_resources"] = "ko"
                    print "NO RESOURCES"
                with self.client.write('dati_gov/dati_gov.json',
                                       encoding='utf-8',
                                       append=True) as writer:
                    writer.write(json.dumps(info) + '\n')
            except Exception, e:
                print str(e)
                red.lpush("dataset_error", dataset_id)
Example #27
0
def kafka_hdfs(opticons=None, hdfshost='', broker='', group='', topics=''):

    hdfshost = argv[0]
    broker = argv[1]
    group = argv[2]
    topics = argv[3:]
    # Consumer configuration
    # See https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
    conf = {
        'bootstrap.servers': broker,
        'group.id': group,
        'session.timeout.ms': 6000,
        'default.topic.config': {
            'auto.offset.reset': 'smallest'
        }
    }

    # Check to see if -T option exists
    for opt in optlist:
        if opt[0] != '-T':
            continue
        try:
            intval = int(opt[1])
        except ValueError:
            sys.stderr.write("Invalid option value for -T: %s\n" % opt[1])
            sys.exit(1)

        if intval <= 0:
            sys.stderr.write(
                "-T option value needs to be larger than zero: %s\n" % opt[1])
            sys.exit(1)

        conf['stats_cb'] = stats_cb
        conf['statistics.interval.ms'] = int(opt[1])

    # Create logger for consumer (logs will be emitted when poll() is called)
    logger = logging.getLogger('consumer')
    logger.setLevel(logging.DEBUG)
    handler = logging.StreamHandler()
    handler.setFormatter(
        logging.Formatter('%(asctime)-15s %(levelname)-8s %(message)s'))
    logger.addHandler(handler)

    # Create Consumer instance
    # Hint: try debug='fetch' to generate some log messages
    c = Consumer(conf, logger=logger)

    def print_assignment(consumer, partitions):
        print('Assignment:', partitions)

    # Subscribe to topics
    c.subscribe(topics, on_assign=print_assignment)
    # hdfs login
    #client = hdfs.Client('http://%s:50070' % (hdfshost))
    client = InsecureClient('http://%s:50070' % (hdfshost), user='******')
    client.makedirs('/kafka')
    # Read messages from Kafka, print to stdout
    try:
        while True:
            logtime = time.strftime('%Y-%m-%d %H:%M:%S',
                                    time.localtime(time.time()))
            getper10 = logtime[15:]

            if getper10 == '0:00':
                hive_load(10, logtime)

            msg = c.poll(timeout=1.0)

            if msg is not None:
                # continue

                if msg.error():
                    # Error or event
                    if msg.error().code() == KafkaError._PARTITION_EOF:
                        # End of partition event
                        sys.stderr.write(
                            '%s %s [%d] reached end at offset %d\n' %
                            (logtime, msg.topic(), msg.partition(),
                             msg.offset()))
                    elif msg.error():
                        # Error
                        raise KafkaException(msg.error())
                else:
                    sys.stderr.write('%s %s [%d] at offset %d with key %s:\n' %
                                     (logtime, msg.topic(), msg.partition(),
                                      msg.offset(), str(msg.value())))
                    msgstr = msg.value().decode('utf-8')
                    #msgstr = msg.value()
                    msgdict = json.loads(msgstr, object_pairs_hook=OrderedDict)
                    #loads后是无法保证json_data原始顺序的,如果想要保留原有的顺序,那么就需要用到object_pairs_hook
                    database = msgdict.get('database').encode()
                    table = msgdict.get('table').encode()
                    type = msgdict.get('type').encode()
                    hdfsfile = '%s.%s.%s' % (database, table, type)
                    data = msgdict.get('data')
                    if type == 'insert':
                        datalist = data.values()
                        datastr = ','.join('%s' % id
                                           for id in datalist).encode()
                        try:
                            with client.write('/kafka/%s' % (hdfsfile),
                                              append=True,
                                              encoding='utf-8') as writer:
                                writer.write(datastr + '\n')
                                # json.dump(data, writer)
                        except Exception, e:
                            with client.write('/kafka/%s' %
                                              (hdfsfile)) as writer:
                                writer.write('')
                    elif type == 'update':
                        with open(hdfsfile, 'a') as writer:
                            json.dump(data, writer)
                    elif type == 'delete':
                        with open(hdfsfile, 'a') as writer:
                            json.dump(data, writer)
                    else:
                        print(type)

            else:
Example #28
0
                            with client.write('/kafka/%s' %
                                              (hdfsfile)) as writer:
                                writer.write('')
                    elif type == 'update':
                        with open(hdfsfile, 'a') as writer:
                            json.dump(data, writer)
                    elif type == 'delete':
                        with open(hdfsfile, 'a') as writer:
                            json.dump(data, writer)
                    else:
                        print(type)

            else:

                continue
    except KeyboardInterrupt:
        sys.stderr.write('%% Aborted by user\n')

    # Close down consumer to commit final offsets.
    c.close()


if __name__ == '__main__':
    optlist, argv = getopt.getopt(sys.argv[1:], 'T:')
    if len(argv) < 4:
        print_usage_and_exit(sys.argv[0])
    hdfshost = argv[0]
    client = InsecureClient('http://%s:50070' % (hdfshost), user='******')
    client.makedirs('/kafka')
    kafka_hdfs()
Example #29
0
# -*- coding: utf-8 -*-
#
# Copyright © 2018 white <*****@*****.**>
#
# Distributed under terms of the MIT license.

"""
https://hdfscli.readthedocs.io/en/latest/api.html#module-hdfs.client
"""
from hdfs import InsecureClient

hdfs_url = "http://192.168.30.125:50070"
hdfs_user = "******"
c = InsecureClient(hdfs_url, user=hdfs_user)

c.write("/test_write", data="string")
c.delete("/test_write")
c.makedirs("/new/path") # 自动递归创建

with c.read("f.txt", encoding="utf-8") as f:
    content = f.read()

c.write("/test.txt", "test string")
Example #30
0
from hdfs import InsecureClient

hdfs_client = InsecureClient("http://master004.diablo.hadoop.nm.ted:50070/",
                             user="******")

# for fn in hdfs_client.list("/user/slave/websac/tiktok/2021-04-12"):
#     print(fn)
#     hdfs_client.delete("/user/slave/websac/tiktok/2021-04-12/" + fn)
hdfs_client.makedirs("/user/slave/websac/tiktok/2021-04-12")
Example #31
0
import pandas as pd
from hdfs import InsecureClient

#Cebd1160/Cebd1160!
# emr update dfs.namenode.http-address
hadoop conf
client_hdfs = InsecureClient('http://ec2-34-204-70-68.compute-1.amazonaws.com:50070', 'hadoop')

# Listing all files in HDFS
fnames = client_hdfs.list('/')
print(fnames)

client_hdfs.makedirs('/test')

# with client_hdfs.write('/test/sample-file.txt') as writer:
#     writer.write('adding one line to a file called sample-file.txt')

# Creating a simple Pandas DataFrame
liste_hello = ['hello1', 'hello2']
liste_world = ['world1', 'world2']
df = pd.DataFrame(data={'hello': liste_hello, 'world': liste_world})

# Writing Dataframe to hdfs
with client_hdfs.write('/test/helloworld.csv', encoding='utf-8') as writer:
  df.to_csv(writer)
# #
# # ====== Reading files ======
# with client_hdfs.read('/test/helloworld.csv', encoding='utf-8') as reader:
#   df = pd.read_csv(reader, index_col=0)
#