def load_from_hdfs(data_package, file_name):
#def load_from_hdfs(data_package, file_name='CThead_uchar.raw'):
	hdfs_str  = data_package.stream_hdfs_file_name
	hdfs_addr = hdfs_str[:hdfs_str.rfind('0/')+1]
	hdfs_path = hdfs_str[hdfs_str.rfind('0/')+2:]

	if log_type in ['time','all']: st = time.time()
	client = InsecureClient(hdfs_addr, user=getpass.getuser())


	with client.read('%s/%s'%(hdfs_path, file_name)) as reader:
		data = numpy.array(Image.open(StringIO(reader.read())))

	print_purple("LOADED")

	return data
Example #2
0
def crop():
    # Check pictures folders
    if request.args.get('from') is None:
        return 'No "from" directory given.'

    if request.args.get('to') is None:
        return 'No "to" directory given.'

    directory_from = request.args.get('from')
    directory_to = request.args.get('to')

    dask_client = Client('192.168.1.4:8786')
    hdfs_client = InsecureClient('http://192.168.1.4:9870', user='******')

    with hdfs_client.read('/' + directory_from + 'data.csv') as reader:
        data = pd.read_csv(reader)
        data = dd.from_pandas(data, npartitions=24)

    data.map_partitions(compute_crop,
                        directory_from,
                        directory_to,
                        meta='dask.dataframe.core.Series').compute()

    create_csv(directory_to=directory_to)

    return "Crop finished"
Example #3
0
class HdfsWrapper:
    def __init__(self):
        self.client = None

    def connect_hdfs(self):
        self.client = InsecureClient(CONST.HDFS_URL, user=CONST.HDFS_USER)

    def mkdir_hdfs(self, path):
        if not exists(path):
            self.client.makedirs(path)

    def list_hdfs(self, path):
        return self.client.list(path)

    def read_hdfs(self, hdfs_path):
        try:
            with self.client.read(hdfs_path) as reader:
                return reader.read()
        except:
            log.error(traceback.format_exc())
            self.connect_hdfs()
            log.error('reconnect hdfs...')

    def write_hdfs(self, hdfs_path, data, overwrite=False):
        try:
            with self.client.write(hdfs_path, overwrite=overwrite) as writer:
                writer.write(data)
            return hdfs_path
        except:
            log.error(traceback.format_exc())
            self.connect_hdfs()
            log.error('reconnect hdfs...')

    def delete_hdfs(self, hdfs_path, recursive=False):
        return self.client.delete(hdfs_path, recursive)
Example #4
0
def read_hdfs(filename, root_dir='data'):
    data_dir = os.path.join(root_dir, filename)
    client_hdfs = InsecureClient('http://' + os.environ['IP_HDFS'] + ':50070')

    with client_hdfs.read(data_dir, encoding='latin-1') as reader:
        df = pd.read_csv(reader, index_col=0)
    return df
Example #5
0
def load_from_hdfs(data_package, hdfs_addr, hdfs_path):
#def load_from_hdfs(data_package, file_name='CThead_uchar.raw'):
	if log_type in ['time','all']: st = time.time()
	dp = data_package
	ds = dp.data_range
	ds_seq = [ds[elem][1]-ds[elem][0] for elem in ['z', 'y', 'x'] if elem in ds]


	while True:
		try:
			client = InsecureClient(hdfs_addr, user=getpass.getuser())
			
		
			file_python_dtype = Vivaldi_dtype_to_python_dtype(dp.file_dtype)
			file_bytes = get_bytes(file_python_dtype)
		
			#print "START TO CONNECT HDFS"
			bef = time.time()
			with client.read(hdfs_path, offset=(ds_seq[1]*ds_seq[2]*ds['z'][0]*file_bytes),length=ds_seq[0]*ds_seq[1]*ds_seq[2]*file_bytes) as reader:
				buf = reader.read()
			aft = time.time()
		
			diff = aft - bef
		
			print_bold( "DATA LOADING ENDS from %s -- time elapsed = %.03f (sec) , reading speed = %.03f MB/sec"%(socket.gethostname(), diff, len(buf) / diff * (1024 ** -2)))
			data = numpy.fromstring(buf, dtype=file_python_dtype).reshape(ds_seq)

			break

		except:
			print bcolors.WARNING + "Connection Broken" + bcolors.ENDC
	

	return data
def normalize():

	# Check pictures folders
	if request.args.get('from') is None:
		return 'No "from" directory given.'

	if request.args.get('to') is None:
		return 'No "to" directory given.'

	dask_client = Client('192.168.1.4:8786')
	hdfs_client = InsecureClient('http://192.168.1.4:9870', user='******')	

	from_directory = request.args.get('from')
	to_directory = request.args.get('to')

	with hdfs_client.read('/' + from_directory + 'data.csv') as reader:
		data = pd.read_csv(reader)
		data = dd.from_pandas(data, npartitions=24)

	data.map_partitions(compute_norm,
						from_directory,
						to_directory,
						meta='dask.dataframe.core.Series').compute()
	
	create_csv(directory_to=to_directory)

	return 'Normalization done.'
Example #7
0
class HDFSService(object):
    def __init__(self):
        self.hdfs = InsecureClient('http://127.0.0.1:9870', user='******')
        self.base_path = '/users/root'

    def mkdir(self, path):
        return self.hdfs.makedirs(path)

    def list(self, path):
        try:
            return self.hdfs.list(path)
        except HdfsError as e:
            print(e)
            return []

    def get(self, path):
        pass

    def upload(self, path, local_path=None, data=None):
        path = self.base_path + path
        if data is not None:
            return self.hdfs.write(path, data=data)
        elif local_path is not None:
            return self.hdfs.upload(path, local_path)
        return False
        pass

    def download(self, path):
        path = self.base_path + path
        with self.hdfs.read(path) as reader:
            print(path)
            buf = reader.read()
        print(len(buf))
        return buf
Example #8
0
def GetSentenceVectorsFromHDFS(file_path, hdfs_url, user):
    hdfs_client = InsecureClient(hdfs_url, user=user)
    with hdfs_client.read(file_path) as reader:
        df = pd.read_csv(reader)

        # 文字記錄的向量轉換為以numpy紀錄
        df['vector_'] = df['vector'].map(lambda x: np.asarray(
            x.replace("]", '').replace("[", '').split('  ')).astype(np.float))
        return df
    return
Example #9
0
class Prediction_ML():
    def __init__(self, dir_algo, algo, path_img):
        logging.info('prediction_ML.init')
        self.directory_algo = dir_algo
        self.path_img = path_img
        self.algo = algo

        self.hdfs_client = InsecureClient('http://192.168.1.4:9870',
                                          user='******')
        self.image = self.read_image(self.path_img, 240)

    def read_image(self, path_img, img_size=0):
        logging.info('prediction_ML.read_image')
        img = 0

        try:
            with self.hdfs_client.read(path_img) as reader:
                img = Image.open(reader)
            if img_size != 0:
                img = img.resize((img_size, img_size))
            img = img.convert('L').convert('RGB')
            img = np.asarray(img).flatten()

        except IOError as err:
            logging.error("Error reading image or path")
            logging.error(err)

        except Exception as err:
            logging.error("Unkownown error in read_image")
            logging.error(err)

        return img

    def run(self):
        try:
            self.hdfs_client.download(
                self.directory_algo + self.algo + ".model",
                self.algo + ".model")
            model = joblib.load(self.algo + ".model")
            os.remove(self.algo + ".model")
            label = model.predict([self.image])
            try:
                array_proba = model.predict_proba([self.image])[0]
                proba = array_proba[label[0]]
            except:
                proba = -1

            return label[0], proba

        except IOError as err:
            logging.error('Error model ' + str(self.algo) +
                          ' is not trained yet!')
            logging.error(
                'Train this model first before using it for predictions')
            return -1, 1
Example #10
0
def main():
    # Connecting to HDFS
    client = InsecureClient(hdfsServer, user='******')
    
    # Downloading the list of most popular words
    with client.read('/tmp/word_count_100k.csv', encoding='UTF-8') as csvfile: 
        w = csv.DictReader(csvfile)
        word_count_aux = list(w)[0]
    
    # Selecting the 2000 most popular words
    word_count_dict = {key:int(value) for key,value in word_count_aux.items()}
    word_count = collections.Counter(word_count_dict)
    top_words = [word for (word, _) in word_count.most_common(2000)]
    
    # Downloading the trained Logistic Regression model
    with client.read('/tmp/twitterML.model') as modelfile:
        logmodel = pickle.load(modelfile)
    
    # Starting Spark context and streaming
    sc = SparkContext(appName="StreamingKafkaTweetProcessor")
    sc.setLogLevel("WARN")
    ssc = StreamingContext(sc, 1)
    ssc.checkpoint("/tmp/checkpoint")
    
    # Configuring Spark Streaming with a Kafka Consumer using a JSON deserializer 
    kafkaStream = KafkaUtils.createStream(ssc, zookeeperServer, 
                    'spark-group', {'twitter':1}, 
                    valueDecoder=lambda m: json.loads(m.decode('UTF-8')))
    
    # Extracting the data field
    tweets = kafkaStream.map(lambda v: v[1])
    
    # Analysing the sentiment of each Tweet
    sentiment_tweets = tweets.map(lambda tweet: tweet_sentiment(tweet, logmodel, top_words))
    # Printing 10 Tweets with the sentiment each second
    sentiment_tweets.pprint(10)
    
    # Sending blocks of Tweets to the function responsible to send the to HBase
    sentiment_tweets.foreachRDD(lambda rdd: rdd.foreachPartition(sendToHbase))
    
    ssc.start()
    ssc.awaitTermination()
def read_by_small():
    client = InsecureClient(HDFS_URL, user=HDFS_USERNAME)
    files_list = client.list(HDFS_DIR)
    images = []

    for fn in files_list:
        with client.read(hdfs_path=os.path.join(HDFS_DIR, fn)) as reader:
            img = reader.read()
            images.append(img)

    print(len(img))
Example #12
0
class HdfsDb(object):
    HOST = '192.168.71.156'
    PORT = 50070
    USER = '******'
    HOST_URI = 'http://{0}:{1}'.format(HOST, PORT)

    def __init__(self):
        self.client = InsecureClient(self.HOST_URI, user=self.USER)

    @check_dir_path
    def list_dir(self, dir_path=None):
        """
        列出根目录
        :return:
        """
        dir_data = self.client.list(dir_path)
        return dir_data

    @check_dir_path
    def mk_dir(self, dir_path=None):
        self.client.makedirs(dir_path)

    def write_file(self, filename, data, dir_path=None):
        """
        写入文件
        hd.write_file('test.json', {'name': 'zhexiao'}, dir_path='/data')
        :param filename:
        :param data:
        :param dir_path:
        :return:
        """
        file_path = '{0}/{1}'.format(dir_path, filename)
        self.client.write(file_path, str(data))

    @check_dir_path
    def read_file(self, filename, dir_path=None):
        """
        读取文件数据
        filedata = hd.read_file('README.txt', dir_path='/data')
        :param filename:
        :param dir_path:
        :return:
        """
        file_path = '{0}/{1}'.format(dir_path, filename)

        with self.client.read(file_path, encoding='utf-8') as reader:
            for line in reader:
                yield line

    @check_dir_path
    def delete(self, filename, dir_path=None):
        file_path = '{0}/{1}'.format(dir_path, filename)
        self.client.delete(file_path)
def download(keyword):
    client = InsecureClient("http://ip_address", user="******")
    root_dir = "/username/dps"
    for folder in client.list(root_dir):
        if keyword not in folder:
            continue
        os.makedirs(os.path.join("data", folder), exist_ok=True)
        for file in client.list(root_dir + "/" + folder):
            target_path = os.path.join("data", folder, file)
            logging.info("Downloading for {}".format(target_path))
            if os.path.exists(target_path):
                logging.warning("{} already exists!".format(target_path))
                continue
            with open(target_path, "wb") as writer, client.read("{}/{}/{}".format(root_dir, folder, file)) as reader:
                writer.write(reader.read())
Example #14
0
    def get_stopwords_from_hdfs(self):
        stopwords_set = set()

        print "loading stopwords..."
        try:
            client = InsecureClient(self.HDFS_ADDR, user='******')
            with client.read(self.STOPWORDS_PATH,
                             encoding="utf-8",
                             delimiter="\n") as reader:
                for _ in reader:
                    stopwords_set.add(_.encode('utf-8'))
                print "done!!"

        except NameError as n:
            print "hdfs取得數據(停用詞)失敗", n
        return stopwords_set
Example #15
0
def reducer():
    web_hdfs_interface = InsecureClient('http://localhost:9870', user='')
    with web_hdfs_interface.read(
            '/tic_tac_toe/ml-project/x_mapped.csv') as reader:
        to_reduce = open('x_mapped.csv', newline='\n')
    currentSquare = None
    currendResult = None
    reduced = {}
    print('REDUSING...')
    for line in to_reduce:
        #split the tuple
        square, result = line.strip().split('\t')
        #if dict key does not exist init
        if not reduced.get(square): reduced[square] = {'W': 0, 'L': 0}
        #if square change
        if currentSquare != square:
            #if not first loop print reduced
            if currentSquare is not None:
                print(
                    currentSquare, reduced.get(currentSquare), 'W/L Ratio',
                    float(reduced.get(currentSquare)['W']) /
                    float(reduced.get(currentSquare)['L']))
            #change cur square
            currentSquare = square
            #if result also changed
            if currendResult != result:
                #change result
                currendResult = result
                reduced.get(square)[result] += 1
            else:
                reduced.get(square)[result] += 1
        else:
            if currendResult != result:
                currendResult = result
                reduced.get(square)[result] += 1
            else:
                reduced.get(square)[result] += 1
    #print last
    print(
        currentSquare, reduced.get(currentSquare), 'W/L Ratio',
        float(reduced.get(currentSquare)['W']) /
        float(reduced.get(currentSquare)['L']))
Example #16
0
def GetMetaDataFromHDFS(mata_data_path, hdfs_url, user):
    vect = None
    names = set()

    def remove_last_comma(line):
        return line[0:len(lines[1]) - 2]

    hdfs_client = InsecureClient(hdfs_url, user=user)

    with hdfs_client.read(mata_data_path) as reader:
        lines = reader.readlines()

        vect = np.asarray(remove_last_comma(lines[1]).split(',')).astype(
            np.float)
        for name in remove_last_comma(lines[3]).split(','):
            if (name == ''):
                continue
            names.add(int(name))

    return vect, names
Example #17
0
def search_img(src, key_value):
    data = []
    hdfs = InsecureClient(current_app.config['WEBHDFS_ADDR'],
                          user=current_app.config['WEBHDFS_USER'])
    with hdfs.read(src) as reader:
        data = reader.read()

    nparr = numpy.fromstring(data, numpy.uint8)
    img1 = cv2.imdecode(nparr,
                        cv2.IMREAD_COLOR)  # cv2.IMREAD_COLOR in OpenCV 3.1

    orb = cv2.ORB_create()
    # find the keypoints and descriptors with SIFT
    kp1, desc = orb.detectAndCompute(img1, None)

    m = {}
    for key in key_value:
        m[key] = search_img_desc(orb, desc, key_value[key])

    return m
Example #18
0
    def get_vec_from_hdfs(self):
        word_vec_dict = dict()

        print "loading word vectors..."
        try:
            client = InsecureClient(self.HDFS_ADDR, user='******')
            # delimiter="\n" 讀取分隔符為空格
            with client.read(self.VEC_PATH, encoding="utf-8",
                             delimiter="\n") as reader:
                for _ in reader:
                    _ = _.split(' ')
                    key = _[0].encode('utf-8')
                    value = np.array(_[1:len(_) - 1]).astype(np.float)
                    word_vec_dict[key] = value

            print "done!!"
        except NameError as n:
            print "hdfs取得數據(詞向量)失敗", n

        return word_vec_dict
Example #19
0
def mapper():
    print("MAPPER READS...")
    web_hdfs_interface = InsecureClient('http://localhost:9870', user='')
    with web_hdfs_interface.read(
            '/tic_tac_toe/ml-project/tic-tac-toe.data') as reader:
        data = pd.read_csv(reader)
    to_reduce = []
    for index, row in data.iterrows():
        #for every column except class
        for square in data.columns[:-1]:
            #instead of 1 or 0 we have W and L
            if row.loc[square] == 'x' and row.loc['Class'] == 'positive':
                print(square, 'W')
                to_reduce.append('%s\t%s' % (square, 'W'))
            elif row.loc[square] == 'x' and row.loc['Class'] == 'negative':
                print(square, 'L')
                to_reduce.append('%s\t%s' % (square, 'L'))
    to_reduce.sort()
    with open('x_mapped.csv', 'w') as csvfile:
        for line in to_reduce:
            csvfile.write(line + '\n')
def read_by_bulk(index_fp):
    with open(index_fp) as f:
        file_index_list = f.readline().split(",")
        filename_list = f.readline().split(",")
        file_index_list[-1] = file_index_list[-1].replace("\n", "")
        file_index_list = [int(item) for item in file_index_list]

    client = InsecureClient(HDFS_URL, user=HDFS_USERNAME)
    with client.read(hdfs_path='./bulk_img.tiff') as reader:
        bulk = reader.read()

    images = []
    for i in range(len(file_index_list) - 1):
        img = bulk[file_index_list[i]:file_index_list[i + 1]]
        images.append(images)
    img = bulk[file_index_list[-1]:]
    images.append(img)

    print('Total {} images'.format(len(images)))

    return images
Example #21
0
class interHDFS:
    def __init__(self, url, user=None, **kwargs):
        self.url = url
        self.user = user
        for k, v in kwargs.items():
            self.k = v
        self.connect = InsecureClient(self.url, self.user)
        try:
            self.connect.status('/')
        except Exception as e:
            print(f"[ERROR]:")
            raise ("connected failed!")

    @property
    def apiVersion(self):
        return "v1"

    def listDir(self, dirname: str = '/'):
        return self.connect.list(dirname)

    def getFiles(self, dirname: str, depth: int = 0) -> list:
        l = []
        if not dirname:
            print("dirname is null")
        else:
            for file in self.connect.walk(dirname, depth=depth):
                if file[-1]:
                    for f in file[-1]:
                        l.append(file[0] + '/' + f)
            return l

    def downloadToCsv(self, filename: str) -> None:
        '''only split for the '€€' sign, and generate same filename in current directory'''
        with self.connect.read(filename, encoding='utf-8') as reader:
            with open(csvdir + filename.split('/')[-1].split('.')[0] + '.csv',
                      'a+') as cf:
                for line in reader.readlines():
                    newline = line.replace('€€', ',')
                    cf.write(newline)
Example #22
0
def run_test(mode):
    client = InsecureClient('http://juneau:46731',
                            user='******')  # HDFS Web UI port!!
    with client.read("/pubg/aggregate/agg_match_stats_0.csv") as f:
        df = pd.read_csv(f, usecols=[1, 3, 4, 9, 12],
                         nrows=50000).replace(to_replace={
                             'tpp': 2,
                             'fpp': 1
                         },
                                              value=None)
    #df = pd.read_csv('agg_match_stats_0.csv', usecols=[1, 3, 4, 9, 12], nrows=50000).replace(to_replace={'tpp': 2, 'fpp': 1}, value=None)
    if mode == 1:
        X = df[df['match_mode'] == 1].drop(
            columns=['match_mode']).values.astype('double')
        T = df[df['match_mode'] == 1].iloc[:,
                                           4:].values.astype('double').reshape(
                                               -1, 1)
    if mode == 2:
        X = df[df['match_mode'] == 2].drop(
            columns=['match_mode']).values.astype('double')
        T = df[df['match_mode'] == 2].iloc[:,
                                           4:].values.astype('double').reshape(
                                               -1, 1)

    network = [5]
    relu = True
    model = nn.NN_distributed(X.shape[1], network, T.shape[1], relu)
    if mode == 1:
        model.load_state_dict(torch.load('Best network (FPP).pth'))
    if mode == 2:
        model.load_state_dict(torch.load('Best network (TPP).pth'))
    Y = model.use_pytorch(X)
    RMSE_model = np.sqrt(np.mean((Y - T)**2))
    print(f'Best Network Test RMSE: {RMSE_model}')
    for i in range(1000, 5000, 500):
        print(
            f'Sample Target {i}: {T[i][0]}, Predicted Value: {model.use_pytorch(X[i])[0]}'
        )
Example #23
0
class HDFS(BaseRepository):
    def __init__(self, host: str, port, user: str):
        super().__init__()
        self.host = host
        self.port = port
        self.user = user
        self.prodcuer = None

    def connect(self):
        self.conn = InsecureClient(f"http://{self.host}:{self.port}",
                                   user=self.user)
        if os.environ.get("KAFKA_BOOTSTRAP", None):
            self.producer = KafkaProducer(bootstrap_servers=os.environ.get(
                "KAFAKA_BOOTSTRAP", "localhost:1234"))
        else:
            self.producer = None

    def disconnect(self):
        self.save_snapshot()
        if self.prodcuer:
            self.producer.close()

    def insert_rows(self, rows: list[(datetime, str, str, str, str, str)]):
        self.add_buff(rows)
        self.flush()

    def _last_datetime(self, category, date):
        if self.conn.status(f"/krwordcloud/add-article/{date}")['length'] == 0:
            return config.min_date
        tfname = ''
        with tempfile.NamedTemporaryFile("wb") as tf:
            tfname = tf.name
            with self.conn.read(f"/krwordcloud/add-article/{date}",
                                chunk_size=8096) as hf:
                for chunk in hf:
                    tf.write(chunk)
            with open(tfname, 'rb') as tf:
                reader = pyorc.Reader(tf)
                maximum = datetime.datetime \
                    .strptime(f"{date} GMT+0900", "%Y-%m-%d.orc GMT%z")
                for row in reader:
                    if row[0] > maximum and row[1] == category:
                        maximum = row[0]
                if (maximum < config.min_date):
                    return config.min_date
                elif maximum > datetime.datetime.now().replace(tzinfo=KST):
                    return datetime.datetime.now().replace(tzinfo=KST)
                else:
                    return maximum
        os.unlink(tfname)

    def make_entries(self):
        entries = dict()
        hdfs_entries = dict()
        lookup_hdfs = []

        self.load_snapshot()

        for category in config.categories:
            category_rows = list(
                filter(lambda row: row[1] == category, self.buff))
            if len(category_rows) > 0:
                last = max(category_rows, key=lambda row: row[0])
                entries[category] = last[0]
            else:
                lookup_hdfs.append(category)

        try:
            dates = self.conn.list("/krwordcloud/add-article/")
            if len(dates) > 0:
                for category in lookup_hdfs:
                    found = False
                    for last in reversed(dates):
                        try:
                            entries[category] = self._last_datetime(
                                category, last)
                            found = True
                            break
                        except Exception as e:
                            print(e)
                            continue
                    if found is False:
                        entries[category] = config.min_date
            else:
                hdfs_entries = dict.fromkeys(lookup_hdfs, config.min_date)
        except HdfsError:
            entries[category] = config.min_date
        except Exception as e:
            print(e)
        return {
            k: v
            for k, v in sorted({
                **entries,
                **hdfs_entries
            }.items(),
                               key=lambda item: item[1])
        }

    def save_snapshot(self):
        print('save_snapshot')
        with self.conn.write("/krwordcloud/snapshot.json",
                             overwrite=True,
                             encoding="utf-8") as f:
            data = list(
                map(lambda x: (x[0].isoformat(), x[1], x[2], x[3], x[4], x[5]),
                    self.buff))
            json.dump(data, f, ensure_ascii=False)

    def load_snapshot(self):
        print('load_snapshot')
        try:
            with self.conn.read("/krwordcloud/snapshot.json",
                                encoding="utf-8") as f:
                self.buff = list(
                    map(
                        lambda x:
                        (parser.parse(x[0]), x[1], x[2], x[3], x[4], x[5]),
                        json.load(f)))
        except Exception:
            self.buff = []

    def flush(self):
        dates = sorted(list(set(map(lambda row: row[0].date(), self.buff))))
        if len(dates) > 1:
            for d in dates[:-1]:
                data = list(filter(lambda row: row[0].date() == d, self.buff))
                if self.producer:
                    self._kafka_flush(d, data)
                else:
                    self._hdfs_flush(d, data)
            self.buff = list(
                filter(lambda row: row[0].date() == dates[-1], self.buff))
            self.save_snapshot()

    def _kafka_flush(self, date, data):
        self.producer.send(f"add-article-{date}", data)

    def _hdfs_flush(self, date, data):
        with self.conn.write(f"/krwordcloud/add-article/{date}.orc",
                             overwrite=True) as hf:
            tfname = ''
            with tempfile.NamedTemporaryFile(mode="wb+", delete=False) as tf:
                tfname = tf.name
                with pyorc.Writer(
                        tf,
                        schema="struct<field0:timestamp,field1:string," +
                        "field2:string,field3:string>",
                ) as of:
                    of.writerows(data)
            with open(tfname, 'rb') as tf:
                for line in tf:
                    hf.write(line)
            os.unlink(tfname)
Example #24
0
TOPIC = os.environ["TOPIC"]
HDFS_NAMENODE = os.environ["HDFS_HOSTNAME"]

client_hdfs = InsecureClient(HDFS_NAMENODE)

while True:
    try:
        producer = KafkaProducer(bootstrap_servers=KAFKA_BROKER.split(","))
        print("Producer: Connected to Kafka!")
        break
    except kafka.errors.NoBrokersAvailable as e:
        print(e)
        time.sleep(3)

file_name = "/produce/file/streaming.csv"
with client_hdfs.read(file_name, encoding='utf-8') as reader:
    df = pd.read_csv(reader, low_memory=False)
    df.info(verbose=False)
    df = df[df['PtID'].notna()]
    df['PtID'] = df['PtID'].astype(int)
    df['Year'] = df['Year'].astype(int)
    df['Month'] = df['Month'].astype(int)
    df['Day'] = df['Day'].astype(int)
    print(df.head(10))
    df = df.sort_values(by=['Date', 'Time'])
    for i, row in df.iterrows():
        print(row.to_json())
        producer.send(TOPIC,
                      key=bytes(str(row['RecID']), 'utf-8'),
                      value=bytes(row.to_json(), 'utf-8'))
        if i % 132 == 0:
Example #25
0
from hdfs import InsecureClient

# log in hdfs server
client = InsecureClient('http://master32:50070', user='******')

# print all of the hdfs root folder
print client.list('/')

path = '/test/aaa.txt'

# Check if the file exists
if (client.content(path, strict=False) != None):
    client.delete(path)

print "START TO WRITE FILE"

# write a text file from hdfs
with client.write(path, encoding='utf-8') as writer:
    for i in range(10):
        writer.write("Hello World\n")

print "DONE"

print "START TO READ FILE"

# read a text file from hdfs
with client.read(path, chunk_size=8096) as reader:
    for chunk in reader:
        print chunk

print "DONE"
Example #26
0
        missing_data = 0  #counter to see how many records in the json file are with empty data.

        #another method to get data from hdfs. Can be used for GPU processing
        #       a=datetime.now()
        #       date=str(a.year)+str(a.month).zfill(2)+str(a.day).zfill(2)
        #       (ret, out, err)= run_cmd(['hadoop', 'fs', '-get', '/data/atl_sprint_2018/lexis_archive/lexis_%sT0000.json'
        #                           %(date), './SocialMediaSprint/'])
        #       if ret == 0:  #if return code is 0, file exists.
        #           with open('lexis_%sT0000.json' %(date), 'r') as file1:
        #       #Today’s file not found - If number of days between current date and last day’s file is > 0,
        #       #then today’s file is not found. Runs after 2 hours.

        if delta.days == 0:  #delta days = 0 ==> the last file was created today. Yay, we have data!
            #read the file from hdfs
            with client.read('/data/atl_sprint_2018/lexis_archive/' + fjson,
                             encoding='utf-8',
                             delimiter='\n') as file1:
                for line in file1:  #each line is a json object (dictionary)
                    try:
                        news_article = json.loads(line)
                        if news_article['Text'] != 'None' and len(
                                news_article['Text'].split(' ')
                        ) >= 100:  #consider the data which has more than 50 words
                            all_articles.append(
                                news_article
                            )  #append individual news articles to the data list.
                        else:
                            missing_data += 1
                    except:
                        continue
            #If there is data, processes the data and writes data into sentiment and topic files.
Example #27
0
from hdfs import InsecureClient
from joblib import load
from io import BytesIO
client_hdfs = InsecureClient('http://localhost:9870', user='******')
path = '/home/hadoop/hdfs/test/xyBernoulliNB()-1.json'

with client_hdfs.read(path) as reader:
    model = load(BytesIO(reader.read()))
print(model)
Example #28
0
# 1번 작업
from hdfs import InsecureClient
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# 문자열을 input output 파일 처럼 받을 수 있다.
from io import StringIO

client = InsecureClient("http://192.168.56.100:50070", user="******")
# 접속 확인
# print(client)

# 데이터 읽어오기
with client.read("output/dept_delay_count/part-r-00000",
                 encoding="utf-8") as reader:
    data = reader.read()

# print(data)
# 현재 data는 파일이 아니다
# data -> str -> stream

stream = StringIO(data)
# print(stream)
df = pd.read_csv(stream, sep="\t", header=None)
# print(df)
# 년과 월을 분리
# print(df[0].str.split(","))
df['year'] = df[0].str.split(",").str[0]
df['month'] = df[0].str.split(",").str[1]
# print(df)
Example #29
0
# Install hdfs package via pip

import pandas as pd
from hdfs import InsecureClient
import os

# ===== Connect to HDFS =====
#client_hdfs = InsecureClient('hdfs_adress:web_port')
client_hdfs = InsecureClient('http://hadoop01.org:50070')

# ===== Read File in HDFS =====
with client_hdfs.read('hdfs_path_file', encoding='utf-8') as reader:
    df = pd.read_csv(reader, index_col=0)
print(df)

# ==== Creating a simple Pandas DataFrame =====
liste_hello = ['hello1', 'hello2']
liste_world = ['world1', 'world2']
df = pd.DataFrame(data={'hello': liste_hello, 'world': liste_world})

# ==== Writing Dataframe to HDFS =====
with client_hdfs.write('/user/hdfs/wiki/helloworld.csv',
                       encoding='utf-8') as writer:
    df.to_csv(writer)

# ====== Reading files ======
with client_hdfs.read('/user/hdfs/wiki/helloworld.csv',
                      encoding='utf-8') as reader:
    df = pd.read_csv(reader, index_col=0)

# ==== Getting Content Summary ====
Example #30
0
class MachineLearning():

    # reads images and stores them
    def __init__(self, input_folder, model_folder, img_size=240):
        self.input_folder = input_folder
        self.model_folder = model_folder
        self.hdfs_client = InsecureClient('http://192.168.1.4:9870',
                                          user='******')
        self.imgs, self.labels = self.read_images(input_folder, 240)
        self.default = "svm"

    # reads images from a directory and resizes them
    # returns the list of images and list of labels
    def read_images(self, directory, img_size=0):
        list_img = []
        labels = []
        logging.info('read_images')

        try:
            for name in self.hdfs_client.list('/' + directory + 'yes'):
                if name == "Thumbs.db":
                    continue
                with self.hdfs_client.read('/' + directory + 'yes/' +
                                           name) as reader:
                    img = Image.open(reader)
                if img_size != 0:
                    img = img.resize((img_size, img_size))
                img = img.convert('L').convert('RGB')
                list_img.append(np.asarray(img).flatten())
                labels.append(1)

            for name in self.hdfs_client.list('/' + directory + 'no'):
                if name == "Thumbs.db":
                    continue
                with self.hdfs_client.read('/' + directory + 'no/' +
                                           name) as reader:
                    img = Image.open(reader)
                if img_size != 0:
                    img = img.resize((img_size, img_size))
                img = img.convert('L').convert('RGB')
                list_img.append(np.asarray(img).flatten())
                labels.append(0)

        except Exception as err:
            logging.error("Error in read_images")
            logging.error(err)
            list_img = []
            labels = []

        logging.info("Finished reading images")

        return list_img, labels

    # returns the untrained model for a given algorithm
    def get_model(self, algorithm, params):
        if (algorithm == "knn"):
            return KNeighborsClassifier(**params, n_jobs=-1)
        elif (algorithm == "svm"):
            return SVC(**params,
                       gamma='auto',
                       random_state=0,
                       probability=True)
        elif (algorithm == "gbc"):
            return GradientBoostingClassifier(**params)
        elif (algorithm == "rfc"):
            return RandomForestClassifier(**params, n_estimators=500)
        elif (algorithm == "nn"):
            return neural_network.MLPClassifier(**params)
        else:
            return self.get_model(self.default, params)

    # returns a set of the "best" parameters for a given algorithm
    def get_params(self, algorithm):
        if (algorithm == "knn"):
            return {'n_neighbors': 9}
        elif (algorithm == "svm"):
            return {'kernel': 'poly', 'C': 10**-4}
        elif (algorithm == "gbc"):
            return {'n_estimators': 10}
        elif (algorithm == "rfc"):
            return {
                'max_depth': 8,
                'max_features': "auto",
                'criterion': "gini"
            }
        elif (algorithm == "nn"):
            return {'hidden_layer_sizes': tuple([64 for _ in range(10)])}
        else:
            return self.get_params(self.default)

    # trains a model using the best parameters and returns the score
    def train(self, algorithm, imgs, labels):
        params = self.get_params(algorithm)
        model = self.get_model(algorithm, params)
        logging.info("Training %s with the following parameters:" %
                     (algorithm))
        logging.info(params)

        dask_client = Client(DASK_IP_ADRESS)
        img_train, img_test, lbl_train, lbl_test = train_test_split(
            self.imgs, self.labels, test_size=0.2)

        futures_img_train = dask_client.scatter(img_train)
        futures_img_test = dask_client.scatter(img_test)
        futures_lbl_train = dask_client.scatter(lbl_train)
        futures_lbl_test = dask_client.scatter(lbl_test)

        future_model_fit = dask_client.submit(model.fit, futures_img_train,
                                              futures_lbl_train)

        model = future_model_fit.result()

        future_score_train = dask_client.submit(model.score, futures_img_train,
                                                futures_lbl_train)
        future_score_test = dask_client.submit(model.score, futures_img_test,
                                               futures_lbl_test)

        score_test = future_score_test.result()
        score_train = future_score_train.result()

        logging.info("Training complete, saving model %s to file" %
                     (algorithm))

        # saving the model to file
        with self.hdfs_client.write('/' + str(self.model_folder) +
                                    str(algorithm) + ".model") as writer:
            joblib.dump(model, writer)

        logging.info("Score on training set: %.4f, score on test set: %.4f" %
                     (score_train, score_test))

        return score_train, score_test
Example #31
0
    print(message)


# API communication
def fetch(table, year, round):
    url = ERGAST_ENDPOINT.format(year, round, table)
    print(f'Fetching URL... {url}')
    response = urllib.request.urlopen(url).read()
    print(f'Received response of length {len(response)}')
    return json.loads(response)


print('Step 1')
try:
    print(f'Reading last fetched race from HDFS: {HDFS_LAST_FETCHED_FILE}')
    with client.read(HDFS_LAST_FETCHED_FILE) as reader:
        lines = reader.read().decode().split('\n')
    last_year = int(lines[0].rstrip())
    last_round = int(lines[1].rstrip())
    print(f'Last fetched race: {last_round}_{last_year}')
except Exception as e:
    log(f'Cannot read last fetched year & round: {e}')
    sys.exit()

print('Step 2')


def is_race_available(year, round):
    j = fetch('results', year, round)
    total = int(j['MRData']['total'])
    return total > 0
Example #32
0
# -*- coding: utf-8 -*-
#
# Copyright © 2018 white <*****@*****.**>
#
# Distributed under terms of the MIT license.

"""
https://hdfscli.readthedocs.io/en/latest/api.html#module-hdfs.client
"""
from hdfs import InsecureClient

hdfs_url = "http://192.168.30.125:50070"
hdfs_user = "******"
c = InsecureClient(hdfs_url, user=hdfs_user)

c.write("/test_write", data="string")
c.delete("/test_write")
c.makedirs("/new/path") # 自动递归创建

with c.read("f.txt", encoding="utf-8") as f:
    content = f.read()

c.write("/test.txt", "test string")
Example #33
0
# 1. HDFS에 접속 -> 결과 파일 읽기
# 2. 결과 내용을 DataFrame 으로 변환
# 3. Matplotlib 으로 시각화
from hdfs import InsecureClient
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO  # 문자열을 파일로 읽어주는

client = InsecureClient('http://192.168.56.100:50070', user='******')
# print(client)

# 데이터 읽어오기
with client.read('output/arr_delay_count/part-r-00000',
                 encoding='utf-8') as reader:
    data = reader.read()

# print(data)
# data ->str -> stream
stream = StringIO(data)
df = pd.read_csv(stream, sep='\t', header=None)
# print(df)

# 가공
# print(df[0].str.split(','))
df['year'] = df[0].str.split(',').str[0]
df['month'] = df[0].str.split(',').str[1]

# year, month 컬럼을 int로 변환
df['year'] = df['year'].astype('int')
df['month'] = df['month'].astype('int')