def open(self): for test_path in self.hdfs_paths[0], self.local_paths[0]: with hdfs.open(test_path, "w") as f: f.write(self.data) f.fs.close() with hdfs.open(test_path) as f: self.assertEqual(f.read(), self.data) f.fs.close()
def xml_from_hdfs(url): with hdfs.open(url, "r") as f: lines = f.read().strip().split('\n') docs, doc = [], None for line in lines: if line.startswith('<doc'): doc = line elif line.startswith('</doc>'): docs.append(doc + line) else: #line = line.replace('&', '').replace('"', "'") doc += line.replace('"', "'") for doc in docs: dom = bs(doc).find('doc') doc = {} try: doc['id'] = dom.attrs['id'] doc['url'] = dom.attrs['url'] doc['title'] = dom.attrs['title'] except AttributeError, e: continue doc['content'] = dom.text doc['md5'] = hashlib.md5(str(doc)).hexdigest() yield doc
def dump(self): for test_path in self.hdfs_paths[0], self.local_paths[0]: hdfs.dump(self.data, test_path) with hdfs.open(test_path) as fi: rdata = fi.read() fi.fs.close() self.assertEqual(rdata, self.data)
def __init__(self, ctx): super(AvroReader, self).__init__(ctx) isplit = ctx.input_split self.region_start = isplit.offset self.region_end = isplit.offset + isplit.length self.reader = SeekableDataFileReader(hdfs.open(isplit.filename), DatumReader()) self.reader.align_after(isplit.offset)
def __init__(self, context): super(AvroWriter, self).__init__(context) job_conf = context.job_conf part = int(job_conf['mapreduce.task.partition']) outdir = job_conf["mapreduce.task.output.dir"] outfn = "%s/part-r-%05d.avro" % (outdir, part) wh = hdfs.open(outfn, "w") self.writer = DataFileWriter(wh, DatumWriter(), self.schema)
def map(self, ctx): p = BioImgPlane(ctx.value) pixels = p.get_xy() bn = '%s-z%04d-c%04d-t%04d.npy' % (p.name, p.z, p.c, p.t) fn = hdfs.path.join(self.out_dir, p.name, bn) with hdfs.open(fn, 'w') as fo: np.save(fo, pixels) ctx.emit(fn, '%s\t%s' % (p.dimension_order, pixels.shape))
def put(self): src = hdfs.path.split(self.local_paths[0])[-1] dest = self.hdfs_paths[0] with open(src, "w") as f: f.write(self.data) hdfs.put(src, dest) with hdfs.open(dest) as fi: rdata = fi.read() self.assertEqual(rdata, self.data)
def __init__(self, context): super(Reader, self).__init__() self.isplit = pp.InputSplit(context.getInputSplit()) self.file = hdfs.open(self.isplit.filename) self.file.seek(self.isplit.offset) self.bytes_read = 0 if self.isplit.offset > 0: discarded = self.file.readline() # read by reader of previous split self.bytes_read += len(discarded)
def __init__(self, context): super(Writer, self).__init__(context) self.logger = LOGGER.getChild("Writer") jc = context.job_conf outfn = context.get_default_work_file() self.logger.info("writing to %s", outfn) hdfs_user = jc.get("pydoop.hdfs.user", None) self.sep = jc.get("mapreduce.output.textoutputformat.separator", "\t") self.file = hdfs.open(outfn, "wt", user=hdfs_user)
def json_from_hdfs(url): assert hdfs.path.isdir(url) file_lists = hdfs.ls(url) for fi in file_lists: with hdfs.open(fi, "r") as f: items = f.read().strip().split('\n') for it in items: it = loads(it) it['md5'] = hashlib.md5(str(it)).hexdigest() yield it
def __init__(self, context): super(Writer, self).__init__(context) self.logger = logging.getLogger("Writer") jc = context.getJobConf() jc_configure_int(self, jc, "mapred.task.partition", "part") jc_configure(self, jc, "mapred.work.output.dir", "outdir") jc_configure(self, jc, "mapred.textoutputformat.separator", "sep", "\t") jc_configure(self, jc, "pydoop.hdfs.user", "hdfs_user", None) self.outfn = "%s/part-%05d" % (self.outdir, self.part) self.file = hdfs.open(self.outfn, "w", user=self.hdfs_user)
def __init__(self, context): super(Writer, self).__init__(context) self.logger = LOGGER.getChild("Writer") jc = context.job_conf part = jc.get_int("mapred.task.partition") out_dir = jc["mapred.work.output.dir"] outfn = "%s/part-%05d" % (out_dir, part) hdfs_user = jc.get("pydoop.hdfs.user", None) self.file = hdfs.open(outfn, "w", user=hdfs_user) self.sep = jc.get("mapred.textoutputformat.separator", "\t")
def _choose_break_points(cls, args): n_records, n_breakpoints, path = args block_size = n_records * RECORD_LENGTH with hdfs.open(path, 'r') as f: data = f.read(block_size) assert len(data) == block_size step = max(n_records // n_breakpoints, 1) keys = sorted([data[k:k + KEY_LENGTH] for k in range(0, block_size, RECORD_LENGTH)]) return [_ for _ in it.islice(keys, step, n_records, step)]
def __init__(self, context): super(Writer, self).__init__(context) self.logger = LOGGER.getChild("Writer") jc = context.job_conf part = jc.get_int("mapred.task.partition") out_dir = jc["mapred.work.output.dir"] self.logger.debug("part: %d", part) self.logger.debug("outdir: %s", out_dir) outfn = "%s/part-%05d" % (out_dir, part) hdfs_user = jc.get("pydoop.hdfs.user", None) self.file = hdfs.open(outfn, "wb", user=hdfs_user)
def processLine(myfile, topic): with hdfs.open(myfile["name"]) as handle: for i, line in enumerate(handle): #strip line line = line.strip() #Submit data (my function) submitLine(topic, line, trials=3) if i % 20000 == 0 and i != 0: logger.info("%s lines submitted for %s" %(i, myfile["name"]))
def __init__(self, context): super(AvroWriter, self).__init__(context) self.logger = LOGGER.getChild('AvroWriter') job_conf = context.job_conf part = int(job_conf['mapreduce.task.partition']) outdir = job_conf["mapreduce.task.output.dir"] outfn = "%s/part-r-%05d.avro" % (outdir, part) wh = hdfs.open(outfn, "w") self.logger.debug('created hdfs file %s', outfn) self.writer = DataFileWriter(wh, DatumWriter(), self.schema) self.logger.debug('opened AvroWriter')
def __init__(self, context): super(Reader, self).__init__() self.logger = logging.getLogger("Reader") self.isplit = pp.InputSplit(context.getInputSplit()) for a in "filename", "offset", "length": self.logger.debug("isplit.%s = %r" % (a, getattr(self.isplit, a))) self.file = hdfs.open(self.isplit.filename) self.logger.debug("readline chunk size = %r" % self.file.chunk_size) self.file.seek(self.isplit.offset) self.bytes_read = 0 if self.isplit.offset > 0: discarded = self.file.readline() # read by reader of previous split self.bytes_read += len(discarded)
def __init__(self, context): super(Reader, self).__init__(context) self.logger = LOGGER.getChild("Reader") self.logger.debug('started') self.isplit = context.input_split for a in "filename", "offset", "length": self.logger.debug( "isplit.{} = {}".format(a, getattr(self.isplit, a)) ) remainder = self.isplit.offset % RECORD_LENGTH self.bytes_read = 0 if remainder == 0 else RECORD_LENGTH - remainder self.file = hdfs.open(self.isplit.filename) self.file.seek(self.isplit.offset + self.bytes_read)
def mapper(_, record, writer, conf): out_dir = conf.get('out.dir', utils.make_random_str()) if not hdfs.path.isdir(out_dir): hdfs.mkdir(out_dir) hdfs.chmod(out_dir, 'g+rwx') img_path = record.strip() a = get_array(img_path) out_a = calc_features(a) out_path = hdfs.path.join(out_dir, '%s.out' % hdfs.path.basename(img_path)) with hdfs.open(out_path, 'w') as fo: np.save(fo, out_a) # actual output hdfs.chmod(out_path, 'g+rw') writer.emit(img_path, fo.name) # info (tab-separated input-output)
def collect_output(mr_out_dir, out_file=None): """ Return all mapreduce output in ``mr_out_dir``. Append the output to ``out_file`` if provided. Otherwise, return the result as a single string (it is the caller's responsibility to ensure that the amount of data retrieved fits into memory). """ if out_file is None: output = [] for fn in iter_mr_out_files(mr_out_dir): with hdfs.open(fn, "rt") as f: output.append(f.read()) return "".join(output) else: block_size = 16777216 with open(out_file, 'a') as o: for fn in iter_mr_out_files(mr_out_dir): with hdfs.open(fn) as f: data = f.read(block_size) while len(data) > 0: o.write(data) data = f.read(block_size)
def read(readFlag): print(readFlag); if (readFlag == True): targetFile = config.targetFile.strip() targetDirectory = config.targetDirectory.strip() targetPath = config.targetPath print(targetPath) # instantiate hadoop hdfs.hdfs() # read from hadoop fileToRead = hdfs.open(targetPath) print(fileToRead.read())
def __init__(self, context): super(Reader, self).__init__(context) self.logger = LOGGER.getChild("Reader") self.logger.debug('started') self.isplit = context.input_split for a in "filename", "offset", "length": self.logger.debug( "isplit.{} = {}".format(a, getattr(self.isplit, a)) ) self.file = hdfs.open(self.isplit.filename) self.file.seek(self.isplit.offset) self.bytes_read = 0 if self.isplit.offset > 0: discarded = self.file.readline() self.bytes_read += len(discarded)
def main(argv=None): parser = make_parser() args, unknown_args = parser.parse_known_args(argv) args.job_name = 'pteracheck' args.module = 'pteracheck' args.do_not_use_java_record_reader = True args.do_not_use_java_record_writer = False args.num_reducers = 1 args.upload_file_to_cache = ['pteracheck.py', 'ioformats.py'] submitter = PydoopSubmitter() submitter.set_args(args, [] if unknown_args is None else unknown_args) submitter.run() path = os.path.join(args.output, 'part-r-00000') with hdfs.open(path, 'rb') as f: data = f.read() check_rows(data.split(b'\n')[:-1])
def check_transpose(mr_out_dir): output = [] for fn in hadut.iter_mr_out_files(mr_out_dir): with hdfs.open(fn, "rt") as f: for line in f: row = line.rstrip().split("\t") index = int(row.pop(0)) output.append((index, row)) output = [_[1] for _ in sorted(output)] exp_output = [] in_fn = os.path.join(THIS_DIR, "data", "transpose_input", "matrix.txt") with open(in_fn) as f: for line in f: for i, item in enumerate(line.split()): try: exp_output[i].append(item) except IndexError: exp_output.append([item]) return output == exp_output
def processChunk(myfile, topic): with hdfs.open(myfile["name"]) as handle: data = [] for i, line in enumerate(handle): #strip line line = line.strip() data += [line] if i % 5000 == 0: #Submit data (my function) submitChunk(topic, data, trials=3) data = [] if i % 20000 == 0 and i != 0: logger.info("%s lines submitted for %s" %(i, myfile["name"])) #for every line #submit the rest of the data submitChunk(topic, data, trials=3) data = []
def xml_from_hdfs(url): assert hdfs.path.isdir(url) file_lists = hdfs.ls(url) #for fi in file_lists: for i in xrange(0, 1): fi = '/datasets/corpus/enwiki-11g/wiki_912' with hdfs.open(fi, "r") as f: lines = f.read().strip().split('\n') docs, doc = [], None for line in lines: if line.startswith('<doc'): doc = line elif line.startswith('</doc>'): docs.append(doc + line) else: #line = line.replace('&', '').replace('"', "'") doc += line.replace('"', "'") for doc in docs: dom = bs(doc).find('doc') doc = dom.attrs doc['content'] = dom.text doc['md5'] = hashlib.md5(str(doc)).hexdigest() yield doc
sys.exit(1) else: return ratings def computeRmse(model, data, n): """ Compute RMSE (Root Mean Squared Error). """ predictions = model.predictAll(data.map(lambda x: (x[0], x[1]))) predictionsAndRatings = predictions.map(lambda x: ((x[0], x[1]), x[2])) \ .join(data.map(lambda x: ((x[0], x[1]), x[2]))) \ .values() return sqrt(predictionsAndRatings.map(lambda x: (x[0] - x[1]) ** 2).reduce(add) / float(n)) for n in userArray: with open(uFile, "w") as fi: with hdfs.open('/user/cloudera/medium/ratings.dat') as f: for line in f: data = line userid = line.split("::") if (int(userid[0]) == int(n)): fi.write(data) print n f.close() if __name__ == "__main__": if (len(sys.argv) != 2): print "Usage: /path/to/spark/bin/spark-submit --driver-memory 2g " + \ "MovieLensALS.py movieLensDataDir" sys.exit(1) # set up environment conf = SparkConf() \
import os import pydoop.hdfs as hd import datetime import forecastio as fo import pandas as pd with hd.open("hdfs://quickstart.cloudera:8020/user/cloudera/python/cities_location.csv") as f: df = pd.read_csv(f) df=pd.read_csv('/user/cloudera/python/cities_location.csv') df.head() api_key = "459009d8daa503cef1e11b190c961ce5" #selecting the specific date date = datetime.datetime(2015,11,1,2,0,0) for i in range(len(df)): col = ["cities", "time", "temperatureMin", "temperatureMax"] lat=df["latitude"].iloc[i] lng=df["longitude"].iloc[i] #qccesing the forecast.io API forecast = fo.load_forecast(api_key, lat, lng, time=date) day = forecast.daily() #retrieving infor;ation for the current day Day=day.data[0] data={"cities": df["cities"].iloc[i], "time" : Day.time, "temperatureMin" : Day.temperatureMin, "temperatureMax" : Day.temperatureMax} if i==0 : weather = pd.DataFrame(data, index=[0], columns= col) else: weather1 = pd.DataFrame(data, index=[0], columns= col) weather = pd.concat([weather, weather1], ignore_index=True)
result = math.pow(math.e, -0.5 * (x_mu * inverse * x_mu.T)) return norm_const * result else: raise NameError("The dimensions of the input don't match") #import pydoop.hdfs as hdfs k = 5 #using Hadoop system file #with hdfs.open('/Users/ming/centroids.txt') as fp: weights = [] means = [] sigmas = [] with hdfs.open('/Users/user06/parameters.txt') as file: for line in file: params = line.strip().split("\t") weights.append(float(params[0])) means.append(np.array(params[1].split(), float)) sigmas.append(np.array(params[2].split(), float)) for line in sys.stdin: line = line.strip() point = np.array(line.split(), float) p = weights[0] * norm_pdf_multivariate(point, means[0], sigmas[0].reshape( (2, 2))) nearest = 0 for i in range(1, k): q = weights[i] * norm_pdf_multivariate(point, means[i], sigmas[i].reshape((2, 2)))
import pydoop.hdfs as hdfs b = hdfs.path.isdir("/data") want_file = 'traffic.csv' if b == True: print("---get test ---") lines = [] with hdfs.open("hdfs://127.0.0.1:9000/data/"+want_file) as f: for line in f: # print(line, type(line)) l = line.decode("utf-8") if l is not None and l != "": lines.append(l) print(lines) print("---end get----") with open("i8predict_flow/"+want_file, "wb") as myfile: myfile.write(str(lines))
##Importing Required Packages import numpy as np import pydoop.hdfs as hd import pandas as pd from sklearn.decomposition import PCA from sklearn.cluster import KMeans import matplotlib.pyplot as plt import seaborn as sbn from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn import metrics from sklearn.metrics import accuracy_score, auc, roc_curve, precision_recall_curve, average_precision_score ##Loading Credit Card Dataset with hd.open("/user/hduser/creditcard.csv") as f: CreditCardData = pd.read_csv(f, header=0) ##Reducing the number of records of Original Dataset incase we wish to work on a smaller subset of Dataset ReducedData = CreditCardData.iloc[:, :] ##Shape of Credit Card Dataset, i.e. number of rows & columns present in Dataset print("\nShape of Credit Card Dataset (rows, columns): " + str(ReducedData.shape)) ##Removing Duplicate Records (if any) FinalData = ReducedData.drop_duplicates() print( "\nShape of Credit Card Dataset after removing duplicate records (rows, columns): " + str(FinalData.shape)) ##Checking for missing values
import pydoop.hdfs as hdfs import boto3 import botocore s3 = boto3.resource('s3') BUCKET = "bd-mindbenders12345" file = hdfs.open("hdfs://localhost:9000/test.txt") s3.Bucket(BUCKET).put_object(Key="test.txt", Body=file)
import pydoop.hdfs as hdfs import config.hdfs with hdfs.open(config.hdfs['ur']) as f: for line in f: print(line)
i = 0 import math from tqdm import tqdm import matplotlib.pyplot as plt from sklearn.metrics import log_loss, accuracy_score def sigmoid(x): return 1 / (1 + np.exp(-x)) vocab = Counter() labels = Counter() with hdfs.open( '/user/ds222/assignment-1/DBPedia.verysmall/verysmall_train.txt') as f: for line in f: first, next = line.split(' ', 1) for label in first.split(','): labels[label] += 1 words = next.strip().lower().split() for word in words: if (len(word) >= 4): if (word[0] != '<'): vocab[word] += 1 i = i + 1 #print(i) #print(counter) #Convert words to indexes
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import pickle import io from collections import Counter import pydoop.hdfs as hdfs from pterasort import Partitioner RECORD_LENGTH = 91 KEY_LENGTH = 10 fname = Partitioner.initialize_break_points(5, 1000, '/user/root/genrecords_output') with io.open('__break_point_cache_file', 'rb') as f: data = f.read() sel = pickle.loads(data) block_size = 20000 * RECORD_LENGTH path = '/user/root/genrecords_output/part-m-00000' with hdfs.open(path, 'rb') as f: data = f.read(block_size) keys = (data[k:k + 10] for k in range(0, block_size, RECORD_LENGTH)) partitions = Counter(map(sel.select_partition, keys)) print(partitions)
HDFSfiles.append(hdFiles[41:]) fileNames = [] indexName = 'music' typeName = 'songs' #IdField = 'songID' bulkData = [] i = 1 for name in HDFSfiles: dataDict = {} fopen=hdfs.open("/gaana/gaanaLyrics/"+name) header = fopen.read() header = re.sub('[^a-zA-Z]', ' ', header) header = header.replace("Advertisements"," ") header = ''.join([item.lower() for item in header]) songAndMovie = [] dlim = "lyrics" # nameNew = name.replace("-"," ") songAndMovie.append(name) dataDict[name] = header metaDict = {} dataDict = {} for elements in songAndMovie: songsName = [] # if "lyrics" in elements: songName = elements.split('-')
from pandasql import sqldf import os login='' senha='' os.system('echo '+senha+' | kinit '+login) dir = '/ranger/audit/hiveServer2/' list = hdfs.ls(dir) df = pd.DataFrame() for pasta in list: for i in range(len(hdfs.ls(pasta))): try: with hdfs.open(hdfs.ls(pasta)[i], 'r') as f: jsn = [json.loads(line) for line in f] df = df.append([pd.DataFrame(jsn)], sort=True) except: print("Leitura do arquivo json em " + hdfs.ls(pasta)[i] + " não foi bem sucedida") df1 = df[['evtTime','reqUser','resource','access','reqData']] df1['reqUser'] = df1['reqUser'].str.upper() df1 = df1[df1['access']=='SELECT'] # exclusao de usuarios de servico exclusao = pd.DataFrame(['HIVE','RANGERLOOKUP']) df1 = df1[~df1.reqUser.isin(exclusao.iloc[:,0])] df1['evtTime'] = pd.to_datetime(df1['evtTime'].str[0:16], format='%Y-%m-%d %H:%M') spark_df = spark.createDataFrame(df1)
{"$group": {"_id": {'source':"$source",'tags':"$tags",'year': "$year_posted",'month':"$month_posted",'day':"$day_posted"}, "count": {"$sum": 1},"countNegative":{"$sum":"$Negative"},"countNeutral":{"$sum":"$Neutral"},"countPositive":{"$sum":"$Positive"}}}, {"$sort": SON([("count", -1), ("_id", -1)])} ]) #use reportdate for the filename filename = startdate.strftime('%Y-%m-%d') print(filename) for result_obj in daily_totals['result']: data_dict = result_obj['_id'] date = (str(data_dict['year']) + "-" + str(data_dict['month']) + "-" + str(data_dict['day'])) tag = data_dict['tags'] source = data_dict['source'] count = result_obj['count'] countPositive = result_obj['countPositive'] countNegative = result_obj['countNegative'] countNeutral = result_obj['countNeutral'] data = (str(date) + "|" + str(tag) + "|" + str(source) + "|" + str(count) + "|" + str(countPositive) + "|" + str(countNegative) + "|" + str(countNeutral)+'\n') print(data) hdfs_path = '/socialmedia/sentiment/' + filename hdfs_path = settings.HDFS_HOST_NAME + ':' + settings.HDFS_PORT + settings.HDFS_ROOT_FOLDER + \ '/socialmedia/sentiment' + filename + '.in' logger.info('HDFS file path: %s' % hdfs_path) logger.debug('Data: %s' % data) try: hdfs_file = hdfs.open(hdfs_path, mode='a') hdfs_file.write(data.encode('utf-8')) except IOError, e: logger.debug("IOError: " + e.message) logger.debug("Caught Exception. Will create a new file on hdfs.") hdfs_file = hdfs.open(hdfs_path, mode='w') hdfs_file.write(data.encode('utf-8'))
colourImg = PIL.Image.open(imgFile) #imshow(np.asarray(colourImg)) nparray = np.asarray(colourImg) image = cv2.cvtColor(nparray, cv2.COLOR_RGB2BGR) return image brand = "logitech" mode = "image" os.chdir("/tmp/") myMachine = kpath.abspath('/tmp/data/input/racetrack/image/') print(myMachine) with hpath.open(myMachine + "driving_log.csv") as csvFile: df = pd.read_csv(csvFile, names=[ "image_center", "image_left", "image_right", "steering", "speed" ]) #next(df.iterrows())[1] df.iterrows() # read and store multiple cameras and steering angles from driving_log.csv # all three camera images will be used to train the model images = [] steering_measurements = [] for index, row in df.iterrows():
import pydoop.hdfs as hdfs import logging logging.basicConfig(level = logging.DEBUG) # тест проверяет наличие строки в выходых файлах # для каждой строки ищется ее пара в директории с нужной датой with open('file1.csv','r') as in_f: for it,in_line in enumerate(in_f): date=in_line.strip().split(',')[0] for part in [1,2,3]: with hdfs.open('/data/archive/'+date+'/part-0000'+str(part)) as out_f: matching=[] for out_line in out_f: a=set(out_line.strip().split(',')) if a==set(in_line.strip().split(',')): matching.append(True) break else: matching.append(False) if any(matching): matching=True break if not matching: logging.debug("Error on line %s ,%s",it,in_line)
def __missing__(self, path): f = hdfs.open(path, "wb") self[path] = f return f
import pydoop.hdfs as hdfs for part in [1, 2, 3]: with hdfs.open('/data/archive/2014-04-29/part-0000' + str(part)) as out_f: with open('file1.csv', 'r') as in_f: for out_line in out_f: for in_line in in_f: a = set(out_line.strip().split(',')) if a == set(in_line.strip().split(',')): print True else: print False print a print set(in_line.strip().split(','))
def __init__(self, context): super(Reader, self).__init__() self.logger = logging.getLogger("Reader") #formatted logger obtained self.file = hdfs.open('HD-2004-2014-d.csv') self.logger.debug("readline chunk size = %r" % self.file.chunk_size)
#!/usr/bin/python3 """mapper.py""" import sys import csv import pydoop.hdfs as hdfs import json import costanct as C azienda_map = {} with hdfs.open('input/historical_stocks.csv', 'rt') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: if line_count > 0: ticker, _, name, _, _ = row azienda_map[ticker] = {'name': name} line_count += 1 def toJson(azione): dic = { "ticker": azione[0], "name": azienda_map[azione[0]], "close": azione[2], "date": azione[7], } return json.dumps(dic) for line in sys.stdin: azione = line.split(',')
context.setStatus("initializing") def map(self, context): k = context.getInputKey() tmp_data = csv.reader(f) words = context.getInputValue().split() for w in words: context.emit(w, "1") context.incrementCounter(self.inputWords, len(words)) def close(self): self.logger.info("all done") print "Prediction on HD 30 year data:" f = hdfs.open('/HD-1984-2014-d.csv') tmp_data = csv.reader(f) my_data = list() for item in tmp_data: tmp_item = list() for i in item: tmp_item.append(i) my_data.append(tmp_item) data = my_data[1:] X = list() training_indices = list() for i in xrange(int(len(data) * 0.9)): training_indices.append(i) test_indices = list()
import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from pyspark.sql import SparkSession import pandas as pd import pydoop.hdfs as hd # create a spark session #sparkSession = SparkSession.builder.master("local").appName("draw heat map").getOrCreate() #df_load = sparkSession.read.csv('hdfs://dumbo/user/gx271/pubgETL/mir_death.csv') with hd.open("hdfs://dumbo/user/gx271/pubgETL/mir_death.csv/part-00006") as f: df = pd.read_csv(f) # convert DataFrame to np array dat = df.as_matrix() # dat = np.loadtxt('mydata.csv') x, y = dat[:,0], dat[:,1] heatmap, xedges, yedges = np.histogram2d(x, y, bins=50) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] plt.clf() plt.imshow(heatmap, extent=extent) # plt.show()
def main(input_path, output_attribute_index, scikit_output_path, spark_output_path): # Instancira se Passive Aggressive Regressor model regressor = PassiveAggressiveRegressor() for file_path in hdfs.ls(input_path): # Ucitava se sadrzaj fajla i kreira string matrica od njega content = hdfs.load(file_path) temp = content.split("\n") temp = list(map(lambda x: x.split(","), temp)) temp = list(filter(lambda x: len(x) > 1, temp)) raw_matrix = np.array(temp) # Ucitava se numpy matrica i zatim parsira u matricu realnih vrednosti # koja se nakon toga koristi prilikom treniranja modela # raw_matrix = np.genfromtxt(file_path, delimiter=',', dtype='string') input_matrix = raw_matrix[1:, 3:-5].astype('float64') output_vector = raw_matrix[1:, -5 + output_attribute_index].astype('float64') # Model se trenira u vidu iterativnog poboljsanja regressor.partial_fit(input_matrix, output_vector) # Na konzoli se stampa putanja do obradjenog fajla print(file_path) # Cuva se kreirani model na izlaznoj putanji # koja je prosledjena u vidu argumenta with hdfs.open(scikit_output_path, 'w') as opened_file: pickle.dump(regressor, opened_file) # Inicijalizacija konfiguracije i konteksta izvrsenja aplikacije configuration = SparkConf().setAppName("BigDataProj3_Trainer") context = SparkContext(conf=configuration) context.setLogLevel("ERROR") # Inicijalizacija sesije # (mora da se obavi zbog upisivanja modela) session = SparkSession(context) # Ucitavanje RDD podataka sa ulazne putanje input_data = context.textFile(input_path) # Parsiranje svakog reda na reci input_data = input_data.map(lambda x: x.split(",")) # Ignorisu se header-i input_data = input_data.filter(lambda x: x[0] != "Timestamp") # Ignorisu se prve tri vrste (Timestamp, Latitude i Longitude) # i bira se odgovarajuca izlazna kolona # (u zavisnosti od output_attribute_index promenljive) input_data = input_data.map(lambda x: list(map(lambda y: float(y), x[ 3:-5])) + [float(x[-5 + output_attribute_index])]) # Formira se odgovarajuci DataFrame objekat # (VectorAssembler se koristi kod formiranja kolona # koje omogucavaju koriscenje fit metode linearne regresije) input_cols = [] for i in range(15): input_cols.append("_" + str(i + 1)) assembler = VectorAssembler(inputCols=input_cols, outputCol='features') data_frame = assembler.transform(input_data.toDF()) # Instancira se LinearRegression objekat i vrsi njegovo treniranje # i zatim cuvanje na zadatoj putanji regression = LinearRegression(featuresCol='features', labelCol='_16') model = regression.fit(data_frame) model.write().overwrite().save(spark_output_path)
bucket = 'enhance-it' import boto3 import pydoop.hdfs as hdfs s3 = boto3.resource('s3') file = hdfs.open('hdfs://master:9000/data_for_db/currency=USD/part-00000-f65c545f-baa0-4bf0-8aa9-0b14957848c4.c000.json') s3.Bucket(bucket).put_object(Key='lituation/data_from_hdfs.csv', Body=file)
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. # # END_COPYRIGHT import pickle import io from collections import Counter import pydoop.hdfs as hdfs from pterasort import Partitioner RECORD_LENGTH = 91 KEY_LENGTH = 10 fname = Partitioner.initialize_break_points( 5, 1000, '/user/root/genrecords_output' ) with io.open('__break_point_cache_file', 'rb') as f: data = f.read() sel = pickle.loads(data) block_size = 20000 * RECORD_LENGTH path = '/user/root/genrecords_output/part-m-00000' with hdfs.open(path, 'rb') as f: data = f.read(block_size) keys = (data[k:k + 10] for k in range(0, block_size, RECORD_LENGTH)) partitions = Counter(map(sel.select_partition, keys)) print(partitions)
#! /usr/bin/env python import sys from pydoop import hdfs from DataPoint import DataPoint #print "Start" # read sys.argv[1] and sys.argv[2] # put em in lists if len(sys.argv)<3: print "Error: Insufficient Arguments" sys.exit(-1) oldCentroidsFile = hdfs.open(sys.argv[1]) newCentroidsFile = hdfs.open(sys.argv[2]) oldCentroids = [] newCentroids = [] for line in oldCentroidsFile: if line.find("\t") != -1: (key,value) = line.strip().split("\t") oldCentroid = DataPoint(value) else: oldCentroid = DataPoint(line.strip()) oldCentroids.append(oldCentroid) for line in newCentroidsFile: (key,value) = line.strip().split("\t")
#! /usr/bin/env python import sys import DataPoint from pydoop import hdfs # Check for sufficient arguments if len(sys.argv) < 2: print("ERROR: Insufficient arguments") sys.exit(-1) # List to hold canopy centers canopyCenters = [] # Read canopy center file file = hdfs.open(sys.argv[1]) for line in file: if line.find("Warning:") == 0: continue (key, value) = line.split("\t") dp = DataPoint.DataPoint(value.strip()) canopyCenters.append(dp) # Assign points to canopies for line in sys.stdin: dp = DataPoint.DataPoint(line.strip()) insert = True for canopyCenter in canopyCenters: if dp.checkT1(canopyCenter): print(canopyCenter.toString() + "\t" + dp.toString())
### Parte de leer los datos de hdfs ### import pandas as pd import numpy as np import pydoop.hdfs as hd from lxml import objectify with hd.open("/user/datostiempo/20160525_1341.xml") as archivo: parsed = objectify.parse(archivo) root = parsed.getroot() prob_precipitacion = [] estado_cielo = [] viento = [] temperatura = [] tempmax = [] tempmin = [] iteraccion = 0 errores = [] print "root : ", root for row in root.prediccion.dia: for row_precipitacion in row.prob_precipitacion: aux_precipitacion = [] if (row_precipitacion != ''): aux_precipitacion.append(row_precipitacion) else: errores.append(1) prob_precipitacion.append( str(sum(aux_precipitacion) / float(len(aux_precipitacion)))) for row_cielo in row.estado_cielo: aux_cielo = [] if (row_cielo != ''):
import os import pandas as pd import pydoop.hdfs as hdfs from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split conf = SparkConf().setAppName("drunk detection").setMaster("yarn") sc = SparkContext(conf=conf) sqlCtx = SQLContext(sc) csv_file_path = "hdfs:///drunkdetection/train_data48.csv" predictor_path = "hdfs:///drunkdetection/shape_predictor_68_face_landmarks.dat" image_path = "hdfs:///drunkdetection/drunk3.jpg" model_path = "hdfs:///drunkdetection/rf48.pickle" with hdfs.open("/drunkdetection/train_data48.csv") as csv: df = pd.read_csv(csv, index_col=0) print(df.columns) df_y = df['label'] == 3 df_X = df[['x' + str(i) for i in range(1, 49)] + ['y' + str(j) for j in range(1, 49)]] X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=15) # Feature Scaling from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_train = scaler.fit_transform(X_train)
import os import pydoop.hdfs as hd import datetime import forecastio as fo import pandas as pd with hd.open( "hdfs://quickstart.cloudera:8020/user/cloudera/python/cities_location.csv" ) as f: df = pd.read_csv(f) df = pd.read_csv('/user/cloudera/python/cities_location.csv') df.head() api_key = "459009d8daa503cef1e11b190c961ce5" #selecting the specific date date = datetime.datetime(2015, 11, 1, 2, 0, 0) for i in range(len(df)): col = ["cities", "time", "temperatureMin", "temperatureMax"] lat = df["latitude"].iloc[i] lng = df["longitude"].iloc[i] #qccesing the forecast.io API forecast = fo.load_forecast(api_key, lat, lng, time=date) day = forecast.daily() #retrieving infor;ation for the current day Day = day.data[0] data = { "cities": df["cities"].iloc[i], "time": Day.time, "temperatureMin": Day.temperatureMin, "temperatureMax": Day.temperatureMax }