コード例 #1
0
ファイル: test_hdfs.py プロジェクト: ZEMUSHKA/pydoop
 def open(self):
   for test_path in self.hdfs_paths[0], self.local_paths[0]:
     with hdfs.open(test_path, "w") as f:
       f.write(self.data)
     f.fs.close()
     with hdfs.open(test_path) as f:
       self.assertEqual(f.read(), self.data)
     f.fs.close()
コード例 #2
0
def xml_from_hdfs(url):
    with hdfs.open(url, "r") as f:
        lines = f.read().strip().split('\n')
        docs, doc = [], None
        for line in lines:
            if line.startswith('<doc'):
                doc = line
            elif line.startswith('</doc>'):
                docs.append(doc + line)
            else:
                #line = line.replace('&', '').replace('"', "'")
                doc += line.replace('"', "'")

        for doc in docs:
            dom = bs(doc).find('doc')
            doc = {}
            try:
                doc['id'] = dom.attrs['id']
                doc['url'] = dom.attrs['url']
                doc['title'] = dom.attrs['title']
            except AttributeError, e:
                continue
            doc['content'] = dom.text
            doc['md5'] = hashlib.md5(str(doc)).hexdigest()
            yield doc
コード例 #3
0
ファイル: test_hdfs.py プロジェクト: ZEMUSHKA/pydoop
 def dump(self):
   for test_path in self.hdfs_paths[0], self.local_paths[0]:
     hdfs.dump(self.data, test_path)
     with hdfs.open(test_path) as fi:
       rdata = fi.read()
     fi.fs.close()
     self.assertEqual(rdata, self.data)
コード例 #4
0
ファイル: avrolib.py プロジェクト: CynthiaYiqingHuang/pydoop
 def __init__(self, ctx):
     super(AvroReader, self).__init__(ctx)
     isplit = ctx.input_split
     self.region_start = isplit.offset
     self.region_end = isplit.offset + isplit.length
     self.reader = SeekableDataFileReader(hdfs.open(isplit.filename),
                                          DatumReader())
     self.reader.align_after(isplit.offset)
コード例 #5
0
ファイル: avrolib.py プロジェクト: CynthiaYiqingHuang/pydoop
 def __init__(self, context):
     super(AvroWriter, self).__init__(context)
     job_conf = context.job_conf
     part = int(job_conf['mapreduce.task.partition'])
     outdir = job_conf["mapreduce.task.output.dir"]
     outfn = "%s/part-r-%05d.avro" % (outdir, part)
     wh = hdfs.open(outfn, "w")
     self.writer = DataFileWriter(wh, DatumWriter(), self.schema)
コード例 #6
0
 def map(self, ctx):
     p = BioImgPlane(ctx.value)
     pixels = p.get_xy()
     bn = '%s-z%04d-c%04d-t%04d.npy' % (p.name, p.z, p.c, p.t)
     fn = hdfs.path.join(self.out_dir, p.name, bn)
     with hdfs.open(fn, 'w') as fo:
         np.save(fo, pixels)
     ctx.emit(fn, '%s\t%s' % (p.dimension_order, pixels.shape))
コード例 #7
0
ファイル: test_hdfs.py プロジェクト: ZEMUSHKA/pydoop
 def put(self):
   src = hdfs.path.split(self.local_paths[0])[-1]
   dest = self.hdfs_paths[0]
   with open(src, "w") as f:
     f.write(self.data)
   hdfs.put(src, dest)
   with hdfs.open(dest) as fi:
     rdata = fi.read()
   self.assertEqual(rdata, self.data)
コード例 #8
0
ファイル: wordcount-rr.py プロジェクト: ilveroluca/pydoop
 def __init__(self, context):
   super(Reader, self).__init__()
   self.isplit = pp.InputSplit(context.getInputSplit())
   self.file = hdfs.open(self.isplit.filename)
   self.file.seek(self.isplit.offset)
   self.bytes_read = 0
   if self.isplit.offset > 0:
     discarded = self.file.readline()  # read by reader of previous split
     self.bytes_read += len(discarded)
コード例 #9
0
ファイル: map_only_python_writer.py プロジェクト: crs4/pydoop
 def __init__(self, context):
     super(Writer, self).__init__(context)
     self.logger = LOGGER.getChild("Writer")
     jc = context.job_conf
     outfn = context.get_default_work_file()
     self.logger.info("writing to %s", outfn)
     hdfs_user = jc.get("pydoop.hdfs.user", None)
     self.sep = jc.get("mapreduce.output.textoutputformat.separator", "\t")
     self.file = hdfs.open(outfn, "wt", user=hdfs_user)
コード例 #10
0
ファイル: hdfs2mongo.py プロジェクト: legendlee1314/ooni
def json_from_hdfs(url):
    assert hdfs.path.isdir(url)
    file_lists = hdfs.ls(url)
    for fi in file_lists:
        with hdfs.open(fi, "r") as f:
            items = f.read().strip().split('\n')
            for it in items:
                it = loads(it)
                it['md5'] = hashlib.md5(str(it)).hexdigest()
                yield it
コード例 #11
0
ファイル: wordcount-full.py プロジェクト: ilveroluca/pydoop
 def __init__(self, context):
   super(Writer, self).__init__(context)
   self.logger = logging.getLogger("Writer")
   jc = context.getJobConf()
   jc_configure_int(self, jc, "mapred.task.partition", "part")
   jc_configure(self, jc, "mapred.work.output.dir", "outdir")
   jc_configure(self, jc, "mapred.textoutputformat.separator", "sep", "\t")
   jc_configure(self, jc, "pydoop.hdfs.user", "hdfs_user", None)
   self.outfn = "%s/part-%05d" % (self.outdir, self.part)
   self.file = hdfs.open(self.outfn, "w", user=self.hdfs_user)
コード例 #12
0
 def __init__(self, context):
     super(Writer, self).__init__(context)
     self.logger = LOGGER.getChild("Writer")
     jc = context.job_conf
     part = jc.get_int("mapred.task.partition")
     out_dir = jc["mapred.work.output.dir"]
     outfn = "%s/part-%05d" % (out_dir, part)
     hdfs_user = jc.get("pydoop.hdfs.user", None)
     self.file = hdfs.open(outfn, "w", user=hdfs_user)
     self.sep = jc.get("mapred.textoutputformat.separator", "\t")
コード例 #13
0
ファイル: pterasort.py プロジェクト: elzaggo/pydoop
 def _choose_break_points(cls, args):
     n_records, n_breakpoints, path = args
     block_size = n_records * RECORD_LENGTH
     with hdfs.open(path, 'r') as f:
         data = f.read(block_size)
     assert len(data) == block_size
     step = max(n_records // n_breakpoints, 1)
     keys = sorted([data[k:k + KEY_LENGTH]
                    for k in range(0, block_size, RECORD_LENGTH)])
     return [_ for _ in it.islice(keys, step, n_records, step)]
コード例 #14
0
ファイル: ioformats.py プロジェクト: elzaggo/pydoop
 def __init__(self, context):
     super(Writer, self).__init__(context)
     self.logger = LOGGER.getChild("Writer")
     jc = context.job_conf
     part = jc.get_int("mapred.task.partition")
     out_dir = jc["mapred.work.output.dir"]
     self.logger.debug("part: %d", part)
     self.logger.debug("outdir: %s", out_dir)
     outfn = "%s/part-%05d" % (out_dir, part)
     hdfs_user = jc.get("pydoop.hdfs.user", None)
     self.file = hdfs.open(outfn, "wb", user=hdfs_user)
コード例 #15
0
ファイル: kafka-producer.py プロジェクト: bunop/ccc-capstone
def processLine(myfile, topic):
    with hdfs.open(myfile["name"]) as handle:
        for i, line in enumerate(handle):
            #strip line
            line = line.strip()
            
            #Submit data (my function)
            submitLine(topic, line, trials=3)
            
            if i % 20000 == 0 and i != 0:
                logger.info("%s lines submitted for %s" %(i, myfile["name"]))
コード例 #16
0
ファイル: avrolib.py プロジェクト: wtj/pydoop
 def __init__(self, context):
     super(AvroWriter, self).__init__(context)
     self.logger = LOGGER.getChild('AvroWriter')
     job_conf = context.job_conf
     part = int(job_conf['mapreduce.task.partition'])
     outdir = job_conf["mapreduce.task.output.dir"]
     outfn = "%s/part-r-%05d.avro" % (outdir, part)
     wh = hdfs.open(outfn, "w")
     self.logger.debug('created hdfs file %s', outfn)
     self.writer = DataFileWriter(wh, DatumWriter(), self.schema)
     self.logger.debug('opened AvroWriter')
コード例 #17
0
ファイル: wordcount-full.py プロジェクト: ilveroluca/pydoop
 def __init__(self, context):
   super(Reader, self).__init__()
   self.logger = logging.getLogger("Reader")
   self.isplit = pp.InputSplit(context.getInputSplit())
   for a in "filename", "offset", "length":
     self.logger.debug("isplit.%s = %r" % (a, getattr(self.isplit, a)))
   self.file = hdfs.open(self.isplit.filename)
   self.logger.debug("readline chunk size = %r" % self.file.chunk_size)
   self.file.seek(self.isplit.offset)
   self.bytes_read = 0
   if self.isplit.offset > 0:
     discarded = self.file.readline()  # read by reader of previous split
     self.bytes_read += len(discarded)
コード例 #18
0
ファイル: ioformats.py プロジェクト: elzaggo/pydoop
 def __init__(self, context):
     super(Reader, self).__init__(context)
     self.logger = LOGGER.getChild("Reader")
     self.logger.debug('started')
     self.isplit = context.input_split
     for a in "filename", "offset", "length":
         self.logger.debug(
             "isplit.{} = {}".format(a, getattr(self.isplit, a))
         )
     remainder = self.isplit.offset % RECORD_LENGTH
     self.bytes_read = 0 if remainder == 0 else RECORD_LENGTH - remainder
     self.file = hdfs.open(self.isplit.filename)
     self.file.seek(self.isplit.offset + self.bytes_read)
コード例 #19
0
ファイル: features.py プロジェクト: manics/pydoop-features
def mapper(_, record, writer, conf):
    out_dir = conf.get('out.dir', utils.make_random_str())
    if not hdfs.path.isdir(out_dir):
        hdfs.mkdir(out_dir)
        hdfs.chmod(out_dir, 'g+rwx')
    img_path = record.strip()
    a = get_array(img_path)
    out_a = calc_features(a)
    out_path = hdfs.path.join(out_dir, '%s.out' % hdfs.path.basename(img_path))
    with hdfs.open(out_path, 'w') as fo:
        np.save(fo, out_a)  # actual output
    hdfs.chmod(out_path, 'g+rw')
    writer.emit(img_path, fo.name)  # info (tab-separated input-output)
コード例 #20
0
ファイル: hadut.py プロジェクト: crs4/pydoop
def collect_output(mr_out_dir, out_file=None):
    """
    Return all mapreduce output in ``mr_out_dir``.

    Append the output to ``out_file`` if provided.  Otherwise, return
    the result as a single string (it is the caller's responsibility to
    ensure that the amount of data retrieved fits into memory).
    """
    if out_file is None:
        output = []
        for fn in iter_mr_out_files(mr_out_dir):
            with hdfs.open(fn, "rt") as f:
                output.append(f.read())
        return "".join(output)
    else:
        block_size = 16777216
        with open(out_file, 'a') as o:
            for fn in iter_mr_out_files(mr_out_dir):
                with hdfs.open(fn) as f:
                    data = f.read(block_size)
                    while len(data) > 0:
                        o.write(data)
                        data = f.read(block_size)
コード例 #21
0
def read(readFlag):
    print(readFlag);
    if (readFlag == True):
        targetFile = config.targetFile.strip()
        targetDirectory = config.targetDirectory.strip()
        targetPath = config.targetPath
        
        print(targetPath)
        
        # instantiate hadoop
        hdfs.hdfs()
        
        # read from hadoop
        fileToRead = hdfs.open(targetPath)
        print(fileToRead.read())
コード例 #22
0
 def __init__(self, context):
     super(Reader, self).__init__(context)
     self.logger = LOGGER.getChild("Reader")
     self.logger.debug('started')
     self.isplit = context.input_split
     for a in "filename", "offset", "length":
         self.logger.debug(
             "isplit.{} = {}".format(a, getattr(self.isplit, a))
         )
     self.file = hdfs.open(self.isplit.filename)
     self.file.seek(self.isplit.offset)
     self.bytes_read = 0
     if self.isplit.offset > 0:
         discarded = self.file.readline()
         self.bytes_read += len(discarded)
コード例 #23
0
ファイル: checkrecords.py プロジェクト: elzaggo/pydoop
def main(argv=None):
    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = 'pteracheck'
    args.module = 'pteracheck'
    args.do_not_use_java_record_reader = True
    args.do_not_use_java_record_writer = False
    args.num_reducers = 1
    args.upload_file_to_cache = ['pteracheck.py', 'ioformats.py']
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.run()
    path = os.path.join(args.output, 'part-r-00000')
    with hdfs.open(path, 'rb') as f:
        data = f.read()
    check_rows(data.split(b'\n')[:-1])
コード例 #24
0
ファイル: check.py プロジェクト: crs4/pydoop
def check_transpose(mr_out_dir):
    output = []
    for fn in hadut.iter_mr_out_files(mr_out_dir):
        with hdfs.open(fn, "rt") as f:
            for line in f:
                row = line.rstrip().split("\t")
                index = int(row.pop(0))
                output.append((index, row))
    output = [_[1] for _ in sorted(output)]
    exp_output = []
    in_fn = os.path.join(THIS_DIR, "data", "transpose_input", "matrix.txt")
    with open(in_fn) as f:
        for line in f:
            for i, item in enumerate(line.split()):
                try:
                    exp_output[i].append(item)
                except IndexError:
                    exp_output.append([item])
    return output == exp_output
コード例 #25
0
ファイル: kafka-producer.py プロジェクト: bunop/ccc-capstone
def processChunk(myfile, topic):
    with hdfs.open(myfile["name"]) as handle:
        data = []
        
        for i, line in enumerate(handle):
            #strip line
            line = line.strip()
            data += [line]
            
            if i % 5000 == 0:
                #Submit data (my function)
                submitChunk(topic, data, trials=3)
                data = []
            
            if i % 20000 == 0 and i != 0:
                logger.info("%s lines submitted for %s" %(i, myfile["name"]))
                
        #for every line
        #submit the rest of the data
        submitChunk(topic, data, trials=3)
        data = []
コード例 #26
0
ファイル: hdfs2mongo.py プロジェクト: legendlee1314/ooni
def xml_from_hdfs(url):
    assert hdfs.path.isdir(url)
    file_lists = hdfs.ls(url)
    #for fi in file_lists:
    for i in xrange(0, 1):
        fi = '/datasets/corpus/enwiki-11g/wiki_912'
        with hdfs.open(fi, "r") as f:
            lines = f.read().strip().split('\n')
            docs, doc = [], None
            for line in lines:
                if line.startswith('<doc'):
                    doc = line
                elif line.startswith('</doc>'):
                    docs.append(doc + line)
                else:
                    #line = line.replace('&', '').replace('"', "'")
                    doc += line.replace('"', "'")

            for doc in docs:
                dom = bs(doc).find('doc')
                doc = dom.attrs
                doc['content'] = dom.text
                doc['md5'] = hashlib.md5(str(doc)).hexdigest()
                yield doc
コード例 #27
0
        sys.exit(1)
    else:
        return ratings

def computeRmse(model, data, n):
    """
    Compute RMSE (Root Mean Squared Error).
    """
    predictions = model.predictAll(data.map(lambda x: (x[0], x[1])))
    predictionsAndRatings = predictions.map(lambda x: ((x[0], x[1]), x[2])) \
      .join(data.map(lambda x: ((x[0], x[1]), x[2]))) \
      .values()
    return sqrt(predictionsAndRatings.map(lambda x: (x[0] - x[1]) ** 2).reduce(add) / float(n))
for n in userArray:
	with open(uFile, "w") as fi:
        with hdfs.open('/user/cloudera/medium/ratings.dat') as f:
            for line in f:
                data = line
                userid = line.split("::")
                if (int(userid[0]) == int(n)):
                    fi.write(data)
                    print n
	f.close()
	if __name__ == "__main__":
		if (len(sys.argv) != 2):
			print "Usage: /path/to/spark/bin/spark-submit --driver-memory 2g " + \
			  "MovieLensALS.py movieLensDataDir"
			sys.exit(1)

		# set up environment
		conf = SparkConf() \
コード例 #28
0
import os
import pydoop.hdfs as hd
import datetime
import forecastio as fo
import pandas as pd

with hd.open("hdfs://quickstart.cloudera:8020/user/cloudera/python/cities_location.csv") as f:
    df =  pd.read_csv(f)
    
    
    df=pd.read_csv('/user/cloudera/python/cities_location.csv') 
    df.head()
    api_key = "459009d8daa503cef1e11b190c961ce5"
    #selecting the specific date
    date = datetime.datetime(2015,11,1,2,0,0)
    for i in range(len(df)):
        col = ["cities", "time",  "temperatureMin", "temperatureMax"]
        lat=df["latitude"].iloc[i]
        lng=df["longitude"].iloc[i]
        #qccesing the forecast.io API
        forecast = fo.load_forecast(api_key, lat, lng, time=date)
        day = forecast.daily()
        #retrieving infor;ation for the current day
        Day=day.data[0]
        data={"cities": df["cities"].iloc[i], "time" : Day.time, "temperatureMin" : Day.temperatureMin, "temperatureMax" : Day.temperatureMax}
        if i==0 :
            weather = pd.DataFrame(data, index=[0], columns= col)
        else:
            weather1 = pd.DataFrame(data, index=[0], columns= col)
            weather = pd.concat([weather, weather1], ignore_index=True)
        
コード例 #29
0
        result = math.pow(math.e, -0.5 * (x_mu * inverse * x_mu.T))
        return norm_const * result
    else:
        raise NameError("The dimensions of the input don't match")


#import pydoop.hdfs as hdfs
k = 5

#using Hadoop system file
#with hdfs.open('/Users/ming/centroids.txt') as fp:

weights = []
means = []
sigmas = []
with hdfs.open('/Users/user06/parameters.txt') as file:
    for line in file:
        params = line.strip().split("\t")
        weights.append(float(params[0]))
        means.append(np.array(params[1].split(), float))
        sigmas.append(np.array(params[2].split(), float))

for line in sys.stdin:
    line = line.strip()
    point = np.array(line.split(), float)
    p = weights[0] * norm_pdf_multivariate(point, means[0], sigmas[0].reshape(
        (2, 2)))
    nearest = 0
    for i in range(1, k):
        q = weights[i] * norm_pdf_multivariate(point, means[i],
                                               sigmas[i].reshape((2, 2)))
コード例 #30
0
import pydoop.hdfs as hdfs


b = hdfs.path.isdir("/data")

want_file = 'traffic.csv'

if b == True:
    print("---get test ---")
    lines = []
    with hdfs.open("hdfs://127.0.0.1:9000/data/"+want_file) as f:
        for line in f:
            # print(line, type(line))
            l = line.decode("utf-8")
            if l is not None and l != "":
                lines.append(l)
    print(lines)
    print("---end get----")

    with open("i8predict_flow/"+want_file, "wb") as myfile:
        myfile.write(str(lines))
##Importing Required Packages
import numpy as np
import pydoop.hdfs as hd
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sbn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, auc, roc_curve, precision_recall_curve, average_precision_score

##Loading Credit Card Dataset
with hd.open("/user/hduser/creditcard.csv") as f:
    CreditCardData = pd.read_csv(f, header=0)

##Reducing the number of records of Original Dataset incase we wish to work on a smaller subset of Dataset
ReducedData = CreditCardData.iloc[:, :]

##Shape of Credit Card Dataset, i.e. number of rows & columns present in Dataset
print("\nShape of Credit Card Dataset (rows, columns): " +
      str(ReducedData.shape))

##Removing Duplicate Records (if any)
FinalData = ReducedData.drop_duplicates()
print(
    "\nShape of Credit Card Dataset after removing duplicate records (rows, columns): "
    + str(FinalData.shape))

##Checking for missing values
コード例 #32
0
import pydoop.hdfs as hdfs
import boto3
import botocore

s3 = boto3.resource('s3')

BUCKET = "bd-mindbenders12345"

file = hdfs.open("hdfs://localhost:9000/test.txt")

s3.Bucket(BUCKET).put_object(Key="test.txt", Body=file)         
コード例 #33
0
ファイル: hdfs.py プロジェクト: clrke/hdfs-test
import pydoop.hdfs as hdfs
import config.hdfs

with hdfs.open(config.hdfs['ur']) as f:
    for line in f:
        print(line)

コード例 #34
0
i = 0
import math
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss, accuracy_score


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


vocab = Counter()
labels = Counter()

with hdfs.open(
        '/user/ds222/assignment-1/DBPedia.verysmall/verysmall_train.txt') as f:
    for line in f:
        first, next = line.split(' ', 1)
        for label in first.split(','):
            labels[label] += 1
        words = next.strip().lower().split()
        for word in words:
            if (len(word) >= 4):
                if (word[0] != '<'):
                    vocab[word] += 1
        i = i + 1
#print(i)
#print(counter)


#Convert words to indexes
コード例 #35
0
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# END_COPYRIGHT

import pickle
import io
from collections import Counter

import pydoop.hdfs as hdfs
from pterasort import Partitioner

RECORD_LENGTH = 91
KEY_LENGTH = 10

fname = Partitioner.initialize_break_points(5, 1000,
                                            '/user/root/genrecords_output')
with io.open('__break_point_cache_file', 'rb') as f:
    data = f.read()
sel = pickle.loads(data)

block_size = 20000 * RECORD_LENGTH
path = '/user/root/genrecords_output/part-m-00000'
with hdfs.open(path, 'rb') as f:
    data = f.read(block_size)
keys = (data[k:k + 10] for k in range(0, block_size, RECORD_LENGTH))
partitions = Counter(map(sel.select_partition, keys))
print(partitions)
コード例 #36
0
	HDFSfiles.append(hdFiles[41:])


fileNames = []

indexName = 'music'
typeName = 'songs'
#IdField = 'songID'


bulkData = [] 

i = 1
for name in HDFSfiles:	
	dataDict = {}
       	fopen=hdfs.open("/gaana/gaanaLyrics/"+name)
	header = fopen.read()
       	header = re.sub('[^a-zA-Z]', ' ', header)
	header = header.replace("Advertisements"," ")
       	header = ''.join([item.lower() for item in header]) 
	songAndMovie = []
       	dlim = "lyrics"
#	nameNew = name.replace("-"," ")
	songAndMovie.append(name)
        dataDict[name] = header
	metaDict = {}
	dataDict = {}	
       	for elements in songAndMovie:
               	songsName = []
#              	if "lyrics" in elements:
		songName = elements.split('-')
コード例 #37
0
from pandasql import sqldf
import os


login=''
senha=''

os.system('echo '+senha+' | kinit '+login)
dir = '/ranger/audit/hiveServer2/'
list = hdfs.ls(dir)

df = pd.DataFrame()
for pasta in list:
    for i in range(len(hdfs.ls(pasta))):
        try:
            with hdfs.open(hdfs.ls(pasta)[i], 'r') as f:
                jsn = [json.loads(line) for line in f]
                df = df.append([pd.DataFrame(jsn)], sort=True)
            
        except:
                print("Leitura do arquivo json em " + hdfs.ls(pasta)[i] + " não foi bem sucedida")

df1 = df[['evtTime','reqUser','resource','access','reqData']]
df1['reqUser'] = df1['reqUser'].str.upper()
df1 = df1[df1['access']=='SELECT']
# exclusao de usuarios de servico
exclusao = pd.DataFrame(['HIVE','RANGERLOOKUP'])
df1 = df1[~df1.reqUser.isin(exclusao.iloc[:,0])]
df1['evtTime'] = pd.to_datetime(df1['evtTime'].str[0:16], format='%Y-%m-%d %H:%M')

spark_df = spark.createDataFrame(df1)
コード例 #38
0
            {"$group": {"_id": {'source':"$source",'tags':"$tags",'year': "$year_posted",'month':"$month_posted",'day':"$day_posted"}, "count": {"$sum": 1},"countNegative":{"$sum":"$Negative"},"countNeutral":{"$sum":"$Neutral"},"countPositive":{"$sum":"$Positive"}}},
            {"$sort": SON([("count", -1), ("_id", -1)])}
        ])
            #use reportdate for the filename
        filename = startdate.strftime('%Y-%m-%d')
        print(filename)
        for result_obj in daily_totals['result']:
            data_dict = result_obj['_id']
            date = (str(data_dict['year']) + "-" +  str(data_dict['month']) + "-" + str(data_dict['day']))
            tag = data_dict['tags']
            source = data_dict['source']
            count = result_obj['count']
            countPositive = result_obj['countPositive']
            countNegative = result_obj['countNegative']
            countNeutral = result_obj['countNeutral']
            data = (str(date) + "|" + str(tag) + "|" + str(source) + "|" + str(count) + "|" + str(countPositive) + "|" + str(countNegative) + "|" + str(countNeutral)+'\n')
            print(data)
            hdfs_path = '/socialmedia/sentiment/' + filename
            hdfs_path = settings.HDFS_HOST_NAME + ':' + settings.HDFS_PORT + settings.HDFS_ROOT_FOLDER + \
                        '/socialmedia/sentiment' + filename + '.in'
            logger.info('HDFS file path: %s' % hdfs_path)
            logger.debug('Data: %s' % data)

            try:
                hdfs_file = hdfs.open(hdfs_path, mode='a')
                hdfs_file.write(data.encode('utf-8'))
            except IOError, e:
                logger.debug("IOError: " + e.message)
                logger.debug("Caught Exception. Will create a new file on hdfs.")
                hdfs_file = hdfs.open(hdfs_path, mode='w')
                hdfs_file.write(data.encode('utf-8'))
コード例 #39
0
        colourImg = PIL.Image.open(imgFile)
        #imshow(np.asarray(colourImg))
        nparray = np.asarray(colourImg)
        image = cv2.cvtColor(nparray, cv2.COLOR_RGB2BGR)

    return image


brand = "logitech"
mode = "image"

os.chdir("/tmp/")
myMachine = kpath.abspath('/tmp/data/input/racetrack/image/')
print(myMachine)

with hpath.open(myMachine + "driving_log.csv") as csvFile:
    df = pd.read_csv(csvFile,
                     names=[
                         "image_center", "image_left", "image_right",
                         "steering", "speed"
                     ])

#next(df.iterrows())[1]
df.iterrows()

# read and store multiple cameras and steering angles from driving_log.csv
# all three camera images will be used to train the model
images = []
steering_measurements = []

for index, row in df.iterrows():
コード例 #40
0
	

import pydoop.hdfs as hdfs 
import logging
logging.basicConfig(level = logging.DEBUG)

# тест проверяет наличие строки в выходых файлах
# для каждой строки ищется ее пара в директории с нужной датой

with open('file1.csv','r') as in_f:
	for it,in_line in enumerate(in_f):
		date=in_line.strip().split(',')[0]
		for part in [1,2,3]:
			with hdfs.open('/data/archive/'+date+'/part-0000'+str(part)) as out_f:
				matching=[]
				for out_line in out_f:
					a=set(out_line.strip().split(','))
					if a==set(in_line.strip().split(',')):
						matching.append(True) 
						break
					else:
						matching.append(False)
			if any(matching):
				matching=True
				break
        	if not matching:            
			logging.debug("Error on line %s ,%s",it,in_line)
コード例 #41
0
 def __missing__(self, path):
     f = hdfs.open(path, "wb")
     self[path] = f
     return f
コード例 #42
0
import pydoop.hdfs as hdfs

for part in [1, 2, 3]:
    with hdfs.open('/data/archive/2014-04-29/part-0000' + str(part)) as out_f:
        with open('file1.csv', 'r') as in_f:
            for out_line in out_f:
                for in_line in in_f:
                    a = set(out_line.strip().split(','))
                    if a == set(in_line.strip().split(',')):
                        print True
                    else:
                        print False
                        print a
                        print set(in_line.strip().split(','))
コード例 #43
0
 def __init__(self, context):
     super(Reader, self).__init__()
     self.logger = logging.getLogger("Reader")  #formatted logger obtained
     self.file = hdfs.open('HD-2004-2014-d.csv')
     self.logger.debug("readline chunk size = %r" % self.file.chunk_size)
コード例 #44
0
#!/usr/bin/python3
"""mapper.py"""
import sys
import csv
import pydoop.hdfs as hdfs
import json
import costanct as C

azienda_map = {}
with hdfs.open('input/historical_stocks.csv', 'rt') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count > 0:
            ticker, _, name, _, _ = row
            azienda_map[ticker] = {'name': name}
        line_count += 1


def toJson(azione):
    dic = {
        "ticker": azione[0],
        "name": azienda_map[azione[0]],
        "close": azione[2],
        "date": azione[7],
    }
    return json.dumps(dic)


for line in sys.stdin:
    azione = line.split(',')
コード例 #45
0
        context.setStatus("initializing")

    def map(self, context):
        k = context.getInputKey()
        tmp_data = csv.reader(f)
        words = context.getInputValue().split()
        for w in words:
            context.emit(w, "1")
            context.incrementCounter(self.inputWords, len(words))

    def close(self):
        self.logger.info("all done")


print "Prediction on HD 30 year data:"
f = hdfs.open('/HD-1984-2014-d.csv')
tmp_data = csv.reader(f)

my_data = list()
for item in tmp_data:
    tmp_item = list()
    for i in item:
        tmp_item.append(i)
    my_data.append(tmp_item)
data = my_data[1:]
X = list()
training_indices = list()
for i in xrange(int(len(data) * 0.9)):
    training_indices.append(i)

test_indices = list()
コード例 #46
0
import numpy as np

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

from pyspark.sql import SparkSession

import pandas as pd
import pydoop.hdfs as hd

# create a spark session
#sparkSession = SparkSession.builder.master("local").appName("draw heat map").getOrCreate()
#df_load = sparkSession.read.csv('hdfs://dumbo/user/gx271/pubgETL/mir_death.csv')

with hd.open("hdfs://dumbo/user/gx271/pubgETL/mir_death.csv/part-00006") as f:
    df = pd.read_csv(f)

# convert DataFrame to np array

dat = df.as_matrix()

# dat = np.loadtxt('mydata.csv')

x, y = dat[:,0], dat[:,1]

heatmap, xedges, yedges = np.histogram2d(x, y, bins=50)  
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]  
plt.clf()  
plt.imshow(heatmap, extent=extent)  
# plt.show()
コード例 #47
0
def main(input_path, output_attribute_index, scikit_output_path,
         spark_output_path):

    # Instancira se Passive Aggressive Regressor model
    regressor = PassiveAggressiveRegressor()
    for file_path in hdfs.ls(input_path):
        # Ucitava se sadrzaj fajla i kreira string matrica od njega
        content = hdfs.load(file_path)
        temp = content.split("\n")
        temp = list(map(lambda x: x.split(","), temp))
        temp = list(filter(lambda x: len(x) > 1, temp))
        raw_matrix = np.array(temp)
        # Ucitava se numpy matrica i zatim parsira u matricu realnih vrednosti
        # koja se nakon toga koristi prilikom treniranja modela
        # raw_matrix = np.genfromtxt(file_path, delimiter=',', dtype='string')
        input_matrix = raw_matrix[1:, 3:-5].astype('float64')
        output_vector = raw_matrix[1:, -5 +
                                   output_attribute_index].astype('float64')
        # Model se trenira u vidu iterativnog poboljsanja
        regressor.partial_fit(input_matrix, output_vector)
        # Na konzoli se stampa putanja do obradjenog fajla
        print(file_path)

    # Cuva se kreirani model na izlaznoj putanji
    # koja je prosledjena u vidu argumenta
    with hdfs.open(scikit_output_path, 'w') as opened_file:
        pickle.dump(regressor, opened_file)

    # Inicijalizacija konfiguracije i konteksta izvrsenja aplikacije
    configuration = SparkConf().setAppName("BigDataProj3_Trainer")
    context = SparkContext(conf=configuration)
    context.setLogLevel("ERROR")
    # Inicijalizacija sesije
    # (mora da se obavi zbog upisivanja modela)
    session = SparkSession(context)

    # Ucitavanje RDD podataka sa ulazne putanje
    input_data = context.textFile(input_path)
    # Parsiranje svakog reda na reci
    input_data = input_data.map(lambda x: x.split(","))
    # Ignorisu se header-i
    input_data = input_data.filter(lambda x: x[0] != "Timestamp")
    # Ignorisu se prve tri vrste (Timestamp, Latitude i Longitude)
    # i bira se odgovarajuca izlazna kolona
    # (u zavisnosti od output_attribute_index promenljive)
    input_data = input_data.map(lambda x: list(map(lambda y: float(y), x[
        3:-5])) + [float(x[-5 + output_attribute_index])])

    # Formira se odgovarajuci DataFrame objekat
    # (VectorAssembler se koristi kod formiranja kolona
    # koje omogucavaju koriscenje fit metode linearne regresije)
    input_cols = []
    for i in range(15):
        input_cols.append("_" + str(i + 1))
    assembler = VectorAssembler(inputCols=input_cols, outputCol='features')
    data_frame = assembler.transform(input_data.toDF())

    # Instancira se LinearRegression objekat i vrsi njegovo treniranje
    # i zatim cuvanje na zadatoj putanji
    regression = LinearRegression(featuresCol='features', labelCol='_16')
    model = regression.fit(data_frame)
    model.write().overwrite().save(spark_output_path)
コード例 #48
0
bucket = 'enhance-it'
import boto3
import pydoop.hdfs as hdfs 

s3 = boto3.resource('s3')


file = hdfs.open('hdfs://master:9000/data_for_db/currency=USD/part-00000-f65c545f-baa0-4bf0-8aa9-0b14957848c4.c000.json')
s3.Bucket(bucket).put_object(Key='lituation/data_from_hdfs.csv', Body=file)
コード例 #49
0
ファイル: test_partitioner.py プロジェクト: elzaggo/pydoop
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# END_COPYRIGHT

import pickle
import io
from collections import Counter

import pydoop.hdfs as hdfs
from pterasort import Partitioner

RECORD_LENGTH = 91
KEY_LENGTH = 10

fname = Partitioner.initialize_break_points(
    5, 1000, '/user/root/genrecords_output'
)
with io.open('__break_point_cache_file', 'rb') as f:
    data = f.read()
sel = pickle.loads(data)

block_size = 20000 * RECORD_LENGTH
path = '/user/root/genrecords_output/part-m-00000'
with hdfs.open(path, 'rb') as f:
    data = f.read(block_size)
keys = (data[k:k + 10] for k in range(0, block_size, RECORD_LENGTH))
partitions = Counter(map(sel.select_partition, keys))
print(partitions)
コード例 #50
0
#! /usr/bin/env python

import sys
from pydoop import hdfs
from DataPoint import DataPoint

#print "Start"

# read sys.argv[1] and sys.argv[2]
# put em in lists

if len(sys.argv)<3:
	print "Error: Insufficient Arguments"
	sys.exit(-1)

oldCentroidsFile = hdfs.open(sys.argv[1])
newCentroidsFile = hdfs.open(sys.argv[2])

oldCentroids = []
newCentroids = []

for line in oldCentroidsFile:
	if line.find("\t") != -1:
		(key,value) = line.strip().split("\t")
		oldCentroid = DataPoint(value)
	else:
		oldCentroid = DataPoint(line.strip()) 
	oldCentroids.append(oldCentroid)

for line in newCentroidsFile:
	(key,value) = line.strip().split("\t")
コード例 #51
0
#! /usr/bin/env python

import sys
import DataPoint
from pydoop import hdfs

# Check for sufficient arguments
if len(sys.argv) < 2:
    print("ERROR: Insufficient arguments")
    sys.exit(-1)

# List to hold canopy centers
canopyCenters = []

# Read canopy center file
file = hdfs.open(sys.argv[1])
for line in file:
    if line.find("Warning:") == 0:
        continue
    (key, value) = line.split("\t")
    dp = DataPoint.DataPoint(value.strip())
    canopyCenters.append(dp)

# Assign points to canopies
for line in sys.stdin:
    dp = DataPoint.DataPoint(line.strip())
    insert = True
    for canopyCenter in canopyCenters:
        if dp.checkT1(canopyCenter):
            print(canopyCenter.toString() + "\t" + dp.toString())
コード例 #52
0
### Parte de leer los datos de hdfs ###
import pandas as pd
import numpy as np
import pydoop.hdfs as hd
from lxml import objectify

with hd.open("/user/datostiempo/20160525_1341.xml") as archivo:
    parsed = objectify.parse(archivo)

root = parsed.getroot()
prob_precipitacion = []
estado_cielo = []
viento = []
temperatura = []
tempmax = []
tempmin = []
iteraccion = 0
errores = []
print "root : ", root
for row in root.prediccion.dia:
    for row_precipitacion in row.prob_precipitacion:
        aux_precipitacion = []
        if (row_precipitacion != ''):
            aux_precipitacion.append(row_precipitacion)
        else:
            errores.append(1)
    prob_precipitacion.append(
        str(sum(aux_precipitacion) / float(len(aux_precipitacion))))
    for row_cielo in row.estado_cielo:
        aux_cielo = []
        if (row_cielo != ''):
コード例 #53
0
import os
import pandas as pd
import pydoop.hdfs as hdfs
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

conf = SparkConf().setAppName("drunk detection").setMaster("yarn")
sc = SparkContext(conf=conf)
sqlCtx = SQLContext(sc)

csv_file_path = "hdfs:///drunkdetection/train_data48.csv"
predictor_path = "hdfs:///drunkdetection/shape_predictor_68_face_landmarks.dat"
image_path = "hdfs:///drunkdetection/drunk3.jpg"
model_path = "hdfs:///drunkdetection/rf48.pickle"

with hdfs.open("/drunkdetection/train_data48.csv") as csv:
    df = pd.read_csv(csv, index_col=0)
print(df.columns)
df_y = df['label'] == 3
df_X = df[['x' + str(i)
           for i in range(1, 49)] + ['y' + str(j) for j in range(1, 49)]]
X_train, X_test, y_train, y_test = train_test_split(df_X,
                                                    df_y,
                                                    test_size=0.2,
                                                    random_state=15)

# Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
コード例 #54
0
import os
import pydoop.hdfs as hd
import datetime
import forecastio as fo
import pandas as pd

with hd.open(
        "hdfs://quickstart.cloudera:8020/user/cloudera/python/cities_location.csv"
) as f:
    df = pd.read_csv(f)

    df = pd.read_csv('/user/cloudera/python/cities_location.csv')
    df.head()
    api_key = "459009d8daa503cef1e11b190c961ce5"
    #selecting the specific date
    date = datetime.datetime(2015, 11, 1, 2, 0, 0)
    for i in range(len(df)):
        col = ["cities", "time", "temperatureMin", "temperatureMax"]
        lat = df["latitude"].iloc[i]
        lng = df["longitude"].iloc[i]
        #qccesing the forecast.io API
        forecast = fo.load_forecast(api_key, lat, lng, time=date)
        day = forecast.daily()
        #retrieving infor;ation for the current day
        Day = day.data[0]
        data = {
            "cities": df["cities"].iloc[i],
            "time": Day.time,
            "temperatureMin": Day.temperatureMin,
            "temperatureMax": Day.temperatureMax
        }