Ejemplos de open en Python, ejemplos de pydoop.hdfs.open en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: test_hdfs.py Proyecto: ZEMUSHKA/pydoop

 def open(self):
   for test_path in self.hdfs_paths[0], self.local_paths[0]:
     with hdfs.open(test_path, "w") as f:
       f.write(self.data)
     f.fs.close()
     with hdfs.open(test_path) as f:
       self.assertEqual(f.read(), self.data)
     f.fs.close()

Ejemplo n.º 2

0

Mostrar archivo

Archivo: hdfs2mongo_distributed.py Proyecto: legendlee1314/ooni

def xml_from_hdfs(url):
    with hdfs.open(url, "r") as f:
        lines = f.read().strip().split('\n')
        docs, doc = [], None
        for line in lines:
            if line.startswith('<doc'):
                doc = line
            elif line.startswith('</doc>'):
                docs.append(doc + line)
            else:
                #line = line.replace('&', '').replace('"', "'")
                doc += line.replace('"', "'")

        for doc in docs:
            dom = bs(doc).find('doc')
            doc = {}
            try:
                doc['id'] = dom.attrs['id']
                doc['url'] = dom.attrs['url']
                doc['title'] = dom.attrs['title']
            except AttributeError, e:
                continue
            doc['content'] = dom.text
            doc['md5'] = hashlib.md5(str(doc)).hexdigest()
            yield doc

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_hdfs.py Proyecto: ZEMUSHKA/pydoop

 def dump(self):
   for test_path in self.hdfs_paths[0], self.local_paths[0]:
     hdfs.dump(self.data, test_path)
     with hdfs.open(test_path) as fi:
       rdata = fi.read()
     fi.fs.close()
     self.assertEqual(rdata, self.data)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: avrolib.py Proyecto: CynthiaYiqingHuang/pydoop

 def __init__(self, ctx):
     super(AvroReader, self).__init__(ctx)
     isplit = ctx.input_split
     self.region_start = isplit.offset
     self.region_end = isplit.offset + isplit.length
     self.reader = SeekableDataFileReader(hdfs.open(isplit.filename),
                                          DatumReader())
     self.reader.align_after(isplit.offset)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: avrolib.py Proyecto: CynthiaYiqingHuang/pydoop

 def __init__(self, context):
     super(AvroWriter, self).__init__(context)
     job_conf = context.job_conf
     part = int(job_conf['mapreduce.task.partition'])
     outdir = job_conf["mapreduce.task.output.dir"]
     outfn = "%s/part-r-%05d.avro" % (outdir, part)
     wh = hdfs.open(outfn, "w")
     self.writer = DataFileWriter(wh, DatumWriter(), self.schema)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: try_input_format.py Proyecto: IDR/pydoop-features

 def map(self, ctx):
     p = BioImgPlane(ctx.value)
     pixels = p.get_xy()
     bn = '%s-z%04d-c%04d-t%04d.npy' % (p.name, p.z, p.c, p.t)
     fn = hdfs.path.join(self.out_dir, p.name, bn)
     with hdfs.open(fn, 'w') as fo:
         np.save(fo, pixels)
     ctx.emit(fn, '%s\t%s' % (p.dimension_order, pixels.shape))

Ejemplo n.º 7

0

Mostrar archivo

Archivo: test_hdfs.py Proyecto: ZEMUSHKA/pydoop

 def put(self):
   src = hdfs.path.split(self.local_paths[0])[-1]
   dest = self.hdfs_paths[0]
   with open(src, "w") as f:
     f.write(self.data)
   hdfs.put(src, dest)
   with hdfs.open(dest) as fi:
     rdata = fi.read()
   self.assertEqual(rdata, self.data)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: wordcount-rr.py Proyecto: ilveroluca/pydoop

 def __init__(self, context):
   super(Reader, self).__init__()
   self.isplit = pp.InputSplit(context.getInputSplit())
   self.file = hdfs.open(self.isplit.filename)
   self.file.seek(self.isplit.offset)
   self.bytes_read = 0
   if self.isplit.offset > 0:
     discarded = self.file.readline()  # read by reader of previous split
     self.bytes_read += len(discarded)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: map_only_python_writer.py Proyecto: crs4/pydoop

 def __init__(self, context):
     super(Writer, self).__init__(context)
     self.logger = LOGGER.getChild("Writer")
     jc = context.job_conf
     outfn = context.get_default_work_file()
     self.logger.info("writing to %s", outfn)
     hdfs_user = jc.get("pydoop.hdfs.user", None)
     self.sep = jc.get("mapreduce.output.textoutputformat.separator", "\t")
     self.file = hdfs.open(outfn, "wt", user=hdfs_user)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: hdfs2mongo.py Proyecto: legendlee1314/ooni

def json_from_hdfs(url):
    assert hdfs.path.isdir(url)
    file_lists = hdfs.ls(url)
    for fi in file_lists:
        with hdfs.open(fi, "r") as f:
            items = f.read().strip().split('\n')
            for it in items:
                it = loads(it)
                it['md5'] = hashlib.md5(str(it)).hexdigest()
                yield it

Ejemplo n.º 11

0

Mostrar archivo

Archivo: wordcount-full.py Proyecto: ilveroluca/pydoop

 def __init__(self, context):
   super(Writer, self).__init__(context)
   self.logger = logging.getLogger("Writer")
   jc = context.getJobConf()
   jc_configure_int(self, jc, "mapred.task.partition", "part")
   jc_configure(self, jc, "mapred.work.output.dir", "outdir")
   jc_configure(self, jc, "mapred.textoutputformat.separator", "sep", "\t")
   jc_configure(self, jc, "pydoop.hdfs.user", "hdfs_user", None)
   self.outfn = "%s/part-%05d" % (self.outdir, self.part)
   self.file = hdfs.open(self.outfn, "w", user=self.hdfs_user)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: wordcount_full.py Proyecto: CynthiaYiqingHuang/pydoop

 def __init__(self, context):
     super(Writer, self).__init__(context)
     self.logger = LOGGER.getChild("Writer")
     jc = context.job_conf
     part = jc.get_int("mapred.task.partition")
     out_dir = jc["mapred.work.output.dir"]
     outfn = "%s/part-%05d" % (out_dir, part)
     hdfs_user = jc.get("pydoop.hdfs.user", None)
     self.file = hdfs.open(outfn, "w", user=hdfs_user)
     self.sep = jc.get("mapred.textoutputformat.separator", "\t")

Ejemplo n.º 13

0

Mostrar archivo

Archivo: pterasort.py Proyecto: elzaggo/pydoop

 def _choose_break_points(cls, args):
     n_records, n_breakpoints, path = args
     block_size = n_records * RECORD_LENGTH
     with hdfs.open(path, 'r') as f:
         data = f.read(block_size)
     assert len(data) == block_size
     step = max(n_records // n_breakpoints, 1)
     keys = sorted([data[k:k + KEY_LENGTH]
                    for k in range(0, block_size, RECORD_LENGTH)])
     return [_ for _ in it.islice(keys, step, n_records, step)]

Ejemplo n.º 14

0

Mostrar archivo

Archivo: ioformats.py Proyecto: elzaggo/pydoop

 def __init__(self, context):
     super(Writer, self).__init__(context)
     self.logger = LOGGER.getChild("Writer")
     jc = context.job_conf
     part = jc.get_int("mapred.task.partition")
     out_dir = jc["mapred.work.output.dir"]
     self.logger.debug("part: %d", part)
     self.logger.debug("outdir: %s", out_dir)
     outfn = "%s/part-%05d" % (out_dir, part)
     hdfs_user = jc.get("pydoop.hdfs.user", None)
     self.file = hdfs.open(outfn, "wb", user=hdfs_user)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: kafka-producer.py Proyecto: bunop/ccc-capstone

def processLine(myfile, topic):
    with hdfs.open(myfile["name"]) as handle:
        for i, line in enumerate(handle):
            #strip line
            line = line.strip()
            
            #Submit data (my function)
            submitLine(topic, line, trials=3)
            
            if i % 20000 == 0 and i != 0:
                logger.info("%s lines submitted for %s" %(i, myfile["name"]))

Ejemplo n.º 16

0

Mostrar archivo

Archivo: avrolib.py Proyecto: wtj/pydoop

 def __init__(self, context):
     super(AvroWriter, self).__init__(context)
     self.logger = LOGGER.getChild('AvroWriter')
     job_conf = context.job_conf
     part = int(job_conf['mapreduce.task.partition'])
     outdir = job_conf["mapreduce.task.output.dir"]
     outfn = "%s/part-r-%05d.avro" % (outdir, part)
     wh = hdfs.open(outfn, "w")
     self.logger.debug('created hdfs file %s', outfn)
     self.writer = DataFileWriter(wh, DatumWriter(), self.schema)
     self.logger.debug('opened AvroWriter')

Ejemplo n.º 17

0

Mostrar archivo

Archivo: wordcount-full.py Proyecto: ilveroluca/pydoop

 def __init__(self, context):
   super(Reader, self).__init__()
   self.logger = logging.getLogger("Reader")
   self.isplit = pp.InputSplit(context.getInputSplit())
   for a in "filename", "offset", "length":
     self.logger.debug("isplit.%s = %r" % (a, getattr(self.isplit, a)))
   self.file = hdfs.open(self.isplit.filename)
   self.logger.debug("readline chunk size = %r" % self.file.chunk_size)
   self.file.seek(self.isplit.offset)
   self.bytes_read = 0
   if self.isplit.offset > 0:
     discarded = self.file.readline()  # read by reader of previous split
     self.bytes_read += len(discarded)

Ejemplo n.º 18

0

Mostrar archivo

Archivo: ioformats.py Proyecto: elzaggo/pydoop

 def __init__(self, context):
     super(Reader, self).__init__(context)
     self.logger = LOGGER.getChild("Reader")
     self.logger.debug('started')
     self.isplit = context.input_split
     for a in "filename", "offset", "length":
         self.logger.debug(
             "isplit.{} = {}".format(a, getattr(self.isplit, a))
         )
     remainder = self.isplit.offset % RECORD_LENGTH
     self.bytes_read = 0 if remainder == 0 else RECORD_LENGTH - remainder
     self.file = hdfs.open(self.isplit.filename)
     self.file.seek(self.isplit.offset + self.bytes_read)

Ejemplo n.º 19

0

Mostrar archivo

Archivo: features.py Proyecto: manics/pydoop-features

def mapper(_, record, writer, conf):
    out_dir = conf.get('out.dir', utils.make_random_str())
    if not hdfs.path.isdir(out_dir):
        hdfs.mkdir(out_dir)
        hdfs.chmod(out_dir, 'g+rwx')
    img_path = record.strip()
    a = get_array(img_path)
    out_a = calc_features(a)
    out_path = hdfs.path.join(out_dir, '%s.out' % hdfs.path.basename(img_path))
    with hdfs.open(out_path, 'w') as fo:
        np.save(fo, out_a)  # actual output
    hdfs.chmod(out_path, 'g+rw')
    writer.emit(img_path, fo.name)  # info (tab-separated input-output)

Ejemplo n.º 20

0

Mostrar archivo

Archivo: hadut.py Proyecto: crs4/pydoop

def collect_output(mr_out_dir, out_file=None):
    """
    Return all mapreduce output in ``mr_out_dir``.

    Append the output to ``out_file`` if provided.  Otherwise, return
    the result as a single string (it is the caller's responsibility to
    ensure that the amount of data retrieved fits into memory).
    """
    if out_file is None:
        output = []
        for fn in iter_mr_out_files(mr_out_dir):
            with hdfs.open(fn, "rt") as f:
                output.append(f.read())
        return "".join(output)
    else:
        block_size = 16777216
        with open(out_file, 'a') as o:
            for fn in iter_mr_out_files(mr_out_dir):
                with hdfs.open(fn) as f:
                    data = f.read(block_size)
                    while len(data) > 0:
                        o.write(data)
                        data = f.read(block_size)

Ejemplo n.º 21

0

Mostrar archivo

Archivo: hadoopReader.py Proyecto: davedwards/beautiful-data

def read(readFlag):
    print(readFlag);
    if (readFlag == True):
        targetFile = config.targetFile.strip()
        targetDirectory = config.targetDirectory.strip()
        targetPath = config.targetPath
        
        print(targetPath)
        
        # instantiate hadoop
        hdfs.hdfs()
        
        # read from hadoop
        fileToRead = hdfs.open(targetPath)
        print(fileToRead.read())

Ejemplo n.º 22

0

Mostrar archivo

Archivo: wordcount_full.py Proyecto: CynthiaYiqingHuang/pydoop

 def __init__(self, context):
     super(Reader, self).__init__(context)
     self.logger = LOGGER.getChild("Reader")
     self.logger.debug('started')
     self.isplit = context.input_split
     for a in "filename", "offset", "length":
         self.logger.debug(
             "isplit.{} = {}".format(a, getattr(self.isplit, a))
         )
     self.file = hdfs.open(self.isplit.filename)
     self.file.seek(self.isplit.offset)
     self.bytes_read = 0
     if self.isplit.offset > 0:
         discarded = self.file.readline()
         self.bytes_read += len(discarded)

Ejemplo n.º 23

0

Mostrar archivo

Archivo: checkrecords.py Proyecto: elzaggo/pydoop

def main(argv=None):
    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = 'pteracheck'
    args.module = 'pteracheck'
    args.do_not_use_java_record_reader = True
    args.do_not_use_java_record_writer = False
    args.num_reducers = 1
    args.upload_file_to_cache = ['pteracheck.py', 'ioformats.py']
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.run()
    path = os.path.join(args.output, 'part-r-00000')
    with hdfs.open(path, 'rb') as f:
        data = f.read()
    check_rows(data.split(b'\n')[:-1])

Ejemplo n.º 24

0

Mostrar archivo

Archivo: check.py Proyecto: crs4/pydoop

def check_transpose(mr_out_dir):
    output = []
    for fn in hadut.iter_mr_out_files(mr_out_dir):
        with hdfs.open(fn, "rt") as f:
            for line in f:
                row = line.rstrip().split("\t")
                index = int(row.pop(0))
                output.append((index, row))
    output = [_[1] for _ in sorted(output)]
    exp_output = []
    in_fn = os.path.join(THIS_DIR, "data", "transpose_input", "matrix.txt")
    with open(in_fn) as f:
        for line in f:
            for i, item in enumerate(line.split()):
                try:
                    exp_output[i].append(item)
                except IndexError:
                    exp_output.append([item])
    return output == exp_output

Ejemplo n.º 25

0

Mostrar archivo

Archivo: kafka-producer.py Proyecto: bunop/ccc-capstone

def processChunk(myfile, topic):
    with hdfs.open(myfile["name"]) as handle:
        data = []
        
        for i, line in enumerate(handle):
            #strip line
            line = line.strip()
            data += [line]
            
            if i % 5000 == 0:
                #Submit data (my function)
                submitChunk(topic, data, trials=3)
                data = []
            
            if i % 20000 == 0 and i != 0:
                logger.info("%s lines submitted for %s" %(i, myfile["name"]))
                
        #for every line
        #submit the rest of the data
        submitChunk(topic, data, trials=3)
        data = []

Ejemplo n.º 26

0

Mostrar archivo

Archivo: hdfs2mongo.py Proyecto: legendlee1314/ooni

def xml_from_hdfs(url):
    assert hdfs.path.isdir(url)
    file_lists = hdfs.ls(url)
    #for fi in file_lists:
    for i in xrange(0, 1):
        fi = '/datasets/corpus/enwiki-11g/wiki_912'
        with hdfs.open(fi, "r") as f:
            lines = f.read().strip().split('\n')
            docs, doc = [], None
            for line in lines:
                if line.startswith('<doc'):
                    doc = line
                elif line.startswith('</doc>'):
                    docs.append(doc + line)
                else:
                    #line = line.replace('&', '').replace('"', "'")
                    doc += line.replace('"', "'")

            for doc in docs:
                dom = bs(doc).find('doc')
                doc = dom.attrs
                doc['content'] = dom.text
                doc['md5'] = hashlib.md5(str(doc)).hexdigest()
                yield doc

Ejemplo n.º 27

0

Mostrar archivo

        sys.exit(1)
    else:
        return ratings

def computeRmse(model, data, n):
    """
    Compute RMSE (Root Mean Squared Error).
    """
    predictions = model.predictAll(data.map(lambda x: (x[0], x[1])))
    predictionsAndRatings = predictions.map(lambda x: ((x[0], x[1]), x[2])) \
      .join(data.map(lambda x: ((x[0], x[1]), x[2]))) \
      .values()
    return sqrt(predictionsAndRatings.map(lambda x: (x[0] - x[1]) ** 2).reduce(add) / float(n))
for n in userArray:
	with open(uFile, "w") as fi:
        with hdfs.open('/user/cloudera/medium/ratings.dat') as f:
            for line in f:
                data = line
                userid = line.split("::")
                if (int(userid[0]) == int(n)):
                    fi.write(data)
                    print n
	f.close()
	if __name__ == "__main__":
		if (len(sys.argv) != 2):
			print "Usage: /path/to/spark/bin/spark-submit --driver-memory 2g " + \
			  "MovieLensALS.py movieLensDataDir"
			sys.exit(1)

		# set up environment
		conf = SparkConf() \

Ejemplo n.º 28

0

Mostrar archivo

Archivo: forecast.py Proyecto: AndraAnoaica/ENSAI-ANOAICA-TARDIVEL

import os
import pydoop.hdfs as hd
import datetime
import forecastio as fo
import pandas as pd

with hd.open("hdfs://quickstart.cloudera:8020/user/cloudera/python/cities_location.csv") as f:
    df =  pd.read_csv(f)
    
    
    df=pd.read_csv('/user/cloudera/python/cities_location.csv') 
    df.head()
    api_key = "459009d8daa503cef1e11b190c961ce5"
    #selecting the specific date
    date = datetime.datetime(2015,11,1,2,0,0)
    for i in range(len(df)):
        col = ["cities", "time",  "temperatureMin", "temperatureMax"]
        lat=df["latitude"].iloc[i]
        lng=df["longitude"].iloc[i]
        #qccesing the forecast.io API
        forecast = fo.load_forecast(api_key, lat, lng, time=date)
        day = forecast.daily()
        #retrieving infor;ation for the current day
        Day=day.data[0]
        data={"cities": df["cities"].iloc[i], "time" : Day.time, "temperatureMin" : Day.temperatureMin, "temperatureMax" : Day.temperatureMax}
        if i==0 :
            weather = pd.DataFrame(data, index=[0], columns= col)
        else:
            weather1 = pd.DataFrame(data, index=[0], columns= col)
            weather = pd.concat([weather, weather1], ignore_index=True)

Ejemplo n.º 29

0

Mostrar archivo

        result = math.pow(math.e, -0.5 * (x_mu * inverse * x_mu.T))
        return norm_const * result
    else:
        raise NameError("The dimensions of the input don't match")


#import pydoop.hdfs as hdfs
k = 5

#using Hadoop system file
#with hdfs.open('/Users/ming/centroids.txt') as fp:

weights = []
means = []
sigmas = []
with hdfs.open('/Users/user06/parameters.txt') as file:
    for line in file:
        params = line.strip().split("\t")
        weights.append(float(params[0]))
        means.append(np.array(params[1].split(), float))
        sigmas.append(np.array(params[2].split(), float))

for line in sys.stdin:
    line = line.strip()
    point = np.array(line.split(), float)
    p = weights[0] * norm_pdf_multivariate(point, means[0], sigmas[0].reshape(
        (2, 2)))
    nearest = 0
    for i in range(1, k):
        q = weights[i] * norm_pdf_multivariate(point, means[i],
                                               sigmas[i].reshape((2, 2)))

Ejemplo n.º 30

0

Mostrar archivo

Archivo: i7get_from_hdfs.py Proyecto: greatabel/DataAnalysis

import pydoop.hdfs as hdfs


b = hdfs.path.isdir("/data")

want_file = 'traffic.csv'

if b == True:
    print("---get test ---")
    lines = []
    with hdfs.open("hdfs://127.0.0.1:9000/data/"+want_file) as f:
        for line in f:
            # print(line, type(line))
            l = line.decode("utf-8")
            if l is not None and l != "":
                lines.append(l)
    print(lines)
    print("---end get----")

    with open("i8predict_flow/"+want_file, "wb") as myfile:
        myfile.write(str(lines))

Ejemplo n.º 31

0

Mostrar archivo

Archivo: AnomalyDetection_CreditCardData.py Proyecto: guptashivam27/Anomaly-Detection

##Importing Required Packages
import numpy as np
import pydoop.hdfs as hd
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sbn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, auc, roc_curve, precision_recall_curve, average_precision_score

##Loading Credit Card Dataset
with hd.open("/user/hduser/creditcard.csv") as f:
    CreditCardData = pd.read_csv(f, header=0)

##Reducing the number of records of Original Dataset incase we wish to work on a smaller subset of Dataset
ReducedData = CreditCardData.iloc[:, :]

##Shape of Credit Card Dataset, i.e. number of rows & columns present in Dataset
print("\nShape of Credit Card Dataset (rows, columns): " +
      str(ReducedData.shape))

##Removing Duplicate Records (if any)
FinalData = ReducedData.drop_duplicates()
print(
    "\nShape of Credit Card Dataset after removing duplicate records (rows, columns): "
    + str(FinalData.shape))

##Checking for missing values

Ejemplo n.º 32

0

Mostrar archivo

Archivo: saveHDFStoS3.py Proyecto: modupeEIT/Mindbenders_BD

import pydoop.hdfs as hdfs
import boto3
import botocore

s3 = boto3.resource('s3')

BUCKET = "bd-mindbenders12345"

file = hdfs.open("hdfs://localhost:9000/test.txt")

s3.Bucket(BUCKET).put_object(Key="test.txt", Body=file)

Ejemplo n.º 33

0

Mostrar archivo

Archivo: hdfs.py Proyecto: clrke/hdfs-test

import pydoop.hdfs as hdfs
import config.hdfs

with hdfs.open(config.hdfs['ur']) as f:
    for line in f:
        print(line)

Ejemplo n.º 34

0

Mostrar archivo

Archivo: local.py Proyecto: vadirajmkulkarni/DS222-Assignment

i = 0
import math
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss, accuracy_score


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


vocab = Counter()
labels = Counter()

with hdfs.open(
        '/user/ds222/assignment-1/DBPedia.verysmall/verysmall_train.txt') as f:
    for line in f:
        first, next = line.split(' ', 1)
        for label in first.split(','):
            labels[label] += 1
        words = next.strip().lower().split()
        for word in words:
            if (len(word) >= 4):
                if (word[0] != '<'):
                    vocab[word] += 1
        i = i + 1
#print(i)
#print(counter)


#Convert words to indexes

Ejemplo n.º 35

0

Mostrar archivo

# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# END_COPYRIGHT

import pickle
import io
from collections import Counter

import pydoop.hdfs as hdfs
from pterasort import Partitioner

RECORD_LENGTH = 91
KEY_LENGTH = 10

fname = Partitioner.initialize_break_points(5, 1000,
                                            '/user/root/genrecords_output')
with io.open('__break_point_cache_file', 'rb') as f:
    data = f.read()
sel = pickle.loads(data)

block_size = 20000 * RECORD_LENGTH
path = '/user/root/genrecords_output/part-m-00000'
with hdfs.open(path, 'rb') as f:
    data = f.read(block_size)
keys = (data[k:k + 10] for k in range(0, block_size, RECORD_LENGTH))
partitions = Counter(map(sel.select_partition, keys))
print(partitions)

Ejemplo n.º 36

0

Mostrar archivo

Archivo: indexData.py Proyecto: ipuneetrathore/searchSongs

	HDFSfiles.append(hdFiles[41:])


fileNames = []

indexName = 'music'
typeName = 'songs'
#IdField = 'songID'


bulkData = [] 

i = 1
for name in HDFSfiles:	
	dataDict = {}
       	fopen=hdfs.open("/gaana/gaanaLyrics/"+name)
	header = fopen.read()
       	header = re.sub('[^a-zA-Z]', ' ', header)
	header = header.replace("Advertisements"," ")
       	header = ''.join([item.lower() for item in header]) 
	songAndMovie = []
       	dlim = "lyrics"
#	nameNew = name.replace("-"," ")
	songAndMovie.append(name)
        dataDict[name] = header
	metaDict = {}
	dataDict = {}	
       	for elements in songAndMovie:
               	songsName = []
#              	if "lyrics" in elements:
		songName = elements.split('-')

Ejemplo n.º 37

0

Mostrar archivo

Archivo: DataLake_stats.py Proyecto: fmilagres/data_engineering

from pandasql import sqldf
import os


login=''
senha=''

os.system('echo '+senha+' | kinit '+login)
dir = '/ranger/audit/hiveServer2/'
list = hdfs.ls(dir)

df = pd.DataFrame()
for pasta in list:
    for i in range(len(hdfs.ls(pasta))):
        try:
            with hdfs.open(hdfs.ls(pasta)[i], 'r') as f:
                jsn = [json.loads(line) for line in f]
                df = df.append([pd.DataFrame(jsn)], sort=True)
            
        except:
                print("Leitura do arquivo json em " + hdfs.ls(pasta)[i] + " não foi bem sucedida")

df1 = df[['evtTime','reqUser','resource','access','reqData']]
df1['reqUser'] = df1['reqUser'].str.upper()
df1 = df1[df1['access']=='SELECT']
# exclusao de usuarios de servico
exclusao = pd.DataFrame(['HIVE','RANGERLOOKUP'])
df1 = df1[~df1.reqUser.isin(exclusao.iloc[:,0])]
df1['evtTime'] = pd.to_datetime(df1['evtTime'].str[0:16], format='%Y-%m-%d %H:%M')

spark_df = spark.createDataFrame(df1)

Ejemplo n.º 38

0

Mostrar archivo

Archivo: socialmedia_sentiment_pullfrommongodb.py Proyecto: rdstanley/untitled

            {"$group": {"_id": {'source':"$source",'tags':"$tags",'year': "$year_posted",'month':"$month_posted",'day':"$day_posted"}, "count": {"$sum": 1},"countNegative":{"$sum":"$Negative"},"countNeutral":{"$sum":"$Neutral"},"countPositive":{"$sum":"$Positive"}}},
            {"$sort": SON([("count", -1), ("_id", -1)])}
        ])
            #use reportdate for the filename
        filename = startdate.strftime('%Y-%m-%d')
        print(filename)
        for result_obj in daily_totals['result']:
            data_dict = result_obj['_id']
            date = (str(data_dict['year']) + "-" +  str(data_dict['month']) + "-" + str(data_dict['day']))
            tag = data_dict['tags']
            source = data_dict['source']
            count = result_obj['count']
            countPositive = result_obj['countPositive']
            countNegative = result_obj['countNegative']
            countNeutral = result_obj['countNeutral']
            data = (str(date) + "|" + str(tag) + "|" + str(source) + "|" + str(count) + "|" + str(countPositive) + "|" + str(countNegative) + "|" + str(countNeutral)+'\n')
            print(data)
            hdfs_path = '/socialmedia/sentiment/' + filename
            hdfs_path = settings.HDFS_HOST_NAME + ':' + settings.HDFS_PORT + settings.HDFS_ROOT_FOLDER + \
                        '/socialmedia/sentiment' + filename + '.in'
            logger.info('HDFS file path: %s' % hdfs_path)
            logger.debug('Data: %s' % data)

            try:
                hdfs_file = hdfs.open(hdfs_path, mode='a')
                hdfs_file.write(data.encode('utf-8'))
            except IOError, e:
                logger.debug("IOError: " + e.message)
                logger.debug("Caught Exception. Will create a new file on hdfs.")
                hdfs_file = hdfs.open(hdfs_path, mode='w')
                hdfs_file.write(data.encode('utf-8'))

Ejemplo n.º 39

0

Mostrar archivo

        colourImg = PIL.Image.open(imgFile)
        #imshow(np.asarray(colourImg))
        nparray = np.asarray(colourImg)
        image = cv2.cvtColor(nparray, cv2.COLOR_RGB2BGR)

    return image


brand = "logitech"
mode = "image"

os.chdir("/tmp/")
myMachine = kpath.abspath('/tmp/data/input/racetrack/image/')
print(myMachine)

with hpath.open(myMachine + "driving_log.csv") as csvFile:
    df = pd.read_csv(csvFile,
                     names=[
                         "image_center", "image_left", "image_right",
                         "steering", "speed"
                     ])

#next(df.iterrows())[1]
df.iterrows()

# read and store multiple cameras and steering angles from driving_log.csv
# all three camera images will be used to train the model
images = []
steering_measurements = []

for index, row in df.iterrows():

Ejemplo n.º 40

0

Mostrar archivo

	

import pydoop.hdfs as hdfs 
import logging
logging.basicConfig(level = logging.DEBUG)

# тест проверяет наличие строки в выходых файлах
# для каждой строки ищется ее пара в директории с нужной датой

with open('file1.csv','r') as in_f:
	for it,in_line in enumerate(in_f):
		date=in_line.strip().split(',')[0]
		for part in [1,2,3]:
			with hdfs.open('/data/archive/'+date+'/part-0000'+str(part)) as out_f:
				matching=[]
				for out_line in out_f:
					a=set(out_line.strip().split(','))
					if a==set(in_line.strip().split(',')):
						matching.append(True) 
						break
					else:
						matching.append(False)
			if any(matching):
				matching=True
				break
        	if not matching:            
			logging.debug("Error on line %s ,%s",it,in_line)

Ejemplo n.º 41

0

Mostrar archivo

 def __missing__(self, path):
     f = hdfs.open(path, "wb")
     self[path] = f
     return f

Ejemplo n.º 42

0

Mostrar archivo

import pydoop.hdfs as hdfs

for part in [1, 2, 3]:
    with hdfs.open('/data/archive/2014-04-29/part-0000' + str(part)) as out_f:
        with open('file1.csv', 'r') as in_f:
            for out_line in out_f:
                for in_line in in_f:
                    a = set(out_line.strip().split(','))
                    if a == set(in_line.strip().split(',')):
                        print True
                    else:
                        print False
                        print a
                        print set(in_line.strip().split(','))

Ejemplo n.º 43

0

Mostrar archivo

 def __init__(self, context):
     super(Reader, self).__init__()
     self.logger = logging.getLogger("Reader")  #formatted logger obtained
     self.file = hdfs.open('HD-2004-2014-d.csv')
     self.logger.debug("readline chunk size = %r" % self.file.chunk_size)

Ejemplo n.º 44

0

Mostrar archivo

Archivo: mapper.py Proyecto: Jeck96/CORNO_GRANDE_PROGETTO_1_BIG-DATA

#!/usr/bin/python3
"""mapper.py"""
import sys
import csv
import pydoop.hdfs as hdfs
import json
import costanct as C

azienda_map = {}
with hdfs.open('input/historical_stocks.csv', 'rt') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count > 0:
            ticker, _, name, _, _ = row
            azienda_map[ticker] = {'name': name}
        line_count += 1


def toJson(azione):
    dic = {
        "ticker": azione[0],
        "name": azienda_map[azione[0]],
        "close": azione[2],
        "date": azione[7],
    }
    return json.dumps(dic)


for line in sys.stdin:
    azione = line.split(',')

Ejemplo n.º 45

0

Mostrar archivo

        context.setStatus("initializing")

    def map(self, context):
        k = context.getInputKey()
        tmp_data = csv.reader(f)
        words = context.getInputValue().split()
        for w in words:
            context.emit(w, "1")
            context.incrementCounter(self.inputWords, len(words))

    def close(self):
        self.logger.info("all done")


print "Prediction on HD 30 year data:"
f = hdfs.open('/HD-1984-2014-d.csv')
tmp_data = csv.reader(f)

my_data = list()
for item in tmp_data:
    tmp_item = list()
    for i in item:
        tmp_item.append(i)
    my_data.append(tmp_item)
data = my_data[1:]
X = list()
training_indices = list()
for i in xrange(int(len(data) * 0.9)):
    training_indices.append(i)

test_indices = list()

Ejemplo n.º 46

0

Mostrar archivo

import numpy as np

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

from pyspark.sql import SparkSession

import pandas as pd
import pydoop.hdfs as hd

# create a spark session
#sparkSession = SparkSession.builder.master("local").appName("draw heat map").getOrCreate()
#df_load = sparkSession.read.csv('hdfs://dumbo/user/gx271/pubgETL/mir_death.csv')

with hd.open("hdfs://dumbo/user/gx271/pubgETL/mir_death.csv/part-00006") as f:
    df = pd.read_csv(f)

# convert DataFrame to np array

dat = df.as_matrix()

# dat = np.loadtxt('mydata.csv')

x, y = dat[:,0], dat[:,1]

heatmap, xedges, yedges = np.histogram2d(x, y, bins=50)  
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]  
plt.clf()  
plt.imshow(heatmap, extent=extent)  
# plt.show()

Ejemplo n.º 47

0

Mostrar archivo

def main(input_path, output_attribute_index, scikit_output_path,
         spark_output_path):

    # Instancira se Passive Aggressive Regressor model
    regressor = PassiveAggressiveRegressor()
    for file_path in hdfs.ls(input_path):
        # Ucitava se sadrzaj fajla i kreira string matrica od njega
        content = hdfs.load(file_path)
        temp = content.split("\n")
        temp = list(map(lambda x: x.split(","), temp))
        temp = list(filter(lambda x: len(x) > 1, temp))
        raw_matrix = np.array(temp)
        # Ucitava se numpy matrica i zatim parsira u matricu realnih vrednosti
        # koja se nakon toga koristi prilikom treniranja modela
        # raw_matrix = np.genfromtxt(file_path, delimiter=',', dtype='string')
        input_matrix = raw_matrix[1:, 3:-5].astype('float64')
        output_vector = raw_matrix[1:, -5 +
                                   output_attribute_index].astype('float64')
        # Model se trenira u vidu iterativnog poboljsanja
        regressor.partial_fit(input_matrix, output_vector)
        # Na konzoli se stampa putanja do obradjenog fajla
        print(file_path)

    # Cuva se kreirani model na izlaznoj putanji
    # koja je prosledjena u vidu argumenta
    with hdfs.open(scikit_output_path, 'w') as opened_file:
        pickle.dump(regressor, opened_file)

    # Inicijalizacija konfiguracije i konteksta izvrsenja aplikacije
    configuration = SparkConf().setAppName("BigDataProj3_Trainer")
    context = SparkContext(conf=configuration)
    context.setLogLevel("ERROR")
    # Inicijalizacija sesije
    # (mora da se obavi zbog upisivanja modela)
    session = SparkSession(context)

    # Ucitavanje RDD podataka sa ulazne putanje
    input_data = context.textFile(input_path)
    # Parsiranje svakog reda na reci
    input_data = input_data.map(lambda x: x.split(","))
    # Ignorisu se header-i
    input_data = input_data.filter(lambda x: x[0] != "Timestamp")
    # Ignorisu se prve tri vrste (Timestamp, Latitude i Longitude)
    # i bira se odgovarajuca izlazna kolona
    # (u zavisnosti od output_attribute_index promenljive)
    input_data = input_data.map(lambda x: list(map(lambda y: float(y), x[
        3:-5])) + [float(x[-5 + output_attribute_index])])

    # Formira se odgovarajuci DataFrame objekat
    # (VectorAssembler se koristi kod formiranja kolona
    # koje omogucavaju koriscenje fit metode linearne regresije)
    input_cols = []
    for i in range(15):
        input_cols.append("_" + str(i + 1))
    assembler = VectorAssembler(inputCols=input_cols, outputCol='features')
    data_frame = assembler.transform(input_data.toDF())

    # Instancira se LinearRegression objekat i vrsi njegovo treniranje
    # i zatim cuvanje na zadatoj putanji
    regression = LinearRegression(featuresCol='features', labelCol='_16')
    model = regression.fit(data_frame)
    model.write().overwrite().save(spark_output_path)

Ejemplo n.º 48

0

Mostrar archivo

Archivo: data_s3.py Proyecto: sapoondutta-eit/Mindbender_BD

bucket = 'enhance-it'
import boto3
import pydoop.hdfs as hdfs 

s3 = boto3.resource('s3')


file = hdfs.open('hdfs://master:9000/data_for_db/currency=USD/part-00000-f65c545f-baa0-4bf0-8aa9-0b14957848c4.c000.json')
s3.Bucket(bucket).put_object(Key='lituation/data_from_hdfs.csv', Body=file)

Ejemplo n.º 49

0

Mostrar archivo

Archivo: test_partitioner.py Proyecto: elzaggo/pydoop

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# END_COPYRIGHT

import pickle
import io
from collections import Counter

import pydoop.hdfs as hdfs
from pterasort import Partitioner

RECORD_LENGTH = 91
KEY_LENGTH = 10

fname = Partitioner.initialize_break_points(
    5, 1000, '/user/root/genrecords_output'
)
with io.open('__break_point_cache_file', 'rb') as f:
    data = f.read()
sel = pickle.loads(data)

block_size = 20000 * RECORD_LENGTH
path = '/user/root/genrecords_output/part-m-00000'
with hdfs.open(path, 'rb') as f:
    data = f.read(block_size)
keys = (data[k:k + 10] for k in range(0, block_size, RECORD_LENGTH))
partitions = Counter(map(sel.select_partition, keys))
print(partitions)

Ejemplo n.º 50

0

Mostrar archivo

#! /usr/bin/env python

import sys
from pydoop import hdfs
from DataPoint import DataPoint

#print "Start"

# read sys.argv[1] and sys.argv[2]
# put em in lists

if len(sys.argv)<3:
	print "Error: Insufficient Arguments"
	sys.exit(-1)

oldCentroidsFile = hdfs.open(sys.argv[1])
newCentroidsFile = hdfs.open(sys.argv[2])

oldCentroids = []
newCentroids = []

for line in oldCentroidsFile:
	if line.find("\t") != -1:
		(key,value) = line.strip().split("\t")
		oldCentroid = DataPoint(value)
	else:
		oldCentroid = DataPoint(line.strip()) 
	oldCentroids.append(oldCentroid)

for line in newCentroidsFile:
	(key,value) = line.strip().split("\t")

Ejemplo n.º 51

0

Mostrar archivo

Archivo: mapperStg2.py Proyecto: bread-tan/canopyClusteringPython

#! /usr/bin/env python

import sys
import DataPoint
from pydoop import hdfs

# Check for sufficient arguments
if len(sys.argv) < 2:
    print("ERROR: Insufficient arguments")
    sys.exit(-1)

# List to hold canopy centers
canopyCenters = []

# Read canopy center file
file = hdfs.open(sys.argv[1])
for line in file:
    if line.find("Warning:") == 0:
        continue
    (key, value) = line.split("\t")
    dp = DataPoint.DataPoint(value.strip())
    canopyCenters.append(dp)

# Assign points to canopies
for line in sys.stdin:
    dp = DataPoint.DataPoint(line.strip())
    insert = True
    for canopyCenter in canopyCenters:
        if dp.checkT1(canopyCenter):
            print(canopyCenter.toString() + "\t" + dp.toString())

Ejemplo n.º 52

0

Mostrar archivo

### Parte de leer los datos de hdfs ###
import pandas as pd
import numpy as np
import pydoop.hdfs as hd
from lxml import objectify

with hd.open("/user/datostiempo/20160525_1341.xml") as archivo:
    parsed = objectify.parse(archivo)

root = parsed.getroot()
prob_precipitacion = []
estado_cielo = []
viento = []
temperatura = []
tempmax = []
tempmin = []
iteraccion = 0
errores = []
print "root : ", root
for row in root.prediccion.dia:
    for row_precipitacion in row.prob_precipitacion:
        aux_precipitacion = []
        if (row_precipitacion != ''):
            aux_precipitacion.append(row_precipitacion)
        else:
            errores.append(1)
    prob_precipitacion.append(
        str(sum(aux_precipitacion) / float(len(aux_precipitacion))))
    for row_cielo in row.estado_cielo:
        aux_cielo = []
        if (row_cielo != ''):

Ejemplo n.º 53

0

Mostrar archivo

Archivo: sparkdrunkdetection.py Proyecto: cloud17shield/DrunkDetection

import os
import pandas as pd
import pydoop.hdfs as hdfs
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

conf = SparkConf().setAppName("drunk detection").setMaster("yarn")
sc = SparkContext(conf=conf)
sqlCtx = SQLContext(sc)

csv_file_path = "hdfs:///drunkdetection/train_data48.csv"
predictor_path = "hdfs:///drunkdetection/shape_predictor_68_face_landmarks.dat"
image_path = "hdfs:///drunkdetection/drunk3.jpg"
model_path = "hdfs:///drunkdetection/rf48.pickle"

with hdfs.open("/drunkdetection/train_data48.csv") as csv:
    df = pd.read_csv(csv, index_col=0)
print(df.columns)
df_y = df['label'] == 3
df_X = df[['x' + str(i)
           for i in range(1, 49)] + ['y' + str(j) for j in range(1, 49)]]
X_train, X_test, y_train, y_test = train_test_split(df_X,
                                                    df_y,
                                                    test_size=0.2,
                                                    random_state=15)

# Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

Ejemplo n.º 54

0

Mostrar archivo

Archivo: forecast.py Proyecto: AndraAnoaica/ENSAI-HiveQLscript

import os
import pydoop.hdfs as hd
import datetime
import forecastio as fo
import pandas as pd

with hd.open(
        "hdfs://quickstart.cloudera:8020/user/cloudera/python/cities_location.csv"
) as f:
    df = pd.read_csv(f)

    df = pd.read_csv('/user/cloudera/python/cities_location.csv')
    df.head()
    api_key = "459009d8daa503cef1e11b190c961ce5"
    #selecting the specific date
    date = datetime.datetime(2015, 11, 1, 2, 0, 0)
    for i in range(len(df)):
        col = ["cities", "time", "temperatureMin", "temperatureMax"]
        lat = df["latitude"].iloc[i]
        lng = df["longitude"].iloc[i]
        #qccesing the forecast.io API
        forecast = fo.load_forecast(api_key, lat, lng, time=date)
        day = forecast.daily()
        #retrieving infor;ation for the current day
        Day = day.data[0]
        data = {
            "cities": df["cities"].iloc[i],
            "time": Day.time,
            "temperatureMin": Day.temperatureMin,
            "temperatureMax": Day.temperatureMax
        }