def move(self): for wd in self.local_wd, self.hdfs_wd: t1 = self.__make_tree(wd) t2 = [_ for _ in t1.children if _.kind == 1][0] f2 = t2.children[0] hdfs.move(f2.name, t1.name) ls = [os.path.basename(_) for _ in hdfs.ls(t1.name)] self.assertTrue(os.path.basename(f2.name) in ls) self.assertEqual(len(hdfs.ls(t2.name)), 0)
def _poly_rename(path, *args, **kwargs): if path.startswith('hdfs:'): if not args[1].startswith('hdfs:'): raise Exception('poly_open.rename(hdfs, non-hdfs) not implemented') return hdfs.move(path, *args, **kwargs) else: return os.listdir(path)
def move(src, dest): """ Move or rename src to dest. Args: :src: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS). :dest: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS). """ src = _expand_path(src, project_name()) dest = _expand_path(dest, project_name(), exists=False) return hdfs.move(src, dest)
velocidadmediasuperficie = row.VALOR iteraccion = iteraccion + 1 #esta variable es para coger el nombre del archivo troceo = k.split('_') fila = [troceo[0],str(totalvehiculostunel),str(totalvehiculoscalle30),str(velocidadmediasuperficie),str(velocidadmediatunel)] print fila ### Parte de meter los datos en hbase con el conector ### # Parametros de configuracion batch_size = 1000 #host = "192.168.1.108" host = 'localhost' namespace = "calidadaire" row_count = 0 start_time = time.time() table_name = "medicion_trafico" # After everything has been defined, run the script. conn, batch = connect_to_hbase() #tenemos que coger la hora para asi ponerlo en el put a HBase aux_hora = int(troceo[1].split('.')[0]) hora = str(aux_hora/100) print "Connect to HBase. table name: %s, batch size: %i" % (table_name, batch_size) try: insert_row(batch, fila, hora) batch.send() finally: # No matter what happens, close the file handle. conn.close() #finalmente movemos el archivo a a ya tratado hdfs.move("/user/datostrafico/sin_tratar/"+k, "/user/datostrafico/tratado/")
for datos in row_temp.dato: aux_temp.append(datos) temperatura.append(str(sum(aux_temp) / float(len(aux_temp)))) iteraccion = iteraccion + 1 #esta variable es para coger el nombre del archivo troceo = k.split('_') #tenemos que crear una lista individual para poder juntar con lo demas troceo_aux = [troceo[0]] fila = troceo_aux+prob_precipitacion+estado_cielo+viento+tempmax+tempmin print "fila: ",fila ### Parte de meter los datos en hbase con el conector ### # Parametros de configuracion batch_size = 1000 #host = "192.168.1.108" host = 'localhost' namespace = "calidadaire" row_count = 0 start_time = time.time() table_name = "medicion_tiempo" # After everything has been defined, run the script. conn, batch = connect_to_hbase() print "Connect to HBase. table name: %s, batch size: %i" % (table_name, batch_size) try: insert_row(batch, fila) batch.send() finally: # No matter what happens, close the file handle. conn.close() #finalmente movemos el archivo a a ya tratado hdfs.move("/user/datostiempo/sin_tratar/"+k, "/user/datostiempo/tratado/")
else: fecha = str(auxiliar.iloc[1,2])+str(auxiliar.iloc[1,3])+str(auxiliar.iloc[1,4]) #quitamos las columnas de estacion ya que ya nos nos sirve, la de la fecha que tampoco ya nos sirve y la de las letras de V o N del valor auxiliar = auxiliar.drop(auxiliar.columns[[0,2,3,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52]],axis=1) #array donde se almacena las medias para subir a hbase fila = [fecha] #bucle que lee todo el dataframe auxiliar para hacer la media de cada magnitud para todas las estaciones y asi almacenar un solo dato en el array final estacion magnitudes = [1,6,8,9,10,12,14,30] for i in magnitudes: for j in range(1,len(auxiliar.columns)): auxiliar_media = auxiliar.loc[auxiliar['magnitud']== i] fila.append(str(auxiliar_media.iloc[:,j].mean())) ### Parte de meter los datos en hbase con el conector ### batch_size = 1000 host = 'localhost' namespace = "calidadaire" row_count = 0 start_time = time.time() table_name = "medicion_aire" # After everything has been defined, run the script. conn, batch = connect_to_hbase() print "Connect to HBase. table name: %s, batch size: %i" % (table_name, batch_size) try: insert_row(batch, fila) batch.send() finally: # No matter what happens, close the file handle. conn.close() #finalmente movemos el archivo a a ya tratado hdfs.move("/user/datosaire/sin_tratar/"+k, "/user/datosaire/tratado/")
def mrjob(options): "Generates and executes MR job script" user = os.getenv('USER') tstamp = int(time.time()) hdir = hdfs_dir(options.hdir, options.hdfs_prefix) if PYDOOP: odir = hdfs.path.join(hdir, options.odir) idir = hdfs.path.join(hdir, options.idir) schema = hdfs.path.join(hdir, options.schema) for name in [ hdir, odir, idir, ]: if options.verbose: print("Checking %s" % name) if not hdfs.path.isdir(name): if name in [hdir, idir]: print("ERROR: %s does not exist" % name) sys.exit(1) # else: # print(" Creating output directory: %s" % name) # hdfs.mkdir(name) elif name == odir: # in case odir exists and is not empty, move it somewhere and re-create if hdfs.ls(odir): ocache = hdfs.path.normpath(odir) + '_%d' % tstamp if options.verbose: print( " Non-empty output directory exists, saving it in %s" % ocache) hdfs.move(odir, ocache) # hdfs.mkdir(odir) # if it's empty, remove it else: hdfs.rmr(odir) if options.verbose: print("Checking %s" % schema) if not hdfs.path.isfile(schema): print("ERROR: %s does not exist" % schema) sys.exit(1) else: idir = '%s%s' % (hdir, 'data') odir = '%s%s' % (hdir, 'mrout') schema = '%s%s' % (hdir, options.schema) if options.verbose: msg = 'pydoop module is not present on this system' msg += ', will use input as is without checking' print('WARNING:', msg) for name in [options.mrpy, options.pydoop, options.avro]: if options.verbose: print("Checking %s" % name) if not os.path.isfile(name): print("ERROR: %s does not exist" % name) sys.exit(1) # module = os.path.basename(os.path.splitext(options.mrpy)[0]) code = create_mrpy(options.mrpy, options.verbose) cmd = """#!/bin/bash input={input} output={output} schema={schema} ifile=/tmp/mr_{user}_{tstamp}.py cat << EOF > $ifile {code} EOF module=mr_{user}_{tstamp} arch_pydoop={pydoop} arch_avro={avro} echo "Input URI : $input" echo "Output URI: $output" echo "Schema: $schema" echo "MR script : $ifile" echo "Module name : $module" echo "Pydoop archive: $arch_pydoop" echo "Avro archive : $arch_avro" echo "-----------------" echo "Submitting MR job" pydoop submit \ --upload-archive-to-cache $arch_pydoop \ --upload-archive-to-cache $arch_avro \ -D avro.schema=$schema \ --do-not-use-java-record-reader \ --log-level {loglevel} \ --job-name WMArchive \ --num-reducers 1 \ --upload-file-to-cache $ifile \ --mrv2 $module $input $output """.format(input=idir, output=odir, user=user, tstamp=tstamp, code=code, schema=schema, loglevel=options.loglevel, pydoop=os.path.abspath(options.pydoop), avro=os.path.abspath(options.avro)) fobj = NamedTemporaryFile(delete=False) fobj.write(cmd) fobj.close() fstat = os.stat(fobj.name) os.chmod(fobj.name, fstat.st_mode | stat.S_IEXEC) if options.execute: run(fobj.name, options.verbose) else: if options.verbose: print("------- Generated script --------") print(open(fobj.name, 'r').read()) if options.verbose: print("---------------------------------") # clean up temporary file os.unlink(fobj.name)
def mrjob(options): "Generates and executes MR job script" user = os.getenv('USER') tstamp = int(time.time()) hdir = hdfs_dir(options.hdir, options.hdfs_prefix) if PYDOOP: odir = hdfs.path.join(hdir, options.odir) idir = hdfs.path.join(hdir, options.idir) schema = hdfs.path.join(hdir, options.schema) for name in [hdir, odir, idir,]: if options.verbose: print("Checking %s" % name) if not hdfs.path.isdir(name): if name in [hdir, idir]: print("ERROR: %s does not exist" % name) sys.exit(1) # else: # print(" Creating output directory: %s" % name) # hdfs.mkdir(name) elif name == odir: # in case odir exists and is not empty, move it somewhere and re-create if hdfs.ls(odir): ocache = hdfs.path.normpath(odir)+'_%d'%tstamp if options.verbose: print(" Non-empty output directory exists, saving it in %s"%ocache) hdfs.move(odir, ocache) # hdfs.mkdir(odir) # if it's empty, remove it else: hdfs.rmr(odir) if options.verbose: print("Checking %s" % schema) if not hdfs.path.isfile(schema): print("ERROR: %s does not exist" % schema) sys.exit(1) else: idir = '%s%s' % (hdir, 'data') odir = '%s%s' % (hdir, 'mrout') schema = '%s%s' % (hdir, options.schema) if options.verbose: msg = 'pydoop module is not present on this system' msg += ', will use input as is without checking' print('WARNING:', msg) for name in [options.mrpy, options.pydoop, options.avro]: if options.verbose: print("Checking %s" % name) if not os.path.isfile(name): print("ERROR: %s does not exist" % name) sys.exit(1) # module = os.path.basename(os.path.splitext(options.mrpy)[0]) code = create_mrpy(options.mrpy, options.verbose) cmd = """#!/bin/bash input={input} output={output} schema={schema} ifile=/tmp/mr_{user}_{tstamp}.py cat << EOF > $ifile {code} EOF module=mr_{user}_{tstamp} arch_pydoop={pydoop} arch_avro={avro} echo "Input URI : $input" echo "Output URI: $output" echo "Schema: $schema" echo "MR script : $ifile" echo "Module name : $module" echo "Pydoop archive: $arch_pydoop" echo "Avro archive : $arch_avro" echo "-----------------" echo "Submitting MR job" pydoop submit \ --upload-archive-to-cache $arch_pydoop \ --upload-archive-to-cache $arch_avro \ -D avro.schema=$schema \ --do-not-use-java-record-reader \ --log-level {loglevel} \ --job-name WMArchive \ --num-reducers 1 \ --upload-file-to-cache $ifile \ --mrv2 $module $input $output """.format(input=idir, output=odir, user=user, tstamp=tstamp, code=code, schema=schema, loglevel=options.loglevel, pydoop=os.path.abspath(options.pydoop), avro=os.path.abspath(options.avro)) fobj = NamedTemporaryFile(delete=False) fobj.write(cmd) fobj.close() fstat = os.stat(fobj.name) os.chmod(fobj.name, fstat.st_mode | stat.S_IEXEC) if options.execute: run(fobj.name, options.verbose) else: if options.verbose: print("------- Generated script --------") print(open(fobj.name, 'r').read()) if options.verbose: print("---------------------------------") # clean up temporary file os.unlink(fobj.name)