Ejemplo n.º 1
0
 def move(self):
     for wd in self.local_wd, self.hdfs_wd:
         t1 = self.__make_tree(wd)
         t2 = [_ for _ in t1.children if _.kind == 1][0]
         f2 = t2.children[0]
         hdfs.move(f2.name, t1.name)
         ls = [os.path.basename(_) for _ in hdfs.ls(t1.name)]
         self.assertTrue(os.path.basename(f2.name) in ls)
         self.assertEqual(len(hdfs.ls(t2.name)), 0)
Ejemplo n.º 2
0
 def move(self):
   for wd in self.local_wd, self.hdfs_wd:
     t1 = self.__make_tree(wd)
     t2 = [_ for _ in t1.children if _.kind == 1][0]
     f2 = t2.children[0]
     hdfs.move(f2.name, t1.name)
     ls = [os.path.basename(_) for _ in hdfs.ls(t1.name)]
     self.assertTrue(os.path.basename(f2.name) in ls)
     self.assertEqual(len(hdfs.ls(t2.name)), 0)
Ejemplo n.º 3
0
def _poly_rename(path, *args, **kwargs):
    if path.startswith('hdfs:'):
        if not args[1].startswith('hdfs:'):
            raise Exception('poly_open.rename(hdfs, non-hdfs) not implemented')
        return hdfs.move(path, *args, **kwargs)
    else:
        return os.listdir(path)
Ejemplo n.º 4
0
def move(src, dest):
    """
    Move or rename src to dest.

    Args:
        :src: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS).
        :dest: You can specify either a full hdfs pathname or a relative one (relative to your Project's path in HDFS).

    """
    src = _expand_path(src, project_name())
    dest = _expand_path(dest, project_name(), exists=False)
    return hdfs.move(src, dest)
Ejemplo n.º 5
0
            velocidadmediasuperficie = row.VALOR

        iteraccion = iteraccion + 1
    #esta variable es para coger el nombre del archivo
    troceo = k.split('_')
    fila = [troceo[0],str(totalvehiculostunel),str(totalvehiculoscalle30),str(velocidadmediasuperficie),str(velocidadmediatunel)]
    print fila
    ### Parte de meter los datos en hbase con el conector ###
    # Parametros de configuracion
    batch_size = 1000
    #host = "192.168.1.108"
    host = 'localhost'
    namespace = "calidadaire"
    row_count = 0
    start_time = time.time()
    table_name = "medicion_trafico"
    # After everything has been defined, run the script.
    conn, batch = connect_to_hbase()
    #tenemos que coger la hora para asi ponerlo en el put a HBase
    aux_hora = int(troceo[1].split('.')[0])
    hora = str(aux_hora/100)
    print "Connect to HBase. table name: %s, batch size: %i" % (table_name, batch_size)
    try:
        insert_row(batch, fila, hora)
        batch.send()
    finally:
        # No matter what happens, close the file handle.
        conn.close()
    #finalmente movemos el archivo a a ya tratado
    hdfs.move("/user/datostrafico/sin_tratar/"+k, "/user/datostrafico/tratado/")
Ejemplo n.º 6
0
                for datos in row_temp.dato:
                    aux_temp.append(datos)
                temperatura.append(str(sum(aux_temp) / float(len(aux_temp))))
        iteraccion = iteraccion + 1
    #esta variable es para coger el nombre del archivo
    troceo = k.split('_')
    #tenemos que crear una lista individual para poder juntar con lo demas
    troceo_aux = [troceo[0]]
    fila = troceo_aux+prob_precipitacion+estado_cielo+viento+tempmax+tempmin
    print "fila: ",fila
    ### Parte de meter los datos en hbase con el conector ###
    # Parametros de configuracion
    batch_size = 1000
    #host = "192.168.1.108"
    host = 'localhost'
    namespace = "calidadaire"
    row_count = 0
    start_time = time.time()
    table_name = "medicion_tiempo"
    # After everything has been defined, run the script.
    conn, batch = connect_to_hbase()
    print "Connect to HBase. table name: %s, batch size: %i" % (table_name, batch_size)
    try:
        insert_row(batch, fila)
        batch.send()
    finally:
        # No matter what happens, close the file handle.
        conn.close()
    #finalmente movemos el archivo a a ya tratado
    hdfs.move("/user/datostiempo/sin_tratar/"+k, "/user/datostiempo/tratado/")
Ejemplo n.º 7
0
        else:
            fecha = str(auxiliar.iloc[1,2])+str(auxiliar.iloc[1,3])+str(auxiliar.iloc[1,4])
        #quitamos las columnas de estacion ya que ya nos nos sirve, la de la fecha que tampoco ya nos sirve y la de las letras de V o N del valor
        auxiliar = auxiliar.drop(auxiliar.columns[[0,2,3,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52]],axis=1)
        #array donde se almacena las medias para subir a hbase
        fila = [fecha]
        #bucle que lee todo el dataframe auxiliar para hacer la media de cada magnitud para todas las estaciones y asi almacenar un solo dato en el array final estacion
        magnitudes = [1,6,8,9,10,12,14,30]
        for i in magnitudes:
            for j in range(1,len(auxiliar.columns)):
                auxiliar_media = auxiliar.loc[auxiliar['magnitud']== i]
                fila.append(str(auxiliar_media.iloc[:,j].mean()))
        ### Parte de meter los datos en hbase con el conector ###
        batch_size = 1000
        host = 'localhost'
        namespace = "calidadaire"
        row_count = 0
        start_time = time.time()
        table_name = "medicion_aire"
        # After everything has been defined, run the script.
        conn, batch = connect_to_hbase()
        print "Connect to HBase. table name: %s, batch size: %i" % (table_name, batch_size)
        try:
            insert_row(batch, fila)
            batch.send()
        finally:
            # No matter what happens, close the file handle.
            conn.close()
        #finalmente movemos el archivo a a ya tratado
        hdfs.move("/user/datosaire/sin_tratar/"+k, "/user/datosaire/tratado/")
Ejemplo n.º 8
0
def mrjob(options):
    "Generates and executes MR job script"

    user = os.getenv('USER')
    tstamp = int(time.time())
    hdir = hdfs_dir(options.hdir, options.hdfs_prefix)

    if PYDOOP:
        odir = hdfs.path.join(hdir, options.odir)
        idir = hdfs.path.join(hdir, options.idir)
        schema = hdfs.path.join(hdir, options.schema)
        for name in [
                hdir,
                odir,
                idir,
        ]:
            if options.verbose:
                print("Checking %s" % name)
            if not hdfs.path.isdir(name):
                if name in [hdir, idir]:
                    print("ERROR: %s does not exist" % name)
                    sys.exit(1)
                # else:
                #     print(" Creating output directory: %s" % name)
                #     hdfs.mkdir(name)
            elif name == odir:
                # in case odir exists and is not empty, move it somewhere and re-create
                if hdfs.ls(odir):
                    ocache = hdfs.path.normpath(odir) + '_%d' % tstamp
                    if options.verbose:
                        print(
                            " Non-empty output directory exists, saving it in %s"
                            % ocache)
                    hdfs.move(odir, ocache)
                    # hdfs.mkdir(odir)
                # if it's empty, remove it
                else:
                    hdfs.rmr(odir)

        if options.verbose:
            print("Checking %s" % schema)
        if not hdfs.path.isfile(schema):
            print("ERROR: %s does not exist" % schema)
            sys.exit(1)
    else:
        idir = '%s%s' % (hdir, 'data')
        odir = '%s%s' % (hdir, 'mrout')
        schema = '%s%s' % (hdir, options.schema)
        if options.verbose:
            msg = 'pydoop module is not present on this system'
            msg += ', will use input as is without checking'
            print('WARNING:', msg)
    for name in [options.mrpy, options.pydoop, options.avro]:
        if options.verbose:
            print("Checking %s" % name)
        if not os.path.isfile(name):
            print("ERROR: %s does not exist" % name)
            sys.exit(1)


#     module = os.path.basename(os.path.splitext(options.mrpy)[0])
    code = create_mrpy(options.mrpy, options.verbose)

    cmd = """#!/bin/bash
input={input}
output={output}
schema={schema}
ifile=/tmp/mr_{user}_{tstamp}.py
cat << EOF > $ifile
{code}
EOF

module=mr_{user}_{tstamp}
arch_pydoop={pydoop}
arch_avro={avro}
echo "Input URI : $input"
echo "Output URI: $output"
echo "Schema: $schema"
echo "MR script : $ifile"
echo "Module name : $module"
echo "Pydoop archive: $arch_pydoop"
echo "Avro archive  : $arch_avro"
echo "-----------------"
echo "Submitting MR job"
pydoop submit \
    --upload-archive-to-cache $arch_pydoop \
    --upload-archive-to-cache $arch_avro \
    -D avro.schema=$schema \
    --do-not-use-java-record-reader \
    --log-level {loglevel} \
    --job-name WMArchive \
    --num-reducers 1 \
    --upload-file-to-cache $ifile \
    --mrv2 $module $input $output
    """.format(input=idir,
               output=odir,
               user=user,
               tstamp=tstamp,
               code=code,
               schema=schema,
               loglevel=options.loglevel,
               pydoop=os.path.abspath(options.pydoop),
               avro=os.path.abspath(options.avro))

    fobj = NamedTemporaryFile(delete=False)
    fobj.write(cmd)
    fobj.close()

    fstat = os.stat(fobj.name)
    os.chmod(fobj.name, fstat.st_mode | stat.S_IEXEC)

    if options.execute:
        run(fobj.name, options.verbose)
    else:
        if options.verbose:
            print("------- Generated script --------")
        print(open(fobj.name, 'r').read())
        if options.verbose:
            print("---------------------------------")

    # clean up temporary file
    os.unlink(fobj.name)
Ejemplo n.º 9
0
def mrjob(options):
    "Generates and executes MR job script"

    user = os.getenv('USER')
    tstamp = int(time.time())
    hdir = hdfs_dir(options.hdir, options.hdfs_prefix)

    if  PYDOOP:
        odir = hdfs.path.join(hdir, options.odir)
        idir = hdfs.path.join(hdir, options.idir)
        schema = hdfs.path.join(hdir, options.schema)
        for name in [hdir, odir, idir,]:
            if  options.verbose:
                print("Checking %s" % name)
            if  not hdfs.path.isdir(name):
                if name in [hdir, idir]:
                    print("ERROR: %s does not exist" % name)
                    sys.exit(1)
                # else:
                #     print(" Creating output directory: %s" % name)
                #     hdfs.mkdir(name)
            elif name == odir:
                # in case odir exists and is not empty, move it somewhere and re-create
                if hdfs.ls(odir):
                    ocache = hdfs.path.normpath(odir)+'_%d'%tstamp
                    if options.verbose:
                        print(" Non-empty output directory exists, saving it in %s"%ocache)
                    hdfs.move(odir, ocache)
                    # hdfs.mkdir(odir)
                # if it's empty, remove it
                else:
                    hdfs.rmr(odir)

        if  options.verbose:
            print("Checking %s" % schema)
        if  not hdfs.path.isfile(schema):
            print("ERROR: %s does not exist" % schema)
            sys.exit(1)
    else:
        idir = '%s%s' % (hdir, 'data')
        odir = '%s%s' % (hdir, 'mrout')
        schema = '%s%s' % (hdir, options.schema)
        if  options.verbose:
            msg = 'pydoop module is not present on this system'
            msg += ', will use input as is without checking'
            print('WARNING:', msg)
    for name in [options.mrpy, options.pydoop, options.avro]:
        if  options.verbose:
            print("Checking %s" % name)
        if  not os.path.isfile(name):
            print("ERROR: %s does not exist" % name)
            sys.exit(1)

#     module = os.path.basename(os.path.splitext(options.mrpy)[0])
    code = create_mrpy(options.mrpy, options.verbose)

    cmd = """#!/bin/bash
input={input}
output={output}
schema={schema}
ifile=/tmp/mr_{user}_{tstamp}.py
cat << EOF > $ifile
{code}
EOF

module=mr_{user}_{tstamp}
arch_pydoop={pydoop}
arch_avro={avro}
echo "Input URI : $input"
echo "Output URI: $output"
echo "Schema: $schema"
echo "MR script : $ifile"
echo "Module name : $module"
echo "Pydoop archive: $arch_pydoop"
echo "Avro archive  : $arch_avro"
echo "-----------------"
echo "Submitting MR job"
pydoop submit \
    --upload-archive-to-cache $arch_pydoop \
    --upload-archive-to-cache $arch_avro \
    -D avro.schema=$schema \
    --do-not-use-java-record-reader \
    --log-level {loglevel} \
    --job-name WMArchive \
    --num-reducers 1 \
    --upload-file-to-cache $ifile \
    --mrv2 $module $input $output
    """.format(input=idir, output=odir, user=user, tstamp=tstamp,
               code=code, schema=schema, loglevel=options.loglevel,
               pydoop=os.path.abspath(options.pydoop),
               avro=os.path.abspath(options.avro))

    fobj = NamedTemporaryFile(delete=False)
    fobj.write(cmd)
    fobj.close()

    fstat = os.stat(fobj.name)
    os.chmod(fobj.name, fstat.st_mode | stat.S_IEXEC)

    if  options.execute:
        run(fobj.name, options.verbose)
    else:
        if  options.verbose:
            print("------- Generated script --------")
        print(open(fobj.name, 'r').read())
        if  options.verbose:
            print("---------------------------------")

    # clean up temporary file
    os.unlink(fobj.name)