def getCoordinatesMinMax_bis(dir,sc): #if there is csv written in the path, then it's only one file if "csv" in dir : rdd=sc.textFile(dir) #else we should work with all the files else : rdd = sc.textFile(dir+"/*.csv") # the ra and decl are line[6] and line[9] result = rdd.filter(lambda line: len(line) > 0) \ .map(lambda line: line.split(',')) \ .map(lambda line :( 1, [mymath.getL(float(line[6]), float(line[9])), mymath.getL(float(line[6]), float(line[9])) \ , mymath.getB(float(line[6]), float(line[9])), mymath.getB(float(line[6]), float(line[9]))])) \ .reduceByKey(lambda x,y: [min(x[0],y[0]),max(x[1],y[1]),min(x[2],y[2]),max(x[3],y[3])] ).collect() return result[0][1]
def partitioning_V3(dir,dir_result,sc,dict): # if there is csv written in the path, then it's only one file if "csv" in dir: rdd=sc.textFile(dir) else : rdd = sc.textFile(dir+"/*.csv") return rdd.filter(lambda line: len(line) > 0)\ .map(lambda line : [ line, line.split(",")[6] , line.split(",")[9] ] )\ .map(lambda x : dict.get_block_number_with_margins(mymath.getL(float(x[1]), float(x[2])), mymath.getB(float(x[1]), float(x[2])), x[0]))\ .flatMap(lambda x : x.split("_") )\ .map(lambda x : x.split(':')) \ .map(lambda x: (int(x[0]), x[1])) \ .partitionBy(len(dict.dictOfCoord)) \ .saveAsHadoopFile(dir_result, "org.apache.hadoop.mapred.TextOutputFormat" )
def getNbLinePerPatition_V3(dir,sc,dict): # if there is csv written in the path, then it's only one file if "csv" in dir: rdd = sc.textFile(dir) else: rdd = sc.textFile(dir + "/*.csv") tab=rdd.filter(lambda line: len(line) > 0) \ .map(lambda line: [ line.split(",")[6], line.split(",")[9]]) \ .map(lambda x: dict.get_block_number_with_margins(mymath.getL(float(x[0]), float(x[1])), mymath.getB(float(x[0]), float(x[1])), 1)) \ .flatMap(lambda x: x.split("_")) \ .map(lambda x: x.split(':')) \ .map(lambda x : (int(x[0]),1))\ .sortByKey()\ .reduceByKey(lambda x,y : x+y).collect() return tab