def main(): try: data_path = sys.argv[1] except IndexError: data_path = "/data1/gfz-fe/GeoMultiSens/database/sampledata/" connection = { "database": "usgscache", "user": "******", "password": "******", "host": "localhost", "connect_timeout": 3, "options": "-c statement_timeout=10000" } inputFormat = GMSDB(data_path, connection, 26184107) env = get_environment() level0Set = env.read_custom(data_path, ".*?\\.bsq", True, inputFormat) level1Set = level0Set.flat_map(L11Processor()) level1SceneSet = level1Set.group_by(0).reduce() level12Set = level1SceneSet.flat_map(L12Processor()) #just to make program complete result = level12Set.filter(DumbFilter()) result.output() env.set_degree_of_parallelism(1) env.execute(local=True)
def main(): try: data_path = sys.argv[1] except IndexError: data_path = "/data1/gfz-fe/GeoMultiSens/database/sampledata/" connection = { "database": "usgscache", "user": "******", "password": "******", "host": "localhost", "connect_timeout": 3, "options": "-c statement_timeout=10000" } inputFormat = GMSDB(data_path, connection, 26184107) env = get_environment() level0Set = env.read_custom(data_path, ".*?\\.bsq", True, inputFormat) level1Set = level0Set.flat_map(L11Processor()) level1SceneSet = level1Set.group_by(0).reduce(CornerpointAdder()) level12Set = level1SceneSet.flat_map(L12Processor()) level12Set.write_custom(PrintOutput("/opt/output")) env.set_parallelism(1) env.execute(local=True)
def runner(): env = get_environment() data = env.from_elements(tweets) # we first map each word into a (1, word) tuple, then flat map across that, and group by the key, and sum # aggregate on it to get (count, word) tuples data \ .flat_map(lambda x, c: [(1, word) for word in x.lower().split()]) \ .group_by(1) \ .reduce_group(Adder(), combinable=True) \ .output() # execute the plan locally. env.execute(local=True)
def main(): env = get_environment() inputFormat = GDALInputFormat(26184107) data = env.read_custom("/opt/gms_sample/", ".*?\\.bsq", True, inputFormat) result = data \ .flat_map(TupleToTile()) \ .flat_map(Tokenizer()) \ .filter(Filter()) result.output() env.set_parallelism(1) env.execute(local=True)
def reduce(self, iterator, collector): count, word = iterator.next() count += sum([x[0] for x in iterator]) collector.collect((count, word)) if __name__ == "__main__": output_file = 'file:/opt/project/out.txt' print('logging results to: %s' % (output_file, )) env = get_environment() data = env.from_elements("Who's there? I think I hear them. Stand, ho! Who's there?") data \ .flat_map(lambda x, c: [(1, word) for word in x.lower().split()], (INT, STRING)) \ .group_by(1) \ .reduce_group(Adder(), (INT, STRING), combinable=True) \ .map(lambda y: 'Count: %s Word: %s' % (y[0], y[1]), STRING) \ .write_text(output_file, write_mode=WriteMode.OVERWRITE) env.execute(local=True)
def runner(): env = get_environment() data = env.from_elements(tweets) # we first map each word into a (1, word) tuple, then flat map across that, and group by the key, and sum # aggregate on it to get (count, word) tuples data \ .flat_map(lambda tweet, c: [y for x in [[(1, word) for word in [x.lower().split(' ')[i] + ' ' + x.lower().split(' ')[1+i] for i in range(0, len(x.lower().split(' '))-1)]] for x in tweet] for y in x]) \ .group_by(1) \ .reduce_group(Adder(), combinable=True) \ .map(lambda y: cl.insert({"_id":y[1], "value":y[0]}) if cl.find({"_id":y[1]}).count() == 0 else cl.update({"_id":y[1]}, {"value": cl.find_one({"_id":y[1]})['value'] + y[0]})) \ .output() # execute the plan locally. env.execute(local=True)
def main(): env = get_environment() inputFormat = GDALInputFormat(26184107) data = env.read_custom("/opt/gms_sample/", ".*?\\.bsq", True, inputFormat) result = data \ .flat_map(TupleToTile()) \ .flat_map(Tokenizer()) result.write_custom(GMSOF("/opt/output")) env.set_parallelism(1) env.execute(local=True)
def classifier(): env = get_environment() data = env.from_elements( "Hillary is going to be a better president than Trump #usElection") #Flat_map extrai os bigramas da string (tweet) #Em seguida uma função de map adiciona as informações relativas a probabilidade dos bigramas acontecerem em cada classe (retirados do banco de dados) #É necessário filtras os dados obtidos pela segunda função de map, no caso são retirados bigramas com probabilidade 0 #É aplicada uma função de reduce que faz o produtorio das probabilidades #Em seguida esses dados são normalizados e adaptados para serem escritos no arquivo de saída data \ .flat_map(lambda tweet, c: [(1, word) for word in [tweet.split(' ')[i] + ' ' + tweet.split(' ')[1+i] for i in range(0, len(tweet.split(' '))-1)]]) \ .map(lambda x: classify(x)) \ .filter(lambda x: x[0] != 0 and x[1] != 0) \ .reduce(Product()) \ .map(lambda y: 'Class 1 probability: %s Class 2 probability: %s' % (y[0] * (cl1.find().count() / (cl1.find().count() + cl2.find().count())), y[1] * (cl2.find().count() / (cl1.find().count() + cl2.find().count())))) \ .write_text(output_file) # execute the plan locally. env.execute(local=True)
def main(): env = get_environment() env.set_sendLargeTuples(True) inputFormat = GDALInputFormat(26184107) data = env.read_custom("/opt/gms_sample/", ".*?\\.bsq", True, inputFormat) #result = data \ # .flat_map(TupleToTile()) \ # .flat_map(Tokenizer()) \ # .flat_map(TileToTuple()) result = data.filter(Filter()) result.write_custom(GMSOF("/opt/output")) filtered = result.filter(Filter()) filtered.write_custom(GMSOF("/opt/output")) env.set_parallelism(2) env.execute(local=False)
x = iterator.next() second_edge = [x[0], x[1]] higher_vertex_id = second_edge[1] for lowerVertexId in vertices: collector.collect((first_edge[0], lowerVertexId, higher_vertex_id)) vertices.append(higher_vertex_id) class TriadFilter(JoinFunction): def join(self, value1, value2): return value1 if __name__ == "__main__": env = get_environment() edges = env.from_elements( (1, 2), (1, 3), (1, 4), (1, 5), (2, 3), (2, 5), (3, 4), (3, 7), (3, 8), (5, 6), (7, 8)) edges_with_degrees = edges \ .flat_map(EdgeDuplicator(), [INT, INT]) \ .group_by(0) \ .sort_group(1, Order.ASCENDING) \ .reduce_group(DegreeCounter(), [INT, INT, INT, INT]) \ .group_by(0, 2) \ .reduce(DegreeJoiner()) edges_by_degree = edges_with_degrees \ .map(EdgeByDegreeProjector(), [INT, INT]) edges_by_id = edges_by_degree \
def json_to_tuple(js, fields): return tuple([str(js.get(f, '')) for f in fields]) if __name__ == "__main__": # get the base path out of the runtime params base_path = sys.argv[1] # setup paths to input and output files on disk dim_file = 'file://' + base_path + '/data_enrichment/dimensional_data.csv' input_file = 'file://' + base_path + '/data_enrichment/input_data.csv' output_file = 'file://' + base_path + '/data_enrichment/out.txt' # remove the output file, if there is one there already if os.path.isfile(output_file): os.remove(output_file) # set up the environment with a text file source env = get_environment() input_data = env.read_text(input_file) dimensional_data = env.read_csv(dim_file, types=[STRING, STRING]) input_data \ .map(lambda x: json_to_tuple(json.loads(x), ['car', 'attr']), (STRING, STRING)) \ .join(dimensional_data).where(1).equal_to(0) \ .map(lambda x: 'This %s is %s' % (x[0][0], x[1][1]), STRING) \ .write_text(output_file, write_mode=WriteMode.OVERWRITE) env.execute(local=True)
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ from flink.plan.Environment import get_environment from flink.functions.MapFunction import MapFunction from flink.functions.CrossFunction import CrossFunction from flink.functions.JoinFunction import JoinFunction from flink.functions.CoGroupFunction import CoGroupFunction from flink.functions.Aggregation import Max, Min, Sum from utils import Verify, Verify2, Id # Test multiple jobs in one Python plan file if __name__ == "__main__": env = get_environment() env.set_parallelism(1) d1 = env.from_elements(1, 6, 12) d1 \ .first(1) \ .map_partition(Verify([1], "First with multiple jobs in one Python plan file")).output() env.execute(local=True) env2 = get_environment() env2.set_parallelism(1) d2 = env2.from_elements(1, 1, 12) d2 \ .map(lambda x: x * 2) \