Beispiel #1
0
def main():
    try:
        data_path = sys.argv[1]
    except IndexError:
        data_path = "/data1/gfz-fe/GeoMultiSens/database/sampledata/"

    connection = {
            "database": "usgscache",
            "user": "******",
            "password": "******",
            "host": "localhost",
            "connect_timeout": 3,
            "options": "-c statement_timeout=10000"
        }

    inputFormat = GMSDB(data_path, connection, 26184107)

    env = get_environment()
    level0Set = env.read_custom(data_path, ".*?\\.bsq", True, inputFormat)
    level1Set = level0Set.flat_map(L11Processor())
    level1SceneSet = level1Set.group_by(0).reduce()
    level12Set = level1SceneSet.flat_map(L12Processor())

    #just to make program complete
    result = level12Set.filter(DumbFilter())
    result.output()

    env.set_degree_of_parallelism(1)

    env.execute(local=True)
Beispiel #2
0
def main():
    try:
        data_path = sys.argv[1]
    except IndexError:
        data_path = "/data1/gfz-fe/GeoMultiSens/database/sampledata/"

    connection = {
            "database": "usgscache",
            "user": "******",
            "password": "******",
            "host": "localhost",
            "connect_timeout": 3,
            "options": "-c statement_timeout=10000"
        }

    inputFormat = GMSDB(data_path, connection, 26184107)

    env = get_environment()
    level0Set = env.read_custom(data_path, ".*?\\.bsq", True, inputFormat)
    level1Set = level0Set.flat_map(L11Processor())
    level1SceneSet = level1Set.group_by(0).reduce(CornerpointAdder())
    level12Set = level1SceneSet.flat_map(L12Processor())
    level12Set.write_custom(PrintOutput("/opt/output"))

    env.set_parallelism(1)
    env.execute(local=True)
Beispiel #3
0
def main():
    try:
        data_path = sys.argv[1]
    except IndexError:
        data_path = "/data1/gfz-fe/GeoMultiSens/database/sampledata/"

    connection = {
        "database": "usgscache",
        "user": "******",
        "password": "******",
        "host": "localhost",
        "connect_timeout": 3,
        "options": "-c statement_timeout=10000"
    }

    inputFormat = GMSDB(data_path, connection, 26184107)

    env = get_environment()
    level0Set = env.read_custom(data_path, ".*?\\.bsq", True, inputFormat)
    level1Set = level0Set.flat_map(L11Processor())
    level1SceneSet = level1Set.group_by(0).reduce(CornerpointAdder())
    level12Set = level1SceneSet.flat_map(L12Processor())
    level12Set.write_custom(PrintOutput("/opt/output"))

    env.set_parallelism(1)
    env.execute(local=True)
Beispiel #4
0
def runner():
    env = get_environment()

    data = env.from_elements(tweets)
    # we first map each word into a (1, word) tuple, then flat map across that, and group by the key, and sum
    # aggregate on it to get (count, word) tuples
    data \
        .flat_map(lambda x, c: [(1, word) for word in x.lower().split()]) \
        .group_by(1) \
        .reduce_group(Adder(), combinable=True) \
        .output()

    # execute the plan locally.
    env.execute(local=True)
Beispiel #5
0
def main():
    env = get_environment()

    inputFormat = GDALInputFormat(26184107)

    data = env.read_custom("/opt/gms_sample/", ".*?\\.bsq", True, inputFormat)

    result = data \
        .flat_map(TupleToTile()) \
        .flat_map(Tokenizer()) \
        .filter(Filter())
    result.output()

    env.set_parallelism(1)

    env.execute(local=True)
Beispiel #6
0
 def reduce(self, iterator, collector):
     count, word = iterator.next()
     count += sum([x[0] for x in iterator])
     collector.collect((count, word))
     if __name__ == "__main__":
         output_file = 'file:/opt/project/out.txt'
         print('logging results to: %s' % (output_file, ))
         env = get_environment()
         data = env.from_elements("Who's there? I think I hear them. Stand, ho! Who's there?")
         data \
         .flat_map(lambda x, c: [(1, word) for word in x.lower().split()], (INT, STRING)) \
         .group_by(1) \
         .reduce_group(Adder(), (INT, STRING), combinable=True) \
         .map(lambda y: 'Count: %s Word: %s' % (y[0], y[1]), STRING) \
         .write_text(output_file, write_mode=WriteMode.OVERWRITE) 
         env.execute(local=True)
Beispiel #7
0
def runner():
    env = get_environment()

    data = env.from_elements(tweets)
    # we first map each word into a (1, word) tuple, then flat map across that, and group by the key, and sum
    # aggregate on it to get (count, word) tuples
    data \
        .flat_map(lambda tweet, c: [y for x in [[(1, word) for word in [x.lower().split(' ')[i] + ' ' + x.lower().split(' ')[1+i] for i in range(0, len(x.lower().split(' '))-1)]] 
            for x in tweet] for y in x]) \
        .group_by(1) \
        .reduce_group(Adder(), combinable=True) \
        .map(lambda y: cl.insert({"_id":y[1], "value":y[0]}) if cl.find({"_id":y[1]}).count() == 0 else cl.update({"_id":y[1]}, {"value": cl.find_one({"_id":y[1]})['value'] + 
            y[0]})) \
        .output()

    # execute the plan locally.
    env.execute(local=True)
Beispiel #8
0
def main():
    env = get_environment()

    inputFormat = GDALInputFormat(26184107)

    data = env.read_custom("/opt/gms_sample/", ".*?\\.bsq", True,
                           inputFormat)

    result = data \
        .flat_map(TupleToTile()) \
        .flat_map(Tokenizer())

    result.write_custom(GMSOF("/opt/output"))

    env.set_parallelism(1)

    env.execute(local=True)
Beispiel #9
0
def classifier():
    env = get_environment()

    data = env.from_elements(
        "Hillary is going to be a better president than Trump #usElection")

    #Flat_map extrai os bigramas da string (tweet)
    #Em seguida uma função de map adiciona as informações relativas a probabilidade dos bigramas acontecerem em cada classe (retirados do banco de dados)
    #É necessário filtras os dados obtidos pela segunda função de map, no caso são retirados bigramas com probabilidade 0
    #É aplicada uma função de reduce que faz o produtorio das probabilidades
    #Em seguida esses dados são normalizados e adaptados para serem escritos no arquivo de saída
    data \
        .flat_map(lambda tweet, c: [(1, word) for word in [tweet.split(' ')[i] + ' ' + tweet.split(' ')[1+i] for i in range(0, len(tweet.split(' '))-1)]]) \
        .map(lambda x: classify(x)) \
        .filter(lambda x: x[0] != 0 and x[1] != 0) \
        .reduce(Product()) \
        .map(lambda y: 'Class 1 probability: %s Class 2 probability: %s' % (y[0] * (cl1.find().count() / (cl1.find().count() + cl2.find().count())),
            y[1] * (cl2.find().count() / (cl1.find().count() + cl2.find().count())))) \
        .write_text(output_file)

    # execute the plan locally.
    env.execute(local=True)
Beispiel #10
0
def main():
    env = get_environment()
    env.set_sendLargeTuples(True)

    inputFormat = GDALInputFormat(26184107)

    data = env.read_custom("/opt/gms_sample/", ".*?\\.bsq", True, inputFormat)

    #result = data \
    #    .flat_map(TupleToTile()) \
    #    .flat_map(Tokenizer()) \
    #    .flat_map(TileToTuple())

    result = data.filter(Filter())

    result.write_custom(GMSOF("/opt/output"))

    filtered = result.filter(Filter())
    filtered.write_custom(GMSOF("/opt/output"))

    env.set_parallelism(2)

    env.execute(local=False)
Beispiel #11
0
            x = iterator.next()
            second_edge = [x[0], x[1]]
            higher_vertex_id = second_edge[1]

            for lowerVertexId in vertices:
                collector.collect((first_edge[0], lowerVertexId, higher_vertex_id))
            vertices.append(higher_vertex_id)


class TriadFilter(JoinFunction):
    def join(self, value1, value2):
        return value1


if __name__ == "__main__":
    env = get_environment()
    edges = env.from_elements(
        (1, 2), (1, 3), (1, 4), (1, 5), (2, 3), (2, 5), (3, 4), (3, 7), (3, 8), (5, 6), (7, 8))

    edges_with_degrees = edges \
        .flat_map(EdgeDuplicator(), [INT, INT]) \
        .group_by(0) \
        .sort_group(1, Order.ASCENDING) \
        .reduce_group(DegreeCounter(), [INT, INT, INT, INT]) \
        .group_by(0, 2) \
        .reduce(DegreeJoiner())

    edges_by_degree = edges_with_degrees \
        .map(EdgeByDegreeProjector(), [INT, INT])

    edges_by_id = edges_by_degree \

def json_to_tuple(js, fields):
    return tuple([str(js.get(f, '')) for f in fields])


if __name__ == "__main__":
    # get the base path out of the runtime params
    base_path = sys.argv[1]

    # setup paths to input and output files on disk
    dim_file = 'file://' + base_path + '/data_enrichment/dimensional_data.csv'
    input_file = 'file://' + base_path + '/data_enrichment/input_data.csv'
    output_file = 'file://' + base_path + '/data_enrichment/out.txt'

    # remove the output file, if there is one there already
    if os.path.isfile(output_file):
        os.remove(output_file)

    # set up the environment with a text file source
    env = get_environment()
    input_data = env.read_text(input_file)
    dimensional_data = env.read_csv(dim_file, types=[STRING, STRING])

    input_data \
        .map(lambda x: json_to_tuple(json.loads(x), ['car', 'attr']), (STRING, STRING)) \
        .join(dimensional_data).where(1).equal_to(0) \
        .map(lambda x: 'This %s is %s' % (x[0][0], x[1][1]), STRING) \
        .write_text(output_file, write_mode=WriteMode.OVERWRITE)

    env.execute(local=True)
Beispiel #13
0
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
################################################################################
from flink.plan.Environment import get_environment
from flink.functions.MapFunction import MapFunction
from flink.functions.CrossFunction import CrossFunction
from flink.functions.JoinFunction import JoinFunction
from flink.functions.CoGroupFunction import CoGroupFunction
from flink.functions.Aggregation import Max, Min, Sum
from utils import Verify, Verify2, Id

# Test multiple jobs in one Python plan file
if __name__ == "__main__":
    env = get_environment()
    env.set_parallelism(1)

    d1 = env.from_elements(1, 6, 12)
    d1 \
        .first(1) \
        .map_partition(Verify([1], "First with multiple jobs in one Python plan file")).output()

    env.execute(local=True)

    env2 = get_environment()
    env2.set_parallelism(1)

    d2 = env2.from_elements(1, 1, 12)
    d2 \
        .map(lambda x: x * 2) \