Esempio n. 1
0
def main():
    #engine = Hadoop('bin/hadoop','/usr/local/hadoop-2.7.0/share/hadoop/tools/lib/hadoop-streaming-2.7.0.jar')

    engine = Hadoop(config.HADOOP_PATH, config.HADOOP_STREAMING_PATH)

    # Put files into Hadoop
    file_path = 'h1b_kaggle_1.csv'
    file_name = os.path.basename(file_path)

    engine.put_file(local_src=file_path, hadoop_dest=file_name, override=False)

    # Map-Reduce Tasks: default output_dir is 'output'

    result = engine.map_reduce(data_src=file_path,
                               mapper='group_by_mapper.py',
                               mapper_arguments=[3, 6],
                               reducer='value_summation_reducer.py')

    print('output is', result)
    """
    mapper arguments in case of group_by_mapper in bellow example is 
    groupby = 5th column
    aggretate = 6th column
    """

    result = engine.map_reduce(data_src=file_path,
                               mapper='group_by_mapper.py',
                               mapper_arguments=[5, 6],
                               reducer='value_summation_reducer.py')

    print('output is', result)

    cache[(3, 6)] = result

    with open('sample_output.txt', 'w') as file:
        file.write(str(cache))