Exemple #1
0
import struct

from pydoop.mapreduce.pipes import run_task, Factory
from pydoop.mapreduce.api import Mapper, Reducer


class FilterMapper(Mapper):
    def map(self, context):
        dst, cnt = context.key, context.value
        cnt = struct.unpack(">i", cnt)[0]
        context.emit(0, (dst, cnt))


class FilterReducer(Reducer):
    def reduce(self, context):
        dic = {}
        lst = []
        for pair in list(context.values):
            dst, cnt = pair[0], pair[1]
            dic[dst] = cnt
        lst = sorted(dic.items(), key=lambda t: t[1], reverse=True)

        for i in range(50):
            context.emit(lst[i][0], lst[i][1])


if __name__ == "__main__":
    factory = Factory(FilterMapper, FilterReducer)
    run_task(factory)
Exemple #2
0
from pydoop.mapreduce.api import Mapper, Reducer
"""
Count followers of each node
Input : directed graph
    e.g.) "3   4" indicates that person 3 has 4 followers.
Output : (destination, follower count)
    e.g.) "4 2" node 4 has 2 followers.
"""


class DstCountMapper(Mapper):
    def map(self, context):
        # Implements your codes
        x = context.value.split()
        context.emit(x[1], 1)
        #	pass


class DstCountReducer(Reducer):
    def reduce(self, context):
        # Implements your codes
        #with open("asd.txt", "w") as f:
        #	f.write("debug\n")
        context.emit("", "")
        #	pass


if __name__ == "__main__":
    factory = Factory(DstCountMapper, DstCountReducer)
    run_task(factory, auto_serialize=False)
Exemple #3
0
        self.ctx = ctx
        LOGGER.info("Mapper instantiated")

    def map(self, ctx):
        words = re.sub('[^0-9a-zA-Z]+', ' ', ctx.value).split()
        for w in words:
            ctx.emit(w, 1)


class TReducer(Reducer):
    def __init__(self, ctx):
        super(TReducer, self).__init__(ctx)
        self.ctx = ctx
        LOGGER.info("Reducer instantiated")

    def reduce(self, ctx):
        s = sum(ctx.values)
        # Note: we explicitly write the value as a str.
        ctx.emit(ctx.key, str(s))


FACTORY = Factory(mapper_class=TMapper, reducer_class=TReducer)


def main():
    run_task(FACTORY)


if __name__ == "__main__":
    main()
Exemple #4
0
import struct

from pydoop.mapreduce.pipes import run_task, Factory
from pydoop.mapreduce.api import Mapper, Reducer


class FilterMapper(Mapper):
    """
    Process a wordcount output stream, emitting only records relative to
    words whose count is equal to or above the configured threshold.
    """
    def __init__(self, context):
        super(FilterMapper, self).__init__(context)
        jc = context.job_conf
        self.threshold = jc.get_int("filter.occurrence.threshold")

    def map(self, context):
        word, occurrence = context.key, context.value
        occurrence = struct.unpack(">i", occurrence)[0]
        if occurrence >= self.threshold:
            context.emit(word, str(occurrence))


class FilterReducer(Reducer):
    def reduce(self, context):
        pass


if __name__ == "__main__":
    run_task(Factory(FilterMapper, FilterReducer))
Exemple #5
0
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# END_COPYRIGHT

import struct
import re

from pydoop.mapreduce.pipes import run_task, Factory
from pydoop.mapreduce.api import Mapper, Reducer


class WordCountMapper(Mapper):

    def map(self, context):
        words = re.sub('[^0-9a-zA-Z]+', ' ', context.value).split()
        for w in words:
            context.emit(w, 1)


class WordCountReducer(Reducer):

    def reduce(self, context):
        s = sum(context.values)
        context.emit(context.key, struct.pack(">i", s))


if __name__ == "__main__":
    run_task(Factory(WordCountMapper, WordCountReducer))
Exemple #6
0
"""
Filter out words whose occurrence falls below a specified value.
"""

import struct

from pydoop.mapreduce.pipes import run_task, Factory
from pydoop.mapreduce.api import Mapper


class FilterMapper(Mapper):
    """
    Process a wordcount output stream, emitting only records relative to
    words whose count is equal to or above the configured threshold.
    """
    def __init__(self, context):
        super(FilterMapper, self).__init__(context)
        jc = context.job_conf
        self.threshold = jc.get_int("filter.occurrence.threshold")

    def map(self, context):
        word, occurrence = context.key, context.value
        occurrence = struct.unpack(">i", occurrence)[0]
        if occurrence >= self.threshold:
            context.emit(word, str(occurrence))


if __name__ == "__main__":
    factory = Factory(FilterMapper)
    run_task(factory, raw_values=True)
Exemple #7
0
# under the License.
#
# END_COPYRIGHT

import struct

from pydoop.mapreduce.pipes import run_task, Factory
from pydoop.mapreduce.api import Mapper, Reducer


class WordCountMapper(Mapper):
    def map(self, context):
        #print("============== Map Start! ======================")
        for w in context.value.split():
            context.emit(w, 1)
        #print("============== Map Start! ======================")


class WordCountReducer(Reducer):
    def reduce(self, context):
        #print("============== reduce Start! ======================")
        s = sum(context.values)
        #context.emit(context.key.encode("utf-8"), struct.pack(">i", s))
        context.emit(context.key, s)
        #print("============== reduce Start! ======================")


if __name__ == "__main__":
    factory = Factory(WordCountMapper, WordCountReducer)
    run_task(factory, auto_serialize=False)
Exemple #8
0
def main():
    return run_task(Factory(Mapper, Reducer, combiner_class=Reducer))