Exemple #1
0
def run_job():
    """
    Runs the Hadoop pipes task through Pydoop
    """
    from pydoop.pipes import runTask, Factory
    from seal.seqal.mapper import mapper
    from seal.seqal.reducer import reducer
    return runTask(Factory(mapper, reducer))
Exemple #2
0
def run_job():
    """
    Runs the Hadoop pipes task through Pydoop
    """
    from pydoop.pipes import runTask, Factory
    from seal.seqal.mapper import mapper
    from seal.seqal.reducer import reducer
    return runTask(Factory(mapper, reducer))
Exemple #3
0
# END_COPYRIGHT

"""
This example includes only the bare minimum required to run
wordcount. See wordcount-full.py for an example that uses counters,
RecordReader, etc.
"""

import pydoop.pipes as pp


class Mapper(pp.Mapper):

  def map(self, context):
    words = context.getInputValue().split()
    for w in words:
      context.emit(w, "1")


class Reducer(pp.Reducer):

  def reduce(self, context):
    s = 0
    while context.nextValue():
      s += int(context.getInputValue())
    context.emit(context.getInputKey(), str(s))


if __name__ == "__main__":
  pp.runTask(pp.Factory(Mapper, Reducer))
Exemple #4
0
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
# 
# END_COPYRIGHT

import struct
from pydoop.pipes import Mapper, Reducer, Factory, runTask


class WordCountMapper(Mapper):

  def map(self, context):
    words = context.getInputValue().split()
    for w in words:
      context.emit(w, "1")


class WordCountReducer(Reducer):

  def reduce(self, context):
    s = 0
    while context.nextValue():
      s += int(context.getInputValue())
    context.emit(context.getInputKey(), struct.pack(">i", s))


if __name__ == "__main__":
  runTask(Factory(WordCountMapper, WordCountReducer))
Exemple #5
0
RecordReader, etc.
"""

import pydoop.pipes as pp
import re


class Mapper(pp.Mapper):
    def __init__(self, context):
        print context

    def map(self, context):
        words = re.sub('[^0-9a-zA-Z]+', ' ', context.getInputValue()).split()
        for w in words:
            context.emit(w, "1")


class Reducer(pp.Reducer):
    def __init__(self, context):
        print "Map"

    def reduce(self, context):
        s = 0
        while context.nextValue():
            s += int(context.getInputValue())
        context.emit(context.getInputKey(), str(s))


if __name__ == "__main__":
    pp.runTask(pp.Factory(mapper_class=Mapper, reducer_class=Reducer))
Exemple #6
0
        super(Reader, self).__init__()
        self.isplit = pp.InputSplit(context.getInputSplit())
        self.file = hdfs.open(self.isplit.filename)
        self.file.seek(self.isplit.offset)
        self.bytes_read = 0
        if self.isplit.offset > 0:
            discarded = self.file.readline(
            )  # read by reader of previous split
            self.bytes_read += len(discarded)

    def close(self):
        self.file.close()
        self.file.fs.close()

    def next(self):
        if self.bytes_read > self.isplit.length:  # end of input split
            return (False, "", "")
        key = struct.pack(">q", self.isplit.offset + self.bytes_read)
        record = self.file.readline()
        if record == "":  # end of file
            return (False, "", "")
        self.bytes_read += len(record)
        return (True, key, record)

    def getProgress(self):
        return min(float(self.bytes_read) / self.isplit.length, 1.0)


if __name__ == "__main__":
    pp.runTask(pp.Factory(Mapper, Reducer, record_reader_class=Reader))
Exemple #7
0
    context.setStatus("Initialization started")
    self.excluded_counter = context.getCounter("IPCOUNT", "EXCLUDED_LINES")
    jc = context.getJobConf()
    pu.jc_configure(self, jc, "ipcount.excludes", "excludes_fn", "")
    if self.excludes_fn:
      with open(self.excludes_fn) as f:
        self.excludes = set(l.strip() for l in f if not l.isspace())
    else:
      self.excludes = set()
    context.setStatus("Initialization done")

  def map(self, context):
    ip = context.getInputValue().split(None, 1)[0]
    if ip not in self.excludes:
      context.emit(ip, "1")
    else:
      context.incrementCounter(self.excluded_counter, 1)


class Reducer(pp.Reducer):

  def reduce(self, context):
    s = 0
    while context.nextValue():
      s += int(context.getInputValue())
    context.emit(context.getInputKey(), str(s))


if __name__ == "__main__":
  pp.runTask(pp.Factory(Mapper, Reducer, combiner_class=Reducer))
Exemple #8
0
# License for the specific language governing permissions and limitations
# under the License.
#
# END_COPYRIGHT
"""
This example includes only the bare minimum required to run
wordcount. See wordcount-full.py for an example that uses counters,
RecordReader, etc.
"""

import pydoop.pipes as pp


class Mapper(pp.Mapper):
    def map(self, context):
        words = context.getInputValue().split()
        for w in words:
            context.emit(w, "1")


class Reducer(pp.Reducer):
    def reduce(self, context):
        s = 0
        while context.nextValue():
            s += int(context.getInputValue())
        context.emit(context.getInputKey(), str(s))


if __name__ == "__main__":
    pp.runTask(pp.Factory(Mapper, Reducer))
Exemple #9
0
        self.file.fs.close()

    def emit(self, key, value):
        self.file.write("%s%s%s\n" % (key, self.sep, value))


class Partitioner(pp.Partitioner):

    def __init__(self, context):
        super(Partitioner, self).__init__(context)
        self.logger = logging.getLogger("Partitioner")

    def partition(self, key, numOfReduces):
        reducer_id = (hash(key) & sys.maxint) % numOfReduces
        self.logger.debug("reducer_id: %r" % reducer_id)
        return reducer_id


if __name__ == "__main__":
    pp.runTask(pp.Factory(
        Mapper, Reducer,
        record_reader_class=Reader,
        record_writer_class=Writer,
        partitioner_class=Partitioner,
        combiner_class=Reducer,
    ))

# Local Variables:
# mode: python
# End:
Exemple #10
0
def main(argv):
    runTask(Factory(FastaMapper, FastaReducer,
                    record_reader_class=FastaReader))
Exemple #11
0
def run_task():
    return pp.runTask(pp.Factory(Mapper, Reducer))
Exemple #12
0
        super(Mapper, self).__init__(context)
        context.setStatus("Initialization started")
        self.excluded_counter = context.getCounter("IPCOUNT", "EXCLUDED_LINES")
        jc = context.getJobConf()
        pu.jc_configure(self, jc, "ipcount.excludes", "excludes_fn", "")
        if self.excludes_fn:
            with open(self.excludes_fn) as f:
                self.excludes = set(l.strip() for l in f if not l.isspace())
        else:
            self.excludes = set()
        context.setStatus("Initialization done")

    def map(self, context):
        ip = context.getInputValue().split(None, 1)[0]
        if ip not in self.excludes:
            context.emit(ip, "1")
        else:
            context.incrementCounter(self.excluded_counter, 1)


class Reducer(pp.Reducer):
    def reduce(self, context):
        s = 0
        while context.nextValue():
            s += int(context.getInputValue())
        context.emit(context.getInputKey(), str(s))


if __name__ == "__main__":
    pp.runTask(pp.Factory(Mapper, Reducer, combiner_class=Reducer))
Exemple #13
0
import pydoop.pipes as pp
import re


class Mapper(pp.Mapper):

    def __init__(self, context):
        print context

    def map(self, context):
        words = re.sub('[^0-9a-zA-Z]+', ' ', context.getInputValue()).split()
        for w in words:
            context.emit(w, "1")


class Reducer(pp.Reducer):

    def __init__(self, context):
        print "Map"

    def reduce(self, context):
        s = 0
        while context.nextValue():
            s += int(context.getInputValue())
        context.emit(context.getInputKey(), str(s))


if __name__ == "__main__":
    pp.runTask(pp.Factory(mapper_class=Mapper, reducer_class=Reducer))
Exemple #14
0
def run_task():
  return runTask(Factory(Mapper, Reducer))
Exemple #15
0
def run_task():
  return runTask(Factory(mapper, reducer))
Exemple #16
0
def main(argv):
    runTask(Factory(FastaMapper, FastaReducer, record_reader_class=FastaReader))
Exemple #17
0
  def __init__(self, context):
    super(Reader, self).__init__()
    self.isplit = pp.InputSplit(context.getInputSplit())
    self.file = hdfs.open(self.isplit.filename)
    self.file.seek(self.isplit.offset)
    self.bytes_read = 0
    if self.isplit.offset > 0:
      discarded = self.file.readline()  # read by reader of previous split
      self.bytes_read += len(discarded)

  def close(self):
    self.file.close()
    self.file.fs.close()
    
  def next(self):
    if self.bytes_read > self.isplit.length:  # end of input split
      return (False, "", "")
    key = struct.pack(">q", self.isplit.offset+self.bytes_read)
    record = self.file.readline()
    if record == "":  # end of file
      return (False, "", "")
    self.bytes_read += len(record)
    return (True, key, record)

  def getProgress(self):
    return min(float(self.bytes_read)/self.isplit.length, 1.0)


if __name__ == "__main__":
  pp.runTask(pp.Factory(Mapper, Reducer, record_reader_class=Reader))
Exemple #18
0
def run_task():
    return runTask(Factory(mapper, reducer))
Exemple #19
0
def run_task():
    return runTask(Factory(Mapper, Reducer, combiner_class=Reducer))
Exemple #20
0
def run_task():
  return runTask(Factory(Mapper, Reducer))
Exemple #21
0
import struct
from pydoop.pipes import Mapper, Reducer, Factory, runTask
from pydoop.utils import jc_configure_int


class FilterMapper(Mapper):
    """
  Process a wordcount output stream, emitting only records relative to
  words whose count is equal to or above the configured threshold.
  """
    def __init__(self, context):
        super(FilterMapper, self).__init__(context)
        jc = context.getJobConf()
        jc_configure_int(self, jc, "filter.occurrence.threshold", "threshold")

    def map(self, context):
        word, occurrence = (context.getInputKey(), context.getInputValue())
        occurrence = struct.unpack(">i", occurrence)[0]
        if occurrence >= self.threshold:
            context.emit(word, str(occurrence))


class FilterReducer(Reducer):
    def reduce(self, context):
        pass


if __name__ == "__main__":
    runTask(Factory(FilterMapper, FilterReducer))
Exemple #22
0
import struct
from pydoop.pipes import Mapper, Reducer, Factory, runTask
from pydoop.utils import jc_configure_int


class FilterMapper(Mapper):
  """
  Process a wordcount output stream, emitting only records relative to
  words whose count is equal to or above the configured threshold.
  """
  def __init__(self, context):
    super(FilterMapper, self).__init__(context)
    jc = context.getJobConf()
    jc_configure_int(self, jc, "filter.occurrence.threshold", "threshold")

  def map(self, context):
    word, occurrence = (context.getInputKey(), context.getInputValue())
    occurrence = struct.unpack(">i", occurrence)[0]
    if occurrence >= self.threshold:
      context.emit(word, str(occurrence))


class FilterReducer(Reducer):

  def reduce(self, context):
    pass


if __name__ == "__main__":
  runTask(Factory(FilterMapper, FilterReducer))
Exemple #23
0
def run_task():
  return runTask(Factory(Mapper, Reducer, combiner_class=Reducer))
Exemple #24
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# END_COPYRIGHT

import struct
from pydoop.pipes import Mapper, Reducer, Factory, runTask


class WordCountMapper(Mapper):
    def map(self, context):
        words = context.getInputValue().split()
        for w in words:
            context.emit(w, "1")


class WordCountReducer(Reducer):
    def reduce(self, context):
        s = 0
        while context.nextValue():
            s += int(context.getInputValue())
        context.emit(context.getInputKey(), struct.pack(">i", s))


if __name__ == "__main__":
    runTask(Factory(WordCountMapper, WordCountReducer))