コード例 #1
ファイル: seqal_run.py プロジェクト: ilveroluca/seal
def run_job():
    Runs the Hadoop pipes task through Pydoop
    from pydoop.pipes import runTask, Factory
    from seal.seqal.mapper import mapper
    from seal.seqal.reducer import reducer
    return runTask(Factory(mapper, reducer))
コード例 #2
ファイル: seqal_run.py プロジェクト: crs4/seal
def run_job():
    Runs the Hadoop pipes task through Pydoop
    from pydoop.pipes import runTask, Factory
    from seal.seqal.mapper import mapper
    from seal.seqal.reducer import reducer
    return runTask(Factory(mapper, reducer))
コード例 #3
ファイル: wordcount-minimal.py プロジェクト: ZEMUSHKA/pydoop

This example includes only the bare minimum required to run
wordcount. See wordcount-full.py for an example that uses counters,
RecordReader, etc.

import pydoop.pipes as pp

class Mapper(pp.Mapper):

  def map(self, context):
    words = context.getInputValue().split()
    for w in words:
      context.emit(w, "1")

class Reducer(pp.Reducer):

  def reduce(self, context):
    s = 0
    while context.nextValue():
      s += int(context.getInputValue())
    context.emit(context.getInputKey(), str(s))

if __name__ == "__main__":
  pp.runTask(pp.Factory(Mapper, Reducer))
コード例 #4
ファイル: wordcount.py プロジェクト: ZEMUSHKA/pydoop
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

import struct
from pydoop.pipes import Mapper, Reducer, Factory, runTask

class WordCountMapper(Mapper):

  def map(self, context):
    words = context.getInputValue().split()
    for w in words:
      context.emit(w, "1")

class WordCountReducer(Reducer):

  def reduce(self, context):
    s = 0
    while context.nextValue():
      s += int(context.getInputValue())
    context.emit(context.getInputKey(), struct.pack(">i", s))

if __name__ == "__main__":
  runTask(Factory(WordCountMapper, WordCountReducer))
コード例 #5
ファイル: wordcount-minimal.py プロジェクト: xuande/pydoop
RecordReader, etc.

import pydoop.pipes as pp
import re

class Mapper(pp.Mapper):
    def __init__(self, context):
        print context

    def map(self, context):
        words = re.sub('[^0-9a-zA-Z]+', ' ', context.getInputValue()).split()
        for w in words:
            context.emit(w, "1")

class Reducer(pp.Reducer):
    def __init__(self, context):
        print "Map"

    def reduce(self, context):
        s = 0
        while context.nextValue():
            s += int(context.getInputValue())
        context.emit(context.getInputKey(), str(s))

if __name__ == "__main__":
    pp.runTask(pp.Factory(mapper_class=Mapper, reducer_class=Reducer))
コード例 #6
        super(Reader, self).__init__()
        self.isplit = pp.InputSplit(context.getInputSplit())
        self.file = hdfs.open(self.isplit.filename)
        self.bytes_read = 0
        if self.isplit.offset > 0:
            discarded = self.file.readline(
            )  # read by reader of previous split
            self.bytes_read += len(discarded)

    def close(self):

    def next(self):
        if self.bytes_read > self.isplit.length:  # end of input split
            return (False, "", "")
        key = struct.pack(">q", self.isplit.offset + self.bytes_read)
        record = self.file.readline()
        if record == "":  # end of file
            return (False, "", "")
        self.bytes_read += len(record)
        return (True, key, record)

    def getProgress(self):
        return min(float(self.bytes_read) / self.isplit.length, 1.0)

if __name__ == "__main__":
    pp.runTask(pp.Factory(Mapper, Reducer, record_reader_class=Reader))
コード例 #7
ファイル: ipcount.py プロジェクト: ilveroluca/pydoop
    context.setStatus("Initialization started")
    self.excluded_counter = context.getCounter("IPCOUNT", "EXCLUDED_LINES")
    jc = context.getJobConf()
    pu.jc_configure(self, jc, "ipcount.excludes", "excludes_fn", "")
    if self.excludes_fn:
      with open(self.excludes_fn) as f:
        self.excludes = set(l.strip() for l in f if not l.isspace())
      self.excludes = set()
    context.setStatus("Initialization done")

  def map(self, context):
    ip = context.getInputValue().split(None, 1)[0]
    if ip not in self.excludes:
      context.emit(ip, "1")
      context.incrementCounter(self.excluded_counter, 1)

class Reducer(pp.Reducer):

  def reduce(self, context):
    s = 0
    while context.nextValue():
      s += int(context.getInputValue())
    context.emit(context.getInputKey(), str(s))

if __name__ == "__main__":
  pp.runTask(pp.Factory(Mapper, Reducer, combiner_class=Reducer))
コード例 #8
ファイル: wordcount-minimal.py プロジェクト: onlynone/pydoop
# License for the specific language governing permissions and limitations
# under the License.
This example includes only the bare minimum required to run
wordcount. See wordcount-full.py for an example that uses counters,
RecordReader, etc.

import pydoop.pipes as pp

class Mapper(pp.Mapper):
    def map(self, context):
        words = context.getInputValue().split()
        for w in words:
            context.emit(w, "1")

class Reducer(pp.Reducer):
    def reduce(self, context):
        s = 0
        while context.nextValue():
            s += int(context.getInputValue())
        context.emit(context.getInputKey(), str(s))

if __name__ == "__main__":
    pp.runTask(pp.Factory(Mapper, Reducer))
コード例 #9
ファイル: wordcount-full.py プロジェクト: wtj/pydoop

    def emit(self, key, value):
        self.file.write("%s%s%s\n" % (key, self.sep, value))

class Partitioner(pp.Partitioner):

    def __init__(self, context):
        super(Partitioner, self).__init__(context)
        self.logger = logging.getLogger("Partitioner")

    def partition(self, key, numOfReduces):
        reducer_id = (hash(key) & sys.maxint) % numOfReduces
        self.logger.debug("reducer_id: %r" % reducer_id)
        return reducer_id

if __name__ == "__main__":
        Mapper, Reducer,

# Local Variables:
# mode: python
# End:
コード例 #10
ファイル: distblast_pipes.py プロジェクト: Pfiver/RNA-Seqlyze
def main(argv):
    runTask(Factory(FastaMapper, FastaReducer,
コード例 #11
ファイル: phase_one.py プロジェクト: crs4/biodoop-core
def run_task():
    return pp.runTask(pp.Factory(Mapper, Reducer))
コード例 #12
ファイル: ipcount.py プロジェクト: onlynone/pydoop
        super(Mapper, self).__init__(context)
        context.setStatus("Initialization started")
        self.excluded_counter = context.getCounter("IPCOUNT", "EXCLUDED_LINES")
        jc = context.getJobConf()
        pu.jc_configure(self, jc, "ipcount.excludes", "excludes_fn", "")
        if self.excludes_fn:
            with open(self.excludes_fn) as f:
                self.excludes = set(l.strip() for l in f if not l.isspace())
            self.excludes = set()
        context.setStatus("Initialization done")

    def map(self, context):
        ip = context.getInputValue().split(None, 1)[0]
        if ip not in self.excludes:
            context.emit(ip, "1")
            context.incrementCounter(self.excluded_counter, 1)

class Reducer(pp.Reducer):
    def reduce(self, context):
        s = 0
        while context.nextValue():
            s += int(context.getInputValue())
        context.emit(context.getInputKey(), str(s))

if __name__ == "__main__":
    pp.runTask(pp.Factory(Mapper, Reducer, combiner_class=Reducer))
コード例 #13
ファイル: wordcount-minimal.py プロジェクト: kikkomep/pydoop
import pydoop.pipes as pp
import re

class Mapper(pp.Mapper):

    def __init__(self, context):
        print context

    def map(self, context):
        words = re.sub('[^0-9a-zA-Z]+', ' ', context.getInputValue()).split()
        for w in words:
            context.emit(w, "1")

class Reducer(pp.Reducer):

    def __init__(self, context):
        print "Map"

    def reduce(self, context):
        s = 0
        while context.nextValue():
            s += int(context.getInputValue())
        context.emit(context.getInputKey(), str(s))

if __name__ == "__main__":
    pp.runTask(pp.Factory(mapper_class=Mapper, reducer_class=Reducer))
コード例 #14
def run_task():
  return runTask(Factory(Mapper, Reducer))
コード例 #15
ファイル: __init__.py プロジェクト: QwertyManiac/seal-cdh4
def run_task():
  return runTask(Factory(mapper, reducer))
コード例 #16
ファイル: distblast_pipes.py プロジェクト: 16NWallace/bcbb
def main(argv):
    runTask(Factory(FastaMapper, FastaReducer, record_reader_class=FastaReader))
コード例 #17
ファイル: wordcount-rr.py プロジェクト: ilveroluca/pydoop
  def __init__(self, context):
    super(Reader, self).__init__()
    self.isplit = pp.InputSplit(context.getInputSplit())
    self.file = hdfs.open(self.isplit.filename)
    self.bytes_read = 0
    if self.isplit.offset > 0:
      discarded = self.file.readline()  # read by reader of previous split
      self.bytes_read += len(discarded)

  def close(self):
  def next(self):
    if self.bytes_read > self.isplit.length:  # end of input split
      return (False, "", "")
    key = struct.pack(">q", self.isplit.offset+self.bytes_read)
    record = self.file.readline()
    if record == "":  # end of file
      return (False, "", "")
    self.bytes_read += len(record)
    return (True, key, record)

  def getProgress(self):
    return min(float(self.bytes_read)/self.isplit.length, 1.0)

if __name__ == "__main__":
  pp.runTask(pp.Factory(Mapper, Reducer, record_reader_class=Reader))
コード例 #18
def run_task():
    return runTask(Factory(mapper, reducer))
コード例 #19
ファイル: __init__.py プロジェクト: onlynone/pydoop
def run_task():
    return runTask(Factory(Mapper, Reducer, combiner_class=Reducer))
コード例 #20
ファイル: __init__.py プロジェクト: crs4/vispa
def run_task():
  return runTask(Factory(Mapper, Reducer))
コード例 #21
import struct
from pydoop.pipes import Mapper, Reducer, Factory, runTask
from pydoop.utils import jc_configure_int

class FilterMapper(Mapper):
  Process a wordcount output stream, emitting only records relative to
  words whose count is equal to or above the configured threshold.
    def __init__(self, context):
        super(FilterMapper, self).__init__(context)
        jc = context.getJobConf()
        jc_configure_int(self, jc, "filter.occurrence.threshold", "threshold")

    def map(self, context):
        word, occurrence = (context.getInputKey(), context.getInputValue())
        occurrence = struct.unpack(">i", occurrence)[0]
        if occurrence >= self.threshold:
            context.emit(word, str(occurrence))

class FilterReducer(Reducer):
    def reduce(self, context):

if __name__ == "__main__":
    runTask(Factory(FilterMapper, FilterReducer))
コード例 #22
ファイル: filter.py プロジェクト: ilveroluca/pydoop
import struct
from pydoop.pipes import Mapper, Reducer, Factory, runTask
from pydoop.utils import jc_configure_int

class FilterMapper(Mapper):
  Process a wordcount output stream, emitting only records relative to
  words whose count is equal to or above the configured threshold.
  def __init__(self, context):
    super(FilterMapper, self).__init__(context)
    jc = context.getJobConf()
    jc_configure_int(self, jc, "filter.occurrence.threshold", "threshold")

  def map(self, context):
    word, occurrence = (context.getInputKey(), context.getInputValue())
    occurrence = struct.unpack(">i", occurrence)[0]
    if occurrence >= self.threshold:
      context.emit(word, str(occurrence))

class FilterReducer(Reducer):

  def reduce(self, context):

if __name__ == "__main__":
  runTask(Factory(FilterMapper, FilterReducer))
コード例 #23
ファイル: __init__.py プロジェクト: ilveroluca/pydoop
def run_task():
  return runTask(Factory(Mapper, Reducer, combiner_class=Reducer))
コード例 #24
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

import struct
from pydoop.pipes import Mapper, Reducer, Factory, runTask

class WordCountMapper(Mapper):
    def map(self, context):
        words = context.getInputValue().split()
        for w in words:
            context.emit(w, "1")

class WordCountReducer(Reducer):
    def reduce(self, context):
        s = 0
        while context.nextValue():
            s += int(context.getInputValue())
        context.emit(context.getInputKey(), struct.pack(">i", s))

if __name__ == "__main__":
    runTask(Factory(WordCountMapper, WordCountReducer))