def setupjob(job, args):
    """
    Set up a job to run on a date range of directories.

    Jobs expect two arguments, startdate and enddate, both in yyyy-MM-dd format.
    """

    import java.text.SimpleDateFormat as SimpleDateFormat
    import java.util.Date as Date
    import java.util.Calendar as Calendar
    import com.mozilla.util.DateUtil as DateUtil
    import com.mozilla.util.DateIterator as DateIterator
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat as FileInputFormat
    import org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat as MyInputFormat

    if len(args) != 3:
        raise Exception(
            "Usage: <testpilot_study> <startdate-YYYY-MM-DD> <enddate-YYYY-MM-DD>"
        )

    # use to collect up each date in the given range
    class MyDateIterator(DateIterator):
        def __init__(self):
            self._list = []

        def get(self):
            return self._list

        def see(self, aTime):
            self._list.append(aTime)

    sdf = SimpleDateFormat(dateformat)
    study = args[0]
    startdate = Calendar.getInstance()
    startdate.setTime(sdf.parse(args[1]))

    enddate = Calendar.getInstance()
    enddate.setTime(sdf.parse(args[2]))

    dates = MyDateIterator()

    DateUtil.iterateByDay(startdate.getTimeInMillis(),
                          enddate.getTimeInMillis(), dates)

    paths = []
    for d in dates.get():
        paths.append(pathformat % (study, sdf.format(Date(d))))

    job.setInputFormatClass(MyInputFormat)
    FileInputFormat.setInputPaths(job, ",".join(paths))
    job.getConfiguration().set("org.mozilla.jydoop.mappertype", "TEXT")
Example #2
0
def setupjob(job, args):
    """
    Set up a job to run on a date range of directories.

    Jobs expect two arguments, startdate and enddate, both in yyyy-MM-dd format.
    """

    import java.text.SimpleDateFormat as SimpleDateFormat
    import java.util.Date as Date
    import java.util.Calendar as Calendar
    import com.mozilla.util.DateUtil as DateUtil
    import com.mozilla.util.DateIterator as DateIterator
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat as FileInputFormat
    import org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat as MyInputFormat

    if len(args) != 3:
        raise Exception("Usage: <testpilot_study> <startdate-YYYY-MM-DD> <enddate-YYYY-MM-DD>")

    # use to collect up each date in the given range
    class MyDateIterator(DateIterator):
       def __init__(self):
          self._list = []
       def get(self):
          return self._list
       def see(self, aTime):
          self._list.append(aTime)

    sdf = SimpleDateFormat(dateformat)
    study = args[0]
    startdate = Calendar.getInstance()
    startdate.setTime(sdf.parse(args[1]))

    enddate = Calendar.getInstance()
    enddate.setTime(sdf.parse(args[2]))

    dates = MyDateIterator()

    DateUtil.iterateByDay(startdate.getTimeInMillis(), enddate.getTimeInMillis(), dates)

    paths = []
    for d in dates.get():
       paths.append(pathformat % (study, sdf.format(Date(d))))

    job.setInputFormatClass(MyInputFormat)
    FileInputFormat.setInputPaths(job, ",".join(paths));
    job.getConfiguration().set("org.mozilla.jydoop.mappertype", "TEXT")
def setupjob(job, args):
    """
    Similar to the above, but run telemetry data that's already been exported
    to HDFS.

    Jobs expect two arguments, startdate and enddate, both in yyyyMMdd format.
    """

    import java.text.SimpleDateFormat as SimpleDateFormat
    import java.util.Date as Date
    import java.util.Calendar as Calendar
    import java.util.concurrent.TimeUnit as TimeUnit
    import com.mozilla.util.DateUtil as DateUtil
    import com.mozilla.util.DateIterator as DateIterator
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat as FileInputFormat
    import org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat as MyInputFormat

    if len(args) != 2:
        raise Exception("Usage: <startdate-YYYYMMDD> <enddate-YYYYMMDD>")

    # use to collect up each date in the given range
    class MyDateIterator(DateIterator):
        def __init__(self):
            self._list = []

        def get(self):
            return self._list

        def see(self, aTime):
            self._list.append(aTime)

    sdf = SimpleDateFormat(dateformat)
    sdf_hdfs = SimpleDateFormat(hdfs_dateformat)
    startdate = Calendar.getInstance()
    startdate.setTime(sdf.parse(args[0]))

    enddate = Calendar.getInstance()
    enddate.setTime(sdf.parse(args[1]))

    nowdate = Calendar.getInstance()

    # HDFS only contains the last 2 weeks of data (up to yesterday)
    startMillis = startdate.getTimeInMillis()
    endMillis = enddate.getTimeInMillis()
    nowMillis = nowdate.getTimeInMillis()

    startDiff = nowMillis - startMillis
    if TimeUnit.DAYS.convert(startDiff, TimeUnit.MILLISECONDS) > 14:
        raise Exception(
            "HDFS Data only includes the past 14 days of history. Try again with more recent dates or use the HBase data directly."
        )

    endDiff = nowMillis - endMillis
    if TimeUnit.DAYS.convert(endDiff, TimeUnit.MILLISECONDS) < 1:
        raise Exception(
            "HDFS Data only includes data up to yesterday. For (partial) data for today, use the HBase data directly."
        )

    dates = MyDateIterator()

    DateUtil.iterateByDay(startMillis, endMillis, dates)

    paths = []
    for d in dates.get():
        paths.append(hdfs_pathformat % (sdf_hdfs.format(Date(d))))

    job.setInputFormatClass(MyInputFormat)
    FileInputFormat.setInputPaths(job, ",".join(paths))
    job.getConfiguration().set("org.mozilla.jydoop.mappertype", "TEXT")
Example #4
0
def hdfs_setupjob(job, args):
    """
    Similar to the above, but run telemetry data that's already been exported
    to HDFS.

    Jobs expect two arguments, startdate and enddate, both in yyyyMMdd format.
    """

    import java.text.SimpleDateFormat as SimpleDateFormat
    import java.util.Date as Date
    import java.util.Calendar as Calendar
    import java.util.concurrent.TimeUnit as TimeUnit
    import com.mozilla.util.DateUtil as DateUtil
    import com.mozilla.util.DateIterator as DateIterator
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat as FileInputFormat
    import org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat as MyInputFormat

    if len(args) != 2:
        raise Exception("Usage: <startdate-YYYYMMDD> <enddate-YYYYMMDD>")

    # use to collect up each date in the given range
    class MyDateIterator(DateIterator):
       def __init__(self):
          self._list = []
       def get(self):
          return self._list
       def see(self, aTime):
          self._list.append(aTime)

    sdf = SimpleDateFormat(dateformat)
    sdf_hdfs = SimpleDateFormat(hdfs_dateformat)
    startdate = Calendar.getInstance()
    startdate.setTime(sdf.parse(args[0]))

    enddate = Calendar.getInstance()
    enddate.setTime(sdf.parse(args[1]))

    nowdate = Calendar.getInstance()

    # HDFS only contains the last 2 weeks of data (up to yesterday)
    startMillis = startdate.getTimeInMillis()
    endMillis = enddate.getTimeInMillis()
    nowMillis = nowdate.getTimeInMillis()

    startDiff = nowMillis - startMillis
    if TimeUnit.DAYS.convert(startDiff, TimeUnit.MILLISECONDS) > 14:
        raise Exception("HDFS Data only includes the past 14 days of history. Try again with more recent dates or use the HBase data directly.")

    endDiff = nowMillis - endMillis
    if TimeUnit.DAYS.convert(endDiff, TimeUnit.MILLISECONDS) < 1:
        raise Exception("HDFS Data only includes data up to yesterday. For (partial) data for today, use the HBase data directly.")

    dates = MyDateIterator()

    DateUtil.iterateByDay(startMillis, endMillis, dates)

    paths = []
    for d in dates.get():
       paths.append(hdfs_pathformat % (sdf_hdfs.format(Date(d))))

    job.setInputFormatClass(MyInputFormat)
    FileInputFormat.setInputPaths(job, ",".join(paths));