Ejemplo n.º 1
0
def compute_median(samples,
                   window_size,
                   check=False,
                   display=None,
                   mode='running'):
    """
    Given a sequence of samples, and a window_size, compute the running median until the data is exhausted
    Compare to the naive running median if check is True
    Display the running media every "display" rounds
    """
    running = RunningMedian(window_size)
    naive = NaiveRunningMedian(window_size)
    if mode == 'naive':
        compute_naive_median(samples, window_size, display)
        return
    for counter, sample in enumerate(samples):
        running.insert(sample)
        running_median = running.median()
        # Print something regularly
        if display and counter % display == 0:
            print("%5d\t%d" % (counter, running_median))
        # Check against naive implementation
        if check:
            naive.insert(sample)
            naive_median = naive.median()
            assert naive_median == running_median, "%d != %d" % (
                running_median, naive_median)
def test_five_elements():
    r = RunningMedian()
    r.add(12)
    r.add(4)
    r.add(5)
    r.add(3)
    assert_that(r.add(8), equal_to(5))
def test_six_elements_left():
    r = RunningMedian()
    r.add(4)
    r.add(5)
    r.add(3)
    r.add(6)
    r.add(8)
    assert_that(r.add(7), equal_to(5.5))
def test_six_elements_right():
    r = RunningMedian()
    r.add(12)
    r.add(4)
    r.add(5)
    r.add(3)
    r.add(8)
    assert_that(r.add(7), equal_to(6))
def processByDate(pathInput):
    """
    This function take each line of the input file as if that record was sequentially streaming into your program as input.
    Each line of this second output file should list every unique combination of date and recipient from the input file with calculated total contributions and median contribution for that combination of date and recipient.
    The fields on each pipe-delimited line of medianvals_by_date.txt should be date, recipient, total number of transactions, total amount of contributions and median contribution. 
    Output file has lines sorted alphabetical by recipient and then chronologically by date.
    
    Parameters
    ----------
    line: string
        One record from input file.

    Returns
    -------
    no return value. It just writes the processed lines to output file
    """

    dByDate = defaultdict(
        list)  #dictionary of lists, key is a tuple of recipent ID and Date
    fields = []
    with open(pathInput, 'r') as f_input, open(pathOutputDate,
                                               'w') as f_outputDate:
        for line in f_input:
            del fields[:]
            line = removeSpace(line)
            if checkValidLine(
                    line)[1]:  #valid to go to be processed for output-by-date
                fields = line.split("|")
                CMTE_ID, TRANSACTION_DT, TRANSACTION_AMT = fields[0], fields[
                    13], fields[14]
                if (CMTE_ID, TRANSACTION_DT) in dByDate:
                    dByDate[(CMTE_ID,
                             TRANSACTION_DT)][0].add(float(TRANSACTION_AMT))
                    dByDate[(CMTE_ID, TRANSACTION_DT)][1] += 1
                    dByDate[(CMTE_ID,
                             TRANSACTION_DT)][2] += float(TRANSACTION_AMT)
                else:
                    rmedian = RunningMedian()
                    rmedian.add(float(TRANSACTION_AMT))
                    dByDate[(CMTE_ID, TRANSACTION_DT)].append(rmedian)
                    dByDate[(CMTE_ID, TRANSACTION_DT)].append(1)
                    dByDate[(CMTE_ID,
                             TRANSACTION_DT)].append(float(TRANSACTION_AMT))

        dByDate = OrderedDict(
            sorted(dByDate.items(),
                   key=lambda t:
                   (t[0][0], int(t[0][1][4:8] + t[0][1][2:4] + t[0][1][0:2]))))
        for (i, j) in dByDate:
            f_outputDate.write('{0}|{1}|{2}|{3}|{4}\n'.format(
                i, j, dByDate[(i, j)][0].get_median(), dByDate[(i, j)][1],
                ('%f' % dByDate[(i, j)][2]).rstrip('0').rstrip('.')))
def processByZip(pathInput):
    """
    This function take each line of the input file as if that record was sequentially streaming into your program as input. 
    For each input file line, calculate the running median of contributions, total number of transactions and total amount of contributions streaming in so far
    for that recipient and zip code. 
    The calculated fields should then be formatted into a pipe-delimited line and written to an output file named medianvals_by_zip.txt. 
    In the same order as the input line appeared in the input file.
    
    Parameters
    ----------
    pathInput : string

    Returns
    -------
    no return value. It just writes the processed lines to output file
    """
    dByZip = defaultdict(
        list)  #dictionary of lists, key is a tuple of recipent ID and zipCode
    fields = []
    with open(pathInput, 'r') as f_input, open(pathOutputZip,
                                               'w') as f_outputZip:
        # Processing input
        for line in f_input:
            del fields[:]
            line = removeSpace(line)
            if checkValidLine(
                    line)[0]:  #valid to go to be processed for output-by-zip
                fields = line.split("|")
                CMTE_ID, ZIP_CODE, TRANSACTION_AMT = fields[0], fields[
                    10][:5], fields[14]
                if (CMTE_ID, ZIP_CODE) in dByZip:
                    dByZip[(CMTE_ID, ZIP_CODE)][0].add(float(TRANSACTION_AMT))
                    dByZip[(CMTE_ID,
                            ZIP_CODE)][1] = dByZip[(CMTE_ID,
                                                    ZIP_CODE)][0].get_median()
                    dByZip[(CMTE_ID, ZIP_CODE)][2] += 1
                    dByZip[(CMTE_ID, ZIP_CODE)][3] += float(TRANSACTION_AMT)
                else:
                    rmedian = RunningMedian()
                    rmedian.add(float(TRANSACTION_AMT))
                    dByZip[(CMTE_ID, ZIP_CODE)].append(rmedian)
                    dByZip[(CMTE_ID, ZIP_CODE)].append(
                        int(round(float(TRANSACTION_AMT)))
                    )  #median to be round to whole number
                    dByZip[(CMTE_ID, ZIP_CODE)].append(1)
                    dByZip[(CMTE_ID, ZIP_CODE)].append(float(TRANSACTION_AMT))
                f_outputZip.write('{0}|{1}|{2}|{3}|{4}\n'.format(
                    CMTE_ID, ZIP_CODE, dByZip[(CMTE_ID, ZIP_CODE)][1],
                    dByZip[(CMTE_ID, ZIP_CODE)][2],
                    ('%f' %
                     dByZip[(CMTE_ID, ZIP_CODE)][3]).rstrip('0').rstrip('.')))
def test_two_elements():
    r = RunningMedian()
    r.add(12)
    assert_that(r.add(4), equal_to(8))
def test_single():
    r = RunningMedian()
    assert_that(r.add(12), equal_to(12))
def test_four_elements():
    r = RunningMedian()
    r.add(12)
    r.add(4)
    r.add(5)
    assert_that(r.add(3), equal_to(4.5))
def test_three_elements():
    r = RunningMedian()
    r.add(12)
    r.add(4)
    assert_that(r.add(5), equal_to(5))