Example #1
0
class StatementParser(object):
    """
    Each "read_*" method takes position as its argument,
    and returns next token position if read was successful,
    or the same position if it was not.
    """

    LEX = [('amount', r'^\[\(([0-9,]+\.\d+)\)\] TJ$'),
           ('date', r'^\[\((\d+/\d+)\)\] TJ$'),
           ('date_range_1', r'^\[\(([A-z]+ \d+, \d{4})'
            r' - ([A-z]+ \d+, \d{4})\)\] TJ$'),
           ('date_range_2', r'^\[\((\d{2}/\d{2}/\d{4})'
            r' to (\d{2}/\d{2}/\d{4})\)\] TJ$'),
           ('layout_tz', r'^(\d+\.\d{2}) Tz$'),
           ('layout_tc', r'^(\d+\.\d{2}) Tc$'),
           ('layout_tw', r'^(\d+\.\d{2}) Tw$'),
           ('layout_tf', r'^/F(\d) (\d+\.\d{2}) Tf$'),
           ('layout_tm', r'^' + (r'(\d+\.\d+ )' * 6) + r'Tm$'),
           ('ref', r'^\[\(([0-9A-Z]{17})\)\] TJ$'),
           ('text', r'^\[\(([^\)]+)\)\] TJ$')]

    def __init__(self, pdf):
        self._pdf = decompress_pdf(pdf)
        self._tok = ReTokenizer(self._pdf, '\n', self.LEX)

    def read_card_transactions(self):
        # Early check if this is a card account statement at all.
        if '[(Transactions)] TJ' not in self._pdf:
            return

        # Read statement dates range.
        date_from, date_to = self.read_first_date_range()

        # Read transactions.
        pos = 0
        while not self._tok.tok(pos).is_eof():
            pos, trans = self.read_card_transaction(pos, date_from, date_to)
            if trans:
                yield trans
            else:
                pos += 1

    def read_cash_transactions(self):
        # Early check if this is a cash account statement at all.
        if '[(Transaction history)] TJ' not in self._pdf:
            return

        # Read statement dates range.
        date_from, date_to = self.read_first_date_range()

        # Read transactions.
        pos = 0
        while not self._tok.tok(pos).is_eof():
            pos, trans = self.read_cash_transaction(pos, date_from, date_to)
            if trans:
                yield trans
            else:
                pos += 1

    def read_first_date_range(self):
        pos = 0
        while not self._tok.tok(pos).is_eof():
            pos, date_range = self.read_date_range(pos)
            if date_range is not None:
                return date_range
            else:
                pos += 1

    def read_card_transaction(self, pos, date_from, date_to):
        INDENT_CHARGES = 520

        startPos = pos

        pos, tdate = self.read_date(pos)
        pos, pdate_layout = self.read_layout_tm(pos)
        pos, pdate = self.read_date(pos)
        pos, ref_layout = self.read_layout_tm(pos)
        pos, ref = self.read_ref(pos)
        pos, desc = self.read_multiline_desc(pos)
        pos, amount = self.read_indent_amount(pos,
                                              range_minus=(INDENT_CHARGES,
                                                           9999),
                                              range_plus=(0, INDENT_CHARGES))

        if tdate is None or pdate_layout is None or pdate is None \
        or ref_layout is None or ref is None or desc is None or amount is None:
            return startPos, None
        else:
            tdate = closest_date(tdate, date_from, date_to)
            pdate = closest_date(pdate, date_from, date_to)

            trans = Transaction(ref)
            trans.date = tdate
            trans.rdate = pdate
            trans.type = Transaction.TYPE_UNKNOWN
            trans.raw = desc
            trans.label = desc
            trans.amount = amount
            return pos, trans

    def read_cash_transaction(self, pos, date_from, date_to):
        INDENT_BALANCE = 520
        INDENT_WITHDRAWAL = 470

        startPos = pos

        pos, date = self.read_date(pos)
        pos, _ = self.read_star(pos)
        pos, desc = self.read_multiline_desc(pos)
        pos, amount = self.read_indent_amount(
            pos,
            range_plus=(0, INDENT_WITHDRAWAL),
            range_minus=(INDENT_WITHDRAWAL, INDENT_BALANCE),
            range_skip=(INDENT_BALANCE, 9999))

        if desc is None or date is None or amount is None:
            return startPos, None
        else:
            date = closest_date(date, date_from, date_to)

            trans = Transaction(u'')
            trans.date = date
            trans.rdate = date
            trans.type = Transaction.TYPE_UNKNOWN
            trans.raw = desc
            trans.label = desc
            trans.amount = amount
            return pos, trans

    def read_multiline_desc(self, pos):
        startPos = pos

        descs = []
        while True:
            prevPos = pos
            pos, layout = self.read_layout_tm(pos)
            pos, desc = self.read_text(pos)
            if layout is None or desc is None:
                pos = prevPos
                break
            else:
                descs.append(desc)

        if descs:
            return pos, clean_label(' '.join(descs))
        else:
            return startPos, None

    def read_indent_amount(self,
                           pos,
                           range_skip=(0, 0),
                           range_plus=(0, 0),
                           range_minus=(0, 0)):
        startPos = pos

        # Read layout-amount pairs.
        amounts = []
        while True:
            prevPos = pos
            pos, layout = self.read_layout_tm(pos)
            pos, amount = self.read_amount(pos)
            if layout is None or amount is None:
                pos = prevPos
                break
            else:
                amounts.append((layout, amount))

        if not amounts:
            return startPos, None
        else:
            # Infer amount type by its indentation in the layout.
            amount_total = AmTr.decimal_amount('0')
            for (_, _, _, _, indent, _), amount in amounts:
                within = lambda xmin_xmax: xmin_xmax[0] <= indent <= xmin_xmax[
                    1]
                if within(range_skip):
                    continue
                elif within(range_plus):
                    amount_total += amount
                elif within(range_minus):
                    amount_total -= amount
            return pos, amount_total

    def read_star(self, pos):
        pos1, star1 = self.read_star_1(pos)
        pos2, star2 = self.read_star_2(pos)
        if star1 is not None:
            return pos1, star1
        else:
            return pos2, star2

    def read_star_1(self, pos):
        startPos = pos

        vals = list()
        pos, v = self.read_layout_tz(pos)
        vals.append(v)
        pos, v = self.read_layout_tc(pos)
        vals.append(v)
        pos, v = self.read_layout_tw(pos)
        vals.append(v)
        pos, v = self.read_layout_tf(pos)
        vals.append(v)
        pos, v = self.read_layout_tm(pos)
        vals.append(v)
        pos, star = self.read_text(pos)
        pos, v = self.read_layout_tz(pos)
        vals.append(v)
        pos, v = self.read_layout_tc(pos)
        vals.append(v)
        pos, v = self.read_layout_tw(pos)
        vals.append(v)
        pos, v = self.read_layout_tf(pos)
        vals.append(v)

        if star == 'S' and None not in vals:
            return pos, star
        else:
            return startPos, None

    def read_star_2(self, pos):
        startPos = pos

        vals = list()
        pos, v = self.read_layout_tf(pos)
        vals.append(v)
        pos, v = self.read_layout_tm(pos)
        vals.append(v)
        pos, star = self.read_text(pos)
        pos, v = self.read_layout_tf(pos)
        vals.append(v)

        if star == 'S' and None not in vals:
            return pos, star
        else:
            return startPos, None

    def read_date(self, pos):
        def parse_date(v):
            for year in [1900, 1904]:  # try leap and non-leap years
                fullstr = '%s/%i' % (v, year)
                try:
                    return datetime.datetime.strptime(fullstr, '%m/%d/%Y')
                except ValueError as e:
                    pass
            raise e

        return self._tok.simple_read('date', pos, parse_date)

    def read_text(self, pos):
        t = self._tok.tok(pos)
        #TODO: handle PDF encodings properly.
        return (pos+1, unicode(t.value(), errors='ignore')) \
            if t.is_text() else (pos, None)

    def read_amount(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, AmTr.decimal_amount(t.value())) \
            if t.is_amount() else (pos, None)

    def read_date_range(self, pos):
        t = self._tok.tok(pos)
        if t.is_date_range_1():
            return (pos + 1, [
                datetime.datetime.strptime(v, '%B %d, %Y') for v in t.value()
            ])
        elif t.is_date_range_2():
            return (pos + 1, [
                datetime.datetime.strptime(v, '%m/%d/%Y') for v in t.value()
            ])
        else:
            return (pos, None)

    def read_ref(self, pos):
        t = self._tok.tok(pos)
        return (pos + 1, t.value()) if t.is_ref() else (pos, None)

    def read_layout_tz(self, pos):
        t = self._tok.tok(pos)
        return (pos + 1, t.value()) if t.is_layout_tz() else (pos, None)

    def read_layout_tc(self, pos):
        t = self._tok.tok(pos)
        return (pos + 1, t.value()) if t.is_layout_tc() else (pos, None)

    def read_layout_tw(self, pos):
        t = self._tok.tok(pos)
        return (pos + 1, t.value()) if t.is_layout_tw() else (pos, None)

    def read_layout_tf(self, pos):
        t = self._tok.tok(pos)
        return (pos + 1, t.value()) if t.is_layout_tf() else (pos, None)

    def read_layout_tm(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, [float(v) for v in t.value()]) \
            if t.is_layout_tm() else (pos, None)
Example #2
0
 def __init__(self, pdf):
     self._pdf = decompress_pdf(pdf)
     self._tok = ReTokenizer(self._pdf, '\n', self.LEX)
Example #3
0
class StatementPage(RawPage):
    LEX = [
        ('charge_amount', r'^\(\$(\d+(,\d{3})*\.\d{2})\) Tj$'),
        ('payment_amount', r'^\(\\\(\$(\d+(,\d{3})*\.\d{2})\\\)\) Tj$'),
        ('date', r'^\((\d+/\d+)\) Tj$'),
        ('full_date', r'^\((\d+/\d+/\d+)\) Tj$'),
        ('layout_td', r'^([-0-9]+ [-0-9]+) Td$'),
        ('ref', r'^\(([A-Z0-9]{17})\) Tj$'),
        ('text', r'^\((.*)\) Tj$')
    ]

    def __init__(self, *args, **kwArgs):
        RawPage.__init__(self, *args, **kwArgs)
        assert self.doc[:4] == '%PDF'
        self._pdf = decompress_pdf(self.doc)
        self._tok = ReTokenizer(self._pdf, '\n', self.LEX)

    def iter_transactions(self):
        return sorted(self.read_transactions(),
            cmp=lambda t1, t2: cmp(t2.date, t1.date) or
                               cmp(t1.label, t2.label) or
                               cmp(t1.amount, t2.amount))

    def read_transactions(self):
        # Statement typically cover one month.
        # Do 60 days, just to be on a safe side.
        date_to = self.read_closing_date()
        date_from = date_to - timedelta(days=60)

        pos = 0
        while not self._tok.tok(pos).is_eof():
            pos, trans = self.read_transaction(pos, date_from, date_to)
            if trans:
                yield trans
            else:
                pos += 1

    def read_transaction(self, pos, date_from, date_to):
        startPos = pos
        pos, tdate = self.read_date(pos)
        pos, pdate_layout = self.read_layout_td(pos)
        pos, pdate = self.read_date(pos)
        pos, ref_layout = self.read_layout_td(pos)
        pos, ref = self.read_ref(pos)
        pos, desc_layout = self.read_layout_td(pos)
        pos, desc = self.read_text(pos)
        pos, amount_layout = self.read_layout_td(pos)
        pos, amount = self.read_amount(pos)
        if tdate is None or pdate is None \
        or desc is None or amount is None or amount == 0:
            return startPos, None
        else:
            tdate = closest_date(tdate, date_from, date_to)
            pdate = closest_date(pdate, date_from, date_to)
            desc = u' '.join(desc.split())

            trans = Transaction(ref or u'')
            trans.date = tdate
            trans.rdate = pdate
            trans.type = Transaction.TYPE_UNKNOWN
            trans.raw = desc
            trans.label = desc
            trans.amount = amount
            return pos, trans

    def read_amount(self, pos):
        pos, ampay = self.read_payment_amount(pos)
        if ampay is not None:
            return pos, ampay
        return self.read_charge_amount(pos)

    def read_charge_amount(self, pos):
        return self._tok.simple_read('charge_amount', pos,
                                     lambda xs: -AmTr.decimal_amount(xs[0]))

    def read_payment_amount(self, pos):
        return self._tok.simple_read('payment_amount', pos,
                                     lambda xs: AmTr.decimal_amount(xs[0]))

    def read_closing_date(self):
        pos = 0
        while not self._tok.tok(pos).is_eof():
            pos, text = self.read_text(pos)
            if text == u'Statement Closing Date':
                break
            pos += 1
        while not self._tok.tok(pos).is_eof():
            pos, date = self.read_full_date(pos)
            if date is not None:
                return date
            pos += 1

    def read_text(self, pos):
        t = self._tok.tok(pos)
        #TODO: handle PDF encodings properly.
        return (pos+1, unicode(t.value(), errors='ignore')) \
            if t.is_text() else (pos, None)

    def read_full_date(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, datetime.strptime(t.value(), '%m/%d/%Y')) \
            if t.is_full_date() else (pos, None)

    def read_date(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, datetime.strptime(t.value(), '%m/%d')) \
            if t.is_date() else (pos, None)

    def read_ref(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, t.value()) if t.is_ref() else (pos, None)

    def read_layout_td(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, t.value()) if t.is_layout_td() else (pos, None)
Example #4
0
 def __init__(self, *args, **kwArgs):
     RawPage.__init__(self, *args, **kwArgs)
     assert self.doc[:4] == '%PDF'
     self._pdf = decompress_pdf(self.doc)
     self._tok = ReTokenizer(self._pdf, '\n', self.LEX)
Example #5
0
class StatementParser(object):
    """
    Each "read_*" method takes position as its argument,
    and returns next token position if read was successful,
    or the same position if it was not.
    """

    LEX = [
        ('amount', r'^\[\(([0-9,]+\.\d+)\)\] TJ$'),
        ('date', r'^\[\((\d+/\d+)\)\] TJ$'),
        ('date_range_1', r'^\[\(([A-z]+ \d+, \d{4})'
                         r' - ([A-z]+ \d+, \d{4})\)\] TJ$'),
        ('date_range_2', r'^\[\((\d{2}/\d{2}/\d{4})'
                         r' to (\d{2}/\d{2}/\d{4})\)\] TJ$'),
        ('layout_tz', r'^(\d+\.\d{2}) Tz$'),
        ('layout_tc', r'^(\d+\.\d{2}) Tc$'),
        ('layout_tw', r'^(\d+\.\d{2}) Tw$'),
        ('layout_tf', r'^/F(\d) (\d+\.\d{2}) Tf$'),
        ('layout_tm', r'^' + (r'(\d+\.\d+ )'*6) + r'Tm$'),
        ('ref', r'^\[\(([0-9A-Z]{17})\)\] TJ$'),

        ('text', r'^\[\(([^\)]+)\)\] TJ$')
    ]

    def __init__(self, pdf):
        self._pdf = decompress_pdf(pdf)
        self._tok = ReTokenizer(self._pdf, '\n', self.LEX)

    def read_card_transactions(self):
        # Early check if this is a card account statement at all.
        if '[(Transactions)] TJ' not in self._pdf:
            return

        # Read statement dates range.
        date_from, date_to = self.read_first_date_range()

        # Read transactions.
        pos = 0
        while not self._tok.tok(pos).is_eof():
            pos, trans = self.read_card_transaction(pos, date_from, date_to)
            if trans:
                yield trans
            else:
                pos += 1

    def read_cash_transactions(self):
        # Early check if this is a cash account statement at all.
        if '[(Transaction history)] TJ' not in self._pdf:
            return

        # Read statement dates range.
        date_from, date_to = self.read_first_date_range()

        # Read transactions.
        pos = 0
        while not self._tok.tok(pos).is_eof():
            pos, trans = self.read_cash_transaction(pos, date_from, date_to)
            if trans:
                yield trans
            else:
                pos += 1

    def read_first_date_range(self):
        pos = 0
        while not self._tok.tok(pos).is_eof():
            pos, date_range = self.read_date_range(pos)
            if date_range is not None:
                return date_range
            else:
                pos += 1

    def read_card_transaction(self, pos, date_from, date_to):
        INDENT_CHARGES = 520

        startPos = pos

        pos, tdate = self.read_date(pos)
        pos, pdate_layout = self.read_layout_tm(pos)
        pos, pdate = self.read_date(pos)
        pos, ref_layout = self.read_layout_tm(pos)
        pos, ref = self.read_ref(pos)
        pos, desc = self.read_multiline_desc(pos)
        pos, amount = self.read_indent_amount(
            pos,
            range_minus = (INDENT_CHARGES, 9999),
            range_plus = (0, INDENT_CHARGES))

        if tdate is None or pdate_layout is None or pdate is None \
        or ref_layout is None or ref is None or desc is None or amount is None:
            return startPos, None
        else:
            tdate = closest_date(tdate, date_from, date_to)
            pdate = closest_date(pdate, date_from, date_to)

            trans = Transaction(ref)
            trans.date = tdate
            trans.rdate = pdate
            trans.type = Transaction.TYPE_UNKNOWN
            trans.raw = desc
            trans.label = desc
            trans.amount = amount
            return pos, trans

    def read_cash_transaction(self, pos, date_from, date_to):
        INDENT_BALANCE = 520
        INDENT_WITHDRAWAL = 470

        startPos = pos

        pos, date = self.read_date(pos)
        pos, _ = self.read_star(pos)
        pos, desc = self.read_multiline_desc(pos)
        pos, amount = self.read_indent_amount(
            pos,
            range_plus = (0, INDENT_WITHDRAWAL),
            range_minus = (INDENT_WITHDRAWAL, INDENT_BALANCE),
            range_skip = (INDENT_BALANCE, 9999))

        if desc is None or date is None or amount is None:
            return startPos, None
        else:
            date = closest_date(date, date_from, date_to)

            trans = Transaction(u'')
            trans.date = date
            trans.rdate = date
            trans.type = Transaction.TYPE_UNKNOWN
            trans.raw = desc
            trans.label = desc
            trans.amount = amount
            return pos, trans

    def read_multiline_desc(self, pos):
        startPos = pos

        descs = []
        while True:
            prevPos = pos
            pos, layout = self.read_layout_tm(pos)
            pos, desc = self.read_text(pos)
            if layout is None or desc is None:
                pos = prevPos
                break
            else:
                descs.append(desc)

        if descs:
            return pos, clean_label(' '.join(descs))
        else:
            return startPos, None

    def read_indent_amount(self, pos, range_skip=(0,0), range_plus=(0,0),
                           range_minus=(0,0)):
        startPos = pos

        # Read layout-amount pairs.
        amounts = []
        while True:
            prevPos = pos
            pos, layout = self.read_layout_tm(pos)
            pos, amount = self.read_amount(pos)
            if layout is None or amount is None:
                pos = prevPos
                break
            else:
                amounts.append((layout, amount))

        if not amounts:
            return startPos, None
        else:
            # Infer amount type by its indentation in the layout.
            amount_total = AmTr.decimal_amount('0')
            for (_, _, _, _, indent, _), amount in amounts:
                within = lambda xmin_xmax: xmin_xmax[0] <= indent <= xmin_xmax[1]
                if within(range_skip):
                    continue
                elif within(range_plus):
                    amount_total += amount
                elif within(range_minus):
                    amount_total -= amount
            return pos, amount_total

    def read_star(self, pos):
        pos1, star1 = self.read_star_1(pos)
        pos2, star2 = self.read_star_2(pos)
        if star1 is not None:
            return pos1, star1
        else:
            return pos2, star2

    def read_star_1(self, pos):
        startPos = pos

        vals = list()
        pos, v = self.read_layout_tz(pos); vals.append(v)
        pos, v = self.read_layout_tc(pos); vals.append(v)
        pos, v = self.read_layout_tw(pos); vals.append(v)
        pos, v = self.read_layout_tf(pos); vals.append(v)
        pos, v = self.read_layout_tm(pos); vals.append(v)
        pos, star = self.read_text(pos)
        pos, v = self.read_layout_tz(pos); vals.append(v)
        pos, v = self.read_layout_tc(pos); vals.append(v)
        pos, v = self.read_layout_tw(pos); vals.append(v)
        pos, v = self.read_layout_tf(pos); vals.append(v)

        if star == 'S' and None not in vals:
            return pos, star
        else:
            return startPos, None

    def read_star_2(self, pos):
        startPos = pos

        vals = list()
        pos, v = self.read_layout_tf(pos); vals.append(v)
        pos, v = self.read_layout_tm(pos); vals.append(v)
        pos, star = self.read_text(pos)
        pos, v = self.read_layout_tf(pos); vals.append(v)

        if star == 'S' and None not in vals:
            return pos, star
        else:
            return startPos, None

    def read_date(self, pos):
        def parse_date(v):
            for year in [1900, 1904]: # try leap and non-leap years
                fullstr = '%s/%i' % (v, year)
                try:
                    return datetime.datetime.strptime(fullstr, '%m/%d/%Y')
                except ValueError as e:
                    pass
            raise e

        return self._tok.simple_read('date', pos, parse_date)

    def read_text(self, pos):
        t = self._tok.tok(pos)
        #TODO: handle PDF encodings properly.
        return (pos+1, unicode(t.value(), errors='ignore')) \
            if t.is_text() else (pos, None)

    def read_amount(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, AmTr.decimal_amount(t.value())) \
            if t.is_amount() else (pos, None)

    def read_date_range(self, pos):
        t = self._tok.tok(pos)
        if t.is_date_range_1():
            return (pos+1, [datetime.datetime.strptime(v, '%B %d, %Y')
                            for v in t.value()])
        elif t.is_date_range_2():
            return (pos+1, [datetime.datetime.strptime(v, '%m/%d/%Y')
                            for v in t.value()])
        else:
            return (pos, None)

    def read_ref(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, t.value()) if t.is_ref() else (pos, None)

    def read_layout_tz(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, t.value()) if t.is_layout_tz() else (pos, None)

    def read_layout_tc(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, t.value()) if t.is_layout_tc() else (pos, None)

    def read_layout_tw(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, t.value()) if t.is_layout_tw() else (pos, None)

    def read_layout_tf(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, t.value()) if t.is_layout_tf() else (pos, None)

    def read_layout_tm(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, [float(v) for v in t.value()]) \
            if t.is_layout_tm() else (pos, None)
Example #6
0
 def __init__(self, pdf):
     self._pdf = decompress_pdf(pdf)
     self._tok = ReTokenizer(self._pdf, '\n', self.LEX)
Example #7
0
class StatementParser(object):
    """
    Each "read_*" method takes position as its argument,
    and returns next token position if read was successful,
    or the same position if it was not.
    """

    LEX = [
        ('date_range', r'^\((\d{2}/\d{2}/\d{2})-(\d{2}/\d{2}/\d{2})\) Tj$'),
        ('amount', r'^\((-?\$\d+(,\d{3})*\.\d{2})\) Tj$'),
        ('date', r'^\((\d{2}/\d{2})\) Tj$'),
        ('text', r'^\((.*)\) Tj$'),
        ('layout_tf', r'^.* Tf$'),
        ('layout_tm', r'^' + (6*r'([^ ]+) ') + r'Tm$'),
        ('layout_bt', r'^BT$'),
        ('layout_et', r'^ET$'),
        ('whitespace', r'^$')
    ]

    def __init__(self, pdf):
        self._pdf = decompress_pdf(pdf)
        self._tok = ReTokenizer(self._pdf, '\n', self.LEX)

    def read_transactions(self):
        # Read statement dates range.
        date_from, date_to = self.read_first_date_range()

        # Read transactions.
        pos = 0
        while not self._tok.tok(pos).is_eof():
            pos, trans = self.read_transaction(pos, date_from, date_to)
            if trans:
                yield trans
            else:
                pos += 1

    def read_first_date_range(self):
        pos = 0
        while not self._tok.tok(pos).is_eof():
            pos, date_range = self.read_date_range(pos)
            if date_range is not None:
                return date_range
            else:
                pos += 1

    def read_date_range(self, pos):
        t = self._tok.tok(pos)
        if t.is_date_range():
            return (pos+1, [datetime.datetime.strptime(v, '%m/%d/%y')
                            for v in t.value()])
        else:
            return (pos, None)

    def read_transaction(self, pos, date_from, date_to):
        startPos = pos

        pos, tdate = self.read_date(pos)
        pos, pdate = self.read_date(pos)

        # Early check to call read_multiline_desc() only when needed.
        if tdate is None:
            return startPos, None

        pos, desc = self.read_multiline_desc(pos)
        pos, amount = self.read_amount(pos)

        if desc is None or amount is None:
            return startPos, None
        else:
            # Sometimes one date is missing.
            pdate = pdate or tdate

            tdate = closest_date(tdate, date_from, date_to)
            pdate = closest_date(pdate, date_from, date_to)

            trans = Transaction()
            trans.date = tdate
            trans.rdate = pdate
            trans.type = Transaction.TYPE_UNKNOWN
            trans.raw = desc
            trans.label = desc
            trans.amount = -amount
            return pos, trans

    def read_multiline_desc(self, pos):
        """
        Read transaction description which can span over multiple lines.
        Amount must always follow the multiline description.
        But multiline description might be split by page break.
        After reading first line of the description, we skip everything
        which is not an amount and which has different horizontal offset
        than the first read line.
        """
        startPos = pos

        descs = []
        xofs = None
        while not self._tok.tok(pos).is_eof():
            pos, desc_tm = self.read_text(pos)
            if desc_tm is None:
                if not descs:
                    break
                prev_pos = pos
                pos, amount = self.read_amount(pos)
                if amount is not None:
                    pos = prev_pos
                    break
                pos += 1
            else:
                desc, tm = desc_tm
                if xofs is None:
                    _, _, _, _, xofs, _ = tm
                _, _, _, _, xofs_new, _ = tm
                if xofs == xofs_new:
                    descs.append(desc)
                else:
                    pos += 1

        if descs:
            return pos, clean_label(' '.join(descs))
        else:
            return startPos, None

    def __getattr__(self, name):
        if name.startswith('read_'):
            return lambda pos: self._tok.simple_read(name[5:], pos)
        raise AttributeError()

    @formatted
    def read_date(self, pos):
        def parse_date(v):
            for year in [1900, 1904]:  # try leap and non-leap years
                fullstr = '%s/%i' % (v, year)
                try:
                    return datetime.datetime.strptime(fullstr, '%m/%d/%Y')
                except ValueError as e:
                    last_error = e
            raise last_error

        return self._tok.simple_read('date', pos, parse_date)

    @formatted
    def read_amount(self, pos):
        return self._tok.simple_read('amount', pos,
                                     lambda xs: AmTr.decimal_amount(xs[0]))

    def read_text(self, pos):
        startPos = pos
        pos, ws = self.read_whitespace(pos)
        pos, bt = self.read_layout_bt(pos)
        pos, tf = self.read_layout_tf(pos)
        pos, tm = self.read_layout_tm(pos)
        pos, text = self._tok.simple_read('text', pos,
            lambda v: unicode(v, errors='ignore'))
        pos, et = self.read_layout_et(pos)
        if ws is None or bt is None or tf is None \
           or tm is None or text is None or et is None:
            return startPos, None
        else:
            return pos, (text, tm)
Example #8
0
class StatementParser(object):
    """
    Each "read_*" method takes position as its argument,
    and returns next token position if read was successful,
    or the same position if it was not.
    """

    LEX = [('date_range', r'^\((\d{2}/\d{2}/\d{2})-(\d{2}/\d{2}/\d{2})\) Tj$'),
           ('amount', r'^\((-?\$\d+(,\d{3})*\.\d{2})\) Tj$'),
           ('date', r'^\((\d{2}/\d{2})\) Tj$'), ('text', r'^\((.*)\) Tj$'),
           ('layout_tf', r'^.* Tf$'),
           ('layout_tm', r'^' + (6 * r'([^ ]+) ') + r'Tm$'),
           ('layout_bt', r'^BT$'), ('layout_et', r'^ET$'),
           ('whitespace', r'^$')]

    def __init__(self, pdf):
        self._pdf = decompress_pdf(pdf)
        self._tok = ReTokenizer(self._pdf, '\n', self.LEX)

    def read_transactions(self):
        # Read statement dates range.
        date_from, date_to = self.read_first_date_range()

        # Read transactions.
        pos = 0
        while not self._tok.tok(pos).is_eof():
            pos, trans = self.read_transaction(pos, date_from, date_to)
            if trans:
                yield trans
            else:
                pos += 1

    def read_first_date_range(self):
        pos = 0
        while not self._tok.tok(pos).is_eof():
            pos, date_range = self.read_date_range(pos)
            if date_range is not None:
                return date_range
            else:
                pos += 1

    def read_date_range(self, pos):
        t = self._tok.tok(pos)
        if t.is_date_range():
            return (pos + 1, [
                datetime.datetime.strptime(v, '%m/%d/%y') for v in t.value()
            ])
        else:
            return (pos, None)

    def read_transaction(self, pos, date_from, date_to):
        startPos = pos

        pos, tdate = self.read_date(pos)
        pos, pdate = self.read_date(pos)

        # Early check to call read_multiline_desc() only when needed.
        if tdate is None:
            return startPos, None

        pos, desc = self.read_multiline_desc(pos)
        pos, amount = self.read_amount(pos)

        if desc is None or amount is None:
            return startPos, None
        else:
            # Sometimes one date is missing.
            pdate = pdate or tdate

            tdate = closest_date(tdate, date_from, date_to)
            pdate = closest_date(pdate, date_from, date_to)

            trans = Transaction()
            trans.date = tdate
            trans.rdate = pdate
            trans.type = Transaction.TYPE_UNKNOWN
            trans.raw = desc
            trans.label = desc
            trans.amount = -amount
            return pos, trans

    def read_multiline_desc(self, pos):
        """
        Read transaction description which can span over multiple lines.
        Amount must always follow the multiline description.
        But multiline description might be split by page break.
        After reading first line of the description, we skip everything
        which is not an amount and which has different horizontal offset
        than the first read line.
        """
        startPos = pos

        descs = []
        xofs = None
        while not self._tok.tok(pos).is_eof():
            pos, desc_tm = self.read_text(pos)
            if desc_tm is None:
                if not descs:
                    break
                prev_pos = pos
                pos, amount = self.read_amount(pos)
                if amount is not None:
                    pos = prev_pos
                    break
                pos += 1
            else:
                desc, tm = desc_tm
                if xofs is None:
                    _, _, _, _, xofs, _ = tm
                _, _, _, _, xofs_new, _ = tm
                if xofs == xofs_new:
                    descs.append(desc)
                else:
                    pos += 1

        if descs:
            return pos, clean_label(' '.join(descs))
        else:
            return startPos, None

    def __getattr__(self, name):
        if name.startswith('read_'):
            return lambda pos: self._tok.simple_read(name[5:], pos)
        raise AttributeError()

    @formatted
    def read_date(self, pos):
        return self._tok.simple_read(
            'date', pos, lambda v: datetime.datetime.strptime(v, '%m/%d'))

    @formatted
    def read_amount(self, pos):
        return self._tok.simple_read('amount', pos,
                                     lambda xs: AmTr.decimal_amount(xs[0]))

    def read_text(self, pos):
        startPos = pos
        pos, ws = self.read_whitespace(pos)
        pos, bt = self.read_layout_bt(pos)
        pos, tf = self.read_layout_tf(pos)
        pos, tm = self.read_layout_tm(pos)
        pos, text = self._tok.simple_read(
            'text', pos, lambda v: unicode(v, errors='ignore'))
        pos, et = self.read_layout_et(pos)
        if ws is None or bt is None or tf is None \
        or tm is None or text is None or et is None:
            return startPos, None
        else:
            return pos, (text, tm)
Example #9
0
 def __init__(self, *args, **kwArgs):
     RawPage.__init__(self, *args, **kwArgs)
     assert self.doc[:4] == '%PDF'
     self._pdf = decompress_pdf(self.doc)
     self._tok = ReTokenizer(self._pdf, '\n', self.LEX)
Example #10
0
class StatementPage(RawPage):
    LEX = [('charge_amount', r'^\(\$([0-9\.]+)\) Tj$'),
           ('payment_amount', r'^\(\\\(\$([0-9\.]+)\\\)\) Tj$'),
           ('date', r'^\((\d+/\d+)\) Tj$'),
           ('full_date', r'^\((\d+/\d+/\d+)\) Tj$'),
           ('layout_td', r'^([-0-9]+ [-0-9]+) Td$'),
           ('ref', r'^\(([A-Z0-9]{17})\) Tj$'), ('text', r'^\((.*)\) Tj$')]

    def __init__(self, *args, **kwArgs):
        RawPage.__init__(self, *args, **kwArgs)
        assert self.doc[:4] == '%PDF'
        self._pdf = decompress_pdf(self.doc)
        self._tok = ReTokenizer(self._pdf, '\n', self.LEX)

    def iter_transactions(self):
        return sorted(self.read_transactions(),
                      cmp=lambda t1, t2: cmp(t2.date, t1.date) or cmp(
                          t1.label, t2.label) or cmp(t1.amount, t2.amount))

    def read_transactions(self):
        # Statement typically cover one month.
        # Do 60 days, just to be on a safe side.
        date_to = self.read_closing_date()
        date_from = date_to - timedelta(days=60)

        pos = 0
        while not self._tok.tok(pos).is_eof():
            pos, trans = self.read_transaction(pos, date_from, date_to)
            if trans:
                yield trans
            else:
                pos += 1

    def read_transaction(self, pos, date_from, date_to):
        startPos = pos
        pos, tdate = self.read_date(pos)
        pos, pdate_layout = self.read_layout_td(pos)
        pos, pdate = self.read_date(pos)
        pos, ref_layout = self.read_layout_td(pos)
        pos, ref = self.read_ref(pos)
        pos, desc_layout = self.read_layout_td(pos)
        pos, desc = self.read_text(pos)
        pos, amount_layout = self.read_layout_td(pos)
        pos, amount = self.read_amount(pos)
        if tdate is None or pdate is None \
        or desc is None or amount is None or amount == 0:
            return startPos, None
        else:
            tdate = closest_date(tdate, date_from, date_to)
            pdate = closest_date(pdate, date_from, date_to)
            desc = u' '.join(desc.split())

            trans = Transaction(ref or u'')
            trans.date = tdate
            trans.rdate = pdate
            trans.type = Transaction.TYPE_UNKNOWN
            trans.raw = desc
            trans.label = desc
            trans.amount = amount
            return pos, trans

    def read_amount(self, pos):
        pos, ampay = self.read_payment_amount(pos)
        if ampay is not None:
            return pos, ampay
        return self.read_charge_amount(pos)

    def read_charge_amount(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, -AmTr.decimal_amount(t.value())) \
            if t.is_charge_amount() else (pos, None)

    def read_payment_amount(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, AmTr.decimal_amount(t.value())) \
            if t.is_payment_amount() else (pos, None)

    def read_closing_date(self):
        pos = 0
        while not self._tok.tok(pos).is_eof():
            pos, text = self.read_text(pos)
            if text == u'Statement Closing Date':
                break
            pos += 1
        while not self._tok.tok(pos).is_eof():
            pos, date = self.read_full_date(pos)
            if date is not None:
                return date
            pos += 1

    def read_text(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, unicode(t.value())) \
            if t.is_text() else (pos, None)

    def read_full_date(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, datetime.strptime(t.value(), '%m/%d/%Y')) \
            if t.is_full_date() else (pos, None)

    def read_date(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, datetime.strptime(t.value(), '%m/%d')) \
            if t.is_date() else (pos, None)

    def read_ref(self, pos):
        t = self._tok.tok(pos)
        return (pos + 1, t.value()) if t.is_ref() else (pos, None)

    def read_layout_td(self, pos):
        t = self._tok.tok(pos)
        return (pos + 1, t.value()) if t.is_layout_td() else (pos, None)
Example #11
0
class StatementPage(RawPage):
    LEX = [('charge_amount', r'^\(\$(\d+(,\d{3})*\.\d{2})\) Tj$'),
           ('payment_amount', r'^\(\\\(\$(\d+(,\d{3})*\.\d{2})\\\)\) Tj$'),
           ('date', r'^\((\d+/\d+)\) Tj$'),
           ('full_date', r'^\((\d+/\d+/\d+)\) Tj$'),
           ('layout_td', r'^([-0-9]+ [-0-9]+) Td$'),
           ('ref', r'^\(([A-Z0-9]{17})\) Tj$'), ('text', r'^\((.*)\) Tj$')]

    def __init__(self, *args, **kwArgs):
        RawPage.__init__(self, *args, **kwArgs)
        assert self.doc[:4] == '%PDF'
        self._pdf = decompress_pdf(self.doc)
        self._tok = ReTokenizer(self._pdf, '\n', self.LEX)

    def iter_transactions(self):
        trs = self.read_transactions()
        # since the sorts are not in the same direction, we can't do in one pass
        # python sorting is stable, so sorting in 2 passes can achieve a multisort
        # the official docs give this way
        trs = sorted(trs, key=lambda tr: (tr.label, tr.amount))
        trs = sorted(trs, key=lambda tr: tr.date, reverse=True)
        return trs

    def read_transactions(self):
        # Statement typically cover one month.
        # Do 60 days, just to be on a safe side.
        date_to = self.read_closing_date()
        date_from = date_to - timedelta(days=60)

        pos = 0
        while not self._tok.tok(pos).is_eof():
            pos, trans = self.read_transaction(pos, date_from, date_to)
            if trans:
                yield trans
            else:
                pos += 1

    def read_transaction(self, pos, date_from, date_to):
        startPos = pos
        pos, tdate = self.read_date(pos)
        pos, pdate_layout = self.read_layout_td(pos)
        pos, pdate = self.read_date(pos)
        pos, ref_layout = self.read_layout_td(pos)
        pos, ref = self.read_ref(pos)
        pos, desc_layout = self.read_layout_td(pos)
        pos, desc = self.read_text(pos)
        pos, amount_layout = self.read_layout_td(pos)
        pos, amount = self.read_amount(pos)
        if tdate is None or pdate is None \
           or desc is None or amount is None or amount == 0:
            return startPos, None
        else:
            tdate = closest_date(tdate, date_from, date_to)
            pdate = closest_date(pdate, date_from, date_to)
            desc = u' '.join(desc.split())

            trans = Transaction(ref or u'')
            trans.date = tdate
            trans.rdate = pdate
            trans.type = Transaction.TYPE_UNKNOWN
            trans.raw = desc
            trans.label = desc
            trans.amount = amount
            return pos, trans

    def read_amount(self, pos):
        pos, ampay = self.read_payment_amount(pos)
        if ampay is not None:
            return pos, ampay
        return self.read_charge_amount(pos)

    def read_charge_amount(self, pos):
        return self._tok.simple_read('charge_amount', pos,
                                     lambda xs: -AmTr.decimal_amount(xs[0]))

    def read_payment_amount(self, pos):
        return self._tok.simple_read('payment_amount', pos,
                                     lambda xs: AmTr.decimal_amount(xs[0]))

    def read_closing_date(self):
        pos = 0
        while not self._tok.tok(pos).is_eof():
            pos, text = self.read_text(pos)
            if text == u'Statement Closing Date':
                break
            pos += 1
        while not self._tok.tok(pos).is_eof():
            pos, date = self.read_full_date(pos)
            if date is not None:
                return date
            pos += 1

    def read_text(self, pos):
        t = self._tok.tok(pos)
        # TODO: handle PDF encodings properly.
        return (pos+1, unicode(t.value(), errors='ignore')) \
            if t.is_text() else (pos, None)

    def read_full_date(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, datetime.strptime(t.value(), '%m/%d/%Y')) \
            if t.is_full_date() else (pos, None)

    def read_date(self, pos):
        t = self._tok.tok(pos)
        return (pos+1, datetime.strptime(t.value(), '%m/%d')) \
            if t.is_date() else (pos, None)

    def read_ref(self, pos):
        t = self._tok.tok(pos)
        return (pos + 1, t.value()) if t.is_ref() else (pos, None)

    def read_layout_td(self, pos):
        t = self._tok.tok(pos)
        return (pos + 1, t.value()) if t.is_layout_td() else (pos, None)