Python RecordFile Exemples, util.RecordFile Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : records.py Projet : calarrick/drocer

 def __init__(self, filename, rules):
     """Create record class from filename and rules object"""
     self.date = filename[:filename.index('.')]
     self.year = int(self.date[0:4])
     self.f = RecordFile(DIR + self.date[0:4] + '/' + filename)
     self.rules = rules

Exemple #2

0

Afficher le fichier

Fichier : records.py Projet : ysakthi/drocer

 def __init__(self, filename, rules):
     """Create record class from filename and rules object"""
     self.date = filename[:filename.index('.')]
     self.year = int(self.date[0:4])
     self.f = RecordFile(DIR + self.date[0:4] + '/' + filename)
     self.rules = rules

Exemple #3

0

Afficher le fichier

Fichier : records.py Projet : calarrick/drocer

class Record:
    """
    #todo description of the record class
    """

    def __init__(self, filename, rules):
        """Create record class from filename and rules object"""
        self.date = filename[:filename.index('.')]
        self.year = int(self.date[0:4])
        self.f = RecordFile(DIR + self.date[0:4] + '/' + filename)
        self.rules = rules


    @property
    def mayor(self):
        return self.get_prop('mayor')


    @property
    def council_president(self):
        return self.get_prop('council_president')


    @property
    def clerk(self):
        return self.get_prop('clerk')


    @property
    def council_members(self):

        # Grab 80 lines after the line containing "Residence" as a starting pt.
        lines = self.f.get_lines_after("Residence", 80)

        # lines that contain the name and address, look like:
        #    Council Member Name ..................... Address
        main_lines = [line.rstrip() for line in lines if "..." in line]

        # Sometimes the lines don't contain elipses. If that's the case, use
        # an alternative method to get the council members
        if main_lines == []:
            return self.get_council_members()

        # Get the lines containing just the zip code
        zip_code_lines = [line.rstrip()
                          for line in lines if re.search('\d{5}\n', line)]

        # this is in the form of:
        #    {
        #        'Council member 1 name': {
        #            'address': '123 Street Address',
        #            'zipcode': '12345'
        #        },
        #        'Council member 2 name': {...
        #        },...
        #    }
        #
        # Wrong. Should be a list of people objects.
        # Position = Council member
        # Address = Address
        # Zipcode = Zipcode

        council_members = {}

        # keep track of zip codes separately in case an address is missing
        zipcode_ptr = 0

        for i in range(0, len(main_lines)):
            # name is the contents up until "..", remove whitespace with
            # rstrip()
            name = main_lines[i][:main_lines[i].index('..')].rstrip()

            # address is everything after the last index of ".."
            address = main_lines[i][main_lines[i].rfind('..') + 2:]

            # no addres --> no zipcode
            if address == "":
                zipcode = ""
            else: #most lines have zip codes
                zipcode = zip_code_lines[zipcode_ptr]
                zipcode_ptr += 1

            # create the council member
            council_members[name] = {
                'address': address,
                'zipcode': zipcode
            }

        # pp = pprint.PrettyPrinter(indent=4)
        # pp.pprint(council_members)

        return council_members


    def get_council_members(self):
        """Alternative method of getting the council members.

        This is for the case where there are no elipses between the name and
        address

        """

        lines = self.f.get_lines_after("Name Residence", 25)
        lines = [line.rstrip() for line in lines if line[0].isdigit()]

        council_members = {}

        for line in lines:
            line = line[2:].strip()
            if ("P.O. Box") in line:
                name = line[:line.index("P.O. Box")].rstrip()
            else:
                name = line[:re.search('\d', line).start()].rstrip()

            line = line.replace(name, "")

            zipcode = line[-5:]

            address = line.replace(zipcode, "").strip()

            council_members[name] = {
                'address': address,
                'zipcode': zipcode
            }

        # pp = pprint.PrettyPrinter(indent=4)
        # pp.pprint(council_members)

        return council_members


    @property
    def cabinet(self):
        lines = self.f.get_lines_between("MAYOR",
                                                   ["OFFICE OF", "DEPT.",
                                                    "Ward\n",
                                                    re.compile("\d{5}")])
        cabinet = {}

        # Some titles don't fit on their line. In this case, combine them with
        # the line above
        i = 0
        for line in lines:
            lines[i] = lines[i].strip()
            if ',' not in lines[i]:
                lines[i-1] = ' '.join(lines[i-1:i+1])
                lines.remove(lines[i])
            i += 1

        for line in lines:
            parts = [part.strip() for part in line.split(",")]

            if (u'\u2013' in parts[0] or '_' in parts[0]):
                parts[0] = "vacant"

            titles_with_commas = ["Director", "Acting Director"]
            contained = [part for part in parts if part in titles_with_commas]
            if contained != []:
                i = parts.index(contained[0])
                parts[i:i + 2] = [', '.join(parts[i:i + 2])]

            cabinet[parts[0]] = parts[1:]

        # pp = pprint.PrettyPrinter(indent=4)
        # pp.pprint(cabinet)

        return cabinet


    @property
    def departments(self):
        # approximately the correct chunk of lines
        lines = self.f.get_lines_between("MAYOR", "MUNICIPAL COURT")

        # clean out lines before first "DEPT" and ward lines that made it into
        # the chunk of lines because of the table layout of the pdf
        lines = self.clean_dept_lines(lines)
        if (self.year == 1996):
            print (lines)

        # split into various bodies (each dept, board, or commission)
        # each body is still a list of strings after this step completes
        bodies = self.split_depts(lines)
        new_bodies = []

        # clean up each body, repeat this loop as many times as there are bodies
        # when the code reaches this line
        for body in bodies:

            # starting point
            new_body = self.create_new_body(body)

            # first line is an unusual case
            first = new_body['lines']
            if (type(first) == list):
                first = new_body["lines"][0]

            # if it starts with a room number, give the department a location
            if first[:4] == "Room":

                # Room# is either separated from the rest of the line by a
                # hyphen or a comma. Ignore if it is a part of a title
                if '-' in first and not any(t in first for t in Person.titles):
                    split_pos = first.index('-')
                else:
                    split_pos = first.index(',')

                # set the location
                new_body["location"] = first[:split_pos]

                # remove this portion of the line because it's now stored
                # elsewhere. To be extra confusing, new_body will be a list if
                # it's a dept otherwise it'll be a string
                if type(new_body) == list:
                    new_body["lines"][0] = first[split_pos + 1:].strip()
                else:
                    new_body["lines"] = first[split_pos + 1:].strip()

            else:
                new_body["location"] = ''

            new_bodies.append(new_body)

        all_bodies = {
            "departments": [],
            "boards": [],
            "commissions": []
        }

        for body in new_bodies:
            if "DEPT" in body["name"]:
                k = "departments"
            elif "BOARD" in body["name"]:
                k = "boards"
            else:
                k = "commissions"
            all_bodies[k].append(body)

        all_bodies['boards'] = self.create_boards(all_bodies["boards"])
        all_bodies['commissions'] = self.create_commissions(all_bodies["commissions"])
        all_bodies['departments'] = self.create_departments(all_bodies['departments'])

        return all_bodies

    def create_new_body(self, body):

        new_body = {
            "name" : '',
            "lines" : []
        }

        # first line looks like "NAME OF DEPT/BOARD/COMMISSION - ..."
        first_line = body[0]

        # subtitles (in parentheses) get hyphens before and after them
        # the following code gets the first hyphen if there is no subtitle and
        # the second hyphen if there is a subtitle
        hyphens = [m.start() for m in re.finditer("-", first_line)]
        split_pos = hyphens[1] if '(' in first_line else hyphens[0]

        # the name is first_line from the beginning -> split_pos
        new_body["name"] = first_line[:split_pos].strip()

        # remove the name from the first element, strip whitespace
        body[0] = first_line[split_pos + 1:].strip()

        # join the lines together for boards or commissions, these are lists of
        # names/positions where the whitespace doesn't matter
        if "DEPT" in new_body["name"]:
            new_body['lines'] = body
        else:
            new_body['lines'] = ' '.join(body)

        return new_body


    def create_departments(self, depts):

        new_depts = []

        '''
        before:
            depts is a list of basic department objects
            these look like:
            {
                name: dept name,
                location: location,
                lines: [all lines between this dept and the next]
            }

        after:
            new_depts is a list of department objects
            a department looks like:
            {
                name: dept name,
                lines: [lines that aren't part of a division],
                location: location,
                divisions: {division}
            }
        '''

        for dept in depts:
            new_dept = {
                'name': dept['name'],
                'location': '',
                'divisions': {},
                'members': []}

            if 'location' in dept:
                new_dept['location'] = dept['location']

            for i in range (0, len(dept['lines'])):
                line = dept['lines'][i]

                if Record.contains_with_spaces('OFFICES', line):
                    offices, members, leftovers = self.create_offices(dept['lines'][i:], dept['name'])
                    new_dept['offices'] = offices
                    divs, members = self.create_divisions(leftovers, dept['name'])
                    new_dept['divisions'] = divs
                    new_dept['members'] += members
                    break

                elif Record.contains_with_spaces('DIVISIONS', line):

                    divs, members = self.create_divisions(dept['lines'][i:], dept['name'])
                    new_dept['divisions'] = divs
                    new_dept['members'] = members
                    break

                else:
                    new_dept['members'].append(Person.get_people(line, department=dept['name']))

            new_depts.append(new_dept)


        return new_depts


    def clean_dept_lines(self, lines):
        # remove whitespace
        lines = [line.strip() for line in lines if line != "\n"]

        # remove lines before first "DEPT"
        i = 0
        while True:
            line = lines[i]
            if not re.search('[A-Z]{4}', line):
                lines.remove(line)
            else:
                break

        # Ward lines, if they exist
        lines = self.remove_ward_lines(lines)

        # Replace abbreviations with the full word for more consistency
        lines = self.replace_abbreviations(lines)

        return lines

    def replace_abbreviations(self, lines):
        for i in range(0, len(lines)):
            line = lines[i]
            line = line.replace("Rm.", "Room")
            line = line.replace("Sec'y.", "Secretary")
            line = line.replace("Exec.", "Executive")
            line = line.replace("Chrm.", "Chairman")
            line = line.replace("Asst.", "Assistant")
            line = line.replace("Ro om", "Room")
            line = line.replace("R oo m", "Room")
            line = line.replace("Act. Mgr.", "Account Manager")
            line = line.replace("DE PT", "DEPT")

            # two passes
            line = line.replace("_ _", "__", 10)
            line = line.replace("_ _", "__", 10)

            line = line.replace("CLEV ELA ND", "CLEVELAND")
            line = line.replace("COMMISS ION", "COMMISSION")
            lines[i] = line
        return lines

    def create_boards(self, boards):
        boards = self.clean_boards(boards)
        for board in boards:
            members = Person.get_people(board['lines'], department=board['name'])
            board['members'] = members
            del board['lines']
        return boards


    def clean_boards(self, boards):

        return boards


    def create_commissions(self, commissions):
        return self.create_boards(commissions)


    def remove_ward_lines(self, lines):

        # skip method if these lines aren't present
        if "Ward\n" not in lines:
            return lines

        i = 0
        found = False
        while True:
            line = lines[i]
            if line == "Ward\n":
                lines.remove(line)
                found = True
            elif found and re.search('\d\n', line):
                lines.remove(line)
            elif found:
                break
            else:
                i += 1
        return lines

    def split_depts(self, lines):

        depts = []
        current_dept = []
        previous = ''

        for line in lines:

            # start of a new thing -
            # "DEPT OF..." or "BOARD OF..." or "SOMETHING COMMISSION"
            # add contents of current_dept to the list of output depts
            if re.search('[A-Z]{4}', line) and current_dept != [] and \
                not Record.contains_with_spaces('DIVISIONS', line) and \
                not Record.contains_with_spaces('OFFICES', line):

                # add current_dept to list and reset current_dept/previous line
                depts.append(current_dept)
                current_dept, previous = [], ''

            # if the current line is
            if self.is_incomplete_line(line, previous):
                if self.year == 1996:
                    print (line)
                # get rid of "|||"s that are being used as spacing
                line.replace('|', '', 10)
                current_dept[-1] = ' '.join([current_dept[-1], line])

            else:
                current_dept.append(line.strip())

            previous = line.strip()

        # add the last department to the output
        depts.append(current_dept)

        return depts


    def is_incomplete_line(self, current, previous):
        """ Return whether a line in a dept/board/commission is incomplete"""

        # lines that contain "DIVISIONS" or "OFFICES" don't count
        if self.contains_with_spaces('DIVISIONS', current) or \
            self.contains_with_spaces('OFFICES', current):
            return False

        # "||||" represents spacing, formatting issue with the conversion from
        # pdf to txt
        tabbed_line = '|' in current

        # the previous line ended with a comma (and is not empty string)
        line_following_comma = previous != '' and previous[-1] == ","

        # too short to be a full line
        not_enough_words = current.count(' ') < 3 and \
                           not self.contains_with_spaces('DIVISION', current)

        # starts with numbers -> starts with an address
        # a complete line doesn't start with an address
        starts_with_digits = re.search('^(\d{3})', current)

        outlier_cases = ['Flr., Court Towers, 1200 Ontario', '',
            'Criminal Branch-Justice Center, 8th']

        return tabbed_line or line_following_comma or not_enough_words or \
            starts_with_digits or (current in outlier_cases)


    def clean_sub_depts(self, lines):

        # a set of divisions/offices looks like:
        # "DIVISIONS -" or "DIVISIONS \" or "DIVISIONS:"
        if '-' in lines[0]:
            split_char = '-'
        elif '\\' in lines[0]:
            split_char = '\\'
        else:
            split_char = ':'

        # remove "DIVISIONS" and whatever punctuation follows it
        lines[0] = lines[0][lines[0].index(split_char) + 1:].strip()

        # for some reason, the rest of the line doesn't matter if there are
        # fewer than 3 spaces. Don't remember why.
        if lines[0].count(' ') < 3:
            lines.remove(lines[0])

        lines = [line for line in lines if line != '']

        return lines


    def create_offices(self, lines, dept_name):
        offices = []
        people = []
        lines = self.clean_sub_depts(lines)

        for i in range(0, len(lines)):
            line = lines[i]

            if "DIVISIONS" in line:
                leftovers = lines[i:]
                break

            if '-' in line:
                parts = line.split('-')

            else:
                if '\\' in line:
                    split_char = '\\'
                else:
                    split_char = ','

                i = line.index(split_char)
                parts = [line[:i], line[i + 1:]]

            division = parts[0].strip()
            person = parts[1].strip()

            offices.append(division)
            people.append(Person.get_people(person, department=dept_name, division=division))

        return offices, people, leftovers

    def create_divisions(self, lines, dept_name):
        divs = []
        people = []

        lines = self.clean_sub_depts(lines)

        for line in lines:
            if '-' in line:
                parts = line.split('-')

            else:
                if '\\' in line:
                    split_char = '\\'
                else:
                    split_char = ','
                #print (line)
                i = line.index(split_char)
                parts = [line[:i], line[i + 1:]]

            division = parts[0].strip()
            person = parts[1].strip()

            divs.append(division)
            people.append(Person.get_people(person, department=dept_name, division=division))

        return divs, people

    @staticmethod
    def contains_with_spaces(keyword, line):
        line = line.replace(' ', '')
        return keyword in line

    def get_prop(self, name):

        if len(self.rules[name][self.year]) == 1:
            return eval('self.f.' + self.rules[name][self.year][0])

        else:
            for rule in self.rules[name][self.year]:
                attempt = eval('self.f.' + rule)
                if attempt not in INVALID_VALS:
                    return attempt

        return "Not found"


    def __repr__(self):
        return "<Record, date="+self.date+">"

Exemple #4

0

Afficher le fichier

Fichier : records.py Projet : ysakthi/drocer

class Record:
    """
    #todo description of the record class
    """
    def __init__(self, filename, rules):
        """Create record class from filename and rules object"""
        self.date = filename[:filename.index('.')]
        self.year = int(self.date[0:4])
        self.f = RecordFile(DIR + self.date[0:4] + '/' + filename)
        self.rules = rules

    @property
    def mayor(self):
        return self.get_prop('mayor')

    @property
    def council_president(self):
        return self.get_prop('council_president')

    @property
    def clerk(self):
        return self.get_prop('clerk')

    @property
    def council_members(self):

        # Grab 80 lines after the line containing "Residence" as a starting pt.
        lines = self.f.get_lines_after("Residence", 80)

        # lines that contain the name and address, look like:
        #    Council Member Name ..................... Address
        main_lines = [line.rstrip() for line in lines if "..." in line]

        # Sometimes the lines don't contain elipses. If that's the case, use
        # an alternative method to get the council members
        if main_lines == []:
            return self.get_council_members()

        # Get the lines containing just the zip code
        zip_code_lines = [
            line.rstrip() for line in lines if re.search('\d{5}\n', line)
        ]

        # this is in the form of:
        #    {
        #        'Council member 1 name': {
        #            'address': '123 Street Address',
        #            'zipcode': '12345'
        #        },
        #        'Council member 2 name': {...
        #        },...
        #    }
        #
        # Wrong. Should be a list of people objects.
        # Position = Council member
        # Address = Address
        # Zipcode = Zipcode

        council_members = {}

        # keep track of zip codes separately in case an address is missing
        zipcode_ptr = 0

        for i in range(0, len(main_lines)):
            # name is the contents up until "..", remove whitespace with
            # rstrip()
            name = main_lines[i][:main_lines[i].index('..')].rstrip()

            # address is everything after the last index of ".."
            address = main_lines[i][main_lines[i].rfind('..') + 2:]

            # no addres --> no zipcode
            if address == "":
                zipcode = ""
            else:  #most lines have zip codes
                zipcode = zip_code_lines[zipcode_ptr]
                zipcode_ptr += 1

            # create the council member
            council_members[name] = {'address': address, 'zipcode': zipcode}

        # pp = pprint.PrettyPrinter(indent=4)
        # pp.pprint(council_members)

        return council_members

    def get_council_members(self):
        """Alternative method of getting the council members.

        This is for the case where there are no elipses between the name and
        address

        """

        lines = self.f.get_lines_after("Name Residence", 25)
        lines = [line.rstrip() for line in lines if line[0].isdigit()]

        council_members = {}

        for line in lines:
            line = line[2:].strip()
            if ("P.O. Box") in line:
                name = line[:line.index("P.O. Box")].rstrip()
            else:
                name = line[:re.search('\d', line).start()].rstrip()

            line = line.replace(name, "")

            zipcode = line[-5:]

            address = line.replace(zipcode, "").strip()

            council_members[name] = {'address': address, 'zipcode': zipcode}

        # pp = pprint.PrettyPrinter(indent=4)
        # pp.pprint(council_members)

        return council_members

    @property
    def cabinet(self):
        lines = self.f.get_lines_between(
            "MAYOR", ["OFFICE OF", "DEPT.", "Ward\n",
                      re.compile("\d{5}")])
        cabinet = {}

        # Some titles don't fit on their line. In this case, combine them with
        # the line above
        i = 0
        for line in lines:
            lines[i] = lines[i].strip()
            if ',' not in lines[i]:
                lines[i - 1] = ' '.join(lines[i - 1:i + 1])
                lines.remove(lines[i])
            i += 1

        for line in lines:
            parts = [part.strip() for part in line.split(",")]

            if (u'\u2013' in parts[0] or '_' in parts[0]):
                parts[0] = "vacant"

            titles_with_commas = ["Director", "Acting Director"]
            contained = [part for part in parts if part in titles_with_commas]
            if contained != []:
                i = parts.index(contained[0])
                parts[i:i + 2] = [', '.join(parts[i:i + 2])]

            cabinet[parts[0]] = parts[1:]

        # pp = pprint.PrettyPrinter(indent=4)
        # pp.pprint(cabinet)

        return cabinet

    @property
    def departments(self):
        # approximately the correct chunk of lines
        lines = self.f.get_lines_between("MAYOR", "MUNICIPAL COURT")

        # clean out lines before first "DEPT" and ward lines that made it into
        # the chunk of lines because of the table layout of the pdf
        lines = self.clean_dept_lines(lines)
        if (self.year == 1996):
            print(lines)

        # split into various bodies (each dept, board, or commission)
        # each body is still a list of strings after this step completes
        bodies = self.split_depts(lines)
        new_bodies = []

        # clean up each body, repeat this loop as many times as there are bodies
        # when the code reaches this line
        for body in bodies:

            # starting point
            new_body = self.create_new_body(body)

            # first line is an unusual case
            first = new_body['lines']
            if (type(first) == list):
                first = new_body["lines"][0]

            # if it starts with a room number, give the department a location
            if first[:4] == "Room":

                # Room# is either separated from the rest of the line by a
                # hyphen or a comma. Ignore if it is a part of a title
                if '-' in first and not any(t in first for t in Person.titles):
                    split_pos = first.index('-')
                else:
                    split_pos = first.index(',')

                # set the location
                new_body["location"] = first[:split_pos]

                # remove this portion of the line because it's now stored
                # elsewhere. To be extra confusing, new_body will be a list if
                # it's a dept otherwise it'll be a string
                if type(new_body) == list:
                    new_body["lines"][0] = first[split_pos + 1:].strip()
                else:
                    new_body["lines"] = first[split_pos + 1:].strip()

            else:
                new_body["location"] = ''

            new_bodies.append(new_body)

        all_bodies = {"departments": [], "boards": [], "commissions": []}

        for body in new_bodies:
            if "DEPT" in body["name"]:
                k = "departments"
            elif "BOARD" in body["name"]:
                k = "boards"
            else:
                k = "commissions"
            all_bodies[k].append(body)

        all_bodies['boards'] = self.create_boards(all_bodies["boards"])
        all_bodies['commissions'] = self.create_commissions(
            all_bodies["commissions"])
        all_bodies['departments'] = self.create_departments(
            all_bodies['departments'])

        return all_bodies

    def create_new_body(self, body):

        new_body = {"name": '', "lines": []}

        # first line looks like "NAME OF DEPT/BOARD/COMMISSION - ..."
        first_line = body[0]

        # subtitles (in parentheses) get hyphens before and after them
        # the following code gets the first hyphen if there is no subtitle and
        # the second hyphen if there is a subtitle
        hyphens = [m.start() for m in re.finditer("-", first_line)]
        split_pos = hyphens[1] if '(' in first_line else hyphens[0]

        # the name is first_line from the beginning -> split_pos
        new_body["name"] = first_line[:split_pos].strip()

        # remove the name from the first element, strip whitespace
        body[0] = first_line[split_pos + 1:].strip()

        # join the lines together for boards or commissions, these are lists of
        # names/positions where the whitespace doesn't matter
        if "DEPT" in new_body["name"]:
            new_body['lines'] = body
        else:
            new_body['lines'] = ' '.join(body)

        return new_body

    def create_departments(self, depts):

        new_depts = []
        '''
        before:
            depts is a list of basic department objects
            these look like:
            {
                name: dept name,
                location: location,
                lines: [all lines between this dept and the next]
            }

        after:
            new_depts is a list of department objects
            a department looks like:
            {
                name: dept name,
                lines: [lines that aren't part of a division],
                location: location,
                divisions: {division}
            }
        '''

        for dept in depts:
            new_dept = {
                'name': dept['name'],
                'location': '',
                'divisions': {},
                'members': []
            }

            if 'location' in dept:
                new_dept['location'] = dept['location']

            for i in range(0, len(dept['lines'])):
                line = dept['lines'][i]

                if Record.contains_with_spaces('OFFICES', line):
                    offices, members, leftovers = self.create_offices(
                        dept['lines'][i:], dept['name'])
                    new_dept['offices'] = offices
                    divs, members = self.create_divisions(
                        leftovers, dept['name'])
                    new_dept['divisions'] = divs
                    new_dept['members'] += members
                    break

                elif Record.contains_with_spaces('DIVISIONS', line):

                    divs, members = self.create_divisions(
                        dept['lines'][i:], dept['name'])
                    new_dept['divisions'] = divs
                    new_dept['members'] = members
                    break

                else:
                    new_dept['members'].append(
                        Person.get_people(line, department=dept['name']))

            new_depts.append(new_dept)

        return new_depts

    def clean_dept_lines(self, lines):
        # remove whitespace
        lines = [line.strip() for line in lines if line != "\n"]

        # remove lines before first "DEPT"
        i = 0
        while True:
            line = lines[i]
            if not re.search('[A-Z]{4}', line):
                lines.remove(line)
            else:
                break

        # Ward lines, if they exist
        lines = self.remove_ward_lines(lines)

        # Replace abbreviations with the full word for more consistency
        lines = self.replace_abbreviations(lines)

        return lines

    def replace_abbreviations(self, lines):
        for i in range(0, len(lines)):
            line = lines[i]
            line = line.replace("Rm.", "Room")
            line = line.replace("Sec'y.", "Secretary")
            line = line.replace("Exec.", "Executive")
            line = line.replace("Chrm.", "Chairman")
            line = line.replace("Asst.", "Assistant")
            line = line.replace("Ro om", "Room")
            line = line.replace("R oo m", "Room")
            line = line.replace("Act. Mgr.", "Account Manager")
            line = line.replace("DE PT", "DEPT")

            # two passes
            line = line.replace("_ _", "__", 10)
            line = line.replace("_ _", "__", 10)

            line = line.replace("CLEV ELA ND", "CLEVELAND")
            line = line.replace("COMMISS ION", "COMMISSION")
            lines[i] = line
        return lines

    def create_boards(self, boards):
        boards = self.clean_boards(boards)
        for board in boards:
            members = Person.get_people(board['lines'],
                                        department=board['name'])
            board['members'] = members
            del board['lines']
        return boards

    def clean_boards(self, boards):

        return boards

    def create_commissions(self, commissions):
        return self.create_boards(commissions)

    def remove_ward_lines(self, lines):

        # skip method if these lines aren't present
        if "Ward\n" not in lines:
            return lines

        i = 0
        found = False
        while True:
            line = lines[i]
            if line == "Ward\n":
                lines.remove(line)
                found = True
            elif found and re.search('\d\n', line):
                lines.remove(line)
            elif found:
                break
            else:
                i += 1
        return lines

    def split_depts(self, lines):

        depts = []
        current_dept = []
        previous = ''

        for line in lines:

            # start of a new thing -
            # "DEPT OF..." or "BOARD OF..." or "SOMETHING COMMISSION"
            # add contents of current_dept to the list of output depts
            if re.search('[A-Z]{4}', line) and current_dept != [] and \
                not Record.contains_with_spaces('DIVISIONS', line) and \
                not Record.contains_with_spaces('OFFICES', line):

                # add current_dept to list and reset current_dept/previous line
                depts.append(current_dept)
                current_dept, previous = [], ''

            # if the current line is
            if self.is_incomplete_line(line, previous):
                if self.year == 1996:
                    print(line)
                # get rid of "|||"s that are being used as spacing
                line.replace('|', '', 10)
                current_dept[-1] = ' '.join([current_dept[-1], line])

            else:
                current_dept.append(line.strip())

            previous = line.strip()

        # add the last department to the output
        depts.append(current_dept)

        return depts

    def is_incomplete_line(self, current, previous):
        """ Return whether a line in a dept/board/commission is incomplete"""

        # lines that contain "DIVISIONS" or "OFFICES" don't count
        if self.contains_with_spaces('DIVISIONS', current) or \
            self.contains_with_spaces('OFFICES', current):
            return False

        # "||||" represents spacing, formatting issue with the conversion from
        # pdf to txt
        tabbed_line = '|' in current

        # the previous line ended with a comma (and is not empty string)
        line_following_comma = previous != '' and previous[-1] == ","

        # too short to be a full line
        not_enough_words = current.count(' ') < 3 and \
                           not self.contains_with_spaces('DIVISION', current)

        # starts with numbers -> starts with an address
        # a complete line doesn't start with an address
        starts_with_digits = re.search('^(\d{3})', current)

        outlier_cases = [
            'Flr., Court Towers, 1200 Ontario', '',
            'Criminal Branch-Justice Center, 8th'
        ]

        return tabbed_line or line_following_comma or not_enough_words or \
            starts_with_digits or (current in outlier_cases)

    def clean_sub_depts(self, lines):

        # a set of divisions/offices looks like:
        # "DIVISIONS -" or "DIVISIONS \" or "DIVISIONS:"
        if '-' in lines[0]:
            split_char = '-'
        elif '\\' in lines[0]:
            split_char = '\\'
        else:
            split_char = ':'

        # remove "DIVISIONS" and whatever punctuation follows it
        lines[0] = lines[0][lines[0].index(split_char) + 1:].strip()

        # for some reason, the rest of the line doesn't matter if there are
        # fewer than 3 spaces. Don't remember why.
        if lines[0].count(' ') < 3:
            lines.remove(lines[0])

        lines = [line for line in lines if line != '']

        return lines

    def create_offices(self, lines, dept_name):
        offices = []
        people = []
        lines = self.clean_sub_depts(lines)

        for i in range(0, len(lines)):
            line = lines[i]

            if "DIVISIONS" in line:
                leftovers = lines[i:]
                break

            if '-' in line:
                parts = line.split('-')

            else:
                if '\\' in line:
                    split_char = '\\'
                else:
                    split_char = ','

                i = line.index(split_char)
                parts = [line[:i], line[i + 1:]]

            division = parts[0].strip()
            person = parts[1].strip()

            offices.append(division)
            people.append(
                Person.get_people(person,
                                  department=dept_name,
                                  division=division))

        return offices, people, leftovers

    def create_divisions(self, lines, dept_name):
        divs = []
        people = []

        lines = self.clean_sub_depts(lines)

        for line in lines:
            if '-' in line:
                parts = line.split('-')

            else:
                if '\\' in line:
                    split_char = '\\'
                else:
                    split_char = ','
                #print (line)
                i = line.index(split_char)
                parts = [line[:i], line[i + 1:]]

            division = parts[0].strip()
            person = parts[1].strip()

            divs.append(division)
            people.append(
                Person.get_people(person,
                                  department=dept_name,
                                  division=division))

        return divs, people

    @staticmethod
    def contains_with_spaces(keyword, line):
        line = line.replace(' ', '')
        return keyword in line

    def get_prop(self, name):

        if len(self.rules[name][self.year]) == 1:
            return eval('self.f.' + self.rules[name][self.year][0])

        else:
            for rule in self.rules[name][self.year]:
                attempt = eval('self.f.' + rule)
                if attempt not in INVALID_VALS:
                    return attempt

        return "Not found"

    def __repr__(self):
        return "<Record, date=" + self.date + ">"