Ejemplo n.º 1
0
    def parse_transcript(self, data):
        print "PARSING %s" % data['url']

        text = data['text'].find('div', 'texto')

        # Sometimes, last paragraph of text isn't in a <p>
        for p in text('p')[1:]:
            if re.search('\S', unicode(p.next_sibling)):
                last_para = sib = p
                while sib.next_sibling and getattr(sib.next_sibling, 'name') != 'p':
                    sib = sib.next_sibling
                    if getattr(sib, 'name'): continue
                    new_para = data['text'].new_tag('p')
                    new_para.string = sib.strip()
                    last_para.insert_after(new_para)
                    last_para = new_para

        speech = None
        for p in text('p')[1:]:

            line = p.text.strip()

            # New speaker
            if '.-' in line:
                yield speech
                m = re.match(
                    u'((?:La señora|El señor) [^ ]*)( \([^)]*\))?\.-(.*)',
                    line)
                speaker = self.fix_name(m.group(1))
                speaker_display = None
                if m.group(2):
                    if '(don' in m.group(2):
                        speaker += m.group(2)
                    else:
                        speaker_display = speaker + m.group(2)
                speech = Speech(
                    speaker=speaker,
                    text=m.group(3).strip(),
                    speaker_display=speaker_display,
                    )
                continue

            # Narrative
            if re.match('--', line):
                yield speech
                speech = Speech(speaker=None, text=line)
                continue

            # We must now already have a speech by the time we're here
            if not speech:
                raise Exception, 'Reached here without a speech - need to deal with "%s"' % line.encode('utf-8')

            speech.add_para(line)

        yield speech
Ejemplo n.º 2
0
def parse_transcript(text, url, para_min_indent=None):
    print "PARSING %s" % url

    page, num = 1, 1

    indent = ' ' * 3
    speech = None
    Speech.reset(True)

    for line in text:
        # Page break
        if '\014' in line:
            page += 1
            num = 1
            continue

        # Empty line
        if re.match('\s*$', line):
            continue

        # Just after last line, there should be a page number
        if num == 26:
            m = re.match(' +(\d+)$', line)
            assert int(m.group(1)) == page
            continue

        # Let's check we haven't lost a line anywhere...
        assert re.match(' *%d( |$)' % num, line), '%s != %s' % (num, line)

        line = re.sub('^ *%d(%s|$)' % (num, indent), '', line)

        # Okay, here we have a non-empty, non-page number, non-index line of just text
        # if para_min_indent:
        #     print page, num, '!', line.encode('utf-8')

        num += 1

        # Empty line
        if re.match('\s*$', line):
            continue

        if re.match(' *DRAFT TRANSCRIPT$', line):
            continue

        if re.match(' *INDEX$', line):
            break

        # Date at start
        m = re.match(' *((Mon|Tues|Wednes|Thurs|Fri)day,? ?)?\d+ (September|October|November|December|January|February|March|April|May|June|July) 201\d+$', line)
        if m:
            continue

        m = re.match(' *(\(.*\))(?:break|s)?$', line)
        if m:
            try:
                line = m.group(1)
                line = line.replace('O2', '02')
                if re.match('\(1[3-9]\.', line):
                    time_format = '(%H.%M %p)'
                else:
                    time_format = '(%I.%M %p)'
                Speech.current_time = datetime.strptime(line, time_format).time()
            except:
                if para_min_indent:
                    #print speech.text
                    yield speech
                if 'The luncheon adjournment' in line and not Speech.current_time:
                    continue
                speech = Speech( speaker=None, text=line )
            continue

        # Headings
        m = re.match('Opening statement by [A-Z ]*$|Submissions? (in reply )?by [A-Z ]*(?: \(continued\))?$', line.strip())
        if m:
            Speech.current_section = Section( heading=string.capwords(line.strip()) )
            continue

        # New speaker
        m = re.match(' *((?:[A-Z -]|Mc)+): (.*)', line)
        if m:
            if para_min_indent:
                #print speech.text if speech else None
                yield speech
            speaker = fix_name(m.group(1))
            speech = Speech( speaker=speaker, text=m.group(2) )
            continue

        if not para_min_indent:
            yield page, num-1, line
            continue

        # New paragraph if indent at least 8 spaces
        m = re.match(' ' * para_min_indent[page], line)
        if m:
            speech.add_para(line.strip())
            continue

        # If we've got this far, hopefully just a normal line of speech
        speech.add_text(line.strip())

    if para_min_indent:
        #print speech.text
        yield speech
Ejemplo n.º 3
0
def parse_transcript(text, url):
    print "PARSING %s" % url

    page, num = 1, 1
    if '2012-05-23pm' in url:
        # This transcript does not start again from page 1 unlike all the others
        page, num = 110, 7
    elif '2011-12-06am' in url:
        # This transcript we're ignoring a special correction, see below
        page, num = 1, 4

    indent = None
    first_indent = None
    interviewer = None
    state = 'text'
    speech = None
    Speech.reset('am' in url)

    if '2012-06-26pm' in url:
        time_format = '(%I.%M %p)'
        Speech.current_time = datetime.strptime('(2.00 pm)', time_format).time()

    for line in text:
        # Page break
        if '\014' in line:
            page += 1
            num = 1
            first_indent = None
            continue

        # Empty line
        if re.match('\s*$', line):
            continue

        # Start of index, ignore from then on
        if re.match(' *\d+ +I ?N ?D ?E ?X$', line) or '...............' in line:
            state = 'index'
            continue
        if state == 'index':
            continue

        # Special case - ignore a one-off correction in this hearing
        if '2011-12-06am' in url and page == 1 and num <= 4 and not re.match(' +4', line):
            continue

        # Just after last line, there should be a page number
        if num == 26:
            m = re.match(' +(\d+)$', line)
            assert int(m.group(1)) == page
            continue

        # Let's check we haven't lost a line anywhere...
        assert re.match(' *%d( |$)' % num, line), '%s != %s' % (num, line)

        if not indent:
            left_space = len(line) - len(line.lstrip())
            if left_space == 1:
                indent = ' ' * 7
            if left_space == 13 or left_space == 11:
                indent = ' ' * 3

        line = re.sub('^ *%d(%s|$)' % (num, indent), '', line)
        num += 1

        # Okay, here we have a non-empty, non-page number, non-index line of just text
        # print page, num, line.encode('utf-8')

        # Empty line
        if re.match('\s*$', line):
            continue

        # Date at start
        m = re.match(' *((Mon|Tues|Wednes|Thurs|Fri)day,? ?)?\d+ (September|October|November|December|January|February|March|April|May|June|July) 201[12]$', line)
        if m:
            continue

        if state == 'adjournment':
            state = 'text'
            if re.match(' *(.*)\)$', line):
                speech.add_text(line.strip())
                continue

        # Time/message about lunch/adjournments
        if re.search('\(2.43$', line): line = '(2.43 pm)'
        line = re.sub('\((3.23 pm|3.07 pm|11.15 am)$', r'(\1)', line)
        # Special case one line of normal text that would be caught
        m = re.match(' *(\(.*\))(?:break|s)?$', line)
        if (m or '[Alarm sounded]' in line or 'Evidence by videolink' in line) \
          and 'published on the Hacked Off website at the time' not in line:
            try:
                line = m.group(1)
                line = line.replace('O2', '02')
                if '2011-11-16am' in url and line == '(12.30 am)':
                    line = '(12.30 pm)'
                if re.match('\(1[3-9]\.', line):
                    time_format = '(%H.%M %p)'
                else:
                    time_format = '(%I.%M %p)'
                Speech.current_time = datetime.strptime(line, time_format).time()
            except:
                yield speech
                if 'Hearing in private' in line:
                    time_format = '(%I.%M %p)'
                    Speech.current_time = datetime.strptime('(10.30 am)', time_format).time()
                if 'The luncheon adjournment' in line and not Speech.current_time:
                    continue
                speech = Speech( speaker=None, text=line )
            continue
        # Multiline message about adjournment
        m = re.match(' *\(The (hearing|Inquiry|court) adjourned(?i)', line)
        if m:
            yield speech
            state = 'adjournment'
            speech = Speech( speaker=None, text=line.strip() )
            continue

        # Questions
        m = re.match('Further questions from ([A-Z ]*)$|Question(?:s|ed|) (?:from by|from|by) (.*?)(?: \(continued\))?$', line.strip())
        if m:
            interviewer = fix_name(m.group(1) or m.group(2))
            continue

        # Headings
        m = re.match('Reply to the Responses to his Application by [A-Z ]*$|Response to .* Application$|Directions [Hh]earing.*$|Application by [A-Z ]*$|Application to become a core participant$|Reading of evidence of [A-Z ]*$|RULING$|Ruling$|(Opening|Closing|Reply) submissions ((on|for) Module 3 )?by [A-Z ]*$|Statement by ([A-Z ]*|Lord Justice Leveson)$|Submissions? by ([A-Z ]*|Mr Garnham)$|Discussion$|Discussion re (procedure|timetable|administrative matters)$|Housekeeping$', line.strip())
        if m:
            Speech.current_section = Section( heading=string.capwords(line.strip()) )
            continue

        # Witness arriving
        m = re.match(" *((?:[A-Z]|Mr)(?:[A-Z' ,-]|Mc|Mr|and)+?)\s+(\(.*\))$", line)
        if m:
            heading = fix_name(m.group(1))
            Speech.witness = heading
            if Speech.witness == 'Dr Gerald Patrick McCann and Dr Kate Marie McCann':
                Speech.witness = 'Mr McCann' # All the A.s are him
            if Speech.witness == 'Mr James Watson and Mrs Margaret Watson':
                Speech.witness = 'Mrs Watson' # All the A.s are her
            if Speech.witness == 'Mr Matthew Bell and Mr Christopher Johnson':
                # The one A. is actually him from the following session
                Speech.witness = 'Mr Piers Pughe-Morgan'
            narrative = '%s %s.' % (m.group(1), m.group(2))
            if state == 'witness':
                Speech.current_section.heading += ' / ' + Speech.witness
                speech.add_text( narrative )
            else:
                yield speech
                Speech.current_section = Section( heading=heading )
                speech = Speech( speaker=None, text=narrative )
                state = 'witness'
            continue
        else:
            state = 'text'

        # Question/answer (speaker from previous lines)
        if '2011-11-30am' in url and line == 'Q.':
            line = 'Q. From the police.'
        m = re.match('([QA])\. (.*)', line)
        if m:
            yield speech
            if m.group(1) == 'A':
                if '2011-12-08am' in url and not Speech.witness:
                    Speech.witness = 'Professor Steven Barnett'
                assert Speech.witness
                speaker = Speech.witness
            else:
                assert interviewer
                speaker = interviewer
            speech = Speech( speaker=speaker, text=m.group(2) )
            # Put the correction alongside the speech
            if '2011-12-06am' in url and m.group(2).strip() == 'Sort of blagging.  I mean, I was trying, as I said, not':
                speech.add_para(speech.text[0][0])
                speech.text[0][0] = "[Mr Leigh's references on page 72 below to \"arms company \" was a slip and Mr Leigh has subsequently corrected this to \"a construction company \" as he intended at the time.]"
            continue

        # New speaker
        m = re.match(' *((?:[A-Z -]|Mc)+): (.*)', line)
        if m:
            yield speech
            speaker = fix_name(m.group(1))
            if not interviewer:
                interviewer = speaker
            speech = Speech( speaker=speaker, text=m.group(2) )
            continue

        if '2011-10-04' in url or '2011-09-06' in url:
            # New paragraph if indent a bit more than 'usual' for the page
            left_space = len(line) - len(line.lstrip())
            if left_space == 5: left_space = 4
            if page == 113: # Special manual fix for this one
                first_indent = 3
            elif not first_indent:
                first_indent = left_space
            elif first_indent > left_space + 1:
                # The first line must have been a new paragraph. Adjust accordingly.
                first_indent = left_space
                if len(speech.text[-1])>1:
                    speech.add_para(speech.text[-1][-1])
                    del speech.text[-2][-1]
            m = re.match(' ' * (first_indent+2), line)
        else:
            # New paragraph if indent at least 8 spaces
            m = re.match('        ', line)
        if m:
            speech.add_para(line.strip())
            continue

        # If we've got this far, hopefully just a normal line of speech
        speech.add_text(line.strip())

    yield speech
Ejemplo n.º 4
0
    def parse_transcript(self, data):
        print "PARSING %s" % data['url']

        page, num = 1, 1
        date = '{d:%B} {d.day}, {d:%Y}'.format(d=data['date'])

        speech = None
        state = 'text'
        Speech.reset(True)

        for prev_line, line, next_line in prevnext(data['text']):
            # Page break
            if '\014' in line:
                page += 1
                num = 0
                continue

            if state == 'skip1':
                state = 'text'
                continue

            # Empty line, or line matching page footer
            if re.match('\s*$', line):
                continue
            if re.match(' *(Strehlow & Associates, Inc.$|\(215\) 504-4622$|You created this PDF)(?i)',
                        line):
                continue
            # Line matching page header
            if re.match(' *(Stated Meeting|%s)$' % date, line):
                continue

            # Ignore title page for now
            if page == 1:
                continue

            # Start of certificate/index
            if re.match(
                    ' *(\d+ *)?(CERTIFICATE|C E R T I F I C A T I O N)$',
                    line):
                state = 'index'
            if state == 'index':
                continue

            # Each page starts with page number
            if num == 0:
                m = re.match(' +(?:Page )?(\d+)$', line)
                assert int(m.group(1)) == page
                num += 1
                continue

            # Heading somewhere within this page, just ignore it
            if num == 1:
                num += 1
                continue

            # Let's check we haven't lost a line anywhere...
            assert re.match(' *%d(   |$)' % num, line), \
                '%s != %s' % (num, line)
            line = re.sub('^ *%d(   |$)' % num, '', line)
            num += 1

            # Ignore line containing only a number and then dashes
            if re.match('[\s-]*$', line):
                continue

            # Narrative messages
            m = re.match(' +(\(.*\))$', line)
            if m:
                yield speech
                speech = Speech(speaker=None, text=line)
                continue
            m1 = re.match(' +(\(.*)$', line)
            m2 = re.match(' *\d+ +(.*\))$', next_line)
            if m1 and m2:
                yield speech
                speech = Speech(
                    speaker=None,
                    text='%s %s' % (m1.group(1), m2.group(1)),
                    )
                state = 'skip1'
                num += 1
                continue

            # Okay, here we have a non-empty, non-page number, non-narrative
            # line of just text print page, num, line

            # New speaker
            m = re.match(" *([A-Z '.]+):(?: (.*)|$)", line)
            if m:
                yield speech
                speaker = self.fix_name(m.group(1))
                text = m.group(2) or ''
                speech = Speech(speaker=speaker, text=text)
                continue

            # We must now already have a speech by the time we're here
            if not speech:
                raise Exception(
                    'Reached here without a speech - need to deal with "%s"' %
                    line)

            if re.match(' ', line):
                speech.add_para(line.strip())
            else:
                speech.add_text(line.strip())

        yield speech
Ejemplo n.º 5
0
    def parse_transcript(self, data):
        print "PARSING %s" % data['url']

        speech = None
        new_page = False
        started = False
        ignore_rest_of_page = False
        Speech.reset(True)

        for prev_line, line, next_line in prevnext(data['text']):
            # Page break
            if '\014' in line:
                continue

            # Empty line
            if re.match('\s*$', line):
                continue

            # Page number
            m = re.match(
                '%s (?:\d+|\d+...\d+), 200\d *(\d+) of (\d+)$' % months, line)
            if m:
                page = int(m.group(1))
                new_page = True
                ignore_rest_of_page = False
                continue

            # Message about lunch/adjournments
            m = re.match(' {10,}(\[.*\])$', line)
            if m:
                yield speech
                speech = Speech(speaker=None, text=line)
                continue

            # Headings
            m = re.match(' {10,}(.*)$', line)
            if m:
                t = m.group(1).strip()
                if re.match(
                        'Transcript of (the )?Federal Open Market Committee (Meeting|Conference Call) (on|of)$',
                        t):
                    started = True
                    continue
                if re.match('END OF MEETING$', t) or re.match(
                        '%s \d+-\d+, 200\d$' % months, t):
                    continue
                if re.match('%s \d+, 200\d$' % months, t):
                    Speech.current_date = datetime.datetime.strptime(
                        t, '%B %d, %Y')
                    continue
                m = re.match(
                    '(?P<d>%s \d+(?:, 200\d)?)...(?P<s>(?:Morning|Afternoon) Session)'
                    % months, t)
                if not m:
                    m = re.match(
                        '(?P<s>Afternoon Session)...(?P<d>%s \d+, 200\d)' %
                        months, t)
                if m:
                    d, s = m.group('d'), m.group('s')
                    if '200' not in d:
                        d = '%s, %d' % (d, data['year'])
                    if d == 'December 15, 2008' and s == 'Morning Session':
                        d = 'December 16, 2008'
                    Speech.current_date = datetime.datetime.strptime(
                        d, '%B %d, %Y')
                    Speech.current_section = Section(heading='%s, %s' % (s, d))
                    continue

            if not started:
                continue

            # footnote (always just "see materials")
            if re.match('[0-9]$', line):
                ignore_rest_of_page = True
            if ignore_rest_of_page:
                continue

            # Okay, here we have a non-empty, non-page number, non-heading line of just text
            # print page, line

            # New speaker
            m = re.match(
                ' *((?:M[RS][. ]+|CHAIRMAN |VICE CHAIRMAN )[A-Z]+|PARTICIPANTS)[.:]? ?[0-9]? ?(.*)',
                line)
            if m:
                yield speech
                new_page = False
                speaker, speaker_display = self.fix_name(m.group(1))
                speech = Speech(speaker=speaker,
                                text=m.group(2),
                                speaker_display=speaker_display)
                continue

            # We must now already have a speech by the time we're here
            if not speech:
                raise Exception, 'Reached here without a speech - need to deal with "%s"' % line.strip(
                )

            if re.match('\s*$', prev_line):
                if new_page:
                    new_page = False
                    # This line could be a continuation or a new paragraph,
                    # we're not sure as it's a new page. If the next line has
                    # the same indentation, assume it's a continuation,
                    # otherwise a new paragraph.
                    left_space = len(line) - len(line.lstrip())
                    left_space_next = len(next_line) - len(next_line.lstrip())
                    if left_space != left_space_next:
                        speech.add_para(line.strip())
                    else:
                        speech.add_text(line.strip())
                else:
                    # The previous line was blank. If the line is indented, it
                    # must be a new paragraph, in both single and double spaced
                    # text. If it's not, it must be a continuation in double
                    # spaced text.
                    if re.match(' ', line):
                        speech.add_para(line.strip())
                    else:
                        speech.add_text(line.strip())
            else:
                # If the last line wasn't blank, we're in single spaced text
                # and it must be a continuation.
                if re.search(
                        '  (Yes|No|With some reluctance, I will vote yes\.)$',
                        line):
                    line = '<br/>' + line.strip()
                speech.add_text(line.strip())
                new_page = False

        if not started:
            raise Exception, 'Never found the heading to begin'

        yield speech
Ejemplo n.º 6
0
def parse_transcript(text, url):
    print "PARSING %s" % url

    page, num = 1, 1
    if '2012-05-23pm' in url:
        # This transcript does not start again from page 1 unlike all the others
        page, num = 110, 7
    elif '2011-12-06am' in url:
        # This transcript we're ignoring a special correction, see below
        page, num = 1, 4

    indent = None
    first_indent = None
    interviewer = None
    state = 'text'
    speech = None
    Speech.reset('am' in url)

    if '2012-06-26pm' in url:
        time_format = '(%I.%M %p)'
        Speech.current_time = datetime.strptime('(2.00 pm)',
                                                time_format).time()

    for line in text:
        # Page break
        if '\014' in line:
            page += 1
            num = 1
            first_indent = None
            continue

        # Empty line
        if re.match('\s*$', line):
            continue

        # Start of index, ignore from then on
        if re.match(' *\d+ +I ?N ?D ?E ?X$',
                    line) or '...............' in line:
            state = 'index'
            continue
        if state == 'index':
            continue

        # Special case - ignore a one-off correction in this hearing
        if '2011-12-06am' in url and page == 1 and num <= 4 and not re.match(
                ' +4', line):
            continue

        # Just after last line, there should be a page number
        if num == 26:
            m = re.match(' +(\d+)$', line)
            assert int(m.group(1)) == page
            continue

        # Let's check we haven't lost a line anywhere...
        assert re.match(' *%d( |$)' % num, line), '%s != %s' % (num, line)

        if not indent:
            left_space = len(line) - len(line.lstrip())
            if left_space == 1:
                indent = ' ' * 7
            if left_space == 13 or left_space == 11:
                indent = ' ' * 3

        line = re.sub('^ *%d(%s|$)' % (num, indent), '', line)
        num += 1

        # Okay, here we have a non-empty, non-page number, non-index line of just text
        # print page, num, line.encode('utf-8')

        # Empty line
        if re.match('\s*$', line):
            continue

        # Date at start
        m = re.match(
            ' *((Mon|Tues|Wednes|Thurs|Fri)day,? ?)?\d+ (September|October|November|December|January|February|March|April|May|June|July) 201[12]$',
            line)
        if m:
            continue

        if state == 'adjournment':
            state = 'text'
            if re.match(' *(.*)\)$', line):
                speech.add_text(line.strip())
                continue

        # Time/message about lunch/adjournments
        if re.search('\(2.43$', line): line = '(2.43 pm)'
        line = re.sub('\((3.23 pm|3.07 pm|11.15 am)$', r'(\1)', line)
        # Special case one line of normal text that would be caught
        m = re.match(' *(\(.*\))(?:break|s)?$', line)
        if (m or '[Alarm sounded]' in line or 'Evidence by videolink' in line) \
          and 'published on the Hacked Off website at the time' not in line:
            try:
                line = m.group(1)
                line = line.replace('O2', '02')
                if '2011-11-16am' in url and line == '(12.30 am)':
                    line = '(12.30 pm)'
                if re.match('\(1[3-9]\.', line):
                    time_format = '(%H.%M %p)'
                else:
                    time_format = '(%I.%M %p)'
                Speech.current_time = datetime.strptime(line,
                                                        time_format).time()
            except:
                yield speech
                if 'Hearing in private' in line:
                    time_format = '(%I.%M %p)'
                    Speech.current_time = datetime.strptime(
                        '(10.30 am)', time_format).time()
                if 'The luncheon adjournment' in line and not Speech.current_time:
                    continue
                speech = Speech(speaker=None, text=line)
            continue
        # Multiline message about adjournment
        m = re.match(' *\(The (hearing|Inquiry|court) adjourned(?i)', line)
        if m:
            yield speech
            state = 'adjournment'
            speech = Speech(speaker=None, text=line.strip())
            continue

        # Questions
        m = re.match(
            'Further questions from ([A-Z ]*)$|Question(?:s|ed|) (?:from by|from|by) (.*?)(?: \(continued\))?$',
            line.strip())
        if m:
            interviewer = fix_name(m.group(1) or m.group(2))
            continue

        # Headings
        m = re.match(
            'Reply to the Responses to his Application by [A-Z ]*$|Response to .* Application$|Directions [Hh]earing.*$|Application by [A-Z ]*$|Application to become a core participant$|Reading of evidence of [A-Z ]*$|RULING$|Ruling$|(Opening|Closing|Reply) submissions ((on|for) Module 3 )?by [A-Z ]*$|Statement by ([A-Z ]*|Lord Justice Leveson)$|Submissions? by ([A-Z ]*|Mr Garnham)$|Discussion$|Discussion re (procedure|timetable|administrative matters)$|Housekeeping$',
            line.strip())
        if m:
            Speech.current_section = Section(
                heading=string.capwords(line.strip()))
            continue

        # Witness arriving
        m = re.match(" *((?:[A-Z]|Mr)(?:[A-Z' ,-]|Mc|Mr|and)+?)\s+(\(.*\))$",
                     line)
        if m:
            heading = fix_name(m.group(1))
            Speech.witness = heading
            if Speech.witness == 'Dr Gerald Patrick McCann and Dr Kate Marie McCann':
                Speech.witness = 'Mr McCann'  # All the A.s are him
            if Speech.witness == 'Mr James Watson and Mrs Margaret Watson':
                Speech.witness = 'Mrs Watson'  # All the A.s are her
            if Speech.witness == 'Mr Matthew Bell and Mr Christopher Johnson':
                # The one A. is actually him from the following session
                Speech.witness = 'Mr Piers Pughe-Morgan'
            narrative = '%s %s.' % (m.group(1), m.group(2))
            if state == 'witness':
                Speech.current_section.heading += ' / ' + Speech.witness
                speech.add_text(narrative)
            else:
                yield speech
                Speech.current_section = Section(heading=heading)
                speech = Speech(speaker=None, text=narrative)
                state = 'witness'
            continue
        else:
            state = 'text'

        # Question/answer (speaker from previous lines)
        if '2011-11-30am' in url and line == 'Q.':
            line = 'Q. From the police.'
        m = re.match('([QA])\. (.*)', line)
        if m:
            yield speech
            if m.group(1) == 'A':
                if '2011-12-08am' in url and not Speech.witness:
                    Speech.witness = 'Professor Steven Barnett'
                assert Speech.witness
                speaker = Speech.witness
            else:
                assert interviewer
                speaker = interviewer
            speech = Speech(speaker=speaker, text=m.group(2))
            # Put the correction alongside the speech
            if '2011-12-06am' in url and m.group(2).strip(
            ) == 'Sort of blagging.  I mean, I was trying, as I said, not':
                speech.add_para(speech.text[0][0])
                speech.text[0][
                    0] = "[Mr Leigh's references on page 72 below to \"arms company \" was a slip and Mr Leigh has subsequently corrected this to \"a construction company \" as he intended at the time.]"
            continue

        # New speaker
        m = re.match(' *((?:[A-Z -]|Mc)+): (.*)', line)
        if m:
            yield speech
            speaker = fix_name(m.group(1))
            if not interviewer:
                interviewer = speaker
            speech = Speech(speaker=speaker, text=m.group(2))
            continue

        if '2011-10-04' in url or '2011-09-06' in url:
            # New paragraph if indent a bit more than 'usual' for the page
            left_space = len(line) - len(line.lstrip())
            if left_space == 5: left_space = 4
            if page == 113:  # Special manual fix for this one
                first_indent = 3
            elif not first_indent:
                first_indent = left_space
            elif first_indent > left_space + 1:
                # The first line must have been a new paragraph. Adjust accordingly.
                first_indent = left_space
                if len(speech.text[-1]) > 1:
                    speech.add_para(speech.text[-1][-1])
                    del speech.text[-2][-1]
            m = re.match(' ' * (first_indent + 2), line)
        else:
            # New paragraph if indent at least 8 spaces
            m = re.match('        ', line)
        if m:
            speech.add_para(line.strip())
            continue

        # If we've got this far, hopefully just a normal line of speech
        speech.add_text(line.strip())

    yield speech
Ejemplo n.º 7
0
    def parse_transcript(self, data):
        print "PARSING %s" % data['url']

        page, num = 1, 1
        date = '{d:%B} {d.day}, {d:%Y}'.format(d=data['date'])

        speech = None
        state = 'text'
        Speech.reset(True)

        for prev_line, line, next_line in prevnext(data['text']):
            # Page break
            if '\014' in line:
                page += 1
                num = 0
                continue

            if state == 'skip1':
                state = 'text'
                continue

            # Empty line, or line matching page footer
            if re.match('\s*$', line):
                continue
            if re.match(
                    ' *(Strehlow & Associates, Inc.$|\(215\) 504-4622$|You created this PDF)(?i)',
                    line):
                continue
            # Line matching page header
            if re.match(' *(Stated Meeting|%s)$' % date, line):
                continue

            # Ignore title page for now
            if page == 1:
                continue

            # Start of certificate/index
            if re.match(' *(\d+ *)?(CERTIFICATE|C E R T I F I C A T I O N)$',
                        line):
                state = 'index'
            if state == 'index':
                continue

            # Each page starts with page number
            if num == 0:
                m = re.match(' +(?:Page )?(\d+)$', line)
                assert int(m.group(1)) == page
                num += 1
                continue

            # Heading somewhere within this page, just ignore it
            if num == 1:
                num += 1
                continue

            # Let's check we haven't lost a line anywhere...
            assert re.match(' *%d(   |$)' % num, line), \
                '%s != %s' % (num, line)
            line = re.sub('^ *%d(   |$)' % num, '', line)
            num += 1

            # Ignore line containing only a number and then dashes
            if re.match('[\s-]*$', line):
                continue

            # Narrative messages
            m = re.match(' +(\(.*\))$', line)
            if m:
                yield speech
                speech = Speech(speaker=None, text=line)
                continue
            m1 = re.match(' +(\(.*)$', line)
            m2 = re.match(' *\d+ +(.*\))$', next_line)
            if m1 and m2:
                yield speech
                speech = Speech(
                    speaker=None,
                    text='%s %s' % (m1.group(1), m2.group(1)),
                )
                state = 'skip1'
                num += 1
                continue

            # Okay, here we have a non-empty, non-page number, non-narrative
            # line of just text print page, num, line

            # New speaker
            m = re.match(" *([A-Z '.]+):(?: (.*)|$)", line)
            if m:
                yield speech
                speaker = self.fix_name(m.group(1))
                text = m.group(2) or ''
                speech = Speech(speaker=speaker, text=text)
                continue

            # We must now already have a speech by the time we're here
            if not speech:
                raise Exception(
                    'Reached here without a speech - need to deal with "%s"' %
                    line)

            if re.match(' ', line):
                speech.add_para(line.strip())
            else:
                speech.add_text(line.strip())

        yield speech
Ejemplo n.º 8
0
    def parse_transcript(self, data):
        print "PARSING %s" % data["url"]

        speech = None
        new_page = False
        started = False
        ignore_rest_of_page = False
        Speech.reset(True)

        for prev_line, line, next_line in prevnext(data["text"]):
            # Page break
            if "\014" in line:
                continue

            # Empty line
            if re.match("\s*$", line):
                continue

            # Page number
            m = re.match("%s (?:\d+|\d+...\d+), 200\d *(\d+) of (\d+)$" % months, line)
            if m:
                page = int(m.group(1))
                new_page = True
                ignore_rest_of_page = False
                continue

            # Message about lunch/adjournments
            m = re.match(" {10,}(\[.*\])$", line)
            if m:
                yield speech
                speech = Speech(speaker=None, text=line)
                continue

            # Headings
            m = re.match(" {10,}(.*)$", line)
            if m:
                t = m.group(1).strip()
                if re.match("Transcript of (the )?Federal Open Market Committee (Meeting|Conference Call) (on|of)$", t):
                    started = True
                    continue
                if re.match("END OF MEETING$", t) or re.match("%s \d+-\d+, 200\d$" % months, t):
                    continue
                if re.match("%s \d+, 200\d$" % months, t):
                    Speech.current_date = datetime.datetime.strptime(t, "%B %d, %Y")
                    continue
                m = re.match("(?P<d>%s \d+(?:, 200\d)?)...(?P<s>(?:Morning|Afternoon) Session)" % months, t)
                if not m:
                    m = re.match("(?P<s>Afternoon Session)...(?P<d>%s \d+, 200\d)" % months, t)
                if m:
                    d, s = m.group("d"), m.group("s")
                    if "200" not in d:
                        d = "%s, %d" % (d, data["year"])
                    if d == "December 15, 2008" and s == "Morning Session":
                        d = "December 16, 2008"
                    Speech.current_date = datetime.datetime.strptime(d, "%B %d, %Y")
                    Speech.current_section = Section(heading="%s, %s" % (s, d))
                    continue

            if not started:
                continue

            # footnote (always just "see materials")
            if re.match("[0-9]$", line):
                ignore_rest_of_page = True
            if ignore_rest_of_page:
                continue

            # Okay, here we have a non-empty, non-page number, non-heading line of just text
            # print page, line

            # New speaker
            m = re.match(" *((?:M[RS][. ]+|CHAIRMAN |VICE CHAIRMAN )[A-Z]+|PARTICIPANTS)[.:]? ?[0-9]? ?(.*)", line)
            if m:
                yield speech
                new_page = False
                speaker, speaker_display = self.fix_name(m.group(1))
                speech = Speech(speaker=speaker, text=m.group(2), speaker_display=speaker_display)
                continue

            # We must now already have a speech by the time we're here
            if not speech:
                raise Exception, 'Reached here without a speech - need to deal with "%s"' % line.strip()

            if re.match("\s*$", prev_line):
                if new_page:
                    new_page = False
                    # This line could be a continuation or a new paragraph,
                    # we're not sure as it's a new page. If the next line has
                    # the same indentation, assume it's a continuation,
                    # otherwise a new paragraph.
                    left_space = len(line) - len(line.lstrip())
                    left_space_next = len(next_line) - len(next_line.lstrip())
                    if left_space != left_space_next:
                        speech.add_para(line.strip())
                    else:
                        speech.add_text(line.strip())
                else:
                    # The previous line was blank. If the line is indented, it
                    # must be a new paragraph, in both single and double spaced
                    # text. If it's not, it must be a continuation in double
                    # spaced text.
                    if re.match(" ", line):
                        speech.add_para(line.strip())
                    else:
                        speech.add_text(line.strip())
            else:
                # If the last line wasn't blank, we're in single spaced text
                # and it must be a continuation.
                if re.search("  (Yes|No|With some reluctance, I will vote yes\.)$", line):
                    line = "<br/>" + line.strip()
                speech.add_text(line.strip())
                new_page = False

        if not started:
            raise Exception, "Never found the heading to begin"

        yield speech