def parse_transcript(self, data): print "PARSING %s" % data['url'] text = data['text'].find('div', 'texto') # Sometimes, last paragraph of text isn't in a <p> for p in text('p')[1:]: if re.search('\S', unicode(p.next_sibling)): last_para = sib = p while sib.next_sibling and getattr(sib.next_sibling, 'name') != 'p': sib = sib.next_sibling if getattr(sib, 'name'): continue new_para = data['text'].new_tag('p') new_para.string = sib.strip() last_para.insert_after(new_para) last_para = new_para speech = None for p in text('p')[1:]: line = p.text.strip() # New speaker if '.-' in line: yield speech m = re.match( u'((?:La señora|El señor) [^ ]*)( \([^)]*\))?\.-(.*)', line) speaker = self.fix_name(m.group(1)) speaker_display = None if m.group(2): if '(don' in m.group(2): speaker += m.group(2) else: speaker_display = speaker + m.group(2) speech = Speech( speaker=speaker, text=m.group(3).strip(), speaker_display=speaker_display, ) continue # Narrative if re.match('--', line): yield speech speech = Speech(speaker=None, text=line) continue # We must now already have a speech by the time we're here if not speech: raise Exception, 'Reached here without a speech - need to deal with "%s"' % line.encode('utf-8') speech.add_para(line) yield speech
def parse_transcript(text, url, para_min_indent=None): print "PARSING %s" % url page, num = 1, 1 indent = ' ' * 3 speech = None Speech.reset(True) for line in text: # Page break if '\014' in line: page += 1 num = 1 continue # Empty line if re.match('\s*$', line): continue # Just after last line, there should be a page number if num == 26: m = re.match(' +(\d+)$', line) assert int(m.group(1)) == page continue # Let's check we haven't lost a line anywhere... assert re.match(' *%d( |$)' % num, line), '%s != %s' % (num, line) line = re.sub('^ *%d(%s|$)' % (num, indent), '', line) # Okay, here we have a non-empty, non-page number, non-index line of just text # if para_min_indent: # print page, num, '!', line.encode('utf-8') num += 1 # Empty line if re.match('\s*$', line): continue if re.match(' *DRAFT TRANSCRIPT$', line): continue if re.match(' *INDEX$', line): break # Date at start m = re.match(' *((Mon|Tues|Wednes|Thurs|Fri)day,? ?)?\d+ (September|October|November|December|January|February|March|April|May|June|July) 201\d+$', line) if m: continue m = re.match(' *(\(.*\))(?:break|s)?$', line) if m: try: line = m.group(1) line = line.replace('O2', '02') if re.match('\(1[3-9]\.', line): time_format = '(%H.%M %p)' else: time_format = '(%I.%M %p)' Speech.current_time = datetime.strptime(line, time_format).time() except: if para_min_indent: #print speech.text yield speech if 'The luncheon adjournment' in line and not Speech.current_time: continue speech = Speech( speaker=None, text=line ) continue # Headings m = re.match('Opening statement by [A-Z ]*$|Submissions? (in reply )?by [A-Z ]*(?: \(continued\))?$', line.strip()) if m: Speech.current_section = Section( heading=string.capwords(line.strip()) ) continue # New speaker m = re.match(' *((?:[A-Z -]|Mc)+): (.*)', line) if m: if para_min_indent: #print speech.text if speech else None yield speech speaker = fix_name(m.group(1)) speech = Speech( speaker=speaker, text=m.group(2) ) continue if not para_min_indent: yield page, num-1, line continue # New paragraph if indent at least 8 spaces m = re.match(' ' * para_min_indent[page], line) if m: speech.add_para(line.strip()) continue # If we've got this far, hopefully just a normal line of speech speech.add_text(line.strip()) if para_min_indent: #print speech.text yield speech
def parse_transcript(text, url): print "PARSING %s" % url page, num = 1, 1 if '2012-05-23pm' in url: # This transcript does not start again from page 1 unlike all the others page, num = 110, 7 elif '2011-12-06am' in url: # This transcript we're ignoring a special correction, see below page, num = 1, 4 indent = None first_indent = None interviewer = None state = 'text' speech = None Speech.reset('am' in url) if '2012-06-26pm' in url: time_format = '(%I.%M %p)' Speech.current_time = datetime.strptime('(2.00 pm)', time_format).time() for line in text: # Page break if '\014' in line: page += 1 num = 1 first_indent = None continue # Empty line if re.match('\s*$', line): continue # Start of index, ignore from then on if re.match(' *\d+ +I ?N ?D ?E ?X$', line) or '...............' in line: state = 'index' continue if state == 'index': continue # Special case - ignore a one-off correction in this hearing if '2011-12-06am' in url and page == 1 and num <= 4 and not re.match(' +4', line): continue # Just after last line, there should be a page number if num == 26: m = re.match(' +(\d+)$', line) assert int(m.group(1)) == page continue # Let's check we haven't lost a line anywhere... assert re.match(' *%d( |$)' % num, line), '%s != %s' % (num, line) if not indent: left_space = len(line) - len(line.lstrip()) if left_space == 1: indent = ' ' * 7 if left_space == 13 or left_space == 11: indent = ' ' * 3 line = re.sub('^ *%d(%s|$)' % (num, indent), '', line) num += 1 # Okay, here we have a non-empty, non-page number, non-index line of just text # print page, num, line.encode('utf-8') # Empty line if re.match('\s*$', line): continue # Date at start m = re.match(' *((Mon|Tues|Wednes|Thurs|Fri)day,? ?)?\d+ (September|October|November|December|January|February|March|April|May|June|July) 201[12]$', line) if m: continue if state == 'adjournment': state = 'text' if re.match(' *(.*)\)$', line): speech.add_text(line.strip()) continue # Time/message about lunch/adjournments if re.search('\(2.43$', line): line = '(2.43 pm)' line = re.sub('\((3.23 pm|3.07 pm|11.15 am)$', r'(\1)', line) # Special case one line of normal text that would be caught m = re.match(' *(\(.*\))(?:break|s)?$', line) if (m or '[Alarm sounded]' in line or 'Evidence by videolink' in line) \ and 'published on the Hacked Off website at the time' not in line: try: line = m.group(1) line = line.replace('O2', '02') if '2011-11-16am' in url and line == '(12.30 am)': line = '(12.30 pm)' if re.match('\(1[3-9]\.', line): time_format = '(%H.%M %p)' else: time_format = '(%I.%M %p)' Speech.current_time = datetime.strptime(line, time_format).time() except: yield speech if 'Hearing in private' in line: time_format = '(%I.%M %p)' Speech.current_time = datetime.strptime('(10.30 am)', time_format).time() if 'The luncheon adjournment' in line and not Speech.current_time: continue speech = Speech( speaker=None, text=line ) continue # Multiline message about adjournment m = re.match(' *\(The (hearing|Inquiry|court) adjourned(?i)', line) if m: yield speech state = 'adjournment' speech = Speech( speaker=None, text=line.strip() ) continue # Questions m = re.match('Further questions from ([A-Z ]*)$|Question(?:s|ed|) (?:from by|from|by) (.*?)(?: \(continued\))?$', line.strip()) if m: interviewer = fix_name(m.group(1) or m.group(2)) continue # Headings m = re.match('Reply to the Responses to his Application by [A-Z ]*$|Response to .* Application$|Directions [Hh]earing.*$|Application by [A-Z ]*$|Application to become a core participant$|Reading of evidence of [A-Z ]*$|RULING$|Ruling$|(Opening|Closing|Reply) submissions ((on|for) Module 3 )?by [A-Z ]*$|Statement by ([A-Z ]*|Lord Justice Leveson)$|Submissions? by ([A-Z ]*|Mr Garnham)$|Discussion$|Discussion re (procedure|timetable|administrative matters)$|Housekeeping$', line.strip()) if m: Speech.current_section = Section( heading=string.capwords(line.strip()) ) continue # Witness arriving m = re.match(" *((?:[A-Z]|Mr)(?:[A-Z' ,-]|Mc|Mr|and)+?)\s+(\(.*\))$", line) if m: heading = fix_name(m.group(1)) Speech.witness = heading if Speech.witness == 'Dr Gerald Patrick McCann and Dr Kate Marie McCann': Speech.witness = 'Mr McCann' # All the A.s are him if Speech.witness == 'Mr James Watson and Mrs Margaret Watson': Speech.witness = 'Mrs Watson' # All the A.s are her if Speech.witness == 'Mr Matthew Bell and Mr Christopher Johnson': # The one A. is actually him from the following session Speech.witness = 'Mr Piers Pughe-Morgan' narrative = '%s %s.' % (m.group(1), m.group(2)) if state == 'witness': Speech.current_section.heading += ' / ' + Speech.witness speech.add_text( narrative ) else: yield speech Speech.current_section = Section( heading=heading ) speech = Speech( speaker=None, text=narrative ) state = 'witness' continue else: state = 'text' # Question/answer (speaker from previous lines) if '2011-11-30am' in url and line == 'Q.': line = 'Q. From the police.' m = re.match('([QA])\. (.*)', line) if m: yield speech if m.group(1) == 'A': if '2011-12-08am' in url and not Speech.witness: Speech.witness = 'Professor Steven Barnett' assert Speech.witness speaker = Speech.witness else: assert interviewer speaker = interviewer speech = Speech( speaker=speaker, text=m.group(2) ) # Put the correction alongside the speech if '2011-12-06am' in url and m.group(2).strip() == 'Sort of blagging. I mean, I was trying, as I said, not': speech.add_para(speech.text[0][0]) speech.text[0][0] = "[Mr Leigh's references on page 72 below to \"arms company \" was a slip and Mr Leigh has subsequently corrected this to \"a construction company \" as he intended at the time.]" continue # New speaker m = re.match(' *((?:[A-Z -]|Mc)+): (.*)', line) if m: yield speech speaker = fix_name(m.group(1)) if not interviewer: interviewer = speaker speech = Speech( speaker=speaker, text=m.group(2) ) continue if '2011-10-04' in url or '2011-09-06' in url: # New paragraph if indent a bit more than 'usual' for the page left_space = len(line) - len(line.lstrip()) if left_space == 5: left_space = 4 if page == 113: # Special manual fix for this one first_indent = 3 elif not first_indent: first_indent = left_space elif first_indent > left_space + 1: # The first line must have been a new paragraph. Adjust accordingly. first_indent = left_space if len(speech.text[-1])>1: speech.add_para(speech.text[-1][-1]) del speech.text[-2][-1] m = re.match(' ' * (first_indent+2), line) else: # New paragraph if indent at least 8 spaces m = re.match(' ', line) if m: speech.add_para(line.strip()) continue # If we've got this far, hopefully just a normal line of speech speech.add_text(line.strip()) yield speech
def parse_transcript(self, data): print "PARSING %s" % data['url'] page, num = 1, 1 date = '{d:%B} {d.day}, {d:%Y}'.format(d=data['date']) speech = None state = 'text' Speech.reset(True) for prev_line, line, next_line in prevnext(data['text']): # Page break if '\014' in line: page += 1 num = 0 continue if state == 'skip1': state = 'text' continue # Empty line, or line matching page footer if re.match('\s*$', line): continue if re.match(' *(Strehlow & Associates, Inc.$|\(215\) 504-4622$|You created this PDF)(?i)', line): continue # Line matching page header if re.match(' *(Stated Meeting|%s)$' % date, line): continue # Ignore title page for now if page == 1: continue # Start of certificate/index if re.match( ' *(\d+ *)?(CERTIFICATE|C E R T I F I C A T I O N)$', line): state = 'index' if state == 'index': continue # Each page starts with page number if num == 0: m = re.match(' +(?:Page )?(\d+)$', line) assert int(m.group(1)) == page num += 1 continue # Heading somewhere within this page, just ignore it if num == 1: num += 1 continue # Let's check we haven't lost a line anywhere... assert re.match(' *%d( |$)' % num, line), \ '%s != %s' % (num, line) line = re.sub('^ *%d( |$)' % num, '', line) num += 1 # Ignore line containing only a number and then dashes if re.match('[\s-]*$', line): continue # Narrative messages m = re.match(' +(\(.*\))$', line) if m: yield speech speech = Speech(speaker=None, text=line) continue m1 = re.match(' +(\(.*)$', line) m2 = re.match(' *\d+ +(.*\))$', next_line) if m1 and m2: yield speech speech = Speech( speaker=None, text='%s %s' % (m1.group(1), m2.group(1)), ) state = 'skip1' num += 1 continue # Okay, here we have a non-empty, non-page number, non-narrative # line of just text print page, num, line # New speaker m = re.match(" *([A-Z '.]+):(?: (.*)|$)", line) if m: yield speech speaker = self.fix_name(m.group(1)) text = m.group(2) or '' speech = Speech(speaker=speaker, text=text) continue # We must now already have a speech by the time we're here if not speech: raise Exception( 'Reached here without a speech - need to deal with "%s"' % line) if re.match(' ', line): speech.add_para(line.strip()) else: speech.add_text(line.strip()) yield speech
def parse_transcript(self, data): print "PARSING %s" % data['url'] speech = None new_page = False started = False ignore_rest_of_page = False Speech.reset(True) for prev_line, line, next_line in prevnext(data['text']): # Page break if '\014' in line: continue # Empty line if re.match('\s*$', line): continue # Page number m = re.match( '%s (?:\d+|\d+...\d+), 200\d *(\d+) of (\d+)$' % months, line) if m: page = int(m.group(1)) new_page = True ignore_rest_of_page = False continue # Message about lunch/adjournments m = re.match(' {10,}(\[.*\])$', line) if m: yield speech speech = Speech(speaker=None, text=line) continue # Headings m = re.match(' {10,}(.*)$', line) if m: t = m.group(1).strip() if re.match( 'Transcript of (the )?Federal Open Market Committee (Meeting|Conference Call) (on|of)$', t): started = True continue if re.match('END OF MEETING$', t) or re.match( '%s \d+-\d+, 200\d$' % months, t): continue if re.match('%s \d+, 200\d$' % months, t): Speech.current_date = datetime.datetime.strptime( t, '%B %d, %Y') continue m = re.match( '(?P<d>%s \d+(?:, 200\d)?)...(?P<s>(?:Morning|Afternoon) Session)' % months, t) if not m: m = re.match( '(?P<s>Afternoon Session)...(?P<d>%s \d+, 200\d)' % months, t) if m: d, s = m.group('d'), m.group('s') if '200' not in d: d = '%s, %d' % (d, data['year']) if d == 'December 15, 2008' and s == 'Morning Session': d = 'December 16, 2008' Speech.current_date = datetime.datetime.strptime( d, '%B %d, %Y') Speech.current_section = Section(heading='%s, %s' % (s, d)) continue if not started: continue # footnote (always just "see materials") if re.match('[0-9]$', line): ignore_rest_of_page = True if ignore_rest_of_page: continue # Okay, here we have a non-empty, non-page number, non-heading line of just text # print page, line # New speaker m = re.match( ' *((?:M[RS][. ]+|CHAIRMAN |VICE CHAIRMAN )[A-Z]+|PARTICIPANTS)[.:]? ?[0-9]? ?(.*)', line) if m: yield speech new_page = False speaker, speaker_display = self.fix_name(m.group(1)) speech = Speech(speaker=speaker, text=m.group(2), speaker_display=speaker_display) continue # We must now already have a speech by the time we're here if not speech: raise Exception, 'Reached here without a speech - need to deal with "%s"' % line.strip( ) if re.match('\s*$', prev_line): if new_page: new_page = False # This line could be a continuation or a new paragraph, # we're not sure as it's a new page. If the next line has # the same indentation, assume it's a continuation, # otherwise a new paragraph. left_space = len(line) - len(line.lstrip()) left_space_next = len(next_line) - len(next_line.lstrip()) if left_space != left_space_next: speech.add_para(line.strip()) else: speech.add_text(line.strip()) else: # The previous line was blank. If the line is indented, it # must be a new paragraph, in both single and double spaced # text. If it's not, it must be a continuation in double # spaced text. if re.match(' ', line): speech.add_para(line.strip()) else: speech.add_text(line.strip()) else: # If the last line wasn't blank, we're in single spaced text # and it must be a continuation. if re.search( ' (Yes|No|With some reluctance, I will vote yes\.)$', line): line = '<br/>' + line.strip() speech.add_text(line.strip()) new_page = False if not started: raise Exception, 'Never found the heading to begin' yield speech
def parse_transcript(text, url): print "PARSING %s" % url page, num = 1, 1 if '2012-05-23pm' in url: # This transcript does not start again from page 1 unlike all the others page, num = 110, 7 elif '2011-12-06am' in url: # This transcript we're ignoring a special correction, see below page, num = 1, 4 indent = None first_indent = None interviewer = None state = 'text' speech = None Speech.reset('am' in url) if '2012-06-26pm' in url: time_format = '(%I.%M %p)' Speech.current_time = datetime.strptime('(2.00 pm)', time_format).time() for line in text: # Page break if '\014' in line: page += 1 num = 1 first_indent = None continue # Empty line if re.match('\s*$', line): continue # Start of index, ignore from then on if re.match(' *\d+ +I ?N ?D ?E ?X$', line) or '...............' in line: state = 'index' continue if state == 'index': continue # Special case - ignore a one-off correction in this hearing if '2011-12-06am' in url and page == 1 and num <= 4 and not re.match( ' +4', line): continue # Just after last line, there should be a page number if num == 26: m = re.match(' +(\d+)$', line) assert int(m.group(1)) == page continue # Let's check we haven't lost a line anywhere... assert re.match(' *%d( |$)' % num, line), '%s != %s' % (num, line) if not indent: left_space = len(line) - len(line.lstrip()) if left_space == 1: indent = ' ' * 7 if left_space == 13 or left_space == 11: indent = ' ' * 3 line = re.sub('^ *%d(%s|$)' % (num, indent), '', line) num += 1 # Okay, here we have a non-empty, non-page number, non-index line of just text # print page, num, line.encode('utf-8') # Empty line if re.match('\s*$', line): continue # Date at start m = re.match( ' *((Mon|Tues|Wednes|Thurs|Fri)day,? ?)?\d+ (September|October|November|December|January|February|March|April|May|June|July) 201[12]$', line) if m: continue if state == 'adjournment': state = 'text' if re.match(' *(.*)\)$', line): speech.add_text(line.strip()) continue # Time/message about lunch/adjournments if re.search('\(2.43$', line): line = '(2.43 pm)' line = re.sub('\((3.23 pm|3.07 pm|11.15 am)$', r'(\1)', line) # Special case one line of normal text that would be caught m = re.match(' *(\(.*\))(?:break|s)?$', line) if (m or '[Alarm sounded]' in line or 'Evidence by videolink' in line) \ and 'published on the Hacked Off website at the time' not in line: try: line = m.group(1) line = line.replace('O2', '02') if '2011-11-16am' in url and line == '(12.30 am)': line = '(12.30 pm)' if re.match('\(1[3-9]\.', line): time_format = '(%H.%M %p)' else: time_format = '(%I.%M %p)' Speech.current_time = datetime.strptime(line, time_format).time() except: yield speech if 'Hearing in private' in line: time_format = '(%I.%M %p)' Speech.current_time = datetime.strptime( '(10.30 am)', time_format).time() if 'The luncheon adjournment' in line and not Speech.current_time: continue speech = Speech(speaker=None, text=line) continue # Multiline message about adjournment m = re.match(' *\(The (hearing|Inquiry|court) adjourned(?i)', line) if m: yield speech state = 'adjournment' speech = Speech(speaker=None, text=line.strip()) continue # Questions m = re.match( 'Further questions from ([A-Z ]*)$|Question(?:s|ed|) (?:from by|from|by) (.*?)(?: \(continued\))?$', line.strip()) if m: interviewer = fix_name(m.group(1) or m.group(2)) continue # Headings m = re.match( 'Reply to the Responses to his Application by [A-Z ]*$|Response to .* Application$|Directions [Hh]earing.*$|Application by [A-Z ]*$|Application to become a core participant$|Reading of evidence of [A-Z ]*$|RULING$|Ruling$|(Opening|Closing|Reply) submissions ((on|for) Module 3 )?by [A-Z ]*$|Statement by ([A-Z ]*|Lord Justice Leveson)$|Submissions? by ([A-Z ]*|Mr Garnham)$|Discussion$|Discussion re (procedure|timetable|administrative matters)$|Housekeeping$', line.strip()) if m: Speech.current_section = Section( heading=string.capwords(line.strip())) continue # Witness arriving m = re.match(" *((?:[A-Z]|Mr)(?:[A-Z' ,-]|Mc|Mr|and)+?)\s+(\(.*\))$", line) if m: heading = fix_name(m.group(1)) Speech.witness = heading if Speech.witness == 'Dr Gerald Patrick McCann and Dr Kate Marie McCann': Speech.witness = 'Mr McCann' # All the A.s are him if Speech.witness == 'Mr James Watson and Mrs Margaret Watson': Speech.witness = 'Mrs Watson' # All the A.s are her if Speech.witness == 'Mr Matthew Bell and Mr Christopher Johnson': # The one A. is actually him from the following session Speech.witness = 'Mr Piers Pughe-Morgan' narrative = '%s %s.' % (m.group(1), m.group(2)) if state == 'witness': Speech.current_section.heading += ' / ' + Speech.witness speech.add_text(narrative) else: yield speech Speech.current_section = Section(heading=heading) speech = Speech(speaker=None, text=narrative) state = 'witness' continue else: state = 'text' # Question/answer (speaker from previous lines) if '2011-11-30am' in url and line == 'Q.': line = 'Q. From the police.' m = re.match('([QA])\. (.*)', line) if m: yield speech if m.group(1) == 'A': if '2011-12-08am' in url and not Speech.witness: Speech.witness = 'Professor Steven Barnett' assert Speech.witness speaker = Speech.witness else: assert interviewer speaker = interviewer speech = Speech(speaker=speaker, text=m.group(2)) # Put the correction alongside the speech if '2011-12-06am' in url and m.group(2).strip( ) == 'Sort of blagging. I mean, I was trying, as I said, not': speech.add_para(speech.text[0][0]) speech.text[0][ 0] = "[Mr Leigh's references on page 72 below to \"arms company \" was a slip and Mr Leigh has subsequently corrected this to \"a construction company \" as he intended at the time.]" continue # New speaker m = re.match(' *((?:[A-Z -]|Mc)+): (.*)', line) if m: yield speech speaker = fix_name(m.group(1)) if not interviewer: interviewer = speaker speech = Speech(speaker=speaker, text=m.group(2)) continue if '2011-10-04' in url or '2011-09-06' in url: # New paragraph if indent a bit more than 'usual' for the page left_space = len(line) - len(line.lstrip()) if left_space == 5: left_space = 4 if page == 113: # Special manual fix for this one first_indent = 3 elif not first_indent: first_indent = left_space elif first_indent > left_space + 1: # The first line must have been a new paragraph. Adjust accordingly. first_indent = left_space if len(speech.text[-1]) > 1: speech.add_para(speech.text[-1][-1]) del speech.text[-2][-1] m = re.match(' ' * (first_indent + 2), line) else: # New paragraph if indent at least 8 spaces m = re.match(' ', line) if m: speech.add_para(line.strip()) continue # If we've got this far, hopefully just a normal line of speech speech.add_text(line.strip()) yield speech
def parse_transcript(self, data): print "PARSING %s" % data['url'] page, num = 1, 1 date = '{d:%B} {d.day}, {d:%Y}'.format(d=data['date']) speech = None state = 'text' Speech.reset(True) for prev_line, line, next_line in prevnext(data['text']): # Page break if '\014' in line: page += 1 num = 0 continue if state == 'skip1': state = 'text' continue # Empty line, or line matching page footer if re.match('\s*$', line): continue if re.match( ' *(Strehlow & Associates, Inc.$|\(215\) 504-4622$|You created this PDF)(?i)', line): continue # Line matching page header if re.match(' *(Stated Meeting|%s)$' % date, line): continue # Ignore title page for now if page == 1: continue # Start of certificate/index if re.match(' *(\d+ *)?(CERTIFICATE|C E R T I F I C A T I O N)$', line): state = 'index' if state == 'index': continue # Each page starts with page number if num == 0: m = re.match(' +(?:Page )?(\d+)$', line) assert int(m.group(1)) == page num += 1 continue # Heading somewhere within this page, just ignore it if num == 1: num += 1 continue # Let's check we haven't lost a line anywhere... assert re.match(' *%d( |$)' % num, line), \ '%s != %s' % (num, line) line = re.sub('^ *%d( |$)' % num, '', line) num += 1 # Ignore line containing only a number and then dashes if re.match('[\s-]*$', line): continue # Narrative messages m = re.match(' +(\(.*\))$', line) if m: yield speech speech = Speech(speaker=None, text=line) continue m1 = re.match(' +(\(.*)$', line) m2 = re.match(' *\d+ +(.*\))$', next_line) if m1 and m2: yield speech speech = Speech( speaker=None, text='%s %s' % (m1.group(1), m2.group(1)), ) state = 'skip1' num += 1 continue # Okay, here we have a non-empty, non-page number, non-narrative # line of just text print page, num, line # New speaker m = re.match(" *([A-Z '.]+):(?: (.*)|$)", line) if m: yield speech speaker = self.fix_name(m.group(1)) text = m.group(2) or '' speech = Speech(speaker=speaker, text=text) continue # We must now already have a speech by the time we're here if not speech: raise Exception( 'Reached here without a speech - need to deal with "%s"' % line) if re.match(' ', line): speech.add_para(line.strip()) else: speech.add_text(line.strip()) yield speech
def parse_transcript(self, data): print "PARSING %s" % data["url"] speech = None new_page = False started = False ignore_rest_of_page = False Speech.reset(True) for prev_line, line, next_line in prevnext(data["text"]): # Page break if "\014" in line: continue # Empty line if re.match("\s*$", line): continue # Page number m = re.match("%s (?:\d+|\d+...\d+), 200\d *(\d+) of (\d+)$" % months, line) if m: page = int(m.group(1)) new_page = True ignore_rest_of_page = False continue # Message about lunch/adjournments m = re.match(" {10,}(\[.*\])$", line) if m: yield speech speech = Speech(speaker=None, text=line) continue # Headings m = re.match(" {10,}(.*)$", line) if m: t = m.group(1).strip() if re.match("Transcript of (the )?Federal Open Market Committee (Meeting|Conference Call) (on|of)$", t): started = True continue if re.match("END OF MEETING$", t) or re.match("%s \d+-\d+, 200\d$" % months, t): continue if re.match("%s \d+, 200\d$" % months, t): Speech.current_date = datetime.datetime.strptime(t, "%B %d, %Y") continue m = re.match("(?P<d>%s \d+(?:, 200\d)?)...(?P<s>(?:Morning|Afternoon) Session)" % months, t) if not m: m = re.match("(?P<s>Afternoon Session)...(?P<d>%s \d+, 200\d)" % months, t) if m: d, s = m.group("d"), m.group("s") if "200" not in d: d = "%s, %d" % (d, data["year"]) if d == "December 15, 2008" and s == "Morning Session": d = "December 16, 2008" Speech.current_date = datetime.datetime.strptime(d, "%B %d, %Y") Speech.current_section = Section(heading="%s, %s" % (s, d)) continue if not started: continue # footnote (always just "see materials") if re.match("[0-9]$", line): ignore_rest_of_page = True if ignore_rest_of_page: continue # Okay, here we have a non-empty, non-page number, non-heading line of just text # print page, line # New speaker m = re.match(" *((?:M[RS][. ]+|CHAIRMAN |VICE CHAIRMAN )[A-Z]+|PARTICIPANTS)[.:]? ?[0-9]? ?(.*)", line) if m: yield speech new_page = False speaker, speaker_display = self.fix_name(m.group(1)) speech = Speech(speaker=speaker, text=m.group(2), speaker_display=speaker_display) continue # We must now already have a speech by the time we're here if not speech: raise Exception, 'Reached here without a speech - need to deal with "%s"' % line.strip() if re.match("\s*$", prev_line): if new_page: new_page = False # This line could be a continuation or a new paragraph, # we're not sure as it's a new page. If the next line has # the same indentation, assume it's a continuation, # otherwise a new paragraph. left_space = len(line) - len(line.lstrip()) left_space_next = len(next_line) - len(next_line.lstrip()) if left_space != left_space_next: speech.add_para(line.strip()) else: speech.add_text(line.strip()) else: # The previous line was blank. If the line is indented, it # must be a new paragraph, in both single and double spaced # text. If it's not, it must be a continuation in double # spaced text. if re.match(" ", line): speech.add_para(line.strip()) else: speech.add_text(line.strip()) else: # If the last line wasn't blank, we're in single spaced text # and it must be a continuation. if re.search(" (Yes|No|With some reluctance, I will vote yes\.)$", line): line = "<br/>" + line.strip() speech.add_text(line.strip()) new_page = False if not started: raise Exception, "Never found the heading to begin" yield speech