def test_dailydigest_I67(self): from locator.dailydigest import DailyDigestInputParser data = b'\x07I67H' current_state_stack, _ = process_lines( data, (None, b'G2'), locator_table=DailyDigestInputParser.LOCATOR_TABLE, font_table=DailyDigestInputParser.FONT_TABLE) current_state = current_state_stack[-1] self.assertEqual( current_state[0], {'end': '</span>', 'grid': b'', 'start': "<span class='bell-I67H dailydigest-extension'>"}) import io out_io = io.StringIO() current_state = (None, b'G2') # start as Grid 2 parser = InputParser() for page, page_match, line in parser.makelines(data, output=out_io): current_state, _ = process_lines( line, current_state, outputf=out_io, locator_table=DailyDigestInputParser.LOCATOR_TABLE, font_table=DailyDigestInputParser.FONT_TABLE) contents = out_io.getvalue() self.assertEqual( contents, "<span class='bell-I67H dailydigest-extension'>")
def test_dailydigest_I67(self): from locator.dailydigest import DailyDigestInputParser data = b'\x07I67H' current_state_stack, _ = process_lines( data, (None, b'G2'), locator_table=DailyDigestInputParser.LOCATOR_TABLE, font_table=DailyDigestInputParser.FONT_TABLE) current_state = current_state_stack[-1] self.assertEqual( current_state[0], { 'end': '</span>', 'grid': b'', 'start': "<span class='bell-I67H dailydigest-extension'>" }) import io out_io = io.StringIO() current_state = (None, b'G2') # start as Grid 2 parser = InputParser() for page, page_match, line in parser.makelines(data, output=out_io): current_state, _ = process_lines( line, current_state, outputf=out_io, locator_table=DailyDigestInputParser.LOCATOR_TABLE, font_table=DailyDigestInputParser.FONT_TABLE) contents = out_io.getvalue() self.assertEqual(contents, "<span class='bell-I67H dailydigest-extension'>")
def parse_io( self, inputfile=None, current_state=( None, b'G2'), outputfile=None, locator_table=None, font_table=None, postfix=None): ''' output by default is a StringIO object, you will probably want to output = parse_io(...) output.seek(0) to rewind to the begining. Alternatively you can pass in a file handle. ''' if not locator_table: locator_table = self.LOCATOR_TABLE if not font_table : font_table = self.FONT_TABLE out = outputfile if outputfile is None: out = io.StringIO() input = inputfile.read() input = input.strip() current_page = None output("<html>", outf=out) for page, page_match, line in self.makelines(input, output=out): current_state_stack , output_line = process_lines( line, current_state, outputf=out, locator_table=locator_table, font_table=font_table, postfix=postfix) logger.debug("Current_state:%s", current_state) logger.debug("Page:%s Current_page:%s", page, current_page) current_state = current_state_stack[-1] if page: #output = re.sub(b'\x07',b'[BELL-]', line) if not current_page: current_page = page if page != current_page: # changed Page! output( b"<center>[Page:" + current_page + b"] </center>", outf=out) current_page = page if current_page: output(b"<center>[Page:" + current_page + b"] </center>", outf=out) output("</html>", outf=out) return out
def test_dailydigest_I01_actions(self): from locator.dailydigest import DailyDigestInputParser data = b'\x07I01Monday, April 18, 2016\xadD382' current_state_stack, _ = process_lines( data, (None, b'G2'), locator_table=DailyDigestInputParser.LOCATOR_TABLE, font_table=DailyDigestInputParser.FONT_TABLE) current_state = current_state_stack[-1] self.assertEqual(current_state[0].get('end'), '</em></h3>') self.assertEqual(current_state[0].get('grid'), b'G2') self.assertEqual(current_state[0].get('start'), '<h3><em>')
def test_I01(self): data = b'''I01AAGENES, ALEXA''' current_state_stack, _ = process_lines( data, (None, b'G2'), locator_table=CongressionalRecordIndexInputParser.LOCATOR_TABLE, font_table=CongressionalRecordIndexInputParser.FONT_TABLE) current_state = current_state_stack[-1] self.assertEqual( current_state[0].get('end'), '') self.assertEqual( current_state[0].get('grid'), b'G2') self.assertEqual( current_state[0].get('start'), '')
def test_I03(self): data = b''' I03Bills and resolutions cosponsored ''' current_state_stack, _ = process_lines( data, (None, b'G2'), locator_table=CongressionalRecordIndexInputParser.LOCATOR_TABLE, font_table=CongressionalRecordIndexInputParser.FONT_TABLE) current_state = current_state_stack[-1] self.assertEqual( current_state[0].get('end'), '</h2>') self.assertEqual( current_state[0].get('grid'), b'G2') self.assertEqual( current_state[0].get('start'), '<h2>')
def parse_io(self, inputfile=None, current_state=(None, b'G2'), outputfile=None, locator_table=None, font_table=None, postfix=None): ''' output by default is a StringIO object, you will probably want to output = parse_io(...) output.seek(0) to rewind to the begining. Alternatively you can pass in a file handle. ''' if not locator_table: locator_table = self.LOCATOR_TABLE if not font_table: font_table = self.FONT_TABLE out = outputfile if outputfile is None: out = io.StringIO() input = inputfile.read() input = input.strip() current_page = None output("<html>", outf=out) for page, page_match, line in self.makelines(input, output=out): current_state_stack, output_line = process_lines( line, current_state, outputf=out, locator_table=locator_table, font_table=font_table, postfix=postfix) logger.debug("Current_state:%s", current_state) logger.debug("Page:%s Current_page:%s", page, current_page) current_state = current_state_stack[-1] if page: #output = re.sub(b'\x07',b'[BELL-]', line) if not current_page: current_page = page if page != current_page: # changed Page! output(b"<center>[Page:" + current_page + b"] </center>", outf=out) current_page = page if current_page: output(b"<center>[Page:" + current_page + b"] </center>", outf=out) output("</html>", outf=out) return out
def test_I05(self): from locator.congressionalrecordindex import CongressionalRecordIndexInputParser data = b'''I05Committee to escort Japanese Prime Minister, Shinzo Abe, into the House Chamber, H2503 [29AP] ''' current_state_stack, _ = process_lines( data, (None, b'G2'), locator_table=CongressionalRecordIndexInputParser.LOCATOR_TABLE, font_table=CongressionalRecordIndexInputParser.FONT_TABLE) current_state = current_state_stack[-1] self.assertEqual( current_state[0].get('end'), '</p>') self.assertEqual( current_state[0].get('grid'), b'G2') self.assertEqual( current_state[0].get('start'), '<p>')
def test_dailydigest_I01_actions(self): from locator.dailydigest import DailyDigestInputParser data = b'\x07I01Monday, April 18, 2016\xadD382' current_state_stack, _ = process_lines( data, (None, b'G2'), locator_table=DailyDigestInputParser.LOCATOR_TABLE, font_table=DailyDigestInputParser.FONT_TABLE) current_state = current_state_stack[-1] self.assertEqual( current_state[0].get('end'), '</em></h3>') self.assertEqual( current_state[0].get('grid'), b'G2') self.assertEqual( current_state[0].get('start'), '<h3><em>')
def parse_io(self, inputfile=None, current_state=(None, b'G2'), outputfile=None, locator_table=None, font_table=None, postfix=None, year=None): ''' output by default is a StringIO object, you will probably want to output = parse_io(...) output.seek(0) to rewind to the begining. Alternatively you can pass in a file handle. ''' if self.year and not year: year = self.year orig_current_state = current_state outputs = {} inputdata = inputfile.read() name = "" for stanza in self.make_stanzas(inputdata): logger.debug("CRI stanza:%s", stanza) out = io.StringIO() # For every sub document in the dat file reset the state to the # start current_state = orig_current_state current_state_stack = [] cnt = 0 for page, page_match, line in self.makelines(stanza, output=out): ret_current_state_stack, output_line = process_lines( line, current_state, outputf=out, locator_table=locator_table, font_table=font_table, postfix=postfix) current_state = ret_current_state_stack[-1] logger.debug("Current state:%s", current_state) logger.debug("Previous state :%s", ret_current_state_stack[0]) logger.debug("[%d] line:[%s] states[%s]", cnt, line, ret_current_state_stack) current_state_stack.append((ret_current_state_stack, line)) cnt = cnt + 1 # check all non first items in stack if they exist and have a bellcode for state, line in current_state_stack: # first item in every state is the previous state, so skip it if state[1]: for action, grid in state[1:]: if action and action.get('bellcode') == b'I01': name, cleaned_line = self.process_stanza_title( line, year) line_name = cleaned_line if current_state[0] and current_state[0].get('end'): logger.debug("\tcurrent_state.end:%s", current_state[0].get('end')) output(current_state[0].get('end'), outf=out) # rewind to the begining now that we are finshed with output. out.seek(0) # if there is no name then we don't bother with the section if name: outputs[name] = out yield ((name, line_name), out)
def parse_io( self, inputfile=None, current_state=( None, b'G2'), outputfile=None, locator_table=None, font_table=None, postfix=None, year=None): ''' output by default is a StringIO object, you will probably want to output = parse_io(...) output.seek(0) to rewind to the begining. Alternatively you can pass in a file handle. ''' if self.year and not year: year = self.year orig_current_state = current_state outputs = {} inputdata = inputfile.read() name = "" for stanza in self.make_stanzas(inputdata): logger.debug("CRI stanza:%s", stanza) out = io.StringIO() # For every sub document in the dat file reset the state to the # start current_state = orig_current_state current_state_stack = [] cnt = 0 for page, page_match, line in self.makelines(stanza, output=out): ret_current_state_stack, output_line = process_lines( line, current_state, outputf=out, locator_table=locator_table, font_table=font_table, postfix=postfix) current_state = ret_current_state_stack[-1] logger.debug("Current state:%s", current_state) logger.debug("Previous state :%s", ret_current_state_stack[0]) logger.debug("[%d] line:[%s] states[%s]", cnt, line, ret_current_state_stack) current_state_stack.append( ( ret_current_state_stack, line)) cnt=cnt+1 # check all non first items in stack if they exist and have a bellcode for state, line in current_state_stack : # first item in every state is the previous state, so skip it if state[1]: for action, grid in state[1:]: if action and action.get('bellcode') == b'I01': name ,cleaned_line= self.process_stanza_title(line,year) line_name = cleaned_line if current_state[0] and current_state[0].get('end'): logger.debug( "\tcurrent_state.end:%s", current_state[0].get('end')) output(current_state[0].get('end'), outf=out) # rewind to the begining now that we are finshed with output. out.seek(0) # if there is no name then we don't bother with the section if name: outputs[name] = out yield ((name, line_name) , out )