def reformat(filename): parser = TranscriptParser(filename) for chunk in parser.get_chunks(): timestamp = seconds_to_timestamp(chunk['timestamp']) for line in chunk['lines']: print "%s\t%s:\t %s" % ( timestamp, line['speaker'], line['text'], )
def index(self): current_labels = {} current_transcript_page = None current_page = 1 current_page_lines = 0 current_lang = None last_act = None previous_log_line_id = None previous_timestamp = None launch_time = int( self.redis_conn.hget("mission:%s" % self.mission_name, "utc_launch_time")) acts = list(Act.Query(self.redis_conn, self.mission_name)) key_scenes = list(KeyScene.Query(self.redis_conn, self.mission_name)) glossary_items = dict([ (item.identifier.lower(), item) for item in Glossary.Query(self.redis_conn, self.mission_name) ]) for chunk in self.parser.get_chunks(): timestamp = chunk['timestamp'] log_line_id = "%s:%i" % (self.transcript_name, timestamp) if timestamp <= previous_timestamp: raise Exception, "%s should be after %s" % ( seconds_to_timestamp(timestamp), seconds_to_timestamp(previous_timestamp)) # See if there's transcript page info, and update it if so if chunk['meta'].get('_page', 0): current_transcript_page = int(chunk["meta"]['_page']) if chunk['meta'].get('_lang', None): current_lang = chunk['meta']['_lang'] if current_transcript_page: self.redis_conn.set("log_line:%s:page" % log_line_id, current_transcript_page) # Look up the act for act in acts: if act.includes(timestamp): break else: print "Error: No act for timestamp %s" % seconds_to_timestamp( timestamp) continue # If we've filled up the current page, go to a new one if current_page_lines >= self.LINES_PER_PAGE or ( last_act is not None and last_act != act): current_page += 1 current_page_lines = 0 last_act = act # First, create a record with some useful information info_key = "log_line:%s:info" % log_line_id info_record = { "offset": chunk['offset'], "page": current_page, "act": act.number, "utc_time": launch_time + timestamp, } if current_transcript_page: info_record["transcript_page"] = current_transcript_page if current_lang: info_record["lang"] = current_lang # And an editorial note if present if '_note' in chunk['meta']: info_record["note"] = chunk['meta']['_note'] self.redis_conn.hmset( info_key, info_record, ) # Look up the key scene for key_scene in key_scenes: if key_scene.includes(timestamp): self.redis_conn.hset(info_key, 'key_scene', key_scene.number) break # Create the doubly-linked list structure if previous_log_line_id: self.redis_conn.hset( info_key, "previous", previous_log_line_id, ) self.redis_conn.hset( "log_line:%s:info" % previous_log_line_id, "next", log_line_id, ) previous_log_line_id = log_line_id previous_timestamp = timestamp # Also store the text text = u"" for line in chunk['lines']: self.redis_conn.rpush( "log_line:%s:lines" % log_line_id, u"%(speaker)s: %(text)s" % line, ) text += "%s %s" % (line['speaker'], line['text']) # Store any images for i, image in enumerate(chunk['meta'].get("_images", [])): # Make the image id image_id = "%s:%s" % (log_line_id, i) # Push it onto the images list self.redis_conn.rpush( "log_line:%s:images" % log_line_id, image_id, ) # Store the image data self.redis_conn.hmset( "image:%s" % image_id, image, ) # Add that logline ID for the people involved speakers = set([line['speaker'] for line in chunk['lines']]) for speaker in speakers: self.redis_conn.sadd("speaker:%s" % speaker, log_line_id) # Add it to the index for this page self.redis_conn.rpush( "page:%s:%i" % (self.transcript_name, current_page), log_line_id) # Add it to the index for this transcript page self.redis_conn.rpush( "transcript_page:%s:%s" % (self.transcript_name, current_transcript_page), log_line_id) # Add it into the transcript and everything sets self.redis_conn.zadd("log_lines:%s" % self.mission_name, log_line_id, chunk['timestamp']) self.redis_conn.zadd("transcript:%s" % self.transcript_name, log_line_id, chunk['timestamp']) # Read the new labels into current_labels has_labels = False if '_labels' in chunk['meta']: for label, endpoint in chunk['meta']['_labels'].items(): if endpoint is not None and label not in current_labels: current_labels[label] = endpoint elif label in current_labels: current_labels[label] = max(current_labels[label], endpoint) elif endpoint is None: self.redis_conn.sadd("label:%s" % label, log_line_id) has_labels = True # Expire any old labels for label, endpoint in current_labels.items(): if endpoint < chunk['timestamp']: del current_labels[label] # Apply any surviving labels for label in current_labels: self.redis_conn.sadd("label:%s" % label, log_line_id) has_labels = True # And add this logline to search index if has_labels: print "weight = 3 for %s" % log_line_id weight = 3.0 # magic! else: weight = 1.0 self.add_to_search_index( mission=self.mission_name, id=log_line_id, chunk=chunk, weight=weight, timestamp=timestamp, ) # For any mentioned glossary terms, add to them. for word in text.split(): word = word.strip(",;-:'\"").lower() if word in glossary_items: glossary_item = glossary_items[word] self.redis_conn.hincrby( "glossary:%s" % glossary_item.id, "times_mentioned", 1, ) # Increment the number of log lines we've done current_page_lines += len(chunk['lines']) pages_set = self.redis_conn.hexists("pages:%s" % self.mission_name, self.transcript_name) if not pages_set and current_transcript_page: print "%s original pages: %d" % (self.transcript_name, current_transcript_page) self.redis_conn.hset("pages:%s" % self.mission_name, self.transcript_name, current_transcript_page)
def index(self): current_labels = {} current_transcript_page = None current_page = 1 current_page_lines = 0 last_act = None previous_log_line_id = None launch_time = int(self.redis_conn.hget("mission:%s" % self.mission_name, "utc_launch_time")) acts = list(Act.Query(self.redis_conn, self.mission_name)) key_scenes = list(KeyScene.Query(self.redis_conn, self.mission_name)) glossary_items = dict([ (item.identifier.lower(), item) for item in Glossary.Query(self.redis_conn, self.mission_name) ]) for chunk in self.parser.get_chunks(): timestamp = chunk['timestamp'] log_line_id = "%s:%i" % (self.transcript_name, timestamp) # See if there's transcript page info, and update it if so if chunk['meta'].get('_page', 0): current_transcript_page = int(chunk["meta"]['_page']) if current_transcript_page: self.redis_conn.set("log_line:%s:page" % log_line_id, current_transcript_page) # Look up the act for act in acts: if act.includes(timestamp): break else: print "Error: No act for timestamp %s" % seconds_to_timestamp(timestamp) continue # If we've filled up the current page, go to a new one if current_page_lines >= self.LINES_PER_PAGE or (last_act is not None and last_act != act): current_page += 1 current_page_lines = 0 last_act = act # First, create a record with some useful information info_key = "log_line:%s:info" % log_line_id info_record = { "offset": chunk['offset'], "page": current_page, "act": act.number, "utc_time": launch_time + timestamp, } if current_transcript_page: info_record["transcript_page"] = current_transcript_page self.redis_conn.hmset( info_key, info_record, ) # Look up the key scene for key_scene in key_scenes: if key_scene.includes(timestamp): self.redis_conn.hset(info_key, 'key_scene', key_scene.number) break # Create the doubly-linked list structure if previous_log_line_id: self.redis_conn.hset( info_key, "previous", previous_log_line_id, ) self.redis_conn.hset( "log_line:%s:info" % previous_log_line_id, "next", log_line_id, ) previous_log_line_id = log_line_id # Also store the text text = "" for line in chunk['lines']: self.redis_conn.rpush( "log_line:%s:lines" % log_line_id, "%(speaker)s: %(text)s" % line, ) text += "%s %s" % (line['speaker'], line['text']) # Store any images for i, image in enumerate(chunk['meta'].get("_images", [])): # Make the image id image_id = "%s:%s" % (log_line_id, i) # Push it onto the images list self.redis_conn.rpush( "log_line:%s:images" % log_line_id, image_id, ) # Store the image data self.redis_conn.hmset( "image:%s" % image_id, image, ) # Add that logline ID for the people involved speakers = set([ line['speaker'] for line in chunk['lines'] ]) for speaker in speakers: self.redis_conn.sadd("speaker:%s" % speaker, log_line_id) # Add it to the index for this page self.redis_conn.rpush("page:%s:%i" % (self.transcript_name, current_page), log_line_id) # Add it into the transcript and everything sets self.redis_conn.zadd("log_lines:%s" % self.mission_name, log_line_id, chunk['timestamp']) self.redis_conn.zadd("transcript:%s" % self.transcript_name, log_line_id, chunk['timestamp']) # Read the new labels into current_labels has_labels = False if '_labels' in chunk['meta']: for label, endpoint in chunk['meta']['_labels'].items(): if endpoint is not None and label not in current_labels: current_labels[label] = endpoint elif label in current_labels: current_labels[label] = max( current_labels[label], endpoint ) elif endpoint is None: self.redis_conn.sadd("label:%s" % label, log_line_id) has_labels = True # Expire any old labels for label, endpoint in current_labels.items(): if endpoint < chunk['timestamp']: del current_labels[label] # Apply any surviving labels for label in current_labels: self.redis_conn.sadd("label:%s" % label, log_line_id) has_labels = True # And add this logline to search index if has_labels: print "weight = 3 for %s" % log_line_id weight = 3.0 # magic! else: weight = 1.0 self.add_to_search_index( mission=self.mission_name, id=log_line_id, lines = chunk['lines'], weight=weight, timestamp=timestamp, ) # For any mentioned glossary terms, add to them. for word in text.split(): word = word.strip(",;-:'\"").lower() if word in glossary_items: glossary_item = glossary_items[word] self.redis_conn.hincrby( "glossary:%s" % glossary_item.id, "times_mentioned", 1, ) # Increment the number of log lines we've done current_page_lines += len(chunk['lines'])
def get_chunks(self, offset=0): """ Reads the log lines from the file in order and yields them. """ current_chunk = None reuse_line = None lines = iter(self.get_lines(offset)) while lines or reuse_line: # If there's a line to reuse, use that, else read a new # line from the file. if reuse_line: line = reuse_line reuse_line = None else: try: line = lines.next() except StopIteration: break offset += len(line) line = line.decode("utf8") # If it's a comment or empty line, ignore it. if not line.strip() or line.strip()[0] == "#": continue # If it's a timestamp header, make a new chunk object. elif line[0] == "[": # Read the timestamp try: timestamp = int(line[1:].split("]")[0]) except ValueError: try: timestamp = timestamp_to_seconds(line[1:].split("]")[0]) except ValueError: print "Error: invalid timestamp %s" % (line[1:], ) raise if current_chunk: yield current_chunk # Start a new log line item current_chunk = { "timestamp": timestamp, "lines": [], "meta": {}, "offset": offset - len(line), } # If it's metadata, read the entire thing. elif line[0] == "_": # Meta item name, blob = line.split(":", 1) while True: try: line = lines.next() except StopIteration: break offset += len(line) line = line.decode("utf8") if not line.strip() or line.strip()[0] == "#": continue if line[0] in string.whitespace: blob += line else: reuse_line = line break # Parse the blob blob = blob.strip() if blob: try: data = json.loads(blob) except ValueError: try: data = json.loads('"%s"' % blob) except ValueError: print "Error: Invalid json at timestamp %s, key %s" % \ (seconds_to_timestamp(timestamp), name) continue current_chunk['meta'][name.strip()] = data # If it's a continuation, append to the current line elif line[0] in string.whitespace: # Continuation line if not current_chunk: print "Error: Continuation line before first timestamp header. Line: %s" % \ (line) elif not current_chunk['lines']: print "Error: Continuation line before first speaker name." else: current_chunk['lines'][-1]['text'] += " " + line.strip() # If it's a new line, start a new line. Shock. else: # New line of speech try: speaker, text = line.split(":", 1) except ValueError: print "Error: First speaker line not in Name: Text format: %s." % (line,) else: line = { "speaker": speaker.strip(), "text": text.strip(), } current_chunk['lines'].append(line) # Finally, if there's one last chunk, yield it. if current_chunk: yield current_chunk
def build_mission(self, mission): print "Building data visualisations for %s..." % mission.name for act in list(Act.Query(self.redis_conn, mission.name)): print ' ... %s' % act.title # Split the act into sections, one for each bar on the graph act_duration = act.end - act.start section_duration = act_duration // 92 # Count the number of log lines in each segment # and find the maximum number of log lines in a segment t = act.start segment_line_counts = [] max_line_count = 0 real_output_path = self.image_output_path % mission.name while t < act.end: # Load log lines for this segment query = LogLine.Query(self.redis_conn, mission.name).transcript(mission.main_transcript).range(t, t+section_duration) line_count = len(list(query)) # Store segment stats max_line_count = max(line_count, max_line_count) segment_line_counts.append((t, t+section_duration, line_count)) t += section_duration # Make sure we have an output directoy and work out where to write the image try: os.makedirs(real_output_path) except OSError: pass graph_file = 'graph_%s_%s.png' % (mission.name, act.number) output_path = '%s/%s' % (real_output_path, graph_file) # Add initial draw command draw_commands = [ 'convert', self.graph_background_file, '-fill', self.graph_bar_colour, ] # Add initial image map tags image_map_id = '%s_%s_frequency_graph' % (mission.name, act.number) image_map = ['<map id="%s" name="%s">' % (image_map_id, image_map_id)] # Iterate over the segments and add them to the draw commands and image map for i, line in enumerate(segment_line_counts): start, end, count = line height = int(round(count / float(max(max_line_count, 1)) * self.max_bar_height)) bar_width = 6 bar_spacing = 4 top_left_x = i * (bar_width + bar_spacing) + 2 top_left_y = self.max_bar_height - height + 14 bottom_right_x = top_left_x + bar_width bottom_right_y = self.max_bar_height + 14 draw_commands.append('-draw') draw_commands.append('rectangle %s,%s,%s,%s' % (top_left_x, top_left_y, bottom_right_x, bottom_right_y)) if height > 0: image_map.append('<area shape="rect" coords="%(coords)s" href="%(url)s" alt="%(alt)s">' % { "url": '/%s/%s/#show-selection' % (seconds_to_timestamp(start), seconds_to_timestamp(end)), "alt": '%d lines between %s and %s' % (count, seconds_to_timestamp(start), seconds_to_timestamp(end)), "coords": '%s,%s,%s,%s' % (top_left_x, top_left_y, bottom_right_x, bottom_right_y), }) # Output the basic graph image draw_commands.append(output_path) subprocess.call(draw_commands) # Iterate over the key scenes adding them to the graph and image map for i, key_scene in enumerate(act.key_scenes()): print ' - %s' % key_scene.title top_left_x = int((self.graph_background_width / float(act_duration)) * (key_scene.start - act.start)) + 2 top_left_y = self.max_bar_height + 5 + 14 bottom_right_x = top_left_x + 20 bottom_right_y = top_left_y + 20 marker_image = self.key_scene_marker_files % (i+1) subprocess.call([ 'composite', '-geometry', '+%s+%s' % (top_left_x, top_left_y), marker_image, output_path, output_path, ]) image_map.append('<area shape="rect" coords="%(coords)s" href="%(url)s" alt="%(alt)s">' % { "url": '/%s/%s/#show-selection' % (seconds_to_timestamp(key_scene.start), seconds_to_timestamp(key_scene.end)), "alt": key_scene.title.decode('utf-8'), "coords": '%s,%s,%s,%s' % (top_left_x, top_left_y, bottom_right_x, bottom_right_y), }) # Finalise the image map image_map.append('</map>') self.redis_conn.hmset( 'act:%s:%s:stats' % (mission.name, act.number), { "image_map": "\n".join(image_map), "image_map_id": image_map_id, } )
def build_mission(self, mission): print "Building data visualisations for %s..." % mission.name for act in list(Act.Query(self.redis_conn, mission.name)): print ' ... %s' % act.title # Split the act into sections, one for each bar on the graph act_duration = act.end - act.start section_duration = act_duration // 92 # Count the number of log lines in each segment # and find the maximum number of log lines in a segment t = act.start segment_line_counts = [] max_line_count = 0 real_output_path = self.image_output_path % mission.name while t < act.end: # Load log lines for this segment query = LogLine.Query(self.redis_conn, mission.name).transcript( mission.main_transcript).range( t, t + section_duration) line_count = len(list(query)) # Store segment stats max_line_count = max(line_count, max_line_count) segment_line_counts.append( (t, t + section_duration, line_count)) t += section_duration # Make sure we have an output directory and work out where to # write the image try: os.makedirs(real_output_path) except OSError: pass graph_file = 'graph_%s_%s.png' % (mission.name, act.number) output_path = '%s/%s' % (real_output_path, graph_file) # Add initial draw command draw_commands = [ 'convert', '-size', '%dx%d' % (self.width, self.height), 'xc:transparent', '-fill', self.end_marker_colour, '-draw', "path 'M 1,1 L 10,1 L 5,8 L 1,1", '-draw', "path 'M 890,1 L 900,1 L 895,8 L 890,1", '-fill', self.graph_bar_colour, ] # Add initial image map tags image_map_id = '%s_%s_frequency_graph' % (mission.name, act.number) image_map = [ '<map id="%s" name="%s">' % (image_map_id, image_map_id) ] # Iterate over the segments and add them to the draw commands and image map for i, line in enumerate(segment_line_counts): start, end, count = line height = int( round(count / float(max(max_line_count, 1)) * self.max_bar_height)) bar_width = 6 bar_spacing = 4 top_left_x = i * (bar_width + bar_spacing) + 2 top_left_y = self.max_bar_height - height + 14 bottom_right_x = top_left_x + bar_width bottom_right_y = self.max_bar_height + 14 draw_commands.append('-draw') draw_commands.append( 'rectangle %s,%s,%s,%s' % (top_left_x, top_left_y, bottom_right_x, bottom_right_y)) if height > 0: image_map.append( '<area shape="rect" coords="%(coords)s" href="%(url)s" alt="%(alt)s">' % { "url": '/%s/%s/#show-selection' % (seconds_to_timestamp(start), seconds_to_timestamp(end)), "alt": '%d lines between %s and %s' % (count, seconds_to_timestamp(start), seconds_to_timestamp(end)), "coords": '%s,%s,%s,%s' % (top_left_x, top_left_y, bottom_right_x, bottom_right_y), }) # Output the basic graph image draw_commands.append(output_path) subprocess.call(draw_commands) # Iterate over the key scenes adding them to the graph and image map for i, key_scene in enumerate(act.key_scenes()): print ' - %s' % key_scene.title top_left_x = int( (self.graph_background_width / float(act_duration)) * (key_scene.start - act.start)) + 2 top_left_y = self.max_bar_height + 5 + 14 bottom_right_x = top_left_x + 20 bottom_right_y = top_left_y + 20 marker_image = self.key_scene_marker_files % (i + 1) subprocess.call([ 'composite', '-geometry', '+%s+%s' % (top_left_x, top_left_y), marker_image, output_path, output_path, ]) image_map.append( '<area shape="rect" coords="%(coords)s" href="%(url)s" alt="%(alt)s">' % { "url": '/%s/%s/#show-selection' % (seconds_to_timestamp(key_scene.start), seconds_to_timestamp(key_scene.end)), "alt": key_scene.title.decode('utf-8'), "coords": '%s,%s,%s,%s' % (top_left_x, top_left_y, bottom_right_x, bottom_right_y), }) # Finalise the image map image_map.append('</map>') self.redis_conn.hmset( 'act:%s:%s:stats' % (mission.name, act.number), { "image_map": "\n".join(image_map), "image_map_id": image_map_id, })
def get_chunks(self, offset=0): """ Reads the log lines from the file in order and yields them. """ current_chunk = None reuse_line = None lines = iter(self.get_lines(offset)) while lines or reuse_line: # If there's a line to reuse, use that, else read a new # line from the file. if reuse_line: line = reuse_line reuse_line = None else: try: line = lines.next() except StopIteration: break offset += len(line) line = line.decode("utf8") # If it's a comment or empty line, ignore it. if not line.strip() or line.strip()[0] == "#": continue # If it's a timestamp header, make a new chunk object. elif line[0] == "[": # Read the timestamp try: timestamp = int(line[1:].split("]")[0]) except ValueError: timestamp = timestamp_to_seconds(line[1:].split("]")[0]) if current_chunk: yield current_chunk # Start a new log line item current_chunk = { "timestamp": timestamp, "lines": [], "meta": {}, "offset": offset - len(line), } # If it's metadata, read the entire thing. elif line[0] == "_": # Meta item name, blob = line.split(":", 1) while True: try: line = lines.next() except StopIteration: break offset += len(line) line = line.decode("utf8") if not line.strip() or line.strip()[0] == "#": continue if line[0] in string.whitespace: blob += line else: reuse_line = line break # Parse the blob blob = blob.strip() if blob: try: data = json.loads(blob) except ValueError: try: data = json.loads('"%s"' % blob) except ValueError: print "Error: Invalid json at timestamp %s, key %s" % \ (seconds_to_timestamp(timestamp), name) continue current_chunk['meta'][name.strip()] = data # If it's a continuation, append to the current line elif line[0] in string.whitespace: # Continuation line if not current_chunk: print "Error: Continuation line before first timestamp header. Line: %s" % \ (line) elif not current_chunk['lines']: print "Error: Continuation line before first speaker name. Timestamp %s" % \ (seconds_to_timestamp(timestamp)) else: current_chunk['lines'][-1]['text'] += " " + line.strip() # If it's a new line, start a new line. Shock. else: # New line of speech try: speaker, text = line.split(":", 1) except ValueError: print "Error: First speaker line not in Name: Text format: %s. Timestamp %s" % \ (line, seconds_to_timestamp(timestamp)) else: line = { "speaker": speaker.strip(), "text": text.strip(), } current_chunk['lines'].append(line) # Finally, if there's one last chunk, yield it. if current_chunk: yield current_chunk