def glossary(request): terms = sorted( list(Glossary.Query(request.redis_conn, request.mission.name).items()), key=lambda term: term.abbr, ) return render_to_response( 'glossary/glossary.html', { 'terms': terms, }, context_instance=RequestContext(request), )
def glossary_link(match, request): # Try to look up the definition gitem = None if request: try: gitem = Glossary(request.redis_conn, request.mission.name, match.group(1)) except ValueError: title = "" more_information = True else: title = gitem.description more_information = bool(gitem.extended_description) tag = 'abbr' if gitem.type == 'abbreviation' else 'i' else: title = "" more_information = True try: # full syntax [glossary:term|display] display = match.group(2) except IndexError: # abbreviated syntax [glossary:term] display = match.group(1) if title: display = u"<%(tag)s class='jargon' title='%(title)s'>%(text)s</%(tag)s>" % { "tag": tag, "title": title, "text": display, } if more_information: if gitem is not None: return u"<a href='%s#%s'>%s</a>" % ( reverse("glossary"), gitem.slug, display, ) else: return u"<a href='%s#%s'>%s</a>" % ( reverse("glossary"), slugify(match.group(1)), display, ) else: return display
def index(self): current_labels = {} current_transcript_page = None current_page = 1 current_page_lines = 0 current_lang = None last_act = None previous_log_line_id = None previous_timestamp = None launch_time = int( self.redis_conn.hget("mission:%s" % self.mission_name, "utc_launch_time")) acts = list(Act.Query(self.redis_conn, self.mission_name)) key_scenes = list(KeyScene.Query(self.redis_conn, self.mission_name)) glossary_items = dict([ (item.identifier.lower(), item) for item in Glossary.Query(self.redis_conn, self.mission_name) ]) for chunk in self.parser.get_chunks(): timestamp = chunk['timestamp'] log_line_id = "%s:%i" % (self.transcript_name, timestamp) if timestamp <= previous_timestamp: raise Exception, "%s should be after %s" % ( seconds_to_timestamp(timestamp), seconds_to_timestamp(previous_timestamp)) # See if there's transcript page info, and update it if so if chunk['meta'].get('_page', 0): current_transcript_page = int(chunk["meta"]['_page']) if chunk['meta'].get('_lang', None): current_lang = chunk['meta']['_lang'] if current_transcript_page: self.redis_conn.set("log_line:%s:page" % log_line_id, current_transcript_page) # Look up the act for act in acts: if act.includes(timestamp): break else: print "Error: No act for timestamp %s" % seconds_to_timestamp( timestamp) continue # If we've filled up the current page, go to a new one if current_page_lines >= self.LINES_PER_PAGE or ( last_act is not None and last_act != act): current_page += 1 current_page_lines = 0 last_act = act # First, create a record with some useful information info_key = "log_line:%s:info" % log_line_id info_record = { "offset": chunk['offset'], "page": current_page, "act": act.number, "utc_time": launch_time + timestamp, } if current_transcript_page: info_record["transcript_page"] = current_transcript_page if current_lang: info_record["lang"] = current_lang # And an editorial note if present if '_note' in chunk['meta']: info_record["note"] = chunk['meta']['_note'] self.redis_conn.hmset( info_key, info_record, ) # Look up the key scene for key_scene in key_scenes: if key_scene.includes(timestamp): self.redis_conn.hset(info_key, 'key_scene', key_scene.number) break # Create the doubly-linked list structure if previous_log_line_id: self.redis_conn.hset( info_key, "previous", previous_log_line_id, ) self.redis_conn.hset( "log_line:%s:info" % previous_log_line_id, "next", log_line_id, ) previous_log_line_id = log_line_id previous_timestamp = timestamp # Also store the text text = u"" for line in chunk['lines']: self.redis_conn.rpush( "log_line:%s:lines" % log_line_id, u"%(speaker)s: %(text)s" % line, ) text += "%s %s" % (line['speaker'], line['text']) # Store any images for i, image in enumerate(chunk['meta'].get("_images", [])): # Make the image id image_id = "%s:%s" % (log_line_id, i) # Push it onto the images list self.redis_conn.rpush( "log_line:%s:images" % log_line_id, image_id, ) # Store the image data self.redis_conn.hmset( "image:%s" % image_id, image, ) # Add that logline ID for the people involved speakers = set([line['speaker'] for line in chunk['lines']]) for speaker in speakers: self.redis_conn.sadd("speaker:%s" % speaker, log_line_id) # Add it to the index for this page self.redis_conn.rpush( "page:%s:%i" % (self.transcript_name, current_page), log_line_id) # Add it to the index for this transcript page self.redis_conn.rpush( "transcript_page:%s:%s" % (self.transcript_name, current_transcript_page), log_line_id) # Add it into the transcript and everything sets self.redis_conn.zadd("log_lines:%s" % self.mission_name, log_line_id, chunk['timestamp']) self.redis_conn.zadd("transcript:%s" % self.transcript_name, log_line_id, chunk['timestamp']) # Read the new labels into current_labels has_labels = False if '_labels' in chunk['meta']: for label, endpoint in chunk['meta']['_labels'].items(): if endpoint is not None and label not in current_labels: current_labels[label] = endpoint elif label in current_labels: current_labels[label] = max(current_labels[label], endpoint) elif endpoint is None: self.redis_conn.sadd("label:%s" % label, log_line_id) has_labels = True # Expire any old labels for label, endpoint in current_labels.items(): if endpoint < chunk['timestamp']: del current_labels[label] # Apply any surviving labels for label in current_labels: self.redis_conn.sadd("label:%s" % label, log_line_id) has_labels = True # And add this logline to search index if has_labels: print "weight = 3 for %s" % log_line_id weight = 3.0 # magic! else: weight = 1.0 self.add_to_search_index( mission=self.mission_name, id=log_line_id, chunk=chunk, weight=weight, timestamp=timestamp, ) # For any mentioned glossary terms, add to them. for word in text.split(): word = word.strip(",;-:'\"").lower() if word in glossary_items: glossary_item = glossary_items[word] self.redis_conn.hincrby( "glossary:%s" % glossary_item.id, "times_mentioned", 1, ) # Increment the number of log lines we've done current_page_lines += len(chunk['lines']) pages_set = self.redis_conn.hexists("pages:%s" % self.mission_name, self.transcript_name) if not pages_set and current_transcript_page: print "%s original pages: %d" % (self.transcript_name, current_transcript_page) self.redis_conn.hset("pages:%s" % self.mission_name, self.transcript_name, current_transcript_page)
def _query(self): return Glossary.Query(self.request.redis_conn, self.request.mission.name)