def __init__(self, transcript_dir, cache_dir, sim_tolerance = -0.4, bag_tolerance = 0.7): ''' Arguments: transcript_dir (str): directory containing speech transcripts. Transcripts should be of format: Transcript title Transcript timestamp Transcript text cache_dir (str): directory to write cache output sim_tolerance (float, default=-.4): minimum similarity of valid quote-transcript match bag_tolerance (float, default=.7): minimum proportion of words in quote that must be present in transcript for string alignment process to occur. ''' self.sim_tolerance = sim_tolerance self.bag_tolerance = bag_tolerance self.cache_dir = cache_dir transcript_order, self.transcript_text = mu.fetch_transcripts(transcript_dir) self.transcript_times = [x[0] for x in transcript_order] self.transcript_names = [x[1] for x in transcript_order] self.phrasematch_cache = self.load_cached_dict(self.PHRASEMATCH_CACHENAME)
import article_utils as au import match_utils as mu import cPickle ''' groups quotes into families of similar quotes and deduplicates articles. ''' TRANSCRIPT_DIR = '/NLP/creativity/work/pres_addrs/transcripts' # transcripts POSTFILTERED_MENTION_OUTPUT = 'output/postfiltered_mentions' # output (pickle) of postfilter_matches.py POSTPROCESSED_OUTPUT = 'output/postprocessed_mentions' # output file (pickle) print 'loading stuff' with open(POSTFILTERED_MENTION_OUTPUT) as f: mentions = cPickle.load(f) torder, ttext = mu.fetch_transcripts(TRANSCRIPT_DIR) print 'clustering quotes' quotes = list(set([au.get_alignment(x) for x in mentions])) transcript_to_quotelist = gq.get_transcript_to_quotelist(quotes) alignment_to_family_id, family_id_to_alignments = gq.group_all(transcript_to_quotelist, [x[1] for x in torder]) print len(family_id_to_alignments) print 'deduplicating stuff' url_to_article = au.simple_dedup(mentions) families_to_articles = au.group_articles_by_cluster(mentions, url_to_article, alignment_to_family_id) article_groups = [] for famset, articles in families_to_articles.iteritems(): article_groups += au.get_article_groups(articles)