def __init__(self, transcript_dir, cache_dir, sim_tolerance = -0.4, bag_tolerance = 0.7):
		'''
			Arguments:
				transcript_dir (str): directory containing speech transcripts.
					Transcripts should be of format:
						Transcript title
						Transcript timestamp
						Transcript text
				cache_dir (str): directory to write cache output
				sim_tolerance (float, default=-.4): minimum similarity of valid quote-transcript match
				bag_tolerance (float, default=.7): minimum proportion of words in quote that must be
					present in transcript for string alignment process to occur. 
		'''	
		self.sim_tolerance = sim_tolerance
		self.bag_tolerance = bag_tolerance

		self.cache_dir = cache_dir

		transcript_order, self.transcript_text = mu.fetch_transcripts(transcript_dir)

		self.transcript_times = [x[0] for x in transcript_order]
		self.transcript_names = [x[1] for x in transcript_order]

		self.phrasematch_cache = self.load_cached_dict(self.PHRASEMATCH_CACHENAME)
import article_utils as au 
import match_utils as mu
import cPickle

'''
	groups quotes into families of similar quotes and deduplicates articles.
'''

TRANSCRIPT_DIR = '/NLP/creativity/work/pres_addrs/transcripts' # transcripts
POSTFILTERED_MENTION_OUTPUT = 'output/postfiltered_mentions' # output (pickle) of postfilter_matches.py
POSTPROCESSED_OUTPUT = 'output/postprocessed_mentions' # output file (pickle)

print 'loading stuff'
with open(POSTFILTERED_MENTION_OUTPUT) as f:
	mentions = cPickle.load(f)
torder, ttext = mu.fetch_transcripts(TRANSCRIPT_DIR)

print 'clustering quotes'
quotes = list(set([au.get_alignment(x) for x in mentions]))

transcript_to_quotelist = gq.get_transcript_to_quotelist(quotes)

alignment_to_family_id, family_id_to_alignments = gq.group_all(transcript_to_quotelist, 
			[x[1] for x in torder])
print len(family_id_to_alignments)
print 'deduplicating stuff'
url_to_article = au.simple_dedup(mentions)
families_to_articles = au.group_articles_by_cluster(mentions, url_to_article, alignment_to_family_id)
article_groups = []
for famset, articles in families_to_articles.iteritems():
	article_groups += au.get_article_groups(articles)