def get_text_passages(search_term, historian_name_last=None, scope=1, side_question=None, side_answer=None, type=None, year_start=1990, year_end=2017, document_type=None): con = sqlite3.connect(DB_PATH) cur = con.cursor() side_question, side_answer = get_sides(side_question, side_answer) docs = document_iterator(year_start=year_start, year_end=year_end, side_question=side_question, type=type, search_term=search_term, format='text_passages', historian_name_last=historian_name_last, document_type=document_type) doc_list = [] years = {i:0 for i in range(year_start,year_end+1)} witnesses = Counter() count = 0 for doc in docs: count += 1 date, text, doc_id, qas_id, last_name, first_name, historian_side = doc witnesses[u'{},{}'.format(last_name, first_name)] += 1 years[int(date[:4])] += 1 heading = u"Witness: {}, {} ({}). Date: {}. Document ID: {}".format(last_name, first_name, historian_side, date, doc_id) print_highlight(heading, heading, 'bold') cur.execute('''SELECT qas.text, qas.type FROM qas WHERE qas.document = "{}" AND qas.id >= {} AND qas.id <= {};'''.format(doc_id, qas_id-scope, qas_id+scope)) rows = cur.fetchall() qas = u'' for row in rows: qas += u"Type: {}.\t{}".format(row[1], row[0]) doc_list.append({ 'witness': u'{}, {}'.format(last_name, first_name), 'doc_id': doc_id, 'date': date, 'year': int(date[:4]), 'type': row[1], 'text': row[0] }) print_highlight(qas, search_term) print "{} Documents".format(count) return doc_list, years, witnesses
def ngrams(year_start=1990, year_end=2016, type = 'Q', side_question=None, side_answer=None, term=None, historian_name_last=None, document_type=None): side_question, side_answer = get_sides(side_question, side_answer) vectorizer = CountVectorizer(max_features= 100000) docs = document_iterator(type=type, side_question=side_question, format='docs_only') vectorizer.fit(docs) features = vectorizer.get_feature_names() vocabulary = {features[i]:i for i in range(len(features))} word_counts = np.zeros(shape= (year_end - year_start + 1, len(vocabulary)), dtype=np.int) docs = document_iterator(type=type, side_question=side_question, historian_name_last=historian_name_last, document_type=document_type) tokenizer = vectorizer.build_tokenizer() for doc in docs: year = int(doc[0][:4]) document = doc[1].lower() for token in tokenizer(document): word_counts[year-year_start, vocabulary[token]] += 1 totals = np.sum(word_counts, axis=1) word_counts = word_counts[:,vocabulary[term]] word_frequencies = 1.0 * word_counts / totals viz_formatting = {'Plaintiff': 's', 'Defendant': '^'} if type == 'A': label = '{} in Answers by {} Witnesses.'.format( term, side_answer) viz_format = 'b{}'.format(viz_formatting[side_answer]) if type == 'Q': label = '{} in Questions by {} Lawyers.'.format(term, side_question) viz_format = 'r{}'.format(viz_formatting[side_question]) return { 'year_start': year_start, 'year_end': year_end, 'term': term, 'word_counts': word_counts, 'word_frequencies': word_frequencies, 'label': label, 'viz_format': viz_format, 'side_question': side_question, 'side_answer': side_answer, 'type': type, 'historian_name_last': historian_name_last }
def get_text_passages(search_term, historian_name_last=None, scope=1, side_question=None, side_answer=None, type=None, year_start=1990, year_end=2017): con = sqlite3.connect(DB_PATH) cur = con.cursor() side_question, side_answer = get_sides(side_question, side_answer) docs = document_iterator(year_start=year_start, year_end=year_end, side_question=side_question, type=type, search_term=search_term, format='text_passages', historian_name_last=historian_name_last) doc_list = [] years = {i:0 for i in range(year_start,year_end+1)} witnesses = Counter() count = 0 for doc in docs: count += 1 date, text, doc_id, qas_id, last_name, first_name, historian_side = doc witnesses[u'{},{}'.format(last_name, first_name)] += 1 years[int(date[:4])] += 1 heading = u"Witness: {}, {} ({}). Date: {}. Document ID: {}".format(last_name, first_name, historian_side, date, doc_id) print_highlight(heading, heading, 'bold') cur.execute('''SELECT qas.text, qas.type FROM qas WHERE qas.document = "{}" AND qas.id >= {} AND qas.id <= {};'''.format(doc_id, qas_id-scope, qas_id+scope)) rows = cur.fetchall() qas = u'' for row in rows: qas += u"Type: {}.\t{}".format(row[1], row[0]) doc_list.append({ 'witness': u'{}, {}'.format(last_name, first_name), 'doc_id': doc_id, 'date': date, 'year': int(date[:4]), 'type': row[1], 'text': row[0] }) print_highlight(qas, search_term) print "{} Documents".format(count) return doc_list, years, witnesses
def ngrams(year_start=1990, year_end=2016, type = 'Q', side_question=None, side_answer=None, term=None): side_question, side_answer = get_sides(side_question, side_answer) vectorizer = CountVectorizer(max_features= 100000) docs = document_iterator(type=type, side_question=side_question, format='docs_only') vectorizer.fit(docs) features = vectorizer.get_feature_names() vocabulary = {features[i]:i for i in range(len(features))} word_counts = np.zeros(shape= (year_end - year_start + 1, len(vocabulary)), dtype=np.int) docs = document_iterator(type=type, side_question=side_question) tokenizer = vectorizer.build_tokenizer() for doc in docs: year = int(doc[0][:4]) document = doc[1].lower() for token in tokenizer(document): word_counts[year-year_start, vocabulary[token]] += 1 totals = np.sum(word_counts, axis=1) word_counts = word_counts[:,vocabulary[term]] word_frequencies = 1.0 * word_counts / totals viz_formatting = {'Plaintiff': 's', 'Defendant': '^'} if type == 'A': label = '{} in Answers by {} Witnesses.'.format( term, side_answer) viz_format = 'b{}'.format(viz_formatting[side_answer]) if type == 'Q': label = '{} in Questions by {} Lawyers.'.format(term, side_question) viz_format = 'r{}'.format(viz_formatting[side_question]) return { 'year_start': year_start, 'year_end': year_end, 'term': term, 'word_counts': word_counts, 'word_frequencies': word_frequencies, 'label': label, 'viz_format': viz_format, 'side_question': side_question, 'side_answer': side_answer, 'type': type }
def passages(search_term, historian_last_name=None, side_question=None, side_answer=None, year_start=1987, year_end=2017, scope=0, type=None): ''' Passages lets you find text passages with a lot of different configuration options. All passages get stored as csv files, found in the csv folder The parameters you can pass are: search_term (required) search term or expression to look for historian_name_last last name of the historian to look for side_question side that poses the question ("Plaintiff" or "Defendant") side_answer side of the witness the question ("Plaintiff" or "Defendant") type questions or answers ("Q" or "A") year_start earliest year to use (default: 1990) year_end final year to use (default: 2017) scope 0: only return passage that includes the search term. (default) 1: return the passage that includes the search term as well as the preceeding and succeeding question/answer 2: and so forth Examples # Find all passages mentioning various between 1990 and 2017 passages('various') # Find all passages by Kyriakoudes that mention addiction passages('addiction', historian_last_name="Kyriakoudes") # Find all passages by Kyriakoudes that mention addiction, include the surrounding questions passages('addiction', historian_last_name="Kyriakoudes", scope=1, type='A') # Find all questions that mention addiction between 2000 and 2015 passages('addiction, type='Q', year_start=2000, year_end=2015) # Find all questions by defendant lawyers that mention addiction passages('addiction', side_question='Defendant', type='Q') :return: ''' if side_question or side_answer: side_question, side_answer = get_sides(side_question, side_answer) doc_list, years, witnesses = get_text_passages(search_term, historian_name_last=historian_last_name, side_question=side_question, side_answer=side_answer, year_start=year_start, year_end=year_end, scope=scope) store_as_csv(doc_list, years, witnesses, search_term, type, side_answer)
def load_cor_matrix(qa='A', side_question=None, side_answer=None, year_start=1990, year_end=2016, ngram_range=(1, 1)): side_question, side_answer = get_sides(side_question, side_answer) try: m = np.load('cor_{}_{}_{}_{}.npz'.format(qa, side_answer, ngram_range[0], ngram_range[1])) cor_mat = m['cor_mat'] features = m['features'] cooc_mat = m['cooc_mat'] except IOError: cor_mat, cooc_mat, features = calculate_and_store_cor_matrix( qa, side_question, side_answer, year_start, year_end, ngram_range) features_lookup = {features[i]: i for i in range(len(features))} return cor_mat, cooc_mat, features, features_lookup
def passages(search_term, historian_last_name=None, side_question=None, side_answer=None, year_start=1987, year_end=2017, scope=0, type=None): ''' Passages lets you find text passages with a lot of different configuration options. All passages get stored as csv files, found in the csv folder The parameters you can pass are: search_term (required) search term or expression to look for historian_name_last last name of the historian to look for side_question side that poses the question ("Plaintiff" or "Defendant") side_answer side of the witness the question ("Plaintiff" or "Defendant") type questions or answers ("Q" or "A") year_start earliest year to use (default: 1990) year_end final year to use (default: 2017) scope 0: only return passage that includes the search term. (default) 1: return the passage that includes the search term as well as the preceeding and succeeding question/answer 2: and so forth Examples # Find all passages mentioning various between 1990 and 2017 passages('various') # Find all passages by Kyriakoudes that mention addiction passages('addiction', historian_last_name="Kyriakoudes") # Find all passages by Kyriakoudes that mention addiction, include the surrounding questions passages('addiction', historian_last_name="Kyriakoudes", scope=1, type='A') # Find all questions that mention addiction between 2000 and 2015 passages('addiction, type='Q', year_start=2000, year_end=2015) # Find all questions by defendant lawyers that mention addiction passages('addiction', side_question='Defendant', type='Q') :return: ''' if side_question or side_answer: side_question, side_answer = get_sides(side_question, side_answer) doc_list, years, witnesses = get_text_passages( search_term, historian_name_last=historian_last_name, side_question=side_question, side_answer=side_answer, year_start=year_start, year_end=year_end, scope=scope) store_as_csv(doc_list, years, witnesses, search_term, type, side_answer)