Esempio n. 1
0
def get_text_passages(search_term, historian_name_last=None, scope=1, side_question=None, side_answer=None, type=None,
                      year_start=1990, year_end=2017, document_type=None):

    con = sqlite3.connect(DB_PATH)
    cur = con.cursor()

    side_question, side_answer = get_sides(side_question, side_answer)

    docs = document_iterator(year_start=year_start, year_end=year_end, side_question=side_question, type=type,
                             search_term=search_term, format='text_passages', historian_name_last=historian_name_last,
                             document_type=document_type)

    doc_list = []
    years = {i:0 for i in range(year_start,year_end+1)}
    witnesses = Counter()
    count = 0
    for doc in docs:
        count += 1
        date, text, doc_id, qas_id, last_name, first_name, historian_side = doc

        witnesses[u'{},{}'.format(last_name, first_name)] += 1
        years[int(date[:4])] += 1

        heading = u"Witness: {}, {} ({}). Date: {}. Document ID: {}".format(last_name, first_name, historian_side,
                                                                            date, doc_id)
        print_highlight(heading, heading, 'bold')


        cur.execute('''SELECT qas.text, qas.type FROM qas
                       WHERE qas.document = "{}" AND qas.id >= {} AND qas.id <= {};'''.format(doc_id, qas_id-scope,
                                                                                              qas_id+scope))
        rows = cur.fetchall()
        qas = u''
        for row in rows:
            qas += u"Type: {}.\t{}".format(row[1], row[0])

            doc_list.append({
                'witness': u'{}, {}'.format(last_name, first_name),
                'doc_id': doc_id,
                'date': date,
                'year': int(date[:4]),
                'type': row[1],
                'text': row[0]
            })


        print_highlight(qas, search_term)




    print "{} Documents".format(count)

    return doc_list, years, witnesses
Esempio n. 2
0
def ngrams(year_start=1990, year_end=2016, type = 'Q', side_question=None, side_answer=None, term=None,
           historian_name_last=None, document_type=None):


    side_question, side_answer = get_sides(side_question, side_answer)

    vectorizer = CountVectorizer(max_features= 100000)

    docs = document_iterator(type=type, side_question=side_question, format='docs_only')
    vectorizer.fit(docs)

    features =  vectorizer.get_feature_names()
    vocabulary = {features[i]:i for i in range(len(features))}

    word_counts = np.zeros(shape= (year_end - year_start + 1, len(vocabulary)), dtype=np.int)

    docs = document_iterator(type=type, side_question=side_question, historian_name_last=historian_name_last, document_type=document_type)
    tokenizer = vectorizer.build_tokenizer()
    for doc in docs:
        year = int(doc[0][:4])
        document = doc[1].lower()
        for token in tokenizer(document):
            word_counts[year-year_start, vocabulary[token]] += 1



    totals = np.sum(word_counts, axis=1)

    word_counts =  word_counts[:,vocabulary[term]]
    word_frequencies = 1.0 * word_counts / totals

    viz_formatting = {'Plaintiff': 's',
                      'Defendant': '^'}

    if type == 'A':
        label = '{} in Answers by {} Witnesses.'.format( term, side_answer)
        viz_format = 'b{}'.format(viz_formatting[side_answer])
    if type == 'Q':
        label = '{} in Questions by {} Lawyers.'.format(term, side_question)
        viz_format = 'r{}'.format(viz_formatting[side_question])

    return {
        'year_start': year_start,
        'year_end': year_end,
        'term': term,
        'word_counts': word_counts,
        'word_frequencies': word_frequencies,
        'label': label,
        'viz_format': viz_format,
        'side_question': side_question,
        'side_answer': side_answer,
        'type': type,
        'historian_name_last': historian_name_last
    }
Esempio n. 3
0
def get_text_passages(search_term, historian_name_last=None, scope=1, side_question=None, side_answer=None, type=None,
                      year_start=1990, year_end=2017):

    con = sqlite3.connect(DB_PATH)
    cur = con.cursor()

    side_question, side_answer = get_sides(side_question, side_answer)

    docs = document_iterator(year_start=year_start, year_end=year_end, side_question=side_question, type=type,
                             search_term=search_term, format='text_passages', historian_name_last=historian_name_last)

    doc_list = []
    years = {i:0 for i in range(year_start,year_end+1)}
    witnesses = Counter()
    count = 0
    for doc in docs:
        count += 1
        date, text, doc_id, qas_id, last_name, first_name, historian_side = doc

        witnesses[u'{},{}'.format(last_name, first_name)] += 1
        years[int(date[:4])] += 1

        heading = u"Witness: {}, {} ({}). Date: {}. Document ID: {}".format(last_name, first_name, historian_side,
                                                                            date, doc_id)
        print_highlight(heading, heading, 'bold')


        cur.execute('''SELECT qas.text, qas.type FROM qas
                       WHERE qas.document = "{}" AND qas.id >= {} AND qas.id <= {};'''.format(doc_id, qas_id-scope,
                                                                                              qas_id+scope))
        rows = cur.fetchall()
        qas = u''
        for row in rows:
            qas += u"Type: {}.\t{}".format(row[1], row[0])

            doc_list.append({
                'witness': u'{}, {}'.format(last_name, first_name),
                'doc_id': doc_id,
                'date': date,
                'year': int(date[:4]),
                'type': row[1],
                'text': row[0]
            })


        print_highlight(qas, search_term)




    print "{} Documents".format(count)

    return doc_list, years, witnesses
Esempio n. 4
0
def ngrams(year_start=1990, year_end=2016, type = 'Q', side_question=None, side_answer=None, term=None):


    side_question, side_answer = get_sides(side_question, side_answer)

    vectorizer = CountVectorizer(max_features= 100000)

    docs = document_iterator(type=type, side_question=side_question, format='docs_only')
    vectorizer.fit(docs)

    features =  vectorizer.get_feature_names()
    vocabulary = {features[i]:i for i in range(len(features))}

    word_counts = np.zeros(shape= (year_end - year_start + 1, len(vocabulary)), dtype=np.int)

    docs = document_iterator(type=type, side_question=side_question)
    tokenizer = vectorizer.build_tokenizer()
    for doc in docs:
        year = int(doc[0][:4])
        document = doc[1].lower()
        for token in tokenizer(document):
            word_counts[year-year_start, vocabulary[token]] += 1



    totals = np.sum(word_counts, axis=1)

    word_counts =  word_counts[:,vocabulary[term]]
    word_frequencies = 1.0 * word_counts / totals

    viz_formatting = {'Plaintiff': 's',
                      'Defendant': '^'}

    if type == 'A':
        label = '{} in Answers by {} Witnesses.'.format( term, side_answer)
        viz_format = 'b{}'.format(viz_formatting[side_answer])
    if type == 'Q':
        label = '{} in Questions by {} Lawyers.'.format(term, side_question)
        viz_format = 'r{}'.format(viz_formatting[side_question])

    return {
        'year_start': year_start,
        'year_end': year_end,
        'term': term,
        'word_counts': word_counts,
        'word_frequencies': word_frequencies,
        'label': label,
        'viz_format': viz_format,
        'side_question': side_question,
        'side_answer': side_answer,
        'type': type
    }
Esempio n. 5
0
def passages(search_term, historian_last_name=None, side_question=None, side_answer=None, year_start=1987,
             year_end=2017, scope=0, type=None):
    '''
    Passages lets you find text passages with a lot of different configuration options.
    All passages get stored as csv files, found in the csv folder

    The parameters you can pass are:

    search_term             (required) search term or expression to look for
    historian_name_last     last name of the historian to look for
    side_question           side that poses the question ("Plaintiff" or "Defendant")
    side_answer             side of the witness the question ("Plaintiff" or "Defendant")
    type                    questions or answers ("Q" or "A")
    year_start              earliest year to use (default: 1990)
    year_end                final year to use (default: 2017)
    scope                       0: only return passage that includes the search term. (default)
                                1: return the passage that includes the search term as well as the preceeding
                                    and succeeding question/answer
                                2: and so forth


    Examples

    # Find all passages mentioning various between 1990 and 2017
    passages('various')

    # Find all passages by Kyriakoudes that mention addiction
    passages('addiction', historian_last_name="Kyriakoudes")

    # Find all passages by Kyriakoudes that mention addiction, include the surrounding questions
    passages('addiction', historian_last_name="Kyriakoudes", scope=1, type='A')

    # Find all questions that mention addiction between 2000 and 2015
    passages('addiction, type='Q', year_start=2000, year_end=2015)

    # Find all questions by defendant lawyers that mention addiction
    passages('addiction', side_question='Defendant', type='Q')


    :return:
    '''

    if side_question or side_answer:
        side_question, side_answer = get_sides(side_question, side_answer)

    doc_list, years, witnesses = get_text_passages(search_term, historian_name_last=historian_last_name,
                                                    side_question=side_question, side_answer=side_answer,
                                                    year_start=year_start, year_end=year_end, scope=scope)

    store_as_csv(doc_list, years, witnesses, search_term, type, side_answer)
Esempio n. 6
0
def load_cor_matrix(qa='A',
                    side_question=None,
                    side_answer=None,
                    year_start=1990,
                    year_end=2016,
                    ngram_range=(1, 1)):

    side_question, side_answer = get_sides(side_question, side_answer)

    try:
        m = np.load('cor_{}_{}_{}_{}.npz'.format(qa, side_answer,
                                                 ngram_range[0],
                                                 ngram_range[1]))
        cor_mat = m['cor_mat']
        features = m['features']
        cooc_mat = m['cooc_mat']

    except IOError:
        cor_mat, cooc_mat, features = calculate_and_store_cor_matrix(
            qa, side_question, side_answer, year_start, year_end, ngram_range)

    features_lookup = {features[i]: i for i in range(len(features))}

    return cor_mat, cooc_mat, features, features_lookup
Esempio n. 7
0
def passages(search_term,
             historian_last_name=None,
             side_question=None,
             side_answer=None,
             year_start=1987,
             year_end=2017,
             scope=0,
             type=None):
    '''
    Passages lets you find text passages with a lot of different configuration options.
    All passages get stored as csv files, found in the csv folder

    The parameters you can pass are:

    search_term             (required) search term or expression to look for
    historian_name_last     last name of the historian to look for
    side_question           side that poses the question ("Plaintiff" or "Defendant")
    side_answer             side of the witness the question ("Plaintiff" or "Defendant")
    type                    questions or answers ("Q" or "A")
    year_start              earliest year to use (default: 1990)
    year_end                final year to use (default: 2017)
    scope                       0: only return passage that includes the search term. (default)
                                1: return the passage that includes the search term as well as the preceeding
                                    and succeeding question/answer
                                2: and so forth


    Examples

    # Find all passages mentioning various between 1990 and 2017
    passages('various')

    # Find all passages by Kyriakoudes that mention addiction
    passages('addiction', historian_last_name="Kyriakoudes")

    # Find all passages by Kyriakoudes that mention addiction, include the surrounding questions
    passages('addiction', historian_last_name="Kyriakoudes", scope=1, type='A')

    # Find all questions that mention addiction between 2000 and 2015
    passages('addiction, type='Q', year_start=2000, year_end=2015)

    # Find all questions by defendant lawyers that mention addiction
    passages('addiction', side_question='Defendant', type='Q')


    :return:
    '''

    if side_question or side_answer:
        side_question, side_answer = get_sides(side_question, side_answer)

    doc_list, years, witnesses = get_text_passages(
        search_term,
        historian_name_last=historian_last_name,
        side_question=side_question,
        side_answer=side_answer,
        year_start=year_start,
        year_end=year_end,
        scope=scope)

    store_as_csv(doc_list, years, witnesses, search_term, type, side_answer)