def get_bills(congress_session_min,
                  congress_session_max,
                  num_bills=100,
                  chamber='both',
                  offset=0):
        abs_min_bill_congress = 105

        if congress_session_min < abs_min_bill_congress:
            congress_session_min = abs_min_bill_congress

        if congress_session_max > PropublicaScraper.CURRENT_CONGRESS:
            congress_session_max = PropublicaScraper.CURRENT_CONGRESS

        for session in range(congress_session_min, congress_session_max + 1):
            session_bills = PropublicaScraper.get_session_bills(
                session=session,
                chamber=chamber,
                num_bills=num_bills,
                offset=offset)

            save_location = 'Bill Set-' + str(session) + '-3'
            print('Done with congressional session: ' + str(session))
            print('Saving to ' + save_location + '...')
            Pickler.save_obj(session_bills, save_location)

        print('Done!')
Exemple #2
0
    def reload_bill_json(session, offset):
        with open('z_tmp_json.json') as j:
            data = json.load(j)
            session_bills = dict()
            for bill in data['results'][0]['summarization']:
                bill_id = bill['bill_id']
                session_bills[bill_id] = bill

            Pickler.save_obj(session_bills, str(session) + '-' + str(offset))
Exemple #3
0
    def merger(session, merge_set):
        main = Pickler.load_obj('Bill Set-' + str(session) + '-3')
        for subset in merge_set:
            print(subset)
            sub = Pickler.load_obj(str(session) + '-' + subset)
            for bill in sub:
                main[bill] = sub[bill]

        Pickler.save_obj(main, 'Bill Set-' + str(session) + '-Complete')
Exemple #4
0
 def __init__(self, pickle_file):
     self.pickler = Pickler()
     self.congress_cache = CongressCache()
     self.president_cache = PresidentCache()
     self.house_member_cache = HouseMemberCache()
     self.senate_member_cache = SenateMemberCache()
     self.bill_cache = BillCache()
     self.sqlEx = SqlExecutor()
     self.pickle_file = pickle_file
     self.local_dir = path.dirname(path.abspath(__file__))
     if self.pickle_file == 'House Members.pkl':
         self.congressional_body = 'House'
     else:
         self.congressional_body = 'Senate'
Exemple #5
0
    def get_list(self, offline=True):
        if offline:
            self.unverified = Pickler.load_obj('test_list')
            return self.unverified
        else:
            for i in range(1, 100):
                print(i)
                headers = {'Authorization': '127ae28c3362490c94e16d337a103f70'}
                res = requests.get(
                    'https://newsapi.org/v2/everything?domains=cnn.com&language=en&page='
                    + str(i),
                    headers=headers).json()

                for article in res['articles']:
                    self.unverified.append({
                        'title': article['title'],
                        'url': article['url']
                    })

            Pickler.save_obj(self.unverified, 'test_list')
            return self.unverified
Exemple #6
0
import time
from tool.Pickler import Pickler
from db.summarization import ExtractiveSummarizer
from nltk.tokenize import sent_tokenize

start_time = time.time()

articles = Pickler.load_obj('test-articles')

# article = dict()
# article['title'] = ""
# article['summary'] = """
# """

article = articles[4]
# Pickler.save_obj(articles, 'test-articles')

summ = ExtractiveSummarizer(article)

short = summ.get_first_n_summary(shorten_to_len=4)
short_2 = summ.get_similarity_summary(shorten_to_len=4)
saved_pct = (len(short) / len(article['summary'])) * 100

print('')
print(article['title'])
print(short)
print('')
print('Compressed to {0:0.2f}% of original'.format(saved_pct))
print('')
print('Program executed in: {0:.2f}s'.format(time.time() - start_time))
Exemple #7
0
 def write_out_list(self):
     self.clear_duplicates()
     Pickler.save_obj(self.verified, 'verified_training_set')
 def get_members_of_congress(house, file_cache, min_session, max_session):
     members = PropublicaScraper.get_members_of_congress(
         house, min_session, max_session)
     Pickler.save_obj(members, file_cache)
     return members
Exemple #9
0
class PickleToDB:
    def __init__(self, pickle_file):
        self.pickler = Pickler()
        self.congress_cache = CongressCache()
        self.president_cache = PresidentCache()
        self.house_member_cache = HouseMemberCache()
        self.senate_member_cache = SenateMemberCache()
        self.bill_cache = BillCache()
        self.sqlEx = SqlExecutor()
        self.pickle_file = pickle_file
        self.local_dir = path.dirname(path.abspath(__file__))
        if self.pickle_file == 'House Members.pkl':
            self.congressional_body = 'House'
        else:
            self.congressional_body = 'Senate'

    def populate_president_cache(self):
        object_path = path.join(self.local_dir, '..', '..', 'data',
                                'json_files', 'presidents.json')
        with open(object_path) as json_file:
            data = json.load(json_file)
            for pres in data:
                if pres['left_office'] is not None:
                    left_office_date = dt.datetime.strptime(
                        pres['left_office'], '%Y-%m-%d').date().toordinal()
                else:
                    left_office_date = None
                if self.president_cache.get_president_by_num(
                        pres['number']) is None:
                    self.president_cache.store_president(
                        pres['number'], pres['president'], pres['birth_year'],
                        pres['death_year'],
                        dt.datetime.strptime(pres['took_office'],
                                             '%Y-%m-%d').date().toordinal(),
                        left_office_date, pres['party'])

    def populate_congressional_sessions(self):
        session_number = 1
        year = 1789
        while year <= dt.datetime.today().year:
            if self.congress_cache.get_congress_session(
                    session_number) is None:
                if session_number < 73:
                    self.congress_cache.store_congress_session(
                        session_number,
                        dt.datetime.strptime(str(year) + '-03-04',
                                             '%Y-%m-%d').toordinal(),
                        dt.datetime.strptime(
                            str(year + 2) + '-03-03', '%Y-%m-%d').toordinal())
                elif session_number == 73:
                    self.congress_cache.store_congress_session(
                        session_number,
                        dt.datetime.strptime(str(year) + '-03-04',
                                             '%Y-%m-%d').toordinal(),
                        dt.datetime.strptime(
                            str(year + 3) + '-01-03', '%Y-%m-%d').toordinal())
                else:
                    self.congress_cache.store_congress_session(
                        session_number,
                        dt.datetime.strptime(str(year) + '-01-02',
                                             '%Y-%m-%d').toordinal(),
                        dt.datetime.strptime(
                            str(year + 2) + '-01-03', '%Y-%m-%d').toordinal())
            year = year + 2
            session_number = session_number + 1

    def populate_president_to_session(self):
        president_number = 1
        session_number = 1
        while True:
            president_data = self.president_cache.get_president_by_num(
                president_number)
            session_data = self.congress_cache.get_congress_session(
                session_number)
            if session_data is None or president_data is None:
                break
            president_start_date = president_data['TOOK_OFFICE']
            president_end_date = president_data['LEFT_OFFICE']
            session_start_date = session_data['START_DATE']
            session_end_date = session_data['END_DATE']
            if president_end_date is None:
                president_end_date = dt.datetime.today().toordinal()
            if president_start_date <= session_start_date <= president_end_date <= session_end_date:
                if self.president_cache.check_president_session_cache(
                        president_number, session_number) is None:
                    self.president_cache.store_president_session(
                        session_number, president_number)
                president_number = president_number + 1
            elif session_end_date >= president_start_date:
                if self.president_cache.check_president_session_cache(
                        president_number, session_number) is None:
                    self.president_cache.store_president_session(
                        session_number, president_number)
                session_number = session_number + 1
            else:
                print("Wait this should work!")

    def populate_members_cache(self):
        object_path = path.join(self.local_dir, '..', '..', 'obj', 'complete',
                                self.pickle_file)
        members = self.pickler.load_obj(object_path)
        for keys in members:
            member = members[keys]

            date_of_birth_string = member.get('date_of_birth')
            if date_of_birth_string != '':
                date_of_birth = dt.datetime.strptime(date_of_birth_string,
                                                     '%Y-%m-%d').toordinal()
            else:
                date_of_birth = 0

            if self.congressional_body is 'House':
                if self.house_member_cache.get_house_member_by_id(
                        member['id']) is None:
                    self.house_member_cache.store_house_member(
                        member.get('id'), member.get('first_name'),
                        member.get('middle_name'),
                        member.get('last_name'), date_of_birth,
                        member.get('gender'), member.get('party'),
                        member.get('leadership_role'),
                        member.get('twitter_account'),
                        member.get('facebook_account'),
                        member.get('youtube_account'), member.get('cspan_id'),
                        member.get('icpsr_id'), member.get('crp_id'),
                        member.get('fec_candidate_id'),
                        member.get('in_office'), member.get('seniority'),
                        member.get('total_votes'), member.get('missed_votes'),
                        member.get('total_present'), member.get('office'),
                        member.get('phone'), member.get('fax'),
                        member.get('state'), member.get('district'),
                        member.get('at_large'), member.get('missed_votes_pct'),
                        member.get('votes_with_party_pct'))
            else:
                if self.senate_member_cache.get_senate_member_by_id(
                        member['id']) is None:
                    self.senate_member_cache.store_senate_member(
                        member.get('id'), member.get('first_name'),
                        member.get('middle_name'),
                        member.get('last_name'), date_of_birth,
                        member.get('gender'), member.get('party'),
                        member.get('leadership_role'),
                        member.get('twitter_account'),
                        member.get('facebook_account'),
                        member.get('youtube_account'), member.get('cspan_id'),
                        member.get('icpsr_id'), member.get('crp_id'),
                        member.get('fec_candidate_id'),
                        member.get('in_office'), member.get('seniority'),
                        member.get('total_votes'), member.get('missed_votes'),
                        member.get('total_present'), member.get('office'),
                        member.get('phone'), member.get('fax'),
                        member.get('state'), member.get('senate_class'),
                        member.get('state_rank'))

    def populate_members_to_session(self):
        object_path = path.join(self.local_dir, '..', '..', 'obj', 'complete',
                                self.pickle_file)
        members = self.pickler.load_obj(object_path)
        for key in members:
            member = members[key]
            member_id = member['id']
            first_session = member['last_session']
            last_session = member['first_session']
            if self.congressional_body is 'House':
                if len(
                        self.house_member_cache.
                        get_session_from_house_member_id(
                            member_id)) == (last_session - first_session) + 1:
                    continue

                for session_num in range(first_session, last_session + 1):
                    self.house_member_cache.store_house_member_to_session(
                        session_num, member_id)
            else:
                if len(
                        self.senate_member_cache.
                        get_session_from_senate_member_id(
                            member_id)) == (last_session - first_session) + 1:
                    continue

                for session_num in range(first_session, last_session + 1):
                    self.senate_member_cache.store_senate_member_to_session(
                        session_num, member_id)

    def populate_bill_cache(self):
        object_path = path.join(self.local_dir, '..', '..', 'obj', 'complete',
                                self.pickle_file)
        bills = self.pickler.load_obj(object_path)
        for key in bills:
            bill = bills[key]
            vetoed = True
            if bill.get('latest_major_action'
                        ) != 'Received in the Senate.' and bill.get(
                            'enacted') is None:
                enacted = False
            else:
                enacted = True
            if bill.get('vetoed') is None:
                vetoed = False
            congress_session = int(bill['bill_id'][len(bill['bill_id']) -
                                                   3:len(bill['bill_id'])])
            self.bill_cache.store_bill(
                bill.get('bill_id'), bill['title'], bill.get('short_title'),
                congress_session,
                dt.datetime.strptime(bill.get('introduced_date'),
                                     '%Y-%m-%d').toordinal(),
                bill.get('congressdotgov_url'), bill.get('active'), enacted,
                vetoed, bill.get('summary'), bill.get('latest_major_action'))