def handle_noargs(self, **options): doc_type = "RCR SRP" file_type = "html" base_url = 'http://repcloakroom.house.gov/news/' page = urllib2.urlopen( "http://repcloakroom.house.gov/news/DocumentQuery.aspx?DocumentTypeID=1501&Page=1" ) add_date = datetime.datetime.now() soup = BeautifulSoup(page) rows = soup.findAll('span', {"class": "middlecopy"}) for row in rows: if row.find('span', {"class": "middleheadline"}): title = str( row.find('span', { "class": "middleheadline" }).contents[1]).replace('<b>', '').replace('</b>', '').strip() bill_list = extract_legislation(title) date_str = row.find('span', { "class": "middleheadline" }).parent.contents[5].contents[0].replace(' -', '').strip() release_date = time.strftime( '%Y-%m-%d', time.strptime(date_str, '%b %d, %Y')) year = int( time.strftime('%Y', time.strptime(date_str, '%b %d, %Y'))) congress = congress_from_year(year) description = unicode( row.find('span', { "class": "middleheadline" }).parent.contents[6]).strip() if not bill_list: bill_list = extract_legislation(description) if title == "": title = "".join(bill_list) file_name = row.find('span', { "class": "middleheadline" }).parent.contents[7]['href'] original_url = "%s%s" % (base_url, file_name) gov_id = "SRP-%s-%s-%s" % (congress, bill_list[0].replace( ' ', '').replace('.', ''), release_date) doc = { 'gov_id': gov_id, 'original_url': original_url, 'file_name': file_name, 'title': title, 'description': description, 'congress': congress, 'year': year, 'release_date': release_date, 'bill_list': bill_list } print doc
def handle_noargs(self, **options): doc_type = "CBO CE" file_type = "pdf" d = feedparser.parse("http://www.cbo.gov/rss/latest10.xml") for entry in d.entries: title_dict = split_title(entry.title) release_date = entry.updated_parsed release_date = datetime.datetime(release_date[0], release_date[1], release_date[2]) congress = congress_from_year(release_date.year) add_date = datetime.datetime.now() title = title_dict['title'] bill_list = extract_legislation(title) if len(bill_list) > 0: bill_num = bill_list[0] gov_id = "%s-%s" % (congress, bill_num.replace( '.', '').replace(' ', '')) else: bill_num = None gov_id = None if 'description' in entry: description = entry.description original_url = entry.link entry = { 'release_date': release_date, 'congress': congress, 'add_date': add_date, 'title': title, 'description': description, 'bill_list': bill_list, 'original_url': original_url } print entry
def handle_noargs(self, **options): doc_type = "DPC LB" file_type = "html" add_date = datetime.datetime.now() year = add_date.year congress = congress_from_year(year) url_prefix = "http://dpc.senate.gov/" url = "%sdpcreports.cfm?cf_year=%s&doctype=lb" % (url_prefix, year) page = urllib2.urlopen(url) soup = BeautifulSoup(page) rows = soup.findAll('p', {"class": "doclist"}) for row in rows: file_name = row('a')[0]['href'].strip() p = re.compile('dpcdoc\.cfm\?doc_name=') standard_format = p.findall(file_name) if standard_format: gov_id = file_name.replace('dpcdoc.cfm?doc_name=', '').upper() else: gov_id = None original_url = "%s%s" % (url_prefix, file_name) local_file = '' title = row('a')[0].string description = '' bill_list = extract_legislation(title) date_str = row.contents[3].string.replace('(', '').replace(')', '') release_date = time.strftime('%Y-%m-%d', time.strptime(date_str, '%m/%d/%y')) matches = Document.objects.filter(doc_type=doc_type, gov_id=gov_id, release_date=release_date) if len(matches) == 0: if gov_id: local_file = archive_file(original_url, gov_id, doc_type, file_type) time.sleep(2) full_text = None doc = Document(gov_id=gov_id, release_date=release_date, add_date=add_date, title=title, description=description, doc_type=doc_type, original_url=original_url, local_file=local_file, full_text=full_text) doc.save() for bill_num in bill_list: bill_dupe = DocumentLegislation.objects.filter( congress=congress).filter( bill_num=bill_num).filter(document=doc) if not bill_dupe: bill = DocumentLegislation(congress=congress, bill_num=bill_num, document=doc) bill.save()
def handle_noargs(self, **options): doc_type = "RCR SRP" file_type = "html" base_url = 'http://repcloakroom.house.gov/news/' page = urllib2.urlopen("http://repcloakroom.house.gov/news/DocumentQuery.aspx?DocumentTypeID=1501&Page=1") add_date = datetime.datetime.now() soup = BeautifulSoup(page) rows = soup.findAll('span', { "class":"middlecopy" }) for row in rows: if row.find('span', { "class":"middleheadline" }): title = str(row.find('span', { "class":"middleheadline" }).contents[1]).replace('<b>', '').replace('</b>', '').strip() bill_list = extract_legislation(title) date_str = row.find('span', { "class":"middleheadline" }).parent.contents[5].contents[0].replace(' -', '').strip() release_date = time.strftime('%Y-%m-%d', time.strptime(date_str, '%b %d, %Y')) year = int(time.strftime('%Y', time.strptime(date_str, '%b %d, %Y'))) congress = congress_from_year(year) description = unicode(row.find('span', { "class":"middleheadline" }).parent.contents[6]).strip() if not bill_list: bill_list = extract_legislation(description) if title == "": title = "".join(bill_list) file_name = row.find('span', { "class":"middleheadline" }).parent.contents[7]['href'] original_url = "%s%s" % (base_url, file_name) gov_id = "SRP-%s-%s-%s" % (congress, bill_list[0].replace(' ', '').replace('.', ''), release_date) matches = Document.objects.filter(doc_type=doc_type, gov_id=gov_id, release_date=release_date) if len(matches) == 0: print_url = original_url.replace('DocumentSingle', 'DocumentPrint') #print_page = urllib2.urlopen(print_url).read() #full_text = ''.join(BeautifulSoup(print_page).findAll(text=True)).replace('DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"', '').strip() #full_text = re.sub("\s+" , " ", full_text) full_text = None if gov_id: local_file = archive_file(print_url, gov_id, doc_type, file_type) doc = Document(gov_id=gov_id, release_date=release_date, add_date=add_date, title=title, description=description, doc_type=doc_type, original_url=original_url, local_file=local_file, full_text=full_text) doc.save() for bill in bill_list: bill_num = clean_bill_num(bill) bill = DocumentLegislation(congress=congress, bill_num=bill_num, document=doc) bill.save()
def handle_noargs(self, **options): docs = Document.objects.all() for doc in docs: bill_list = extract_legislation(doc.title) congress = congress_from_year(doc.release_date.year) if bill_list: i = 0 for bill_num in bill_list: bill_dupe = DocumentLegislation.objects.filter( congress=congress).filter(bill_num=bill_num).filter( document=doc) if not bill_dupe: bill = DocumentLegislation(congress=congress, bill_num=bill_num, document=doc) bill.save() print "%s %s" % (doc.gov_id, bill_list)
def handle_noargs(self, **options): doc_type = "CBO CE" file_type = "pdf" d = feedparser.parse("http://www.cbo.gov/rss/latest10.xml") for entry in d.entries: title_dict = split_title(entry.title) release_date = entry.updated_parsed release_date=datetime.datetime(release_date[0], release_date[1], release_date[2]) congress = congress_from_year(release_date.year) add_date = datetime.datetime.now() title = title_dict['title'] bill_list = extract_legislation(title) if len(bill_list) > 0: bill_num = bill_list[0] gov_id = "%s-%s" % (congress, bill_num.replace('.', '').replace(' ', '')) else: bill_num = None gov_id = None if 'description' in entry: description = entry.description original_url = entry.link matches = Document.objects.filter(doc_type=doc_type, gov_id=gov_id, release_date=release_date) if len(matches) == 0: if gov_id: local_file = archive_file(original_url, gov_id, doc_type, file_type) #full_text = pdf_extract_text(local_file, original_url) full_text = None doc = Document(gov_id=gov_id, release_date=release_date, add_date=add_date, title=title, description=description, doc_type=doc_type, original_url=original_url, local_file=local_file, full_text=full_text) doc.save() for bill_num in bill_list: bill_dupe = DocumentLegislation.objects.filter(congress=congress).filter(bill_num=bill_num).filter(document=doc) if not bill_dupe: bill = DocumentLegislation(congress=congress, bill_num=bill_num, document=doc) bill.save()
def handle_noargs(self, **options): doc_type = "DPC LB" file_type = "html" add_date = datetime.datetime.now() year = add_date.year congress = congress_from_year(year) url_prefix = "http://dpc.senate.gov/" url = "%sdpcreports.cfm?cf_year=%s" % (url_prefix, year) page = urllib2.urlopen(url) soup = BeautifulSoup(page) rows = soup.findAll('p', {"class": "doclist"}) for row in rows: file_name = row('a')[0]['href'].strip() p = re.compile('dpcdoc\.cfm\?doc_name=') standard_format = p.findall(file_name) if standard_format: gov_id = file_name.replace('dpcdoc.cfm?doc_name=', '').upper() else: gov_id = None original_url = "%s%s" % (url_prefix, file_name) local_file = '' title = row('a')[0].string description = '' bill_list = extract_legislation(title) date_str = row.contents[3].string.replace('(', '').replace(')', '') release_date = time.strftime('%Y-%m-%d', time.strptime(date_str, '%m/%d/%y')) doc = { 'original_url': original_url, 'local_file': local_file, 'title': title, 'description': description, 'bill_list': bill_list, 'release_date': release_date } print doc