def date_format(date): date = str(date) date = re.sub(r'\([^)]*\)', '', date) date = re.sub(r'\[(.*?)\]', '', date) date = date.rsplit(',') date = str(date[-2]).strip() + ", " + str(date[-1]).strip() date_string = date return date_string
def format_date_for_box_score(date_string): date = date_string[5:] date = date.replace(',','').replace(' ','-') # get month month_as_string = date[:3] month = months.get(month_as_string) # get day date = date[4:] day = date.rsplit('-', 1)[0] x = int(day) # prepend 0 if necessary if x < 10: day = '0' + day # get year date = date.replace('-',' ') date.split() year = date[2:] year = year.replace(' ', '') return year + month + day
def odd_link(b, date, l, directory): text = b.get_text() # not links to docs try: link = l.get("href") except: pass # these are not documents if "link" in locals(): if link[-4:] == ".gov": return {"date_string":False, "real_title":False} elif link[-5:] == ".gov/" or link == "/usao/eousa/index.html": return {"date_string":False, "real_title":False} text = b.get_text() #section for documents without dates: if date != None: if date.strip() == "Alleged Deception of Congress: The Congressional Task Force on Immigration Reform's Fact-Finding Visit to the Miami District of INS in June 1995": return {"date_string": "June 1, 1996", "real_title": "Alleged Deception of Congress: The Congressional Task Force on Immigration Reform's Fact-Finding Visit to the Miami District of INS in June 1995"} if date == "Audit Report GR-30-00-001": return {"date_string": "November 1, 2000", "real_title":"McMechen, West Virginia Police Department, Audit Report GR-30-00-001"} # no date, one other entry, giving it the same date if date == "Georgia's Department of Corrections": return {"date_string": "November 1, 2000", "real_title":"United States Marshals Service Cost Proposal for the Intergovernmental Service Agreement for Detention Facilities with the City of Atlanta, Georgia’s Department of Corrections"} # confirmed no dates for these no_dates = ("Audit Report GR-40-99-014", "Audit Report GR-40-99-011", "Evaluation and Inspections Report I-2000-021", "Evaluation and Inspections Report I-2000-018", "Audit Report 99-03") if date.strip() in no_dates: date_string = datetime.now() date_string = datetime.strftime(date_string, "%B %d, %Y") return {"date_string": date_string, "real_title": text} # Intergovernmental Agreements for Detention Space External Reports don't always have dates, not even on the documents, using today if directory == "Intergovernmental Agreements for Detention Space (IGAs)": date_string = datetime.now() date_string = datetime.strftime(date_string, "%B %d, %Y") return {"date_string": date_string, "real_title": text} # need to get rid of this to process if "Released Publicly" in text: date = text date = re.sub(r'\([^)]*\)', '', date) date = re.sub(r'\[(.*?)\]', '', date) date = date.replace("Released Publicly", '') date_chopped = date.rsplit(',') day = date_chopped[-1] date = day.strip() if day.isdigit(): date_string = date_chopped[-2] + "," + date_chopped[-1] if "," not in date: date = date.strip() date = date.replace(" ", " 1, ") return{"date_string": date, "real_title": text} if "Revised" in text: date = text date = re.sub(r'\([^)]*\)', '', date) date = re.sub(r'\[(.*?)\]', '', date) date = date.replace("Revised", '') date_chopped = date.rsplit(',') day = date_chopped[-1] date = day.strip() if day.isdigit(): date_string = date_chopped[-2] + "," + date_chopped[-1] if "," not in date: date = date.strip() date = date.replace(" ", " 1, ") return{"date_string": date, "real_title": text} if date != None: date = date.strip # case 1, date is wrong because it is in the paragraph and completely written out try: date = b.string date_string = date_format(date) title = b.string except: # these are lists of links that are different variants of the same report in a list # case where there is a list in a paragraph tag listy = b.parent.parent text = str(listy.previous_sibling) title = text # case where there is a paragraph above a list if len(text) < 4: listy = b.parent.parent text = listy.previous_sibling.previous_sibling title = str(text)[3:-4] date = re.sub(r'\([^)]*\)', '', title) date = re.sub(r'\[[^)]*\]', '', date) date = date.rsplit(',') date_string = date[-1] date_string = date_string.strip() if "," not in date_string: date_string = date_string.replace(" ", " 1, ") # for the DOJ combined page if date_string == 'id="content" 1, name="content">': text = b.text text = re.sub(r'\([^)]*\)', '', text) chunks = text.split(",") day_piece = chunks[-1] day_chunks = day_piece.split('—') day = day_chunks[0] day = day.strip() day = day.replace(" ", " 1, ") date_string = day title = b.text ## uncomment for debugging # try: # date = datetime.strptime(date_string, "%B %d, %Y") # except: # print('hit one') # print("b: ", b.text) # print("l: ", l) # print("date: ", date) # print("date string", date_string) # print("directory", directory) # exit() info = {"real_title":title, "date_string": date_string, } return(info)