Exemple #1
0
def date_format(date):
  date = str(date)
  date = re.sub(r'\([^)]*\)', '', date)
  date = re.sub(r'\[(.*?)\]', '', date)
  date = date.rsplit(',')
  date = str(date[-2]).strip() + ", " + str(date[-1]).strip()
  date_string = date
  return date_string
Exemple #2
0
def format_date_for_box_score(date_string):
	date = date_string[5:]
	date = date.replace(',','').replace(' ','-')
	# get month
	month_as_string = date[:3]
	month = months.get(month_as_string)
	# get day
	date = date[4:]
	day = date.rsplit('-', 1)[0]
	x = int(day)
	# prepend 0 if necessary
	if x < 10:
		day = '0' + day
	# get year
	date = date.replace('-',' ')
	date.split()
	year = date[2:]
	year = year.replace(' ', '')

	return year + month + day
Exemple #3
0
def odd_link(b, date, l, directory):
  text = b.get_text()
  # not links to docs
  try:
    link = l.get("href")
  except:
    pass

  # these are not documents
  if "link" in locals():
    if link[-4:] == ".gov":
      return {"date_string":False, "real_title":False}
    elif link[-5:] == ".gov/" or link == "/usao/eousa/index.html":
      return {"date_string":False, "real_title":False}
  text = b.get_text()

  #section for documents without dates:
  if date != None:
    if date.strip() == "Alleged Deception of Congress: The Congressional Task Force on Immigration Reform's Fact-Finding Visit to the Miami District of INS in June 1995":
      return {"date_string": "June 1, 1996", "real_title": "Alleged Deception of Congress: The Congressional Task Force on Immigration Reform's Fact-Finding Visit to the Miami District of INS in June 1995"}
    if date == "Audit Report GR-30-00-001":
      return {"date_string": "November 1, 2000", "real_title":"McMechen, West Virginia Police Department, Audit Report GR-30-00-001"}
    # no date, one other entry, giving it the same date
    if date == "Georgia's Department of Corrections":
      return {"date_string": "November 1, 2000", "real_title":"United States Marshals Service Cost Proposal for the Intergovernmental Service Agreement for Detention Facilities with the City of Atlanta, Georgia’s Department of Corrections"}
    # confirmed no dates for these
    no_dates = ("Audit Report GR-40-99-014", "Audit Report GR-40-99-011", "Evaluation and Inspections Report I-2000-021", "Evaluation and Inspections Report I-2000-018", "Audit Report 99-03")
    if date.strip() in no_dates:
      date_string = datetime.now()
      date_string = datetime.strftime(date_string, "%B %d, %Y")
      return {"date_string": date_string, "real_title": text}
    # Intergovernmental Agreements for Detention Space External Reports don't always have dates, not even on the documents, using today
    if directory == "Intergovernmental Agreements for Detention Space (IGAs)":
      date_string = datetime.now()
      date_string = datetime.strftime(date_string, "%B %d, %Y")
      return {"date_string": date_string, "real_title": text}

  # need to get rid of this to process
  if "Released Publicly" in text:
    date = text
    date = re.sub(r'\([^)]*\)', '', date)
    date = re.sub(r'\[(.*?)\]', '', date)
    date = date.replace("Released Publicly", '')
    date_chopped = date.rsplit(',')
    day = date_chopped[-1]
    date = day.strip()
    if day.isdigit():
        date_string = date_chopped[-2] + "," + date_chopped[-1]
    if "," not in date:
      date = date.strip()
      date = date.replace(" ", " 1, ")
    return{"date_string": date, "real_title": text}

  if "Revised" in text:
    date = text
    date = re.sub(r'\([^)]*\)', '', date)
    date = re.sub(r'\[(.*?)\]', '', date)
    date = date.replace("Revised", '')
    date_chopped = date.rsplit(',')
    day = date_chopped[-1]
    date = day.strip()
    if day.isdigit():
        date_string = date_chopped[-2] + "," + date_chopped[-1]
    if "," not in date:
      date = date.strip()
      date = date.replace(" ", " 1, ")
    return{"date_string": date, "real_title": text}

  if date != None:
    date = date.strip

    # case 1, date is wrong because it is in the paragraph and completely written out
    try:
        date =  b.string
        date_string = date_format(date)
        title = b.string
    except:
      # these are lists of links that are different variants of the same report in a list
      # case where there is a list in a paragraph tag
      listy = b.parent.parent
      text = str(listy.previous_sibling)
      title = text

      # case where there is a paragraph above a list
      if len(text) < 4:
        listy = b.parent.parent
        text = listy.previous_sibling.previous_sibling
        title = str(text)[3:-4]
      date = re.sub(r'\([^)]*\)', '', title)
      date = re.sub(r'\[[^)]*\]', '', date)
      date = date.rsplit(',')
      date_string = date[-1]
      date_string = date_string.strip()
      if "," not in date_string:
        date_string = date_string.replace(" ", " 1, ")

  # for the DOJ combined page
  if date_string == 'id="content" 1, name="content">':
    text = b.text
    text = re.sub(r'\([^)]*\)', '', text)
    chunks = text.split(",")
    day_piece = chunks[-1]
    day_chunks = day_piece.split('—')
    day = day_chunks[0]
    day = day.strip()
    day = day.replace(" ", " 1, ")
    date_string = day
    title = b.text

  ## uncomment for debugging
  # try:
  #   date = datetime.strptime(date_string, "%B %d, %Y")
  # except:
  #   print('hit one')
  #   print("b:  ", b.text)
  #   print("l:  ", l)
  #   print("date: ", date)
  #   print("date string", date_string)
  #   print("directory", directory)
  #   exit()

  info = {"real_title":title, "date_string": date_string, }
  return(info)