コード例 #1
0
def compare_bills(b1, b2):
    from bill.billtext import get_bill_text_metadata
    fn1 = get_bill_text_metadata(b1, None)['xml_file']
    fn2 = get_bill_text_metadata(b2, None)['xml_file']
    text1 = extract_text(fn1)
    state = prepare_text1(text1)
    print b1.id, unicode(b1).encode("utf8")
    print b2.id, unicode(b2).encode("utf8")
    ratio1, ratio2, text = compare_text(extract_text(fn2), *state)
    print ratio1, ratio2, ratio1 * ratio2
    print(text[:1000].encode("utf8"))
    print
コード例 #2
0
def compare_bills(b1, b2):
  from bill.billtext import get_bill_text_metadata
  fn1 = get_bill_text_metadata(b1, None)['xml_file']
  fn2 = get_bill_text_metadata(b2, None)['xml_file']
  text1 = extract_text(fn1)
  state = prepare_text1(text1)
  print b1.id, unicode(b1).encode("utf8")
  print b2.id, unicode(b2).encode("utf8")
  ratio1, ratio2, text = compare_text(extract_text(fn2), *state)
  print ratio1, ratio2, ratio1*ratio2
  print(text[:1000].encode("utf8"))
  print
コード例 #3
0
def compare_bills(b1, b2):
  from bill.billtext import get_bill_text_metadata
  fn1 = get_bill_text_metadata(b1, None)['xml_file']
  fn2 = get_bill_text_metadata(b2, None)['xml_file']
  text1 = extract_text(fn1)
  state = prepare_text1(text1)
  print(b1.id, str(b1))
  print(b2.id, str(b2))
  ratio1, ratio2, text = compare_text(extract_text(fn2), *state)
  print(ratio1, ratio2, len(text))
  print(text)
  print(is_text_incorporated(ratio1, ratio2, len(text)))
  print()
コード例 #4
0
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()

    # Cache existing terms. There aren't so many.
    existing_terms = {}
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info('Processing old bill terms')
    TERMS_FILE = 'bill/liv.xml'
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath('/liv/top-term'):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()

        for subnode in node.xpath('./term'):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type),
                                          subterm.name)]
                term.subterms.add(subterm)
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)

                    existing_terms[(int(subterm.term_type),
                                    subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error('Duplicated term %s' %
                              term_processor.display_node(subnode))

    log.info('Processing new bill terms')
    for FILE in ('bill/liv111.xml', 'bill/crsnet.xml'):
        tree = etree.parse(FILE)
        for node in tree.xpath('/liv/top-term'):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath('./term'):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type),
                                              subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)

                        existing_terms[(int(subterm.term_type),
                                        subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error('Duplicated term %s' %
                                  term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills

    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex
        bill_index = BillIndex()

    if options.congress:
        files = glob.glob(settings.CONGRESS_DATA_PATH +
                          '/%s/bills/*/*/data.xml' % options.congress)
        log.info('Parsing unitedstates/congress bills of only congress#%s' %
                 options.congress)
    else:
        files = glob.glob(settings.CONGRESS_DATA_PATH +
                          '/*/bills/*/*/data.xml')

    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]

    log.info('Processing bills: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()

        # With indexing or events enabled, if the bill metadata file hasn't changed check
        # the bill's latest text file for changes so we can create a text-is-available
        # event and so we can index the bill's text.
        if (not options.congress or int(options.congress) > 42) and (
                bill_index and not options.disable_events
        ) and not File.objects.is_changed(fname) and not options.force:
            m = re.match(
                re.escape(settings.CONGRESS_DATA_PATH) +
                r'/(?P<congress>\d+)/bills/(?P<bill_type>[a-z]+)/(?P<bill_type_2>[a-z]+)(?P<number>\d+)/data.xml',
                fname)

            try:
                b = Bill.objects.get(congress=int(m.group("congress")),
                                     bill_type=BillType.by_slug(
                                         m.group("bill_type")),
                                     number=m.group("number"))
                seen_bill_ids.append(b.id)

                # Update the index/events for any bill with recently changed text
                textfile = get_bill_text_metadata(b, None)
                if not textfile:
                    if b.congress >= 103 and b.introduced_date < (
                            datetime.now() - timedelta(days=14)).date():
                        print("No bill text?", fname, b.introduced_date)
                    continue
                textfile = textfile["text_file"]
                if os.path.exists(textfile) and File.objects.is_changed(
                        textfile):
                    b.update_index(bill_index)  # index the full text
                    b.create_events()  # events for new bill text documents
                    File.objects.save_file(textfile)

                continue
            except Bill.DoesNotExist:
                print("Unchanged metadata file but bill doesn't exist:", fname)
                pass  # just parse as normal

        if options.slow:
            time.sleep(1)

        tree = etree.parse(fname)
        for node in tree.xpath('/bill'):
            try:
                bill = bill_processor.process(Bill(), node)
            except:
                print(fname)
                raise

            seen_bill_ids.append(bill.id)  # don't delete me later

            # So far this is just for American Memory bills.
            if node.xpath("string(source/@url)"):
                bill.source_link = str(node.xpath("string(source/@url)"))
            else:
                bill.source_link = None

            actions = []
            for axn in tree.xpath("actions/*[@state]"):
                if axn.xpath("string(@state)") == "REFERRED":
                    continue  # we don't track this state
                actions.append((
                    repr(
                        bill_processor.parse_datetime(
                            axn.xpath("string(@datetime)"))),
                    BillStatus.by_xml_code(axn.xpath("string(@state)")),
                    axn.xpath("string(text)"),
                    etree.tostring(axn, encoding=str),
                ))

            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/enacted"):
                bill.sliplawpubpriv = "PUB" if axn.get(
                    "type") == "public" else "PRI"
                bill.sliplawnum = int(axn.get("number").split("-")[1])

            bill.major_actions = actions
            try:
                bill.save()
            except:
                print(bill)
                raise

            if bill_index:
                bill.update_index(bill_index)

            if not options.disable_events:
                bill.create_events()

        File.objects.save_file(fname)

    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        for b in Bill.objects.filter(congress=options.congress).exclude(
                id__in=seen_bill_ids):
            print("Bill is no longer on disk: ", b.id, b)

    # The rest is for current only...

    if options.congress and int(options.congress) != settings.CURRENT_CONGRESS:
        return

    # Find what might be coming up this week.
    load_docs_house_gov(options, bill_index)
    load_senate_floor_schedule(options, bill_index)
コード例 #5
0
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()
    
    # Cache existing terms. There aren't so many.
    existing_terms = { }
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info('Processing old bill terms')
    TERMS_FILE = 'data/us/liv.xml'
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath('/liv/top-term'):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()
            
        for subnode in node.xpath('./term'):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                term.subterms.add(subterm) 
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)
                    
                    existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error('Duplicated term %s' % term_processor.display_node(subnode))

    log.info('Processing new bill terms')
    for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'):
        tree = etree.parse(FILE)
        for node in tree.xpath('/liv/top-term'):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath('./term'):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)
                        
                        existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error('Duplicated term %s' % term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills
    
    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex
        bill_index = BillIndex()

    if options.congress and int(options.congress) <= 42:
        files = glob.glob('data/congress/%s/bills/*/*/*.xml' % options.congress)
        log.info('Parsing unitedstates/congress bills of only congress#%s' % options.congress)
    elif options.congress:
        files = glob.glob('data/us/%s/bills/*.xml' % options.congress)
        log.info('Parsing bills of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/bills/*.xml')
        
    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]
        
    log.info('Processing bills: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()
        
        # With indexing or events enabled, if the bill metadata file hasn't changed check
        # the bill's latest text file for changes so we can create a text-is-available
        # event and so we can index the bill's text.
        if (not options.congress or options.congress>42) and (bill_index and not options.disable_events) and not File.objects.is_changed(fname) and not options.force:
            m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)

            try:
                b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3))
                seen_bill_ids.append(b.id)
                
                # Update the index/events for any bill with recently changed text
                textfile = get_bill_text_metadata(b, None)
                if not textfile:
                    if b.congress >= 103 and b.introduced_date < (datetime.now()-timedelta(days=14)).date():
                        print "No bill text?", fname, b.introduced_date
                    continue
                textfile = textfile["text_file"]
                if os.path.exists(textfile) and File.objects.is_changed(textfile):
                    b.update_index(bill_index) # index the full text
                    b.create_events() # events for new bill text documents
                    File.objects.save_file(textfile)
                    
                continue
            except Bill.DoesNotExist:
                print "Unchanged metadata file but bill doesn't exist:", fname
                pass # just parse as normal
            
        if options.slow:
            time.sleep(1)
            
        tree = etree.parse(fname)
        for node in tree.xpath('/bill'):
            try:
                bill = bill_processor.process(Bill(), node)
            except:
                print fname
                raise
           
            seen_bill_ids.append(bill.id) # don't delete me later
            
            # So far this is just for American Memory bills.
            if node.xpath("string(source/@url)"):
                bill.source_link = unicode(node.xpath("string(source/@url)"))
            else:
                bill.source_link = None

            actions = []
            for axn in tree.xpath("actions/*[@state]"):
                actions.append( (
                	repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))),
                	BillStatus.by_xml_code(axn.xpath("string(@state)")),
                	axn.xpath("string(text)"),
                    etree.tostring(axn),
                	) )
                
            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/enacted"):
                bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI"
                bill.sliplawnum = int(axn.get("number").split("-")[1])
                    
            bill.major_actions = actions
            try:
                bill.save()
            except:
                print bill
                raise

            if bill_index:
                bill.update_index(bill_index)

            if not options.disable_events:
                bill.create_events()
                
        File.objects.save_file(fname)
        
    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        for b in Bill.objects.filter(congress=options.congress).exclude(id__in = seen_bill_ids):
            print "Bill is no longer on disk: ", b.id, b
        
    # The rest is for current only...
    
    if options.congress and int(options.congress) != CURRENT_CONGRESS:
        return
        
    # Find what might be coming up this week.
    load_docs_house_gov(options, bill_index)
    load_senate_floor_schedule(options, bill_index)
コード例 #6
0
        existing_comps = set()
        if os.path.exists(csv_fn):
            for row in csv.reader(open(csv_fn)):
                timestamp, b1_id, b1_versioncode, b1_ratio, b2_id, b2_versioncode, b2_ratio, cmp_text_len, cmp_text \
                   = row
                existing_comps.add(
                    ((b1_id, b1_versioncode), (b2_id, b2_versioncode)))
                writer.writerow(row)

        # For each enacted bill..
        for b1 in tqdm(enacted_bills):
            # Load the enacted bill's text.

            # Loads current metadata and text for the bill.
            try:
                md1 = get_bill_text_metadata(b1, None)
                text1 = extract_text(md1['xml_file'])
            except TypeError:  # no bill text at all (accessing [...] of None)
                continue
            except KeyError:  # no xml_file
                continue
            except ValueError:  # xml is bad
                continue

            state = prepare_text1(text1)

            # Use Solr's More Like This query to get a preliminary list of
            # bills textually similar to each enacted bill, which lets us
            # cut down on the number of comparisons that we need to run
            # by a factor of around 100. Pull between 10 and 50 similar
            # bills -- depending on how large of a bill the enacted bill
コード例 #7
0
    csv_fn = "data/us/%d/text_comparison.csv" % congress
    existing_comps = set()
    if os.path.exists(csv_fn):
      for row in csv.reader(open(csv_fn)):
        timestamp, b1_id, b1_versioncode, b1_ratio, b2_id, b2_versioncode, b2_ratio, cmp_text_len, cmp_text \
           = row
        existing_comps.add( ((b1_id, b1_versioncode), (b2_id, b2_versioncode) ) )
        writer.writerow(row)

    # For each enacted bill..
    for b1 in tqdm.tqdm(enacted_bills):
      # Load the enacted bill's text.

      # Loads current metadata and text for the bill.
      try:
        md1 = get_bill_text_metadata(b1, None)
        text1 = extract_text(md1['xml_file'])
      except KeyError: # no xml_file
        continue
      except ValueError: # xml is bad
        continue

      state = prepare_text1(text1)

      # Use Solr's More Like This query to get a preliminary list of
      # bills textually similar to each enacted bill, which lets us
      # cut down on the number of comparisons that we need to run
      # by a factor of around 100. Pull between 10 and 50 similar
      # bills -- depending on how large of a bill the enacted bill
      # is. An authorization bill can have lots of bills incorporated
      # into it, but a short bill could not have very many.
コード例 #8
0
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()

    # Cache existing terms. There aren't so many.
    existing_terms = {}
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info('Processing old bill terms')
    TERMS_FILE = 'data/us/liv.xml'
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath('/liv/top-term'):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()

        for subnode in node.xpath('./term'):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type),
                                          subterm.name)]
                term.subterms.add(subterm)
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)

                    existing_terms[(int(subterm.term_type),
                                    subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error('Duplicated term %s' %
                              term_processor.display_node(subnode))

    log.info('Processing new bill terms')
    for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'):
        tree = etree.parse(FILE)
        for node in tree.xpath('/liv/top-term'):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath('./term'):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type),
                                              subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)

                        existing_terms[(int(subterm.term_type),
                                        subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error('Duplicated term %s' %
                                  term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills

    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex
        bill_index = BillIndex()

    if options.congress and int(options.congress) <= 42:
        files = glob.glob('data/congress/%s/bills/*/*/*.xml' %
                          options.congress)
        log.info('Parsing unitedstates/congress bills of only congress#%s' %
                 options.congress)
    elif options.congress:
        files = glob.glob('data/us/%s/bills/*.xml' % options.congress)
        log.info('Parsing bills of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/bills/*.xml')

    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]

    log.info('Processing bills: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()

        # With indexing or events enabled, if the bill metadata file hasn't changed check
        # the bill's latest text file for changes so we can create a text-is-available
        # event and so we can index the bill's text.
        if (not options.congress or options.congress > 42) and (
                bill_index and not options.disable_events
        ) and not File.objects.is_changed(fname) and not options.force:
            m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)

            try:
                b = Bill.objects.get(congress=m.group(1),
                                     bill_type=BillType.by_xml_code(
                                         m.group(2)),
                                     number=m.group(3))
                seen_bill_ids.append(b.id)

                # Update the index/events for any bill with recently changed text
                textfile = get_bill_text_metadata(b, None)
                if not textfile:
                    if b.congress >= 103 and b.introduced_date < (
                            datetime.now() - timedelta(days=14)).date():
                        print "No bill text?", fname, b.introduced_date
                    continue
                textfile = textfile["text_file"]
                if os.path.exists(textfile) and File.objects.is_changed(
                        textfile):
                    bill_index.update_object(
                        b, using="bill")  # index the full text
                    b.create_events()  # events for new bill text documents
                    File.objects.save_file(textfile)

                continue
            except Bill.DoesNotExist:
                print "Unchanged metadata file but bill doesn't exist:", fname
                pass  # just parse as normal

        if options.slow:
            time.sleep(1)

        tree = etree.parse(fname)
        for node in tree.xpath('/bill'):
            try:
                bill = bill_processor.process(Bill(), node)
            except:
                print fname
                raise

            seen_bill_ids.append(bill.id)  # don't delete me later

            if bill.congress >= 93:
                bill.source = "thomas-congproj"
            elif bill.congress >= 82:
                bill.source = "statutesatlarge"
                if bill.current_status == BillStatus.enacted_signed:
                    bill.current_status = BillStatus.enacted_unknown
            elif bill.congress <= 42:
                bill.source = "americanmemory"
            else:
                raise ValueError()

            # So far this is just for American Memory bills.
            if node.xpath("string(source/@url)"):
                bill.source_link = unicode(node.xpath("string(source/@url)"))
            else:
                bill.source_link = None

            actions = []
            for axn in tree.xpath("actions/*[@state]"):
                actions.append((
                    repr(
                        bill_processor.parse_datetime(
                            axn.xpath("string(@datetime)"))),
                    BillStatus.by_xml_code(axn.xpath("string(@state)")),
                    axn.xpath("string(text)"),
                    etree.tostring(axn),
                ))

            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/enacted"):
                bill.sliplawpubpriv = "PUB" if axn.get(
                    "type") == "public" else "PRI"
                bill.sliplawnum = int(axn.get("number").split("-")[1])

            bill.major_actions = actions
            try:
                bill.save()
            except:
                print bill
                raise
            if bill_index: bill_index.update_object(bill, using="bill")

            if not options.disable_events:
                bill.create_events()

        File.objects.save_file(fname)

    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        for b in Bill.objects.filter(congress=options.congress).exclude(
                id__in=seen_bill_ids):
            print "Bill is no longer on disk: ", b.id, b

    # The rest is for current only...

    if options.congress and int(options.congress) != CURRENT_CONGRESS:
        return

    # Parse docs.house.gov for what might be coming up this week.
    import iso8601
    dhg_html = urllib.urlopen("http://docs.house.gov/floor/").read()
    m = re.search(r"class=\"downloadXML\" href=\"(Download.aspx\?file=.*?)\"",
                  dhg_html)
    if not m:
        log.error(
            'No docs.house.gov download link found at http://docs.house.gov.')
    else:

        def bt_re(bt):
            return re.escape(bt[1]).replace(r"\.", r"\.?\s*")

        try:
            dhg = etree.parse(
                urllib.urlopen("http://docs.house.gov/floor/" +
                               m.group(1))).getroot()
        except:
            print "http://docs.house.gov/floor/" + m.group(1)
            raise
        # iso8601.parse_date(dhg.get("week-date")+"T00:00:00").date()
        for item in dhg.xpath("category/floor-items/floor-item"):
            billname = item.xpath("legis-num")[0].text
            if billname is None: continue  # weird but OK
            m = re.match(
                r"\s*(?:Concur in the Senate Amendment to |Senate Amendment to )?("
                + "|".join(bt_re(bt) for bt in BillType) +
                r")(\d+)\s*(\[Conference Report\]\s*)?$", billname, re.I)
            if not m:
                if not billname.strip().endswith(" __"):
                    log.error(
                        'Could not parse legis-num "%s" in docs.house.gov.' %
                        billname)
            else:
                for bt in BillType:
                    if re.match(bt_re(bt) + "$", m.group(1), re.I):
                        try:
                            bill = Bill.objects.get(congress=CURRENT_CONGRESS,
                                                    bill_type=bt[0],
                                                    number=m.group(2))
                            bill.docs_house_gov_postdate = iso8601.parse_date(
                                item.get("add-date")).replace(tzinfo=None)
                            bill.save()
                            if bill_index:
                                bill_index.update_object(bill, using="bill")

                            if not options.disable_events:
                                bill.create_events()
                        except Bill.DoesNotExist:
                            log.error(
                                'Could not find bill "%s" in docs.house.gov.' %
                                billname)
                        break
                else:
                    log.error(
                        'Could not parse legis-num bill type "%s" in docs.house.gov.'
                        % m.group(1))

    # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow.
    now = datetime.now()
    sfs = urllib.urlopen(
        "http://www.senate.gov/pagelayout/legislative/d_three_sections_with_teasers/calendars.htm"
    ).read()
    try:
        sfs = re.search(r"Floor Schedule([\w\W]*)Previous Meeting",
                        sfs).group(1)
        for congress, bill_type, number in re.findall(
                r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)",
                sfs):
            bill_type = BillType.by_slug(bill_type)
            bill = Bill.objects.get(congress=congress,
                                    bill_type=bill_type,
                                    number=number)
            if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta(
                    days=7):
                bill.senate_floor_schedule_postdate = now
                bill.save()
                if bill_index: bill_index.update_object(bill, using="bill")
                if not options.disable_events:
                    bill.create_events()
    except Exception as e:
        log.error('Could not parse Senate Floor Schedule: ' + repr(e))
コード例 #9
0
def main(options):
    """
    Process bill terms and bills
    """

    # Terms

    term_processor = TermProcessor()
    terms_parsed = set()
    
    # Cache existing terms. There aren't so many.
    existing_terms = { }
    for term in BillTerm.objects.all():
        existing_terms[(int(term.term_type), term.name)] = term

    log.info('Processing old bill terms')
    TERMS_FILE = 'data/us/liv.xml'
    tree = etree.parse(TERMS_FILE)
    for node in tree.xpath('/liv/top-term'):
        term = term_processor.process(BillTerm(), node)
        term.term_type = TermType.old
        try:
            # No need to update an existing term because there are no other attributes.
            term = existing_terms[(int(term.term_type), term.name)]
            terms_parsed.add(term.id)
        except:
            log.debug("Created %s" % term)
            term.save()
            term.subterms.clear()
            
        for subnode in node.xpath('./term'):
            subterm = term_processor.process(BillTerm(), subnode)
            subterm.term_type = TermType.old
            try:
                # No need to update an existing term because there are no other attributes.
                subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                term.subterms.add(subterm) 
                terms_parsed.add(subterm.id)
            except:
                try:
                    log.debug("Created %s" % subterm)
                    subterm.save()
                    term.subterms.add(subterm)
                    
                    existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                    terms_parsed.add(subterm.id)
                except IntegrityError:
                    log.error('Duplicated term %s' % term_processor.display_node(subnode))

    log.info('Processing new bill terms')
    for FILE in ('data/us/liv111.xml', 'data/us/crsnet.xml'):
        tree = etree.parse(FILE)
        for node in tree.xpath('/liv/top-term'):
            term = term_processor.process(BillTerm(), node)
            term.term_type = TermType.new
            try:
                # No need to update an existing term because there are no other attributes.
                term = existing_terms[(int(term.term_type), term.name)]
                terms_parsed.add(term.id)
            except:
                log.debug("Created %s" % term)
                term.save()
                term.subterms.clear()

            for subnode in node.xpath('./term'):
                subterm = term_processor.process(BillTerm(), subnode)
                subterm.term_type = TermType.new
                try:
                    # No need to update an existing term because there are no other attributes.
                    subterm = existing_terms[(int(subterm.term_type), subterm.name)]
                    terms_parsed.add(subterm.id)
                    term.subterms.add(subterm)
                except:
                    try:
                        log.debug("Created %s" % term)
                        subterm.save()
                        term.subterms.add(subterm)
                        
                        existing_terms[(int(subterm.term_type), subterm.name)] = subterm
                        terms_parsed.add(subterm.id)
                    except IntegrityError:
                        log.error('Duplicated term %s' % term_processor.display_node(subnode))

    for term in existing_terms.values():
        if not term.id in terms_parsed:
            log.debug("Deleted %s" % term)
            term.delete()

    # Bills
    
    bill_index = None
    if not options.disable_indexing:
        from bill.search_indexes import BillIndex
        bill_index = BillIndex()

    if options.congress:
        files = glob.glob('data/us/%s/bills/*.xml' % options.congress)
        log.info('Parsing bills of only congress#%s' % options.congress)
    else:
        files = glob.glob('data/us/*/bills/*.xml')
        
    if options.filter:
        files = [f for f in files if re.match(options.filter, f)]
        
    log.info('Processing bills: %d files' % len(files))
    total = len(files)
    progress = Progress(total=total, name='files', step=100)

    bill_processor = BillProcessor()
    seen_bill_ids = []
    for fname in files:
        progress.tick()
        
        if not File.objects.is_changed(fname) and not options.force:
            m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)

            try:
                b = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3))
                seen_bill_ids.append(b.id)
                
                # Update the index/events for any bill with recently changed text
                textfile = get_bill_text_metadata(b, None)
                if not textfile:
                    if b.introduced_date < (datetime.now()-timedelta(days=14)).date():
                        print "No bill text?", fname, b.introduced_date
                    continue
                textfile = textfile["plain_text_file"]
                if (bill_index and not options.disable_events) and File.objects.is_changed(textfile):
                    bill_index.update_object(b, using="bill") # index the full text
                    b.create_events() # events for new bill text documents
                    File.objects.save_file(textfile)
                    
                continue
            except Bill.DoesNotExist:
                print "Unchanged metadata file but bill doesn't exist:", fname
                pass # just parse as normal
            
        if options.slow:
            time.sleep(1)
            
        skip_stuff = False
            
        tree = etree.parse(fname)
        for node in tree.xpath('/bill'):
            if not skip_stuff:
                try:
                    bill = bill_processor.process(Bill(), node)
                except:
                    print fname
                    raise
            else:
                m = re.search(r"/(\d+)/bills/([a-z]+)(\d+)\.xml$", fname)
                bill = Bill.objects.get(congress=m.group(1), bill_type=BillType.by_xml_code(m.group(2)), number=m.group(3))
           
            seen_bill_ids.append(bill.id) # don't delete me later
            
            actions = []
            bill.sliplawpubpriv = None
            bill.sliplawnum = None
            for axn in tree.xpath("actions/*[@state]"):
                actions.append( (repr(bill_processor.parse_datetime(axn.xpath("string(@datetime)"))), BillStatus.by_xml_code(axn.xpath("string(@state)")), axn.xpath("string(text)")) )
                
                if actions[-1][1] in (BillStatus.enacted_signed, BillStatus.enacted_veto_override):
                    bill.sliplawpubpriv = "PUB" if axn.get("type") == "public" else "PRI"
                    bill.sliplawnum = int(axn.get("number").split("-")[1])
                    
            bill.major_actions = actions
            try:
                bill.save()
            except:
                print bill
                raise
            if bill_index: bill_index.update_object(bill, using="bill")
            
            if not options.disable_events:
                bill.create_events()

        if not skip_stuff:
            File.objects.save_file(fname)
        
    # delete bill objects that are no longer represented on disk.... this is too dangerous.
    if options.congress and not options.filter and False:
        # this doesn't work because seen_bill_ids is too big for sqlite!
        Bill.objects.filter(congress=options.congress).exclude(id__in = seen_bill_ids).delete()
        
    # Parse docs.house.gov for what might be coming up this week.
    import iso8601
    dhg_html = urllib.urlopen("http://docs.house.gov/floor/").read()
    m = re.search(r"class=\"downloadXML\" href=\"(Download.aspx\?file=.*?)\"", dhg_html)
    if not m:
        log.error('No docs.house.gov download link found at http://docs.house.gov.')
    else:
        def bt_re(bt): return re.escape(bt[1]).replace(r"\.", "\.?\s*")
        try:
            dhg = etree.parse(urllib.urlopen("http://docs.house.gov/floor/" + m.group(1))).getroot()
        except:
            print "http://docs.house.gov/" + m.group(1)
            raise
        # iso8601.parse_date(dhg.get("week-date")+"T00:00:00").date()
        for item in dhg.xpath("category/floor-items/floor-item"):
            billname = item.xpath("legis-num")[0].text
            m = re.match("\s*(?:Concur in the Senate Amendment to |Senate Amendment to )?("
                + "|".join(bt_re(bt) for bt in BillType)
                + ")(\d+)\s*(\[Conference Report\]\s*)?$", billname, re.I)
            if not m:
                if billname.strip() != "H.R. __":
                    log.error('Could not parse legis-num "%s" in docs.house.gov.' % billname)
            else:
                for bt in BillType:
                    if re.match(bt_re(bt) + "$", m.group(1)):
                        try:
                            bill = Bill.objects.get(congress=CURRENT_CONGRESS, bill_type=bt[0], number=m.group(2))
                            bill.docs_house_gov_postdate = iso8601.parse_date(item.get("add-date")).replace(tzinfo=None)
                            bill.save()
                            if bill_index: bill_index.update_object(bill, using="bill")
                            
                            if not options.disable_events:
                                bill.create_events()
                        except Bill.DoesNotExist:
                            log.error('Could not find bill "%s" in docs.house.gov.' % billname)
                        break
                else:
                    log.error('Could not parse legis-num bill type "%s" in docs.house.gov.' % billname)

    # Parse Senate.gov's "Floor Schedule" blurb for coming up tomorrow.
    now = datetime.now()
    sfs = urllib.urlopen("http://www.senate.gov/pagelayout/legislative/d_three_sections_with_teasers/calendars.htm").read()
    try:
        sfs = re.search(r"Floor Schedule([\w\W]*)Previous Meeting", sfs).group(1)
        for congress, bill_type, number in re.findall(r"http://hdl.loc.gov/loc.uscongress/legislation.(\d+)([a-z]+)(\d+)", sfs):
            bill_type = BillType.by_slug(bill_type)
            bill = Bill.objects.get(congress=congress, bill_type=bill_type, number=number)
            if bill.senate_floor_schedule_postdate == None or now - bill.senate_floor_schedule_postdate > timedelta(days=7):
                bill.senate_floor_schedule_postdate = now
                bill.save()
                if bill_index: bill_index.update_object(bill, using="bill")
                if not options.disable_events:
                    bill.create_events()
    except Exception as e:
        log.error('Could not parse Senate Floor Schedule: ' + repr(e))
コード例 #10
0
        if os.path.exists(csv_fn):
            for row in csv.reader(open(csv_fn)):
                timestamp, b1_id, b1_versioncode, b1_ratio, b2_id, b2_versioncode, b2_ratio, cmp_text_len, cmp_text \
                   = row
                existing_comps.add(
                    ((b1_id, b1_versioncode), (b2_id, b2_versioncode)))
                writer.writerow(row)

        # For each enacted bill, find the set of bills to compare it to.
        comps = []
        for b1 in tqdm(enacted_bills, desc="Finding comparison pairs"):
            # Load the enacted bill's text.

            # Loads current metadata for the bill.
            try:
                md1 = get_bill_text_metadata(b1, None)
                if 'xml_file' not in md1: continue  # no xml text
                text1 = extract_text(md1['xml_file'])
            except TypeError:  # no bill text at all (accessing [...] of None)
                continue
            except ValueError:  # xml is bad
                continue

            # Use Solr's More Like This query to get a preliminary list of
            # bills textually similar to each enacted bill, which lets us
            # cut down on the number of comparisons that we need to run
            # by a factor of around 100. Pull between 10 and 300 similar
            # bills -- depending on how large of a bill the enacted bill
            # is. An authorization bill can have lots of bills incorporated
            # into it, but a short bill could not have very many.
            from haystack.query import SearchQuerySet