Exemple #1
0
    def test_find(self):
        bills = [
            Bill({
                'id': 'Bill:0',
                'name': '法律案A'
            }),
            Bill({
                'id': 'Bill:1',
                'name': '法律案B'
            }),
            Bill({
                'id': 'Bill:2',
                'name': '法律案A',
                'billNumber': '第1号'
            })
        ]
        bill_finder = BillFinder(bills)

        assert len(bill_finder.find('法律案A')) == 2
        assert bill_finder.find('法律案A')[0].id == 'Bill:0'
        assert bill_finder.find('法律案A')[1].id == 'Bill:2'

        assert len(bill_finder.find('法律案B')) == 1
        assert bill_finder.find('法律案B')[0].id == 'Bill:1'

        assert len(bill_finder.find('法律案C')) == 0

        assert len(bill_finder.find('第1号')) == 1
        assert bill_finder.find('第1号')[0].id == 'Bill:2'

        assert len(bill_finder.find('第2号')) == 0
Exemple #2
0
 def __init__(self, *args, **kwargs):
     super(SpiderTemplate, self).__init__(*args, **kwargs)
     logging.getLogger('elasticsearch').setLevel(logging.WARNING)
     logging.getLogger('sgqlc').setLevel(logging.WARNING)
     self.gql_client = GraphQLClient()
     self.es_client = ElasticsearchClient()
     self.bill_finder = BillFinder()
     self.minutes_finder = MinutesFinder()
     self.committee_finder = CommitteeFinder()
     self.member_finder = MemberFinder()
Exemple #3
0
    def test_match(self):
        bill_finder = BillFinder(bills=[], search_fields=['name', 'aliases'])

        assert bill_finder.is_text_match(Bill({'name': '法律案A'}),
                                         text='法律案',
                                         exact_match=False)
        assert bill_finder.is_text_match(Bill({'name': '法律案A'}),
                                         text='法律案A',
                                         exact_match=True)
        assert not bill_finder.is_text_match(
            Bill({'name': '法律案A'}), text='法律案', exact_match=True)
        assert bill_finder.is_text_match(Bill({'aliases': ['猫ちゃん', '法律案']}),
                                         text='法律案',
                                         exact_match=True)
        assert not bill_finder.is_text_match(
            Bill({'aliases': ['法律', '案']}), text='法律案', exact_match=True)
Exemple #4
0
    def test_find(self):
        bills = [
            Bill({
                'id': 'Bill:0',
                'name': '法律案A'
            }),
            Bill({
                'id': 'Bill:1',
                'name': '法律案B'
            }),
            Bill({
                'id': 'Bill:2',
                'name': '法律案A',
                'billNumber': '第100回国会閣法第1号',
                'category': 'KAKUHOU'
            })
        ]
        bill_finder = BillFinder(bills)

        assert len(bill_finder.find('法律案A')) == 2
        assert bill_finder.find('法律案A')[0].id == 'Bill:0'
        assert bill_finder.find('法律案A')[1].id == 'Bill:2'
        with pytest.raises(ValueError):
            bill_finder.find_one('法律案A')

        assert len(bill_finder.find('法律案B')) == 1
        assert bill_finder.find('法律案B')[0].id == 'Bill:1'
        assert bill_finder.find_one('法律案B').id == 'Bill:1'

        assert len(bill_finder.find('法律案C')) == 0
        with pytest.raises(ValueError):
            bill_finder.find_one('法律案C')

        assert len(bill_finder.find('第1号')) == 1
        assert bill_finder.find('第1号')[0].id == 'Bill:2'

        assert len(bill_finder.find('第2号')) == 0
        assert len(bill_finder.find('法律案')) == 3
        assert len(bill_finder.find('法律案A(成立)')) == 2

        assert len(bill_finder.find('法律案', diet_number=100)) == 1
        assert len(bill_finder.find('法律案', diet_number=101)) == 0

        assert len(bill_finder.find('法律案', category=BillCategory.KAKUHOU)) == 1
        assert len(bill_finder.find('法律案', category=BillCategory.SANHOU)) == 0
Exemple #5
0
    def test_find(self):
        bills = [
            Bill({
                'id': 'Bill:0',
                'name': '法律案A'
            }),
            Bill({
                'id': 'Bill:1',
                'name': '法律案B'
            }),
            Bill({
                'id': 'Bill:2',
                'name': '法律案A',
                'billNumber': '第1号'
            })
        ]
        bill_finder = BillFinder(bills)

        assert len(bill_finder.find('法律案A')) == 2
        assert bill_finder.find('法律案A')[0].id == 'Bill:0'
        assert bill_finder.find('法律案A')[1].id == 'Bill:2'
        with pytest.raises(ValueError):
            bill_finder.find_one('法律案A')

        assert len(bill_finder.find('法律案B')) == 1
        assert bill_finder.find('法律案B')[0].id == 'Bill:1'
        assert bill_finder.find_one('法律案B').id == 'Bill:1'

        assert len(bill_finder.find('法律案C')) == 0
        with pytest.raises(ValueError):
            bill_finder.find_one('法律案C')

        assert len(bill_finder.find('第1号')) == 1
        assert bill_finder.find('第1号')[0].id == 'Bill:2'

        assert len(bill_finder.find('第2号')) == 0
        assert len(bill_finder.find('法律案')) == 3
        assert len(bill_finder.find('法律案A(成立)')) == 2
Exemple #6
0
import argparse
import logging
from urllib.parse import urlparse

import pandas as pd

from politylink.graphql.client import GraphQLClient
from politylink.graphql.schema import Url
from politylink.helpers import BillFinder
from politylink.idgen import idgen

LOGGER = logging.getLogger(__name__)
bill_finder = BillFinder()
client = GraphQLClient()


def build_url(url, title):
    domain = urlparse(url).netloc.replace('www.', '')
    url = Url({'url': url, 'title': title, 'domain': domain})
    url.id = idgen(url)
    return url


def main(fp):
    df = pd.read_csv(fp).fillna('')
    LOGGER.info(f'loaded {len(df)} records from {fp}')

    urls, from_ids, to_ids = [], [], []
    for _, row in df.iterrows():
        try:
            bill = bill_finder.find_one(row['bill'])