def test_find(self): bills = [ Bill({ 'id': 'Bill:0', 'name': '法律案A' }), Bill({ 'id': 'Bill:1', 'name': '法律案B' }), Bill({ 'id': 'Bill:2', 'name': '法律案A', 'billNumber': '第1号' }) ] bill_finder = BillFinder(bills) assert len(bill_finder.find('法律案A')) == 2 assert bill_finder.find('法律案A')[0].id == 'Bill:0' assert bill_finder.find('法律案A')[1].id == 'Bill:2' assert len(bill_finder.find('法律案B')) == 1 assert bill_finder.find('法律案B')[0].id == 'Bill:1' assert len(bill_finder.find('法律案C')) == 0 assert len(bill_finder.find('第1号')) == 1 assert bill_finder.find('第1号')[0].id == 'Bill:2' assert len(bill_finder.find('第2号')) == 0
def __init__(self, *args, **kwargs): super(SpiderTemplate, self).__init__(*args, **kwargs) logging.getLogger('elasticsearch').setLevel(logging.WARNING) logging.getLogger('sgqlc').setLevel(logging.WARNING) self.gql_client = GraphQLClient() self.es_client = ElasticsearchClient() self.bill_finder = BillFinder() self.minutes_finder = MinutesFinder() self.committee_finder = CommitteeFinder() self.member_finder = MemberFinder()
def test_match(self): bill_finder = BillFinder(bills=[], search_fields=['name', 'aliases']) assert bill_finder.is_text_match(Bill({'name': '法律案A'}), text='法律案', exact_match=False) assert bill_finder.is_text_match(Bill({'name': '法律案A'}), text='法律案A', exact_match=True) assert not bill_finder.is_text_match( Bill({'name': '法律案A'}), text='法律案', exact_match=True) assert bill_finder.is_text_match(Bill({'aliases': ['猫ちゃん', '法律案']}), text='法律案', exact_match=True) assert not bill_finder.is_text_match( Bill({'aliases': ['法律', '案']}), text='法律案', exact_match=True)
def test_find(self): bills = [ Bill({ 'id': 'Bill:0', 'name': '法律案A' }), Bill({ 'id': 'Bill:1', 'name': '法律案B' }), Bill({ 'id': 'Bill:2', 'name': '法律案A', 'billNumber': '第100回国会閣法第1号', 'category': 'KAKUHOU' }) ] bill_finder = BillFinder(bills) assert len(bill_finder.find('法律案A')) == 2 assert bill_finder.find('法律案A')[0].id == 'Bill:0' assert bill_finder.find('法律案A')[1].id == 'Bill:2' with pytest.raises(ValueError): bill_finder.find_one('法律案A') assert len(bill_finder.find('法律案B')) == 1 assert bill_finder.find('法律案B')[0].id == 'Bill:1' assert bill_finder.find_one('法律案B').id == 'Bill:1' assert len(bill_finder.find('法律案C')) == 0 with pytest.raises(ValueError): bill_finder.find_one('法律案C') assert len(bill_finder.find('第1号')) == 1 assert bill_finder.find('第1号')[0].id == 'Bill:2' assert len(bill_finder.find('第2号')) == 0 assert len(bill_finder.find('法律案')) == 3 assert len(bill_finder.find('法律案A(成立)')) == 2 assert len(bill_finder.find('法律案', diet_number=100)) == 1 assert len(bill_finder.find('法律案', diet_number=101)) == 0 assert len(bill_finder.find('法律案', category=BillCategory.KAKUHOU)) == 1 assert len(bill_finder.find('法律案', category=BillCategory.SANHOU)) == 0
def test_find(self): bills = [ Bill({ 'id': 'Bill:0', 'name': '法律案A' }), Bill({ 'id': 'Bill:1', 'name': '法律案B' }), Bill({ 'id': 'Bill:2', 'name': '法律案A', 'billNumber': '第1号' }) ] bill_finder = BillFinder(bills) assert len(bill_finder.find('法律案A')) == 2 assert bill_finder.find('法律案A')[0].id == 'Bill:0' assert bill_finder.find('法律案A')[1].id == 'Bill:2' with pytest.raises(ValueError): bill_finder.find_one('法律案A') assert len(bill_finder.find('法律案B')) == 1 assert bill_finder.find('法律案B')[0].id == 'Bill:1' assert bill_finder.find_one('法律案B').id == 'Bill:1' assert len(bill_finder.find('法律案C')) == 0 with pytest.raises(ValueError): bill_finder.find_one('法律案C') assert len(bill_finder.find('第1号')) == 1 assert bill_finder.find('第1号')[0].id == 'Bill:2' assert len(bill_finder.find('第2号')) == 0 assert len(bill_finder.find('法律案')) == 3 assert len(bill_finder.find('法律案A(成立)')) == 2
import argparse import logging from urllib.parse import urlparse import pandas as pd from politylink.graphql.client import GraphQLClient from politylink.graphql.schema import Url from politylink.helpers import BillFinder from politylink.idgen import idgen LOGGER = logging.getLogger(__name__) bill_finder = BillFinder() client = GraphQLClient() def build_url(url, title): domain = urlparse(url).netloc.replace('www.', '') url = Url({'url': url, 'title': title, 'domain': domain}) url.id = idgen(url) return url def main(fp): df = pd.read_csv(fp).fillna('') LOGGER.info(f'loaded {len(df)} records from {fp}') urls, from_ids, to_ids = [], [], [] for _, row in df.iterrows(): try: bill = bill_finder.find_one(row['bill'])