def __init__(self, committees=None, **kwargs): if committees: self.committees = committees else: client = GraphQLClient(**kwargs) self.committees = client.get_all_committees(['id'] + self.search_fields)
def main(fp): client = GraphQLClient() member_finder = MemberFinder(search_fields=['name', 'name_hira']) df = pd.read_csv(fp).fillna('') LOGGER.info(f'load {len(df)} members from {fp}') members = [] for _, row in df.iterrows(): member = None for search_field in ['name', 'name_hira']: try: member = member_finder.find_one(row[search_field], exact_match=True) break except ValueError as e: LOGGER.debug(e) if not member: LOGGER.warning(f'failed to find member for row={row}') continue for link_field in ['website', 'twitter', 'facebook']: if row[link_field]: setattr(member, link_field, row[link_field]) members.append(member) client.bulk_merge(members) LOGGER.info(f'merged {len(members)} member links')
def __init__(self, bills=None, search_fields=None, **kwargs): super().__init__(search_fields or ['name', 'bill_number', 'category']) if bills: self.bills = bills else: client = GraphQLClient(**kwargs) self.bills = client.get_all_bills(['id'] + self.search_fields)
def __init__(self, members=None, search_fields=None, **kwargs): super().__init__(search_fields or ['name', 'name_hira']) if members: self.members = members else: client = GraphQLClient(**kwargs) self.members = client.get_all_members(['id'] + self.search_fields)
def main(): es_client = ElasticsearchClient() gql_client = GraphQLClient() query = args.query if args.bill: bill = gql_client.get(f'Bill:{args.bill}', fields=['id', 'name']) query += bill.name news_texts = es_client.search(NewsText, query=query, start_date_str=args.start, end_date_str=args.end) for news_text in news_texts: try: news = gql_client.get(news_text.id) except GraphQLException as e: LOGGER.warning(e) continue print(news.id) print(news.publisher + '@' + news.published_at.formatted) print(news.title) print(news_text.body[:100]) print(news.url) print()
def main(): client = GraphQLClient(url="https://graphql.politylink.jp/") bills = client.get_all_bills(fields=['id', 'urls']) stats = defaultdict(int) s3_client = boto3.client('s3') os.makedirs("./image/bill", exist_ok=True) for bill in tqdm(bills): summary_pdf = next(filter(lambda x: x.title == "概要PDF", bill.urls), None) LOGGER.debug(f'Processing ... {bill.id}') if summary_pdf: stats['process'] += 1 id_ = bill.id.split(':')[-1] local_path = f'./image/bill/{id_}.png' try: response = requests.get(summary_pdf.url) thumbnail = save_thumbnail(response, local_path) except Exception as e: LOGGER.warning(f'failed to convert summary pdf to png: {e}') stats['fail'] += 1 continue if args.publish: s3_path = f'bill/{id_}.png' s3_client.upload_file(local_path, 'politylink', s3_path, ExtraArgs={'ContentType': 'image/png'}) time.sleep(1) LOGGER.info('processed {} bills ({} success, {} fail)'.format( stats['process'], stats['process'] - stats['fail'], stats['fail']))
def __init__(self, committees=None, search_fields=None, **kwargs): super().__init__(search_fields or ['name', 'aliases']) if committees: self.committees = committees else: client = GraphQLClient(**kwargs) self.committees = client.get_all_committees(['id'] + self.search_fields)
def test_bulk_get(self): client = GraphQLClient() bill = self._build_sample_bill() bills = client.bulk_get([bill.id]) assert len(bills) == 1 assert bill.id == bills[0].id
def get_member_names(self): client = GraphQLClient() fields = [ 'first_name', 'first_name_hira', 'last_name', 'last_name_hira' ] members = client.get_all_members(fields=fields) member_names = [m[f] for f in fields for m in members] return member_names
def test_bulk_merge(self): client = GraphQLClient() bill = self._build_sample_bill() url = self._build_sample_url() data = client.bulk_merge([bill, url]) assert data['op0']['id'] == bill.id assert data['op1']['id'] == url.id
def __init__(self, *args, **kwargs): super(SpiderTemplate, self).__init__(*args, **kwargs) logging.getLogger('elasticsearch').setLevel(logging.WARNING) logging.getLogger('sgqlc').setLevel(logging.WARNING) self.gql_client = GraphQLClient() self.es_client = ElasticsearchClient() self.bill_finder = BillFinder() self.minutes_finder = MinutesFinder() self.committee_finder = CommitteeFinder() self.member_finder = MemberFinder()
def test_merge(self): client = GraphQLClient() bill = self._build_sample_bill() data = client.merge(bill) assert data['MergeBill']['id'] == bill.id url = self._build_sample_url() data = client.merge(url) assert data['MergeUrl']['id'] == url.id news = self._build_sample_news() data = client.merge(news) assert data['MergeNews']['id'] == news.id minutes = self._build_sample_minutes() data = client.merge(minutes) assert data['MergeMinutes']['id'] == minutes.id committee = self._build_sample_committee() data = client.merge(committee) assert data['MergeCommittee']['id'] == committee.id speech = self._build_sample_speech() data = client.merge(speech) assert data['MergeSpeech']['id'] == speech.id timeline = self._build_sample_timeline() data = client.merge(timeline) assert data['MergeTimeline']['id'] == timeline.id
def main(): client = GraphQLClient(url="https://graphql.politylink.jp/") members = client.get_all_members(fields=['id', 'image']) s3 = boto3.resource('s3') for member in tqdm(members): response = requests.get(member.image) object_key = 'member/{}.jpg'.format(member.id.split(':')[-1]) s3.Bucket('politylink').put_object(Key=object_key, Body=response.content, ContentType="image/jpeg") time.sleep(1)
def test_exec(self): client = GraphQLClient() query = """ { Bill { name billNumber } } """ data = client.exec(query) assert 'Bill' in data
def __init__(self, bills=None): if bills: self.bills = bills else: client = GraphQLClient() self.bills = client.get_all_bills(['id', 'name', 'bill_number']) self.name2bills = defaultdict(list) self.number2bill = defaultdict(list) for bill in self.bills: if hasattr(bill, 'name'): self.name2bills[bill.name].append(bill) if hasattr(bill, 'bill_number'): self.number2bill[bill.bill_number].append(bill)
def find(self, text, dt=None): op = Operation(Query) minutes_filter = _MinutesFilter(None) minutes_filter.name_contains = text if dt: minutes_filter.start_date_time = _Neo4jDateTimeInput( year=dt.year, month=dt.month, day=dt.day) op.minutes(filter=minutes_filter) data = self.client.endpoint(op) GraphQLClient.validate_response_or_raise(data) minutes = (op + data).minutes return minutes
def test_build_input(self): bill_id = 'Bill:id' bill_input = GraphQLClient.build_input(bill_id) assert isinstance(bill_input, _BillInput) assert bill_input.id == bill_id minutes_id = 'Minutes:id' minutes_input = GraphQLClient.build_input(minutes_id) assert isinstance(minutes_input, _MinutesInput) assert minutes_input.id == minutes_id with pytest.raises(GraphQLException): GraphQLClient.build_input('invalid:id')
def test_get_all_bills(self): client = GraphQLClient() bill = self._build_sample_bill() client.merge(bill) query = '公文書' bills = client.get_all_bills(fields=['id', 'name'], filter_=_BillFilter( {'name_contains': query})) assert len(bills) > 0 for bill in bills: assert isinstance(bill, Bill) assert query in bill.name
def test_bulk_link(self): client = GraphQLClient() url = self._build_sample_url() bill = self._build_sample_bill() minutes = self._build_sample_minutes() from_ids = [url.id, url.id] to_ids = [bill.id, minutes.id] data = client.bulk_link(from_ids, to_ids) assert data['op0']['from']['id'] == url.id assert data['op0']['to']['id'] == bill.id assert data['op1']['from']['id'] == url.id assert data['op1']['to']['id'] == minutes.id assert url.id in map(lambda x: x.id, client.get(bill.id).urls) assert url.id in map(lambda x: x.id, client.get(minutes.id).urls) data = client.bulk_unlink(from_ids, to_ids) assert data['op0']['from']['id'] == url.id assert data['op0']['to']['id'] == bill.id assert data['op1']['from']['id'] == url.id assert data['op1']['to']['id'] == minutes.id assert url.id not in map(lambda x: x.id, client.get(bill.id).urls) assert url.id not in map(lambda x: x.id, client.get(minutes.id).urls)
def main(): gql_client = GraphQLClient() es_client = ElasticsearchClient() news_list = gql_client.get_all_news( fields=['id', 'title', 'published_at', 'is_timeline'], start_date=args.start_date, end_date=args.end_date) LOGGER.info(f'fetched {len(news_list)} news from GraphQL') if args.check_timeline: news_list = list(filter(lambda x: x.is_timeline, news_list)) LOGGER.info(f'filtered {len(news_list)} timeline news') stats = defaultdict(int) for news in tqdm(news_list): LOGGER.info(f'process {news.id}') stats['process'] += 1 try: news_text = es_client.get(news.id) if not args.skip_minutes: LOGGER.debug(f'check Minutes for {news.id}') minutes_list = fetch_matched_minutes(news, news_text) if minutes_list: gql_client.bulk_link([news.id] * len(minutes_list), map(lambda x: x['id'], minutes_list)) LOGGER.info( f'linked {len(minutes_list)} minutes for {news.id}') if not args.skip_bill: LOGGER.debug(f'check Bill for {news.id}') bill_list = fetch_matched_bills(news, news_text) if bill_list: gql_client.bulk_link([news.id] * len(bill_list), map(lambda x: x['id'], bill_list)) LOGGER.info(f'linked {len(bill_list)} bills for {news.id}') if not args.skip_timeline: LOGGER.debug(f'check Timeline for {news.id}') is_timeline = fetch_is_timeline(news, news_text) if is_timeline: # need to create new instance to avoid neo4j datetime error updated_news = News(None) updated_news.id = news.id updated_news.is_timeline = is_timeline gql_client.merge(updated_news) LOGGER.info(f'linked {news.id} to timeline') except Exception as e: stats['fail'] += 1 if isinstance(e, json.decoder.JSONDecodeError): LOGGER.warning(f'failed to parse API response for {news.id}') else: LOGGER.exception(f'failed to process {news.id}') LOGGER.info('processed {} news ({} success, {} fail)'.format( stats['process'], stats['process'] - stats['fail'], stats['fail']))
def main(fp): gql_client = GraphQLClient() df = pd.read_csv(fp) diets = [] for _, row in df.iterrows(): diet = Diet(None) diet.number = int(row['number']) diet.name = f'第{diet.number}回国会' diet.category = row['category'] diet.start_date = to_neo4j_datetime(row['start_date']) diet.end_date = to_neo4j_datetime(row['end_date']) diet.id = idgen(diet) diets.append(diet) gql_client.bulk_merge(diets) LOGGER.info(f'merged {len(diets)} diets')
def get_speech_contexts(json_path=None): speech_contexts = [] # add contexts from json if json_path: speech_contexts += json.load(open(json_path, 'r')).values() # add contexts from GraphQL gql_client = GraphQLClient() members = gql_client.get_all_members(['name']) if members: member_context = { 'phrases': [member.name for member in members], 'boost': 20.0 } speech_contexts.append(member_context) return speech_contexts
def main(): gql_client = GraphQLClient() es_client = ElasticsearchClient() news_list = gql_client.get_all_news(fields=['id', 'published_at'], start_date=args.start_date, end_date=args.end_date) LOGGER.info(f'fetched {len(news_list)} news from GraphQL') if news_list: news_text_list = list( map( lambda news: NewsText({ 'id': news.id, 'date': to_date_str(news.published_at) }), news_list)) es_client.bulk_index(news_text_list, op_type=OpType.UPDATE) LOGGER.info(f're-indexed {len(news_text_list)} news text')
def test_get(self): client = GraphQLClient() url = self._build_sample_url() bill = self._build_sample_bill() news = self._build_sample_news() speech = self._build_sample_speech() minutes = self._build_sample_minutes() committee = self._build_sample_committee() timeline = self._build_sample_timeline() for obj in [url, bill, news, speech, minutes, committee, timeline]: client.merge(obj) ret = client.get(obj.id) assert ret.id == obj.id with pytest.raises(GraphQLException): client.get('invalid:class') with pytest.raises(GraphQLException): client.get('bill:invalid')
def main(): gql_client = GraphQLClient() df = pd.read_csv(args.file) if args.delete: gql_client.bulk_unlink(from_ids=df['from_id'], to_ids=df['to_id']) LOGGER.info(f'deleted {len(df)} relationships') else: gql_client.bulk_link(from_ids=df['from_id'], to_ids=df['to_id']) LOGGER.info(f'merged {len(df)} relationships')
def test_show_ops(self): bill = self._build_sample_bill() url = self._build_sample_url() LOGGER.warning(GraphQLClient.build_merge_operation(bill)) LOGGER.warning(GraphQLClient.build_link_operation(url.id, bill.id)) LOGGER.warning( GraphQLClient.build_link_operation(url.id, bill.id, remove=True)) LOGGER.warning( GraphQLClient.build_get_operation(bill.id, ['id', 'name'])) LOGGER.warning( GraphQLClient.build_bulk_get_operation(ids=['Bill:1', 'Bill:2'], fields=['id', 'name'])) LOGGER.warning(GraphQLClient.build_delete_operation(bill.id)) LOGGER.warning( GraphQLClient.build_get_all_operation( 'bill', ['id', 'name'], _BillFilter({'name_contains': '公文書'}))) bulk_op = Operation(Mutation) bulk_op = GraphQLClient.build_merge_operation(bill, bulk_op) bulk_op = GraphQLClient.build_link_operation(url.id, bill.id, bulk_op) LOGGER.warning(bulk_op)
def main(): gql_client = GraphQLClient() bill_list = gql_client.get_all_bills(['id'] + BILL_DATE_FIELDS) LOGGER.info(f'fetched {len(bill_list)} bills') minutes_list = gql_client.get_all_minutes(['id'] + MINUTES_DATE_FIELD) LOGGER.info(f'fetched {len(minutes_list)} minutes') news_list = gql_client.get_all_news(['id', 'is_timeline'] + NEWS_DATE_FIELD, start_date=args.start_date, end_date=args.end_date) LOGGER.info(f'fetched {len(news_list)} news') date2bill = build_date_dict(bill_list, BILL_DATE_FIELDS) date2minutes = build_date_dict(minutes_list, MINUTES_DATE_FIELD) date2news = build_date_dict(news_list, NEWS_DATE_FIELD) dates = [ args.start_date + timedelta(i) for i in range((args.end_date - args.start_date).days) ] for date in tqdm(dates): timeline = Timeline(None) timeline.date = _Neo4jDateTimeInput(year=date.year, month=date.month, day=date.day) timeline.id = idgen(timeline) gql_client.merge(timeline) from_ids = [] for bill in date2bill[date]: from_ids.append(bill.id) for minutes in date2minutes[date]: from_ids.append(minutes.id) for news in date2news[date]: if news.is_timeline: from_ids.append(news.id) gql_client.bulk_link(from_ids, [timeline.id] * len(from_ids)) LOGGER.info(f'linked {len(from_ids)} events to {date}')
def test_link(self): client = GraphQLClient() url = self._build_sample_url() bill = self._build_sample_bill() data = client.link(url.id, bill.id) assert data['MergeUrlReferredBills']['from']['id'] == url.id assert data['MergeUrlReferredBills']['to']['id'] == bill.id assert url.id in map(lambda x: x.id, client.get(bill.id).urls) data = client.unlink(url.id, bill.id) assert data['RemoveUrlReferredBills']['from']['id'] == url.id assert data['RemoveUrlReferredBills']['to']['id'] == bill.id assert url.id not in map(lambda x: x.id, client.get(bill.id).urls)
class MinutesFinder: """ GraphQL based Minutes finder """ def __init__(self, **kwargs): self.client = GraphQLClient(**kwargs) def find(self, text, dt=None): op = Operation(Query) minutes_filter = _MinutesFilter(None) minutes_filter.name_contains = text if dt: minutes_filter.start_date_time = _Neo4jDateTimeInput(year=dt.year, month=dt.month, day=dt.day) minutes = op.minutes(filter=minutes_filter) minutes.id() minutes.name() data = self.client.endpoint(op) GraphQLClient.validate_response_or_raise(data) minutes = (op + data).minutes return minutes
def main(): gql_client = GraphQLClient() objects = [] if args.bill: bills = gql_client.get_all_bills(['id', 'news']) LOGGER.info(f'fetched {len(bills)} bills to clean') objects += bills if args.minutes: minutesList = gql_client.get_all_minutes(['id', 'news']) LOGGER.info(f'fetched {len(minutesList)} minutes to clean') objects += minutesList LOGGER.info(f'registered {len(objects)} objects to clean') for obj in objects: news_ids = list(map(lambda x: x.id, obj.news)) if news_ids: gql_client.bulk_unlink(news_ids, [obj.id] * len(news_ids)) LOGGER.info(f'removed {len(news_ids)} news links from {obj.id}')