def main(): gql_client = GraphQLClient() df = pd.read_csv(args.file) if args.delete: gql_client.bulk_unlink(from_ids=df['from_id'], to_ids=df['to_id']) LOGGER.info(f'deleted {len(df)} relationships') else: gql_client.bulk_link(from_ids=df['from_id'], to_ids=df['to_id']) LOGGER.info(f'merged {len(df)} relationships')
def main(): gql_client = GraphQLClient() es_client = ElasticsearchClient() news_list = gql_client.get_all_news( fields=['id', 'title', 'published_at', 'is_timeline'], start_date=args.start_date, end_date=args.end_date) LOGGER.info(f'fetched {len(news_list)} news from GraphQL') if args.check_timeline: news_list = list(filter(lambda x: x.is_timeline, news_list)) LOGGER.info(f'filtered {len(news_list)} timeline news') stats = defaultdict(int) for news in tqdm(news_list): LOGGER.info(f'process {news.id}') stats['process'] += 1 try: news_text = es_client.get(news.id) if not args.skip_minutes: LOGGER.debug(f'check Minutes for {news.id}') minutes_list = fetch_matched_minutes(news, news_text) if minutes_list: gql_client.bulk_link([news.id] * len(minutes_list), map(lambda x: x['id'], minutes_list)) LOGGER.info( f'linked {len(minutes_list)} minutes for {news.id}') if not args.skip_bill: LOGGER.debug(f'check Bill for {news.id}') bill_list = fetch_matched_bills(news, news_text) if bill_list: gql_client.bulk_link([news.id] * len(bill_list), map(lambda x: x['id'], bill_list)) LOGGER.info(f'linked {len(bill_list)} bills for {news.id}') if not args.skip_timeline: LOGGER.debug(f'check Timeline for {news.id}') is_timeline = fetch_is_timeline(news, news_text) if is_timeline: # need to create new instance to avoid neo4j datetime error updated_news = News(None) updated_news.id = news.id updated_news.is_timeline = is_timeline gql_client.merge(updated_news) LOGGER.info(f'linked {news.id} to timeline') except Exception as e: stats['fail'] += 1 if isinstance(e, json.decoder.JSONDecodeError): LOGGER.warning(f'failed to parse API response for {news.id}') else: LOGGER.exception(f'failed to process {news.id}') LOGGER.info('processed {} news ({} success, {} fail)'.format( stats['process'], stats['process'] - stats['fail'], stats['fail']))
def test_bulk_link(self): client = GraphQLClient() url = self._build_sample_url() bill = self._build_sample_bill() minutes = self._build_sample_minutes() from_ids = [url.id, url.id] to_ids = [bill.id, minutes.id] data = client.bulk_link(from_ids, to_ids) assert data['op0']['from']['id'] == url.id assert data['op0']['to']['id'] == bill.id assert data['op1']['from']['id'] == url.id assert data['op1']['to']['id'] == minutes.id assert url.id in map(lambda x: x.id, client.get(bill.id).urls) assert url.id in map(lambda x: x.id, client.get(minutes.id).urls) data = client.bulk_unlink(from_ids, to_ids) assert data['op0']['from']['id'] == url.id assert data['op0']['to']['id'] == bill.id assert data['op1']['from']['id'] == url.id assert data['op1']['to']['id'] == minutes.id assert url.id not in map(lambda x: x.id, client.get(bill.id).urls) assert url.id not in map(lambda x: x.id, client.get(minutes.id).urls)
def main(): gql_client = GraphQLClient() bill_list = gql_client.get_all_bills(['id'] + BILL_DATE_FIELDS) LOGGER.info(f'fetched {len(bill_list)} bills') minutes_list = gql_client.get_all_minutes(['id'] + MINUTES_DATE_FIELD) LOGGER.info(f'fetched {len(minutes_list)} minutes') news_list = gql_client.get_all_news(['id', 'is_timeline'] + NEWS_DATE_FIELD, start_date=args.start_date, end_date=args.end_date) LOGGER.info(f'fetched {len(news_list)} news') date2bill = build_date_dict(bill_list, BILL_DATE_FIELDS) date2minutes = build_date_dict(minutes_list, MINUTES_DATE_FIELD) date2news = build_date_dict(news_list, NEWS_DATE_FIELD) dates = [ args.start_date + timedelta(i) for i in range((args.end_date - args.start_date).days) ] for date in tqdm(dates): timeline = Timeline(None) timeline.date = _Neo4jDateTimeInput(year=date.year, month=date.month, day=date.day) timeline.id = idgen(timeline) gql_client.merge(timeline) from_ids = [] for bill in date2bill[date]: from_ids.append(bill.id) for minutes in date2minutes[date]: from_ids.append(minutes.id) for news in date2news[date]: if news.is_timeline: from_ids.append(news.id) gql_client.bulk_link(from_ids, [timeline.id] * len(from_ids)) LOGGER.info(f'linked {len(from_ids)} events to {date}')
class SpiderTemplate(scrapy.Spider): domain = NotImplemented def __init__(self, *args, **kwargs): super(SpiderTemplate, self).__init__(*args, **kwargs) logging.getLogger('elasticsearch').setLevel(logging.WARNING) logging.getLogger('sgqlc').setLevel(logging.WARNING) self.gql_client = GraphQLClient() self.es_client = ElasticsearchClient() self.bill_finder = BillFinder() self.minutes_finder = MinutesFinder() self.committee_finder = CommitteeFinder() self.member_finder = MemberFinder() def parse(self, response): NotImplemented def link_urls(self, urls): """ link Url to parent resource """ from_ids, to_ids = [], [] for url in urls: if hasattr(url, 'to_id'): from_ids.append(url.id) to_ids.append(url.to_id) if from_ids: self.gql_client.bulk_link(from_ids, to_ids) def link_activities(self, activities): """ link Activity to Member, Bill, and Minutes """ from_ids, to_ids = [], [] for activity in activities: for id_field in ['member_id', 'bill_id', 'minutes_id']: if hasattr(activity, id_field): from_ids.append(activity.id) to_ids.append(getattr(activity, id_field)) if from_ids: self.gql_client.bulk_link(from_ids, to_ids) def link_bill_action(self, bill_action_lst): """ link BillAction to Bill, Minutes, and Speech """ from_ids, to_ids = [], [] for bill_action in bill_action_lst: for id_field in ['bill_id', 'minutes_id', 'speech_id']: if hasattr(bill_action, id_field): from_ids.append(bill_action.id) to_ids.append(getattr(bill_action, id_field)) if from_ids: self.gql_client.bulk_link(from_ids, to_ids) def link_minutes(self, minutes): """ link Minutes to Bill, Member, and Committee """ if hasattr(minutes, 'topic_ids'): bill_ids = list(filter(lambda x: x, minutes.topic_ids)) if bill_ids: self.gql_client.bulk_link([minutes.id] * len(bill_ids), bill_ids) LOGGER.info(f'linked {len(bill_ids)} bills to {minutes.id}') if hasattr(minutes, 'speaker_ids'): member_ids = list(filter(lambda x: x, minutes.speaker_ids)) if member_ids: self.gql_client.bulk_link(member_ids, [minutes.id] * len(member_ids)) LOGGER.info(f'linked {len(member_ids)} members to {minutes.id}') try: committee = self.committee_finder.find_one(minutes.name) except ValueError as e: LOGGER.warning(e) else: self.gql_client.link(minutes.id, committee.id) def link_speeches(self, speeches): from_ids, to_ids = [], [] for speech in speeches: from_ids.append(speech.id) to_ids.append(speech.minutes_id) if hasattr(speech, 'member_id'): from_ids.append(speech.member_id) to_ids.append(speech.id) if from_ids: self.gql_client.bulk_link(from_ids, to_ids) def delete_old_urls(self, src_id, url_title): obj = self.gql_client.get(src_id, fields=['urls']) for url in obj.urls: if url.title == url_title: self.gql_client.delete(url.id) LOGGER.info(f'deleted {url.id}') def get_diet(self, diet_number=None): if diet_number: return self.gql_client.get(f'Diet:{diet_number}', ['id', 'number', 'start_date']) else: return self.get_latest_diet() def get_latest_diet(self): diets = sorted(self.gql_client.get_all_diets(['id', 'number', 'start_date']), key=lambda x: x.number) return diets[-1] def get_topic_ids(self, topics): def get_topic_id(topic): maybe_bill_number = extract_bill_number_or_none(topic) maybe_category = extract_bill_category_or_none(topic) try: if maybe_bill_number: bill = self.bill_finder.find_one(maybe_bill_number) elif maybe_category: bill = self.bill_finder.find_one(topic, category=maybe_category) else: bill = self.bill_finder.find_one(topic) return bill.id except ValueError as e: LOGGER.debug(e) # this is expected when topic does not include bill return '' return list(map(lambda x: get_topic_id(x), topics)) def get_speakers_ids(self, speakers): def get_speaker_id(speaker): try: member = self.member_finder.find_one(speaker) return member.id except ValueError as e: LOGGER.debug(e) # this is expected when speaker is not member return '' return list(map(lambda x: get_speaker_id(x), speakers))
class SpiderTemplate(scrapy.Spider): domain = NotImplemented def __init__(self, *args, **kwargs): super(SpiderTemplate, self).__init__(*args, **kwargs) logging.getLogger('elasticsearch').setLevel(logging.WARNING) logging.getLogger('sgqlc').setLevel(logging.WARNING) self.gql_client = GraphQLClient() self.es_client = ElasticsearchClient() self.bill_finder = BillFinder() self.minutes_finder = MinutesFinder() self.committee_finder = CommitteeFinder() self.member_finder = MemberFinder() def parse(self, response): NotImplemented def link_urls(self, urls): """ link Url to parent resource """ from_ids, to_ids = [], [] for url in urls: if hasattr(url, 'to_id'): from_ids.append(url.id) to_ids.append(url.to_id) if from_ids: self.gql_client.bulk_link(from_ids, to_ids) def link_activities(self, activities): """ link Activity to Member, Bill, and Minutes """ from_ids, to_ids = [], [] for activity in activities: for id_field in ['member_id', 'bill_id', 'minutes_id']: if hasattr(activity, id_field): from_ids.append(activity.id) to_ids.append(getattr(activity, id_field)) if from_ids: self.gql_client.bulk_link(from_ids, to_ids) def link_minutes(self, minutes): """ link Minutes to Bill, Committee and Member """ self.link_bills_by_topics(minutes) try: committee = self.committee_finder.find_one(minutes.name) except ValueError as e: LOGGER.warning(e) else: self.gql_client.link(minutes.id, committee.id) if hasattr(minutes, 'speakers'): from_ids = [] to_ids = [] for speaker in minutes.speakers: try: member = self.member_finder.find_one(speaker) except ValueError as e: LOGGER.debug(e) # this is expected when speaker is not member else: from_ids.append(member.id) to_ids.append(minutes.id) if from_ids: self.gql_client.bulk_link(from_ids, to_ids) def link_speeches(self, speeches): from_ids, to_ids = [], [] for speech in speeches: from_ids.append(speech.id) to_ids.append(speech.minutes_id) if from_ids: self.gql_client.bulk_link(from_ids, to_ids) def store_urls_for_bill(self, urls, bill_query): if not urls: return try: bill = self.bill_finder.find_one(bill_query) except ValueError as e: LOGGER.warning(e) else: self.gql_client.bulk_merge(urls) self.gql_client.bulk_link(map(lambda x: x.id, urls), [bill.id] * len(urls)) def delete_old_urls(self, src_id, url_title): obj = self.gql_client.get(src_id) for url in obj.urls: if url.title == url_title: self.gql_client.delete(url.id) LOGGER.info(f'deleted {url.id}') def link_bills_by_topics(self, minutes: Minutes): if not hasattr(minutes, 'topics'): return from_ids, to_ids = [], [] for topic in minutes.topics: try: bill = self.bill_finder.find_one(topic) except ValueError as e: LOGGER.debug(e) # this is expected when topic does not include bill else: from_ids.append(minutes.id) to_ids.append(bill.id) LOGGER.debug(f'link {minutes.id} to {bill.id}') if from_ids: self.gql_client.bulk_link(from_ids, to_ids) LOGGER.info(f'linked {len(from_ids)} bills to {minutes.id}')