Esempio n. 1
0
 def __init__(self, committees=None, **kwargs):
     if committees:
         self.committees = committees
     else:
         client = GraphQLClient(**kwargs)
         self.committees = client.get_all_committees(['id'] +
                                                     self.search_fields)
def main(fp):
    client = GraphQLClient()
    member_finder = MemberFinder(search_fields=['name', 'name_hira'])

    df = pd.read_csv(fp).fillna('')
    LOGGER.info(f'load {len(df)} members from {fp}')

    members = []
    for _, row in df.iterrows():
        member = None
        for search_field in ['name', 'name_hira']:
            try:
                member = member_finder.find_one(row[search_field],
                                                exact_match=True)
                break
            except ValueError as e:
                LOGGER.debug(e)
        if not member:
            LOGGER.warning(f'failed to find member for row={row}')
            continue
        for link_field in ['website', 'twitter', 'facebook']:
            if row[link_field]:
                setattr(member, link_field, row[link_field])
        members.append(member)
    client.bulk_merge(members)
    LOGGER.info(f'merged {len(members)} member links')
 def __init__(self, bills=None, search_fields=None, **kwargs):
     super().__init__(search_fields or ['name', 'bill_number', 'category'])
     if bills:
         self.bills = bills
     else:
         client = GraphQLClient(**kwargs)
         self.bills = client.get_all_bills(['id'] + self.search_fields)
Esempio n. 4
0
 def __init__(self, members=None, search_fields=None, **kwargs):
     super().__init__(search_fields or ['name', 'name_hira'])
     if members:
         self.members = members
     else:
         client = GraphQLClient(**kwargs)
         self.members = client.get_all_members(['id'] + self.search_fields)
Esempio n. 5
0
def main():
    es_client = ElasticsearchClient()
    gql_client = GraphQLClient()

    query = args.query
    if args.bill:
        bill = gql_client.get(f'Bill:{args.bill}', fields=['id', 'name'])
        query += bill.name

    news_texts = es_client.search(NewsText,
                                  query=query,
                                  start_date_str=args.start,
                                  end_date_str=args.end)
    for news_text in news_texts:
        try:
            news = gql_client.get(news_text.id)
        except GraphQLException as e:
            LOGGER.warning(e)
            continue
        print(news.id)
        print(news.publisher + '@' + news.published_at.formatted)
        print(news.title)
        print(news_text.body[:100])
        print(news.url)
        print()
Esempio n. 6
0
def main():
    client = GraphQLClient(url="https://graphql.politylink.jp/")
    bills = client.get_all_bills(fields=['id', 'urls'])
    stats = defaultdict(int)
    s3_client = boto3.client('s3')
    os.makedirs("./image/bill", exist_ok=True)

    for bill in tqdm(bills):
        summary_pdf = next(filter(lambda x: x.title == "概要PDF", bill.urls),
                           None)
        LOGGER.debug(f'Processing ... {bill.id}')
        if summary_pdf:
            stats['process'] += 1
            id_ = bill.id.split(':')[-1]
            local_path = f'./image/bill/{id_}.png'
            try:
                response = requests.get(summary_pdf.url)
                thumbnail = save_thumbnail(response, local_path)
            except Exception as e:
                LOGGER.warning(f'failed to convert summary pdf to png: {e}')
                stats['fail'] += 1
                continue
            if args.publish:
                s3_path = f'bill/{id_}.png'
                s3_client.upload_file(local_path,
                                      'politylink',
                                      s3_path,
                                      ExtraArgs={'ContentType': 'image/png'})
            time.sleep(1)
    LOGGER.info('processed {} bills ({} success, {} fail)'.format(
        stats['process'], stats['process'] - stats['fail'], stats['fail']))
 def __init__(self, committees=None, search_fields=None, **kwargs):
     super().__init__(search_fields or ['name', 'aliases'])
     if committees:
         self.committees = committees
     else:
         client = GraphQLClient(**kwargs)
         self.committees = client.get_all_committees(['id'] +
                                                     self.search_fields)
Esempio n. 8
0
    def test_bulk_get(self):
        client = GraphQLClient()

        bill = self._build_sample_bill()

        bills = client.bulk_get([bill.id])
        assert len(bills) == 1
        assert bill.id == bills[0].id
Esempio n. 9
0
 def get_member_names(self):
     client = GraphQLClient()
     fields = [
         'first_name', 'first_name_hira', 'last_name', 'last_name_hira'
     ]
     members = client.get_all_members(fields=fields)
     member_names = [m[f] for f in fields for m in members]
     return member_names
Esempio n. 10
0
    def test_bulk_merge(self):
        client = GraphQLClient()

        bill = self._build_sample_bill()
        url = self._build_sample_url()

        data = client.bulk_merge([bill, url])
        assert data['op0']['id'] == bill.id
        assert data['op1']['id'] == url.id
Esempio n. 11
0
 def __init__(self, *args, **kwargs):
     super(SpiderTemplate, self).__init__(*args, **kwargs)
     logging.getLogger('elasticsearch').setLevel(logging.WARNING)
     logging.getLogger('sgqlc').setLevel(logging.WARNING)
     self.gql_client = GraphQLClient()
     self.es_client = ElasticsearchClient()
     self.bill_finder = BillFinder()
     self.minutes_finder = MinutesFinder()
     self.committee_finder = CommitteeFinder()
     self.member_finder = MemberFinder()
Esempio n. 12
0
    def test_merge(self):
        client = GraphQLClient()

        bill = self._build_sample_bill()
        data = client.merge(bill)
        assert data['MergeBill']['id'] == bill.id

        url = self._build_sample_url()
        data = client.merge(url)
        assert data['MergeUrl']['id'] == url.id

        news = self._build_sample_news()
        data = client.merge(news)
        assert data['MergeNews']['id'] == news.id

        minutes = self._build_sample_minutes()
        data = client.merge(minutes)
        assert data['MergeMinutes']['id'] == minutes.id

        committee = self._build_sample_committee()
        data = client.merge(committee)
        assert data['MergeCommittee']['id'] == committee.id

        speech = self._build_sample_speech()
        data = client.merge(speech)
        assert data['MergeSpeech']['id'] == speech.id

        timeline = self._build_sample_timeline()
        data = client.merge(timeline)
        assert data['MergeTimeline']['id'] == timeline.id
Esempio n. 13
0
def main():
    client = GraphQLClient(url="https://graphql.politylink.jp/")
    members = client.get_all_members(fields=['id', 'image'])
    s3 = boto3.resource('s3')
    for member in tqdm(members):
        response = requests.get(member.image)
        object_key = 'member/{}.jpg'.format(member.id.split(':')[-1])
        s3.Bucket('politylink').put_object(Key=object_key,
                                           Body=response.content,
                                           ContentType="image/jpeg")
        time.sleep(1)
Esempio n. 14
0
 def test_exec(self):
     client = GraphQLClient()
     query = """
     {
         Bill {
             name
             billNumber
         }
     }
     """
     data = client.exec(query)
     assert 'Bill' in data
Esempio n. 15
0
 def __init__(self, bills=None):
     if bills:
         self.bills = bills
     else:
         client = GraphQLClient()
         self.bills = client.get_all_bills(['id', 'name', 'bill_number'])
     self.name2bills = defaultdict(list)
     self.number2bill = defaultdict(list)
     for bill in self.bills:
         if hasattr(bill, 'name'):
             self.name2bills[bill.name].append(bill)
         if hasattr(bill, 'bill_number'):
             self.number2bill[bill.bill_number].append(bill)
    def find(self, text, dt=None):
        op = Operation(Query)
        minutes_filter = _MinutesFilter(None)
        minutes_filter.name_contains = text
        if dt:
            minutes_filter.start_date_time = _Neo4jDateTimeInput(
                year=dt.year, month=dt.month, day=dt.day)
        op.minutes(filter=minutes_filter)

        data = self.client.endpoint(op)
        GraphQLClient.validate_response_or_raise(data)
        minutes = (op + data).minutes
        return minutes
Esempio n. 17
0
    def test_build_input(self):
        bill_id = 'Bill:id'
        bill_input = GraphQLClient.build_input(bill_id)
        assert isinstance(bill_input, _BillInput)
        assert bill_input.id == bill_id

        minutes_id = 'Minutes:id'
        minutes_input = GraphQLClient.build_input(minutes_id)
        assert isinstance(minutes_input, _MinutesInput)
        assert minutes_input.id == minutes_id

        with pytest.raises(GraphQLException):
            GraphQLClient.build_input('invalid:id')
Esempio n. 18
0
    def test_get_all_bills(self):
        client = GraphQLClient()
        bill = self._build_sample_bill()
        client.merge(bill)
        query = '公文書'
        bills = client.get_all_bills(fields=['id', 'name'],
                                     filter_=_BillFilter(
                                         {'name_contains': query}))

        assert len(bills) > 0
        for bill in bills:
            assert isinstance(bill, Bill)
            assert query in bill.name
Esempio n. 19
0
    def test_bulk_link(self):
        client = GraphQLClient()

        url = self._build_sample_url()
        bill = self._build_sample_bill()
        minutes = self._build_sample_minutes()

        from_ids = [url.id, url.id]
        to_ids = [bill.id, minutes.id]

        data = client.bulk_link(from_ids, to_ids)
        assert data['op0']['from']['id'] == url.id
        assert data['op0']['to']['id'] == bill.id
        assert data['op1']['from']['id'] == url.id
        assert data['op1']['to']['id'] == minutes.id
        assert url.id in map(lambda x: x.id, client.get(bill.id).urls)
        assert url.id in map(lambda x: x.id, client.get(minutes.id).urls)

        data = client.bulk_unlink(from_ids, to_ids)
        assert data['op0']['from']['id'] == url.id
        assert data['op0']['to']['id'] == bill.id
        assert data['op1']['from']['id'] == url.id
        assert data['op1']['to']['id'] == minutes.id
        assert url.id not in map(lambda x: x.id, client.get(bill.id).urls)
        assert url.id not in map(lambda x: x.id, client.get(minutes.id).urls)
Esempio n. 20
0
def main():
    gql_client = GraphQLClient()
    es_client = ElasticsearchClient()

    news_list = gql_client.get_all_news(
        fields=['id', 'title', 'published_at', 'is_timeline'],
        start_date=args.start_date,
        end_date=args.end_date)
    LOGGER.info(f'fetched {len(news_list)} news from GraphQL')

    if args.check_timeline:
        news_list = list(filter(lambda x: x.is_timeline, news_list))
        LOGGER.info(f'filtered {len(news_list)} timeline news')

    stats = defaultdict(int)
    for news in tqdm(news_list):
        LOGGER.info(f'process {news.id}')
        stats['process'] += 1
        try:
            news_text = es_client.get(news.id)
            if not args.skip_minutes:
                LOGGER.debug(f'check Minutes for {news.id}')
                minutes_list = fetch_matched_minutes(news, news_text)
                if minutes_list:
                    gql_client.bulk_link([news.id] * len(minutes_list),
                                         map(lambda x: x['id'], minutes_list))
                    LOGGER.info(
                        f'linked {len(minutes_list)} minutes for {news.id}')
            if not args.skip_bill:
                LOGGER.debug(f'check Bill for {news.id}')
                bill_list = fetch_matched_bills(news, news_text)
                if bill_list:
                    gql_client.bulk_link([news.id] * len(bill_list),
                                         map(lambda x: x['id'], bill_list))
                    LOGGER.info(f'linked {len(bill_list)} bills for {news.id}')
            if not args.skip_timeline:
                LOGGER.debug(f'check Timeline for {news.id}')
                is_timeline = fetch_is_timeline(news, news_text)
                if is_timeline:
                    # need to create new instance to avoid neo4j datetime error
                    updated_news = News(None)
                    updated_news.id = news.id
                    updated_news.is_timeline = is_timeline
                    gql_client.merge(updated_news)
                    LOGGER.info(f'linked {news.id} to timeline')
        except Exception as e:
            stats['fail'] += 1
            if isinstance(e, json.decoder.JSONDecodeError):
                LOGGER.warning(f'failed to parse API response for {news.id}')
            else:
                LOGGER.exception(f'failed to process {news.id}')
    LOGGER.info('processed {} news ({} success, {} fail)'.format(
        stats['process'], stats['process'] - stats['fail'], stats['fail']))
Esempio n. 21
0
def main(fp):
    gql_client = GraphQLClient()
    df = pd.read_csv(fp)

    diets = []
    for _, row in df.iterrows():
        diet = Diet(None)
        diet.number = int(row['number'])
        diet.name = f'第{diet.number}回国会'
        diet.category = row['category']
        diet.start_date = to_neo4j_datetime(row['start_date'])
        diet.end_date = to_neo4j_datetime(row['end_date'])
        diet.id = idgen(diet)
        diets.append(diet)

    gql_client.bulk_merge(diets)
    LOGGER.info(f'merged {len(diets)} diets')
def get_speech_contexts(json_path=None):
    speech_contexts = []

    # add contexts from json
    if json_path:
        speech_contexts += json.load(open(json_path, 'r')).values()

    # add contexts from GraphQL
    gql_client = GraphQLClient()
    members = gql_client.get_all_members(['name'])
    if members:
        member_context = {
            'phrases': [member.name for member in members],
            'boost': 20.0
        }
        speech_contexts.append(member_context)

    return speech_contexts
Esempio n. 23
0
def main():
    gql_client = GraphQLClient()
    es_client = ElasticsearchClient()

    news_list = gql_client.get_all_news(fields=['id', 'published_at'],
                                        start_date=args.start_date,
                                        end_date=args.end_date)
    LOGGER.info(f'fetched {len(news_list)} news from GraphQL')

    if news_list:
        news_text_list = list(
            map(
                lambda news: NewsText({
                    'id': news.id,
                    'date': to_date_str(news.published_at)
                }), news_list))
        es_client.bulk_index(news_text_list, op_type=OpType.UPDATE)
        LOGGER.info(f're-indexed {len(news_text_list)} news text')
Esempio n. 24
0
    def test_get(self):
        client = GraphQLClient()

        url = self._build_sample_url()
        bill = self._build_sample_bill()
        news = self._build_sample_news()
        speech = self._build_sample_speech()
        minutes = self._build_sample_minutes()
        committee = self._build_sample_committee()
        timeline = self._build_sample_timeline()

        for obj in [url, bill, news, speech, minutes, committee, timeline]:
            client.merge(obj)
            ret = client.get(obj.id)
            assert ret.id == obj.id

        with pytest.raises(GraphQLException):
            client.get('invalid:class')

        with pytest.raises(GraphQLException):
            client.get('bill:invalid')
Esempio n. 25
0
def main():
    gql_client = GraphQLClient()
    df = pd.read_csv(args.file)

    if args.delete:
        gql_client.bulk_unlink(from_ids=df['from_id'], to_ids=df['to_id'])
        LOGGER.info(f'deleted {len(df)} relationships')
    else:
        gql_client.bulk_link(from_ids=df['from_id'], to_ids=df['to_id'])
        LOGGER.info(f'merged {len(df)} relationships')
Esempio n. 26
0
    def test_show_ops(self):
        bill = self._build_sample_bill()
        url = self._build_sample_url()

        LOGGER.warning(GraphQLClient.build_merge_operation(bill))
        LOGGER.warning(GraphQLClient.build_link_operation(url.id, bill.id))
        LOGGER.warning(
            GraphQLClient.build_link_operation(url.id, bill.id, remove=True))
        LOGGER.warning(
            GraphQLClient.build_get_operation(bill.id, ['id', 'name']))
        LOGGER.warning(
            GraphQLClient.build_bulk_get_operation(ids=['Bill:1', 'Bill:2'],
                                                   fields=['id', 'name']))
        LOGGER.warning(GraphQLClient.build_delete_operation(bill.id))
        LOGGER.warning(
            GraphQLClient.build_get_all_operation(
                'bill', ['id', 'name'], _BillFilter({'name_contains': '公文書'})))

        bulk_op = Operation(Mutation)
        bulk_op = GraphQLClient.build_merge_operation(bill, bulk_op)
        bulk_op = GraphQLClient.build_link_operation(url.id, bill.id, bulk_op)
        LOGGER.warning(bulk_op)
Esempio n. 27
0
def main():
    gql_client = GraphQLClient()
    bill_list = gql_client.get_all_bills(['id'] + BILL_DATE_FIELDS)
    LOGGER.info(f'fetched {len(bill_list)} bills')
    minutes_list = gql_client.get_all_minutes(['id'] + MINUTES_DATE_FIELD)
    LOGGER.info(f'fetched {len(minutes_list)} minutes')
    news_list = gql_client.get_all_news(['id', 'is_timeline'] +
                                        NEWS_DATE_FIELD,
                                        start_date=args.start_date,
                                        end_date=args.end_date)
    LOGGER.info(f'fetched {len(news_list)} news')
    date2bill = build_date_dict(bill_list, BILL_DATE_FIELDS)
    date2minutes = build_date_dict(minutes_list, MINUTES_DATE_FIELD)
    date2news = build_date_dict(news_list, NEWS_DATE_FIELD)

    dates = [
        args.start_date + timedelta(i)
        for i in range((args.end_date - args.start_date).days)
    ]
    for date in tqdm(dates):
        timeline = Timeline(None)
        timeline.date = _Neo4jDateTimeInput(year=date.year,
                                            month=date.month,
                                            day=date.day)
        timeline.id = idgen(timeline)
        gql_client.merge(timeline)

        from_ids = []
        for bill in date2bill[date]:
            from_ids.append(bill.id)
        for minutes in date2minutes[date]:
            from_ids.append(minutes.id)
        for news in date2news[date]:
            if news.is_timeline:
                from_ids.append(news.id)
        gql_client.bulk_link(from_ids, [timeline.id] * len(from_ids))
        LOGGER.info(f'linked {len(from_ids)} events to {date}')
Esempio n. 28
0
    def test_link(self):
        client = GraphQLClient()

        url = self._build_sample_url()
        bill = self._build_sample_bill()

        data = client.link(url.id, bill.id)
        assert data['MergeUrlReferredBills']['from']['id'] == url.id
        assert data['MergeUrlReferredBills']['to']['id'] == bill.id
        assert url.id in map(lambda x: x.id, client.get(bill.id).urls)

        data = client.unlink(url.id, bill.id)
        assert data['RemoveUrlReferredBills']['from']['id'] == url.id
        assert data['RemoveUrlReferredBills']['to']['id'] == bill.id
        assert url.id not in map(lambda x: x.id, client.get(bill.id).urls)
class MinutesFinder:
    """
    GraphQL based Minutes finder
    """

    def __init__(self, **kwargs):
        self.client = GraphQLClient(**kwargs)

    def find(self, text, dt=None):
        op = Operation(Query)
        minutes_filter = _MinutesFilter(None)
        minutes_filter.name_contains = text
        if dt:
            minutes_filter.start_date_time = _Neo4jDateTimeInput(year=dt.year, month=dt.month, day=dt.day)
        minutes = op.minutes(filter=minutes_filter)
        minutes.id()
        minutes.name()

        data = self.client.endpoint(op)
        GraphQLClient.validate_response_or_raise(data)
        minutes = (op + data).minutes
        return minutes
Esempio n. 30
0
def main():
    gql_client = GraphQLClient()

    objects = []
    if args.bill:
        bills = gql_client.get_all_bills(['id', 'news'])
        LOGGER.info(f'fetched {len(bills)} bills to clean')
        objects += bills
    if args.minutes:
        minutesList = gql_client.get_all_minutes(['id', 'news'])
        LOGGER.info(f'fetched {len(minutesList)} minutes to clean')
        objects += minutesList
    LOGGER.info(f'registered {len(objects)} objects to clean')

    for obj in objects:
        news_ids = list(map(lambda x: x.id, obj.news))
        if news_ids:
            gql_client.bulk_unlink(news_ids, [obj.id] * len(news_ids))
            LOGGER.info(f'removed {len(news_ids)} news links from {obj.id}')