def main(fp):
    client = GraphQLClient()
    member_finder = MemberFinder(search_fields=['name', 'name_hira'])

    df = pd.read_csv(fp).fillna('')
    LOGGER.info(f'load {len(df)} members from {fp}')

    members = []
    for _, row in df.iterrows():
        member = None
        for search_field in ['name', 'name_hira']:
            try:
                member = member_finder.find_one(row[search_field],
                                                exact_match=True)
                break
            except ValueError as e:
                LOGGER.debug(e)
        if not member:
            LOGGER.warning(f'failed to find member for row={row}')
            continue
        for link_field in ['website', 'twitter', 'facebook']:
            if row[link_field]:
                setattr(member, link_field, row[link_field])
        members.append(member)
    client.bulk_merge(members)
    LOGGER.info(f'merged {len(members)} member links')
Ejemplo n.º 2
0
def main(fp):
    gql_client = GraphQLClient()
    df = pd.read_csv(fp)

    diets = []
    for _, row in df.iterrows():
        diet = Diet(None)
        diet.number = int(row['number'])
        diet.name = f'第{diet.number}回国会'
        diet.category = row['category']
        diet.start_date = to_neo4j_datetime(row['start_date'])
        diet.end_date = to_neo4j_datetime(row['end_date'])
        diet.id = idgen(diet)
        diets.append(diet)

    gql_client.bulk_merge(diets)
    LOGGER.info(f'merged {len(diets)} diets')
Ejemplo n.º 3
0
    def test_bulk_merge(self):
        client = GraphQLClient()

        bill = self._build_sample_bill()
        url = self._build_sample_url()

        data = client.bulk_merge([bill, url])
        assert data['op0']['id'] == bill.id
        assert data['op1']['id'] == url.id
Ejemplo n.º 4
0
class SpiderTemplate(scrapy.Spider):
    domain = NotImplemented

    def __init__(self, *args, **kwargs):
        super(SpiderTemplate, self).__init__(*args, **kwargs)
        logging.getLogger('elasticsearch').setLevel(logging.WARNING)
        logging.getLogger('sgqlc').setLevel(logging.WARNING)
        self.gql_client = GraphQLClient()
        self.es_client = ElasticsearchClient()
        self.bill_finder = BillFinder()
        self.minutes_finder = MinutesFinder()
        self.committee_finder = CommitteeFinder()
        self.member_finder = MemberFinder()

    def parse(self, response):
        NotImplemented

    def link_urls(self, urls):
        """
        link Url to parent resource
        """

        from_ids, to_ids = [], []
        for url in urls:
            if hasattr(url, 'to_id'):
                from_ids.append(url.id)
                to_ids.append(url.to_id)
        if from_ids:
            self.gql_client.bulk_link(from_ids, to_ids)

    def link_activities(self, activities):
        """
        link Activity to Member, Bill, and Minutes
        """

        from_ids, to_ids = [], []
        for activity in activities:
            for id_field in ['member_id', 'bill_id', 'minutes_id']:
                if hasattr(activity, id_field):
                    from_ids.append(activity.id)
                    to_ids.append(getattr(activity, id_field))
        if from_ids:
            self.gql_client.bulk_link(from_ids, to_ids)

    def link_minutes(self, minutes):
        """
        link Minutes to Bill, Committee and Member
        """

        self.link_bills_by_topics(minutes)

        try:
            committee = self.committee_finder.find_one(minutes.name)
        except ValueError as e:
            LOGGER.warning(e)
        else:
            self.gql_client.link(minutes.id, committee.id)

        if hasattr(minutes, 'speakers'):
            from_ids = []
            to_ids = []
            for speaker in minutes.speakers:
                try:
                    member = self.member_finder.find_one(speaker)
                except ValueError as e:
                    LOGGER.debug(e)  # this is expected when speaker is not member
                else:
                    from_ids.append(member.id)
                    to_ids.append(minutes.id)
            if from_ids:
                self.gql_client.bulk_link(from_ids, to_ids)

    def link_speeches(self, speeches):
        from_ids, to_ids = [], []
        for speech in speeches:
            from_ids.append(speech.id)
            to_ids.append(speech.minutes_id)
        if from_ids:
            self.gql_client.bulk_link(from_ids, to_ids)

    def store_urls_for_bill(self, urls, bill_query):
        if not urls:
            return
        try:
            bill = self.bill_finder.find_one(bill_query)
        except ValueError as e:
            LOGGER.warning(e)
        else:
            self.gql_client.bulk_merge(urls)
            self.gql_client.bulk_link(map(lambda x: x.id, urls), [bill.id] * len(urls))

    def delete_old_urls(self, src_id, url_title):
        obj = self.gql_client.get(src_id)
        for url in obj.urls:
            if url.title == url_title:
                self.gql_client.delete(url.id)
                LOGGER.info(f'deleted {url.id}')

    def link_bills_by_topics(self, minutes: Minutes):
        if not hasattr(minutes, 'topics'):
            return

        from_ids, to_ids = [], []
        for topic in minutes.topics:
            try:
                bill = self.bill_finder.find_one(topic)
            except ValueError as e:
                LOGGER.debug(e)  # this is expected when topic does not include bill
            else:
                from_ids.append(minutes.id)
                to_ids.append(bill.id)
                LOGGER.debug(f'link {minutes.id} to {bill.id}')
        if from_ids:
            self.gql_client.bulk_link(from_ids, to_ids)
            LOGGER.info(f'linked {len(from_ids)} bills to {minutes.id}')