Beispiel #1
0
def create_table():
    """ テーブル作成SQL
    "CREATE TABLE idol_group_twitter_follower_num (idol_group_id integer, screen_name varchar(255), follower_num integer, recode_date date) WITH OIDS;"
    """
    db = DB("iddata")
    #db.execute_sql("CREATE TABLE idol_group_twitter_follower_num (idol_group_id integer, screen_name varchar(255), follower_num integer, recode_date date) WITH OIDS;")
    db.execute_sql(
        "CREATE TABLE idol_group_twitter_otavector (idol_group_id integer, screen_name varchar(255), follow_userid varchar(255), follow_num integer, recode_date date) WITH OIDS;"
    )
    db.close()
Beispiel #2
0
def create_table():
    """ データベース作成SQL
    "CREATE DATABASE iddata;"
    """
    """ テーブル作成SQL
    "CREATE TABLE idol_group_name (idol_group_id integer PRIMARY KEY, idol_group_name varchar(255)) WITH OIDS;"
    "CREATE TABLE idol_group_wiki_url (idol_group_id integer PRIMARY KEY, url varchar(255)) WITH OIDS;"
    "CREATE TABLE not_idol_group_wiki_url (not_idol_group_name varchar(255), url varchar(255)) WITH OIDS;"
    "CREATE TABLE idol_group_twitter_url (idol_group_id integer, twitter_name varchar(255), url varchar(255), account_type varchar(255)) WITH OIDS;"
    """
    db = DB('iddata')
    db.execute_sql(
        "CREATE TABLE idol_group_name (idol_group_id integer PRIMARY KEY, idol_group_name varchar(255)) WITH OIDS;"
    )
    db.execute_sql(
        "CREATE TABLE idol_group_wiki_url (idol_group_id integer PRIMARY KEY, url varchar(255)) WITH OIDS;"
    )
    db.execute_sql(
        "CREATE TABLE not_idol_group_wiki_url (not_idol_group_name varchar(255), url varchar(255)) WITH OIDS;"
    )
    db.execute_sql(
        "CREATE TABLE idol_group_twitter_url (idol_group_id integer, twitter_name varchar(255), url varchar(255), account_type varchar(255)) WITH OIDS;"
    )
    db.close()
Beispiel #3
0
    def idol_group_twitter_url(self):
        db = DB('iddata')
        id_name_wikiurls = db.select(
            'SELECT N.idol_group_id, N.idol_group_name, W.url FROM idol_group_name AS N INNER JOIN idol_group_wiki_url AS W ON N.idol_group_id = W.idol_group_id'
        )
        for id_name_wikiurl in id_name_wikiurls:

            # wikipediaの個別アイドルグループのURLをクロール
            res = requests.get(id_name_wikiurl[2])
            content = res.content
            soup = BeautifulSoup(content, 'html.parser')

            # twitter.comを含む<a>タグをlistで取得
            twitter_a = soup.find_all('a', href=re.compile("twitter.com"))

            # <a>タグからTwitterURLとツイッター名を取得しlist化
            twitter_name_urls = list()
            for a in twitter_a:
                twitter_url = a.get('href')
                twitter_name = a.text

                # すでにURLがDBに登録されていたらスキップ
                db_twitter_url = db.select(
                    "SELECT url FROM idol_group_twitter_url WHERE idol_group_id = %s AND url = '%s'"
                    % (id_name_wikiurl[0], twitter_url))
                if len(db_twitter_url) > 0:
                    continue

                # URLに特定の文字列が含まれていれば、スキップ
                if '/status' in twitter_url:
                    continue

                twitter_name_urls.append([twitter_name, twitter_url])

            # 追加twitter_name_urlsが空ならスキップ
            if len(twitter_name_urls) == 0:
                continue

            # twitter_name_urlsリストの各先頭にtargetsリストのマッチ度を挿入
            targets = ["公式", "運営", "オフィシャル", "スタッフ", "staff", "OFFICIAL"]
            targets.append(id_name_wikiurl[1])
            for i, name_url in enumerate(twitter_name_urls):
                match_ratio = 0
                for target in targets:
                    match_ratio += difflib.SequenceMatcher(
                        None, name_url[0], target).ratio()
                twitter_name_urls[i].insert(0, match_ratio)

            # [[match_ratio, idol_group_name, twitter_url],..]のリストをマッチ度の高い順にソート
            twitter_match_name_urls = twitter_name_urls
            twitter_match_name_urls.sort(reverse=True)
            print(id_name_wikiurl[0], id_name_wikiurl[1])
            print('データベースのTwitterURLリスト')
            print(
                db.select(
                    'SELECT idol_group_id, twitter_name, url, account_type FROM idol_group_twitter_url WHERE idol_group_id = %s'
                    % id_name_wikiurl[0]))
            print('追加するTwitterURLリスト')
            print(twitter_match_name_urls)

            command = input('%sをofficialにしますか?(y/n) >>' %
                            (twitter_match_name_urls[0]))
            if command is 'y':
                # データベース内の特定アイドルグループのtwitterURLのアカウントタイプをすべてotherに更新
                db.execute_sql(
                    "UPDATE idol_group_twitter_url SET account_type = 'other' WHERE idol_group_id = %s"
                    % id_name_wikiurl[0])

                # TwitterURLを挿入
                for count, match_name_url in enumerate(
                        twitter_match_name_urls):

                    # マッチ度が高いURLはofficialにして挿入
                    if count == 0:
                        db.insert(
                            'INSERT INTO idol_group_twitter_url (idol_group_id, twitter_name, url, account_type) VALUES (%s,%s,%s,%s)',
                            [
                                id_name_wikiurl[0], match_name_url[1],
                                match_name_url[2], 'official'
                            ])
                        continue
                    db.insert(
                        'INSERT INTO idol_group_twitter_url (idol_group_id, twitter_name, url, account_type) VALUES (%s,%s,%s,%s)',
                        [
                            id_name_wikiurl[0], match_name_url[1],
                            match_name_url[2], 'other'
                        ])
                print('officialで登録しました')
            else:
                # TwitterURLをotherにして挿入
                for match_name_url in twitter_match_name_urls:
                    db.insert(
                        'INSERT INTO idol_group_twitter_url (idol_group_id, twitter_name, url, account_type) VALUES (%s,%s,%s,%s)',
                        [
                            id_name_wikiurl[0], match_name_url[1],
                            match_name_url[2], 'other'
                        ])
                print('otherで登録しました')