def create_table(): """ テーブル作成SQL "CREATE TABLE idol_group_twitter_follower_num (idol_group_id integer, screen_name varchar(255), follower_num integer, recode_date date) WITH OIDS;" """ db = DB("iddata") #db.execute_sql("CREATE TABLE idol_group_twitter_follower_num (idol_group_id integer, screen_name varchar(255), follower_num integer, recode_date date) WITH OIDS;") db.execute_sql( "CREATE TABLE idol_group_twitter_otavector (idol_group_id integer, screen_name varchar(255), follow_userid varchar(255), follow_num integer, recode_date date) WITH OIDS;" ) db.close()
def create_table(): """ データベース作成SQL "CREATE DATABASE iddata;" """ """ テーブル作成SQL "CREATE TABLE idol_group_name (idol_group_id integer PRIMARY KEY, idol_group_name varchar(255)) WITH OIDS;" "CREATE TABLE idol_group_wiki_url (idol_group_id integer PRIMARY KEY, url varchar(255)) WITH OIDS;" "CREATE TABLE not_idol_group_wiki_url (not_idol_group_name varchar(255), url varchar(255)) WITH OIDS;" "CREATE TABLE idol_group_twitter_url (idol_group_id integer, twitter_name varchar(255), url varchar(255), account_type varchar(255)) WITH OIDS;" """ db = DB('iddata') db.execute_sql( "CREATE TABLE idol_group_name (idol_group_id integer PRIMARY KEY, idol_group_name varchar(255)) WITH OIDS;" ) db.execute_sql( "CREATE TABLE idol_group_wiki_url (idol_group_id integer PRIMARY KEY, url varchar(255)) WITH OIDS;" ) db.execute_sql( "CREATE TABLE not_idol_group_wiki_url (not_idol_group_name varchar(255), url varchar(255)) WITH OIDS;" ) db.execute_sql( "CREATE TABLE idol_group_twitter_url (idol_group_id integer, twitter_name varchar(255), url varchar(255), account_type varchar(255)) WITH OIDS;" ) db.close()
def idol_group_twitter_url(self): db = DB('iddata') id_name_wikiurls = db.select( 'SELECT N.idol_group_id, N.idol_group_name, W.url FROM idol_group_name AS N INNER JOIN idol_group_wiki_url AS W ON N.idol_group_id = W.idol_group_id' ) for id_name_wikiurl in id_name_wikiurls: # wikipediaの個別アイドルグループのURLをクロール res = requests.get(id_name_wikiurl[2]) content = res.content soup = BeautifulSoup(content, 'html.parser') # twitter.comを含む<a>タグをlistで取得 twitter_a = soup.find_all('a', href=re.compile("twitter.com")) # <a>タグからTwitterURLとツイッター名を取得しlist化 twitter_name_urls = list() for a in twitter_a: twitter_url = a.get('href') twitter_name = a.text # すでにURLがDBに登録されていたらスキップ db_twitter_url = db.select( "SELECT url FROM idol_group_twitter_url WHERE idol_group_id = %s AND url = '%s'" % (id_name_wikiurl[0], twitter_url)) if len(db_twitter_url) > 0: continue # URLに特定の文字列が含まれていれば、スキップ if '/status' in twitter_url: continue twitter_name_urls.append([twitter_name, twitter_url]) # 追加twitter_name_urlsが空ならスキップ if len(twitter_name_urls) == 0: continue # twitter_name_urlsリストの各先頭にtargetsリストのマッチ度を挿入 targets = ["公式", "運営", "オフィシャル", "スタッフ", "staff", "OFFICIAL"] targets.append(id_name_wikiurl[1]) for i, name_url in enumerate(twitter_name_urls): match_ratio = 0 for target in targets: match_ratio += difflib.SequenceMatcher( None, name_url[0], target).ratio() twitter_name_urls[i].insert(0, match_ratio) # [[match_ratio, idol_group_name, twitter_url],..]のリストをマッチ度の高い順にソート twitter_match_name_urls = twitter_name_urls twitter_match_name_urls.sort(reverse=True) print(id_name_wikiurl[0], id_name_wikiurl[1]) print('データベースのTwitterURLリスト') print( db.select( 'SELECT idol_group_id, twitter_name, url, account_type FROM idol_group_twitter_url WHERE idol_group_id = %s' % id_name_wikiurl[0])) print('追加するTwitterURLリスト') print(twitter_match_name_urls) command = input('%sをofficialにしますか?(y/n) >>' % (twitter_match_name_urls[0])) if command is 'y': # データベース内の特定アイドルグループのtwitterURLのアカウントタイプをすべてotherに更新 db.execute_sql( "UPDATE idol_group_twitter_url SET account_type = 'other' WHERE idol_group_id = %s" % id_name_wikiurl[0]) # TwitterURLを挿入 for count, match_name_url in enumerate( twitter_match_name_urls): # マッチ度が高いURLはofficialにして挿入 if count == 0: db.insert( 'INSERT INTO idol_group_twitter_url (idol_group_id, twitter_name, url, account_type) VALUES (%s,%s,%s,%s)', [ id_name_wikiurl[0], match_name_url[1], match_name_url[2], 'official' ]) continue db.insert( 'INSERT INTO idol_group_twitter_url (idol_group_id, twitter_name, url, account_type) VALUES (%s,%s,%s,%s)', [ id_name_wikiurl[0], match_name_url[1], match_name_url[2], 'other' ]) print('officialで登録しました') else: # TwitterURLをotherにして挿入 for match_name_url in twitter_match_name_urls: db.insert( 'INSERT INTO idol_group_twitter_url (idol_group_id, twitter_name, url, account_type) VALUES (%s,%s,%s,%s)', [ id_name_wikiurl[0], match_name_url[1], match_name_url[2], 'other' ]) print('otherで登録しました')