def get_wiki_pageviews(title): p = PageviewsClient(user_agent="<*****@*****.**> multiple movie titles") today = datetime.datetime.now().strftime("%Y%m%d") try: return p.article_views('en.wikipedia', title, start='20130101',end=today) except: return {}
def __init__(self): self.db_connection = DBConnection() self.logger = logging.getLogger(__name__) self.api = PageviewsClient( "Mozilla/5.0 (X11; Linux x86_64)" " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36" )
def get_pageviews(site_name, *args, **kwargs): if site_name.lower() == 'wikipedia': start = '' end = '' granularity = 'monthly' if kwargs.get('article_name') != None: article_name = kwargs['article_name'] # article_name = self.get_article_name(article_name) if kwargs.get('start') != None: start = kwargs['start'].replace('-', '') if kwargs.get('end') != None: end = kwargs['end'].replace('-', '') if kwargs.get('granularity') != None: granularity = kwargs['granularity'] p = PageviewsClient(user_agent="<*****@*****.**>") if start == '': return p.article_views('en.wikipedia', article_name, granularity=granularity) elif end == '': return p.article_views('en.wikipedia', article_name, granularity=granularity, start=start, end=start) else: return p.article_views('en.wikipedia', article_name, granularity=granularity, start=start, end=end)
def run(self): viewer = PageviewsClient( user_agent="<*****@*****.**> Selfie, Cat, and Dog analysis" ) self.logger.info('[%s] Starting Wiki thread' % self.Name) try: for ticker, article in self.Tickers.items(): End_date = time.strftime('%Y%m%d') data = viewer.article_views('en.wikipedia', article, granularity='daily', start=self.Start_date, end=End_date) for row in data: if data[row][article]: wikid = {} wikid['date'] = row.strftime('%m/%d/%Y') wikid['symbol'] = ticker wikid['article'] = article wikid['wiki_views'] = int(data[row][article]) queueDoc(wikid) self.logger.info('[%s] Collected Info on %s' % (self.Name, ticker)) except Exception as e: self.logger.error('[%s] Error: %s' % (self.Name, e)) self.logger.info('[%s] Exiting' % self.name) self.Fin = True
def get_groundtruth(lang): """Get actual counts of top articles and their pageviews from a Wikipedia from yesterday.""" p = PageviewsClient( user_agent="[email protected] -- diff private toolforge") try: groundtruth = p.top_articles( project='{0}.wikipedia'.format(lang), access='all-access', year=None, month=None, day=None, # defaults to yesterday limit=50) except Exception: two_days_ago = date.today() - timedelta(days=2) groundtruth = p.top_articles(project='{0}.wikipedia'.format(lang), access='all-access', year=two_days_ago.year, month=two_days_ago.month, day=two_days_ago.day, limit=50) return { r['article']: { 'gt-rank': r['rank'], 'gt-views': r['views'] } for r in groundtruth }
def _get_snp500_wiki_views(conn, start, end): """ Inserts wiki page views into the daily_views table Parameters: start (str) : YYYYMMDD end (str) : YYYYMMDD Returns: List[tuple] : (id, date, views, now, now) """ pvc = PageviewsClient() symbol_ids_and_titles = _get_symbol_ids_and_wiki_titles(conn) title_to_id = {title: id for id, title in symbol_ids_and_titles} articles = [title for _, title in symbol_ids_and_titles] project = 'en.wikipedia' now = datetime.datetime.utcnow() # API call views_dict = pvc.article_views(project, articles, start=start, end=end) # transforming API call to rows (a list of tuples) daily_views = [] for date in views_dict: for title in views_dict[date]: id, views = title_to_id[title], views_dict[date][title] daily_views.append((id, date, views, now, now)) return daily_views
def top_articles_by_views(articles, top_x): """ returns the top x of the given list of articles based on page views for the previous month output: [(article1, views), (article2, views)] """ p = PageviewsClient() # create date string based on previous month now = datetime.datetime.now() previous_month = str(now.month - 1).zfill(2) if previous_month == "00": previous_month = "12" start_date = str(now.year) + previous_month + "0100" end_date = str(now.year) + previous_month + "2800" # get views result = p.article_views('en.wikipedia', articles, granularity='monthly', start=start_date, end=end_date) # clean results (six is used for backwards compatibility with python 2 result = six.next(six.itervalues(result)) sorted_articles = sorted(result.items(), key=operator.itemgetter(1), reverse=True) return sorted_articles[:top_x]
def set_view_counts(self): """ Initializes the `view_count` property for all of the concepts in the `ConceptModel`. """ for node in self.nodes(): p = PageviewsClient().article_views("en.wikipedia", [node.concept.replace(' ', '_')]) p = [p[key][node.concept.replace(' ', '_')] for key in p.keys()] p = int(sum([daily_view_count for daily_view_count in p if daily_view_count]) / len(p)) node.set_property('view_count', p)
def set_view_count(self): """ Sets the view_count parameter appropriately, using a 30-day average. """ p = PageviewsClient().article_views("en.wikipedia", [self.concept.replace(' ', '_')]) p = [p[key][self.concept.replace(' ', '_')] for key in p.keys()] p = int(sum([daily_view_count for daily_view_count in p if daily_view_count])/len(p)) # self.view_count = p self.properties['view_count'] = p print(self.properties['view_count'])
def getViewsPerDay(self): # pageviews own thing """Gets a time series dataframe: date (as index), views (column is article name) Will be using 'mwviews' package""" p = PageviewsClient('en') print(self.article) data = p.article_views('en.wikipedia', [self.article], granularity='daily', start=self.getPublishDate(), end=datetime.datetime.now().strftime('%Y%m%d')) df = pd.DataFrame.from_dict(data, orient='index').dropna() return df
def wiki_api(keyword, start, end, agent='user'): output_list = [] p = PageviewsClient('what is it..?') #parameter로 스트링이 들어가야함. 아무거나 넣어도 가능.. output_dict = dict( p.article_views('en.wikipedia.org', [keyword], start=start, end=end, agent=agent)) for key, val in output_dict.items(): tem_dict = {} tem_dict['date'] = key.strftime("%Y%m%d") tem_dict['view_count'] = val[keyword.replace(" ", "_")] output_list.append(tem_dict) result = json.dumps(output_list) return result
def get_page_views_dict(links): p = PageviewsClient() #today = datetime.datetime.today() #today = today.strftime('%Y%m%d') #p.article_views('{}.wikipedia'.format(lang), title, granularity='monthly', start='20160201', end=today) my_dico = p.article_views('{}.wikipedia'.format(lang), links) my_dico_by_article = {} for article in links: my_dico_by_article[article] = 0 for key_date, sub_dico_value in my_dico.items(): for article, number in sub_dico_value.items(): if number is not None: my_dico_by_article[article.replace('_', ' ')] += number my_dico_by_article = dict( sorted(my_dico_by_article.items(), key=operator.itemgetter(1), reverse=True)) #need to define a selection choose based on title approximation return my_dico_by_article
def get_page_views(article_names, output_path): """Query the Wikipedia page views api for the relevant pages Keyword arguments: article_names -- array of article names to query output_path -- output path for the csv file output """ p = PageviewsClient(user_agent="[email protected] Selfie, Cat, and Dog analysis") values = p.article_views('en.wikipedia',article_names, granularity='monthly', start='20150101', end='20200401') all_keys = list(values.keys()) all_keys.sort() val_dict = [] for x in article_names: for key in all_keys: val_dict.append({"article_title":x,"timestamp":key, "views":values[key][x]}) df = pd.DataFrame(val_dict) df = df.fillna(0) print("Writing Page View Data to -- " + output_path + " -- for " + str(len(df.article_title.unique())) + " articles") df.to_csv(output_path, mode='w', index=False) return df
import csv import pageviewapi import mwviews import json import datetime from mwviews.api import PageviewsClient import pandas import pandas_datareader as web import ystockquote from pprint import pprint yesterday = str( (datetime.datetime.now() - datetime.timedelta(2)).strftime("%Y%m%d")) print('Yesterday was', yesterday) pageViews = PageviewsClient('shivansh') #FOR Iphone pv = pageViews.article_views('en.wikipedia', 'IPhone', granularity='daily', start='20150701', end=yesterday) print(pv) print('Data points for IPhone: ', pv.__len__()) rawIphone = list(pv.items()) t = sorted(rawIphone) out = open('Iphone.csv', 'w')
# add up the total views for a sub page def addtotalviews(data, a): for key, value in data.iteritems(): v = value.get(t + l) if isinstance(v, int) == True: a += v else: pass return a today = datetime.datetime.today() p = PageviewsClient() # the first blank spot is reserved for the english (unless modified for wikipedia language versions, then add 'en' first) # version or 'original article page'. Add langauge codes after. code = [ '' ] # VVVVVVVVVVVVVVV add to code to vew all lanauge verisons VVVVVVVVVVVVVVVV # 'es' , 'aa' , 'ab' , 'ae' , 'af' , 'ak' , 'am' , 'an' , 'ar' , 'as' , # 'av' , 'ay' , 'az' , 'ba' , 'be' , 'bg' , 'bh' , 'bi' , 'bm' , 'bn' , 'bo' , # 'br' , 'bs' , 'ca' , 'ce' , 'ch' , 'co' , 'cr' , 'cs' , 'cu' , 'cv' , 'cy' , # 'da' , 'de' , 'dv' , 'dz' , 'ee' , 'el' , 'eo' , 'es' , 'et' , 'eu' , 'fa' , # 'ff' , 'fi' , 'fj' , 'fo' , 'fr' , 'fy' , 'ga' , 'gd' , 'gl' , 'gn' , 'gu' , # 'gv' , 'ha' , 'he' , 'hi' , 'ho' , 'hr' , 'ht' , 'hu' , 'hy' , 'hz' , 'ia' , # 'id' , 'ie' , 'ig' , 'ii' , 'ik' , 'io' , 'is' , 'it' , 'iu' , 'ja' , 'jv' , # 'ka' , 'kg' , 'ki' , 'kj' , 'kk' , 'kl' , 'km' , 'kn' , 'ko' , 'kr' , 'ks' , # 'ku' , 'kv' , 'kw' , 'ky' , 'la' , 'lb' , 'lg' , 'li' , 'ln' , 'lo' , 'lt' ,
import pageviewapi import mwviews import datetime from mwviews.api import PageviewsClient import ystockquote from pprint import pprint p = PageviewsClient('shivansh') today = str(datetime.datetime.now().strftime("%Y%m%d")) print(today) #print(p.article_views('en.wikipedia', 'IPhone', granularity='daily', start='20160201', end=today)) pprint(ystockquote.get_historical_prices('AAPL', '2013-01-03', today))
#now let's explore the data again df_2018[df_2018.Player=='Player'] #counts for each value of Rnd, PIck, Tm, and Position for column in tabs: tab = pd.crosstab(index=df_2018[column], # Make a crosstab columns="count") # Name the count column print(tab) df_2018.describe() #that looks better ####2 COLLECT WIKIPEDIA PAGE VIEWS FOR EACH PLAYER # Sends a descriptive User-Agent header with every request p = PageviewsClient(user_agent="<ene> NFL draft analysis") #2a retrieve page views for each player #Error occurs as ProFootballReference and Wikipedia handle some initials inconsistently #Manually correcting this issue name_correction = {'M.J. Stewart':'M. J. Stewart', 'P.J. Hall':'P. J. Hall', 'R.J. McIntosh':'R. J. McIntosh' } df_2018 = df_2018.replace(name_correction) #2018 NFL draft took place from April 26 to April 28 #Collect more data than needed at beginning. Dates will be pared down after exploratory analysis
import json from datetime import datetime from mwviews.api import PageviewsClient dump_folder = './Dumps' dataset_file = dump_folder + '/' + 'dataset.dmp' stats_file = dump_folder + '/' + 'stats.txt' semi_final_dataset_file = dump_folder + '/' + 'semi_final_dataset.dmp' p = PageviewsClient() i = 0 with open(semi_final_dataset_file, 'w') as semi_final_dataset: semi_final_dataset.write('') with open(dataset_file, 'r') as dataset: with open(stats_file, 'r') as stats: with open(semi_final_dataset_file, 'a') as semi_final_dataset: statistics = stats.readlines() statistics = [s for s in statistics if 'ERRORFAIL' not in s] current_stat_read_line = 0 previous_date = None data_out = [] stat_out = [] for data in dataset: data_dic = json.loads(data) stat_dic = json.loads(statistics[current_stat_read_line]) current_stat_read_line += 1 current_date = data_dic['webPublicationDate'].split(
import six import urllib from mwviews.api import PageviewsClient articles = [ 'cat', 'dog', 'New York', ] articles = [ urllib.parse.quote('Park Güell'.encode('utf-8', 'ignore'), safe='') ] top_x = 2 p = PageviewsClient(10) # create date string based on previous month now = datetime.datetime.now() previous_month = str(now.month - 1).zfill(2) if previous_month == "00": previous_month = "12" start_date = str(now.year) + previous_month + "0100" end_date = str(now.year) + previous_month + "2800" # encode in ascii for compatibility with page views api articles = [article.encode("ascii", 'ignore') for article in articles] # get views result = p.article_views('en.wikipedia', articles, granularity='monthly', start=start_date,
from mwviews.api import PageviewsClient from neo4j import GraphDatabase import csv p = PageviewsClient("mark-needham") driver = GraphDatabase.driver("bolt://localhost", auth=("neo4j", "neo")) # people = [ # "Boris Johnson", "Theresa May", "Jacob Rees-Mogg" # ] with driver.session() as session: result = session.run(""" MATCH (p:Person) RETURN p.name AS person """) people = [row["person"] for row in result] # p.article_views("en.wikipedia", people, start="20190325", end="20190330") views = p.article_views("en.wikipedia", people, start="20160624", end="20190330") votes = {person: 0 for person in people} for key in views.keys(): for person_key in views[key].keys(): person = person_key.replace("_", " ") if views[key][person_key]: votes[person] += views[key][person_key]
import mwclient import os import glob import pycountry from mwviews.api import PageviewsClient from calendar import monthrange import logins # TODO: Reschedule if something went wrong # TODO: Investigate and fix SSLError when trying to do # gpsread stuff while pageviews is collecting __dir__ = os.path.dirname(__file__) ua = 'Page views collection for The Wikipedia Library. Run by User:Samwalton9' p = PageviewsClient() g_client = logins.gspread_login() # Test sheet - 17Vr9o9ytiv-5l9g3TdUoheEJldWKFxZrUTiIJQI-Ucg # Live sheet - 1hUbMHmjoewO36kkE_LlTsj2JQL9018vEHTeAP7sR5ik # Pageviews sheet g_sheet = g_client.open_by_key('1hUbMHmjoewO36kkE_LlTsj2JQL9018vEHTeAP7sR5ik') global_sums = g_sheet.worksheet('Global Sums') def mwclient_login(language, user_agent=ua): if language == 'meta': p_m = 'm' else: p_m = 'p'
class wiki_table: levenshtein = Levenshtein() pvc = PageviewsClient("Looking for songs") def __init__(self, decade): self.p = Path('C:/Users/tomha/PycharmProjects/GlglzPredictor/DFs') self.decade = decade self.genres_dict = {} self.df = self.create_table() def cut_year_from_cell(self, cell): try: return int(cell.contents[0]) except: try: return int(cell.contents[0].split(" ")[1]) except: try: return int(cell.contents[0].split(" ")[2]) except: return 0 def append_genre(self, genre): for genre_from_dict in self.genres_dict.keys(): if genre_from_dict[1:len(genre_from_dict)] == genre["title"][ 1:len(genre["title"])]: return genre_from_dict elif genre_from_dict[1:len(genre_from_dict)] == genre["title"][ 1:len(genre["title"])] + " music": return genre_from_dict elif self.levenshtein.distance(genre_from_dict, genre["title"]) <= 2: return genre_from_dict return genre["title"] def get_year(self, row): year = 0 found = 0 year_cell = row.find("td", {"class": "plainlist"}) if year_cell is not None: if year_cell.find("li") and found == 0: year = self.cut_year_from_cell(year_cell.find("li")) if year != 0: print("Taken from List! " + str(year)) found = 1 else: print("year_li: " + str(year_cell.find("li"))) elif year_cell.find("a") and year_cell.find("a").has_attr("title"): year = year_cell.find("a")["title"].split(" ")[0] print("Taken from Link! " + str(year)) found = 1 elif year_cell.find("span", {"class": "dtstart"}): try: year = int( year_cell.find("span", { "class": "dtstart" }).contents[0].split("-")[0]) print("Taken from span! " + str(year)) found = 1 except: print(year_cell) elif len(year_cell.contents) > 0: year = self.cut_year_from_cell(year_cell) if year != 0: found = 1 if found == 0: print("year cell: " + str(year_cell)) return year def scrape_info_from_wiki(self, page): song = {} try: page_html = wikipedia.WikipediaPage(page).html() prettified = BeautifulSoup(page_html, 'html.parser') info_table = prettified.findAll("table", {"class": "infobox"}) song["result"] = page song["year"] = 0 song["genres"] = [] song["views"] = 0 for row in info_table[0].find_all("tr"): row_year = row.find(text='Released') if row_year: song["year"] = self.get_year(row) row_genres = row.find("td", {"class": "category"}) if row_genres: for genre in row_genres.find_all("a"): if genre.has_attr("title"): song["genres"].append(self.append_genre(genre)) try: pop_dict = self.pvc.article_views('en.wikipedia', [page], granularity='monthly', start='20190101', end='20191001') for value in pop_dict.items(): for i in value[1]: if value[1][i] != None: song["views"] = song["views"] + value[1][i] except: print("Can't Sum Up Views!") except Exception as e: print(e) song = {'result': 'None', 'year': 0, 'genres': [], 'views': 0} return song def get_song_from_wikipedia(self, song_name): song = {} results = wikipedia.search(song_name) found = 0 for result in results: if self.levenshtein.distance( result.split("(")[0], song_name.split("-")[0]) <= 5 and found == 0: song = self.scrape_info_from_wiki(result) found = 1 if found == 0: print("Name: " + song_name) print("Available Results: " + str(results)) selection = int(input("Select the right result")) if selection in range(0, len(results)): song = self.scrape_info_from_wiki(results[selection]) else: song = {'result': 'None', 'year': 0, 'genres': [], 'views': 0} return song def create_table(self): spotify_table_name = "DFs/spotify_" + self.decade + ".csv" data_from_spotify = pd.read_csv(spotify_table_name) wiki_songs = [] for row in data_from_spotify.iterrows(): name = row[1]['name'].split("-")[0].replace( 'remastered', '') + " - " + row[1]['artist'] song = self.get_song_from_wikipedia(name) song["spotify_name"] = row[1]['name'] song["spotify_artist"] = row[1]['artist'] wiki_songs.append(song) wiki_df = pd.DataFrame(wiki_songs) table_name = "wiki_" + self.decade wiki_df.to_csv(Path(self.p, table_name + '.csv'), index=None, header=True) if len(wiki_songs) % 100 == 0: print("Fetched " + str(len(wiki_songs)) + " songs") wiki_df = pd.DataFrame(wiki_songs) table_name = "wiki_" + self.decade export_csv = wiki_df.to_csv(Path(self.p, table_name + '.csv'), index=None, header=True) print("Saved table successfully") return wiki_df
from mwviews.api import PageviewsClient ###Provides the access time for a wikipedia pages in between a specific time(with starting and endoing date specified), it gives a nomber of views### p = PageviewsClient('Amit') ''' def accessTime(page_name, s_time, e_time): test = False for daily in p.article_views('en.wikipedia', [page_name], granularity='daily', start=s_time, end=e_time).items(): if daily[1].get(page_name) != 0: no_of_views = daily[1].get(page_name) return str(no_of_views) return "0" ''' def accessTime(page_name, s_time, e_time): no_of_views = 0 for daily in p.article_views('en.wikipedia', [page_name], granularity='daily', start=s_time, end=e_time).items(): if daily[1].get(page_name) != None: no_of_views = no_of_views + daily[1].get(page_name) else: pass return no_of_views
class WikiIngest(object): def __init__(self): self.db_connection = DBConnection() self.logger = logging.getLogger(__name__) self.api = PageviewsClient( "Mozilla/5.0 (X11; Linux x86_64)" " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36" ) def get_top_articles(self, time_collect=None, historic=False): if not historic: time_collect = datetime.now() - timedelta(days=1) results = self.api.top_articles(project=WIKI_SOURCES.ENGLISH_WIKIPEDIA, year=time_collect.year, month=time_collect.month, day=time_collect.day) timestamp = calendar.timegm(time_collect.timetuple()) articles_to_insert = [] bulk_op = None if historic: bulk_op = self.db_connection.start_bulk_upsert( collection=DB.WIKI_TRENDS) for result in results: name = result["article"] if "_" in name: name = name.replace("_", " ") doc = { WIKI_TREND.NAME: name, WIKI_TREND.RANK: int(result["rank"]), WIKI_TREND.VIEWS: int(result["views"]), WIKI_TREND.TIMESTAMP: timestamp, WIKI_TREND.DATE_OBJECT: time_collect, WIKI_TREND.DATE_STRING: time_collect.strftime("%A %B %d %Y"), WIKI_TREND.MONTH: time_collect.strftime("%B").lower(), WIKI_TREND.WEEKDAY: time_collect.strftime("%A").lower(), WIKI_TREND.MONTH_DAY: int(time_collect.strftime("%d")), WIKI_TREND.YEAR: time_collect.strftime("%Y") } if historic: self.db_connection.add_to_bulk_upsert(query={ "$and": [{ WIKI_TREND.NAME: name }, { WIKI_TREND.DATE_STRING: time_collect.strftime("%A %B %d %Y") }] }, data=doc, bulk_op=bulk_op) else: articles_to_insert.append(doc) if historic: self.db_connection.end_bulk_upsert(bulk_op=bulk_op) else: self.db_connection.bulk_insert(data=articles_to_insert, collection=DB.WIKI_TRENDS)
# -*- coding: utf-8 -*- import datetime import operator import six import urllib from mwviews.api import PageviewsClient articles = ['cat', 'dog', 'New York', ] articles = [urllib.parse.quote('Park Güell'.encode('utf-8', 'ignore'), safe='')] top_x = 2 p = PageviewsClient(10) # create date string based on previous month now = datetime.datetime.now() previous_month = str(now.month - 1).zfill(2) if previous_month == "00": previous_month = "12" start_date = str(now.year) + previous_month + "0100" end_date = str(now.year) + previous_month + "2800" # encode in ascii for compatibility with page views api articles = [article.encode("ascii", 'ignore') for article in articles] # get views result = p.article_views('en.wikipedia', articles, granularity='monthly', start=start_date, end=end_date) # clean results (six is used for backwards compatibility with python 2 result = six.next(six.itervalues(result)) sorted_articles = sorted(result.items(), key=operator.itemgetter(1), reverse=True) # print sorted_articles[:top_x]
class original_table: levenshtein = Levenshtein() pvc = PageviewsClient("Looking for songs") client_credentials_manager = SpotifyClientCredentials( client_id='274d5abed01c455099ac8ad14c6a68e8', client_secret='7425a61db8ed45c48d1ccfaa39842e00') def __init__(self, decade): self.sp = spotipy.Spotify( client_credentials_manager=self.client_credentials_manager) self.decade = decade table_name = input("Please insert the original chart table name") self.original_decade_df = pd.read_csv("DFs/" + table_name + ".csv", encoding="utf-8") spoti = input("Add Spotify Features?") if spoti == 'Y' or spoti == 'y' or spoti == 'yes': self.add_spotify_features() wiki = input("Add Wikipedia Features?") if wiki == 'Y' or wiki == 'y' or wiki == 'yes': self.operate_wikipedia() #yearly = input("Find in yearly charts?") #if yearly == 'Y' or yearly == 'y' or yearly == 'yes': # self.find_in_yearly_chart() p = Path('C:/Users/tomha/PycharmProjects/GlglzPredictor/DFs/') new_table_name = input( "Please insert the new original chart table name") export_csv = self.original_decade_df.to_csv(Path( p, new_table_name + '.csv'), index=None, header=True) print("Table saved successfully!") def add_spotify_features(self): spotify_popularity = [] spotify_valence = [] spotify_tempo = [] spotify_instrumentalness = [] spotify_year = [] for row in self.original_decade_df.iterrows(): try: result = self.sp.search(q=row[1]['name'], type='track')['tracks']['items'][0] spotify_valence.append( self.sp.audio_features(result['id'])[0]['valence']) spotify_tempo.append( self.sp.audio_features(result['id'])[0]['tempo']) spotify_instrumentalness.append( self.sp.audio_features( result['id'])[0]['instrumentalness']) spotify_popularity.append(result['popularity']) spotify_year.append( result['album']['release_date'].split("-")[0]) except: spotify_valence.append('None') spotify_tempo.append('None') spotify_instrumentalness.append('None') spotify_popularity.append('None') spotify_year.append('None') self.original_decade_df['spotify_popularity'] = spotify_popularity self.original_decade_df['spotify_valence'] = spotify_valence self.original_decade_df['spotify_tempo'] = spotify_tempo self.original_decade_df[ 'spotify_instrumentalness'] = spotify_instrumentalness self.original_decade_df['spotify_year'] = spotify_year def scrape_info_from_wiki(self, page): song = {} try: page_html = wikipedia.WikipediaPage(page).html() prettified = BeautifulSoup(page_html, 'html.parser') info_table = prettified.findAll("table", {"class": "infobox"}) song["result"] = page song["year"] = 0 song["genres"] = [] song["views"] = 0 for row in info_table[0].find_all("tr"): row_year = row.find(text='Released') if row_year: song["year"] = get_year(row) row_genres = row.find("td", {"class": "category"}) if row_genres: for genre in row_genres.find_all("a"): if genre.has_attr("title"): song["genres"].append(genre["title"]) try: pop_dict = self.pvc.article_views('en.wikipedia', [page], granularity='monthly', start='20190101', end='20190731') for value in pop_dict.items(): for i in value[1]: if value[1][i] != None: song["views"] = song["views"] + value[1][i] except: print("Can't Sum Up Views!") except Exception as e: print(e) song = {'result': 'None', 'year': 0, 'genres': [], 'views': 0} return song def get_song_from_wikipedia(self, song_name): song = {} results = wikipedia.search(song_name) found = 0 for result in results: if self.levenshtein.distance( result.split("(")[0], song_name.split("-")[0]) <= 5 and found == 0: song = self.scrape_info_from_wiki(result) found = 1 if found == 0: print("Name: " + song_name) print("Available Results: " + str(results)) selection = int(input("Select the right result")) if selection in range(0, len(results)): song = self.scrape_info_from_wiki(results[selection]) else: song = {'result': 'None', 'year': 0, 'genres': [], 'views': 0} return song def operate_wikipedia(self): songs_from_wikipedia = [] for row in self.original_decade_df.iterrows(): songs_from_wikipedia.append( self.get_song_from_wikipedia(row[1]['name'])) songs_from_wikipedia = pd.DataFrame(songs_from_wikipedia) self.original_decade_df['wikipedia_year'] = songs_from_wikipedia[ 'year'] self.original_decade_df['genres'] = songs_from_wikipedia['genres'] self.original_decade_df['views'] = songs_from_wikipedia['views'] def read_chart_file(self): songs = [] year = 0 file_name = "DFs/" + self.decade + ".txt" file = open(file_name, "r", encoding="utf8") for line in file.readlines(): song = {} try: year = int(line) except: try: song["name"] = line[line.find('"') + 1:len(line) - 1] song["location"] = int(line.split(".")[0][0:2]) song["year"] = year songs.append(song) except: print("Empty Line") return songs def find_in_yearly_chart(self): yearly_positions = [] songs_from_charts = self.read_chart_file() for row in self.original_decade_df.iterrows(): found_song = 0 for song in songs_from_charts: if self.levenshtein.distance(song['name'], row[1]['name']) <= 1: yearly_positions.append(song['location']) found_song = 1 if found_song == 0: yearly_positions.append(0) self.original_decade_df['yearly_position'] = yearly_positions def fix_year(self): year = [] year_source = [] for row in self.original_decade_df.iterrows(): if int(row[1]['spotify_year']) > 1979 and int( row[1]['spotify_year']) < 1990: year.append(int(row[1]['spotify_year'])) year_source.append('spotify') elif int(row[1]['wikipedia_year']) > 1979 and int( row[1]['wikipedia_year']) < 1990: year.append(int(row[1]['wikipedia_year'])) year_source.append('wikipedia') else: year.append(int(input(row[1]['name'] + " " + row[1]['artist']))) year_source.append('manual') self.original_decade_df['year'] = year self.original_decade_df['year_source'] = year_source
def download_pageviews(entities=None, start='20150701', end=None, access='desktop', agent='user', limit=10000): """ Download pageviews from Wikipedia :param entities: A list of entities (Wikipedia pages) to get pageview data for :param start: The start date of the range over which to collect data; 2015-07-01 is the earliest supported by the API :param end: The end date of the range, or None for today :param access: The method by which Wikipedia was accessed (default: desktop) :param agent: The user agent accessing Wikipedia (default: user) :param limit: The number of most-trafficked entities to return data for, if no entities are specified in the call :return: A DataFrame of entities x pageviews by day """ if end is None: end = datetime.date.today().strftime('%Y%m%d') p = PageviewsClient() dates = pd.date_range(start=start, end=end) #str -> list if type(entities) is str: entities = [entities] # if entities aren't passed in, get the daily top entities for the period if entities is None: df_pvs = None for d in dates: try: df = pd.DataFrame(p.top_articles('en.wikipedia', year=d.year, month=d.month,\ day=d.day, limit=limit, access=access)) except: continue df = df.set_index('article').rename(columns={'views': d})[[d]] if df_pvs is None: df_pvs = df else: df_pvs = df_pvs.join(df, how='outer') entities = df_pvs.index.values.tolist() for i in range(len(entities)): try: entities[i] = unidecode(wikipedia.page(entities[i]).title) except wikipedia.exceptions.DisambiguationError as e: print 'I could not understand that, please check your spelling or be more specific' print 'Error: {0}'.format(e) avere = pd.DataFrame(columns=['NONE']) return avere except wikipedia.exceptions.PageError as e: print 'I could not understand that, please check your spelling or be more specific' print 'Error: {0}'.format(e) avere = pd.DataFrame(columns=['NONE']) return avere search = p.article_views('en.wikipedia', entities, start=start, end=end, access=access, agent=agent) df = pd.DataFrame.from_dict(search, orient='index') return df
conn = mdb.connect(host=db_host, port=db_port, user=db_user, passwd=db_password, db=db_db, charset='utf8') cache = {} # Handle brand new pages added which haven't had anything downloaded yet. Determine pageid query = "select * from edits where pageid is null" df = pd.read_sql(query, conn) p = PageviewsClient(user_agent=wp_agent) q = "UPDATE edits set pageid=%s WHERE edit_id=%s" cur = conn.cursor() for r in range(df.shape[0]): row = df.iloc[r] editid = row['edit_id'] title = row['page'] lang = row['lang'] url = 'https://' + lang + '.wikipedia.org/w/api.php?action=query&format=json&titles=' + urllib.parse.quote( title.encode("utf8")) print('Getting', url) req = requests.get(url) j = json.loads(req.text) try: pages = j['query']['pages']
from mwviews.api import PageviewsClient import pandas as pd p = PageviewsClient(user_agent='all-agents') x = (p.article_views('en.wikipedia', ['Reliance Industries'], granularity='daily', start='20150701', end='20180318')) df = pd.DataFrame() df1 = pd.DataFrame() Y = pd.read_csv("FinalData.csv") timeslot = [] for date in x: items = x[date].items() timeslot.append({date: value for (timeslot, value) in items}) Date = [] PageViews = [] for i in timeslot: for x, y in i.items(): Date.append(x.date()) PageViews.append(y) #print(Date) #print(PageViews) df = pd.DataFrame(Date, columns=['Date']) df1 = pd.DataFrame(PageViews, columns=['WikiPageViews']) df = df.merge(df1, left_index=True, right_index=True)