Exemple #1
0
def get_wiki_pageviews(title):
	p = PageviewsClient(user_agent="<*****@*****.**> multiple movie titles")
	today = datetime.datetime.now().strftime("%Y%m%d")
	try:
		return p.article_views('en.wikipedia', title, start='20130101',end=today)
	except:
		return {}
 def __init__(self):
     self.db_connection = DBConnection()
     self.logger = logging.getLogger(__name__)
     self.api = PageviewsClient(
         "Mozilla/5.0 (X11; Linux x86_64)"
         " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
     )
def get_pageviews(site_name, *args, **kwargs):
    if site_name.lower() == 'wikipedia':
        start = ''
        end = ''
        granularity = 'monthly'
        if kwargs.get('article_name') != None:
            article_name = kwargs['article_name']
        # article_name = self.get_article_name(article_name)
        if kwargs.get('start') != None:
            start = kwargs['start'].replace('-', '')

        if kwargs.get('end') != None:
            end = kwargs['end'].replace('-', '')

        if kwargs.get('granularity') != None:
            granularity = kwargs['granularity']

        p = PageviewsClient(user_agent="<*****@*****.**>")

        if start == '':
            return p.article_views('en.wikipedia',
                                   article_name,
                                   granularity=granularity)
        elif end == '':
            return p.article_views('en.wikipedia',
                                   article_name,
                                   granularity=granularity,
                                   start=start,
                                   end=start)
        else:
            return p.article_views('en.wikipedia',
                                   article_name,
                                   granularity=granularity,
                                   start=start,
                                   end=end)
Exemple #4
0
 def run(self):
     viewer = PageviewsClient(
         user_agent="<*****@*****.**> Selfie, Cat, and Dog analysis"
     )
     self.logger.info('[%s] Starting Wiki thread' % self.Name)
     try:
         for ticker, article in self.Tickers.items():
             End_date = time.strftime('%Y%m%d')
             data = viewer.article_views('en.wikipedia',
                                         article,
                                         granularity='daily',
                                         start=self.Start_date,
                                         end=End_date)
             for row in data:
                 if data[row][article]:
                     wikid = {}
                     wikid['date'] = row.strftime('%m/%d/%Y')
                     wikid['symbol'] = ticker
                     wikid['article'] = article
                     wikid['wiki_views'] = int(data[row][article])
                     queueDoc(wikid)
             self.logger.info('[%s] Collected Info on %s' %
                              (self.Name, ticker))
     except Exception as e:
         self.logger.error('[%s] Error: %s' % (self.Name, e))
     self.logger.info('[%s] Exiting' % self.name)
     self.Fin = True
Exemple #5
0
def get_groundtruth(lang):
    """Get actual counts of top articles and their pageviews from a Wikipedia from yesterday."""
    p = PageviewsClient(
        user_agent="[email protected] -- diff private toolforge")
    try:
        groundtruth = p.top_articles(
            project='{0}.wikipedia'.format(lang),
            access='all-access',
            year=None,
            month=None,
            day=None,  # defaults to yesterday
            limit=50)
    except Exception:
        two_days_ago = date.today() - timedelta(days=2)
        groundtruth = p.top_articles(project='{0}.wikipedia'.format(lang),
                                     access='all-access',
                                     year=two_days_ago.year,
                                     month=two_days_ago.month,
                                     day=two_days_ago.day,
                                     limit=50)
    return {
        r['article']: {
            'gt-rank': r['rank'],
            'gt-views': r['views']
        }
        for r in groundtruth
    }
Exemple #6
0
def _get_snp500_wiki_views(conn, start, end):
    """
    Inserts wiki page views into the daily_views table

    Parameters:
        start (str) : YYYYMMDD
        end   (str) : YYYYMMDD

    Returns:
        List[tuple] : (id, date, views, now, now)
    """
    pvc = PageviewsClient()
    symbol_ids_and_titles = _get_symbol_ids_and_wiki_titles(conn)
    title_to_id = {title: id for id, title in symbol_ids_and_titles}
    articles = [title for _, title in symbol_ids_and_titles]
    project = 'en.wikipedia'
    now = datetime.datetime.utcnow()

    # API call
    views_dict = pvc.article_views(project, articles, start=start, end=end)
    # transforming API call to rows (a list of tuples)
    daily_views = []
    for date in views_dict:
        for title in views_dict[date]:
            id, views = title_to_id[title], views_dict[date][title]
            daily_views.append((id, date, views, now, now))

    return daily_views
def top_articles_by_views(articles, top_x):
    """
    returns the top x of the given list of articles
        based on page views for the previous month
        output:
            [(article1, views), (article2, views)]
    """
    p = PageviewsClient()

    # create date string based on previous month
    now = datetime.datetime.now()
    previous_month = str(now.month - 1).zfill(2)
    if previous_month == "00": previous_month = "12"
    start_date = str(now.year) + previous_month + "0100"
    end_date = str(now.year) + previous_month + "2800"

    # get views
    result = p.article_views('en.wikipedia',
                             articles,
                             granularity='monthly',
                             start=start_date,
                             end=end_date)
    # clean results (six is used for backwards compatibility with python 2
    result = six.next(six.itervalues(result))
    sorted_articles = sorted(result.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
    return sorted_articles[:top_x]
Exemple #8
0
 def set_view_counts(self):
     """
     Initializes the `view_count` property for all of the concepts in the `ConceptModel`.
     """
     for node in self.nodes():
         p = PageviewsClient().article_views("en.wikipedia", [node.concept.replace(' ', '_')])
         p = [p[key][node.concept.replace(' ', '_')] for key in p.keys()]
         p = int(sum([daily_view_count for daily_view_count in p if daily_view_count]) / len(p))
         node.set_property('view_count', p)
Exemple #9
0
 def set_view_count(self):
     """
     Sets the view_count parameter appropriately, using a 30-day average.
     """
     p = PageviewsClient().article_views("en.wikipedia", [self.concept.replace(' ', '_')])
     p = [p[key][self.concept.replace(' ', '_')] for key in p.keys()]
     p = int(sum([daily_view_count for daily_view_count in p if daily_view_count])/len(p))
     # self.view_count = p
     self.properties['view_count'] = p
     print(self.properties['view_count'])
Exemple #10
0
 def getViewsPerDay(self):  # pageviews own thing
     """Gets a time series dataframe: date (as index), views (column is article name)
     Will be using 'mwviews' package"""
     p = PageviewsClient('en')
     print(self.article)
     data = p.article_views('en.wikipedia', [self.article],
                            granularity='daily',
                            start=self.getPublishDate(),
                            end=datetime.datetime.now().strftime('%Y%m%d'))
     df = pd.DataFrame.from_dict(data, orient='index').dropna()
     return df
Exemple #11
0
def wiki_api(keyword, start, end, agent='user'):
    output_list = []

    p = PageviewsClient('what is it..?')  #parameter로 스트링이 들어가야함. 아무거나 넣어도 가능..
    output_dict = dict(
        p.article_views('en.wikipedia.org', [keyword],
                        start=start,
                        end=end,
                        agent=agent))

    for key, val in output_dict.items():
        tem_dict = {}
        tem_dict['date'] = key.strftime("%Y%m%d")
        tem_dict['view_count'] = val[keyword.replace(" ", "_")]
        output_list.append(tem_dict)

    result = json.dumps(output_list)
    return result
Exemple #12
0
def get_page_views_dict(links):
    p = PageviewsClient()
    #today = datetime.datetime.today()
    #today = today.strftime('%Y%m%d')
    #p.article_views('{}.wikipedia'.format(lang), title, granularity='monthly', start='20160201', end=today)
    my_dico = p.article_views('{}.wikipedia'.format(lang), links)
    my_dico_by_article = {}
    for article in links:
        my_dico_by_article[article] = 0

    for key_date, sub_dico_value in my_dico.items():
        for article, number in sub_dico_value.items():
            if number is not None:
                my_dico_by_article[article.replace('_', ' ')] += number
    my_dico_by_article = dict(
        sorted(my_dico_by_article.items(),
               key=operator.itemgetter(1),
               reverse=True))
    #need to define a selection choose based on title approximation
    return my_dico_by_article
Exemple #13
0
def get_page_views(article_names, output_path):
    """Query the Wikipedia page views api for the relevant pages

    Keyword arguments:
    article_names -- array of article names to query
    output_path -- output path for the csv file output
    """
    p = PageviewsClient(user_agent="[email protected] Selfie, Cat, and Dog analysis")
            
    values = p.article_views('en.wikipedia',article_names, granularity='monthly', start='20150101', end='20200401')
    all_keys = list(values.keys())
    all_keys.sort()
    val_dict = []
    for x in article_names:
        for key in all_keys:
            val_dict.append({"article_title":x,"timestamp":key, "views":values[key][x]})
    df = pd.DataFrame(val_dict)
    df = df.fillna(0)
    
    print("Writing Page View Data to -- " + output_path + " -- for " + str(len(df.article_title.unique())) + " articles")
    
    df.to_csv(output_path, mode='w', index=False)
    
    return df
def top_articles_by_views(articles, top_x):
    """
    returns the top x of the given list of articles
        based on page views for the previous month
        output:
            [(article1, views), (article2, views)]
    """
    p = PageviewsClient()

    # create date string based on previous month
    now = datetime.datetime.now()
    previous_month = str(now.month - 1).zfill(2)
    if previous_month == "00": previous_month = "12"
    start_date = str(now.year) + previous_month + "0100"
    end_date = str(now.year) + previous_month + "2800"

    # get views
    result = p.article_views('en.wikipedia', articles, 
            granularity='monthly', start=start_date, end=end_date)
    # clean results (six is used for backwards compatibility with python 2
    result = six.next(six.itervalues(result))
    sorted_articles = sorted(result.items(), 
            key=operator.itemgetter(1), reverse=True)
    return sorted_articles[:top_x]
Exemple #15
0
import csv
import pageviewapi
import mwviews
import json
import datetime
from mwviews.api import PageviewsClient
import pandas
import pandas_datareader as web
import ystockquote
from pprint import pprint

yesterday = str(
    (datetime.datetime.now() - datetime.timedelta(2)).strftime("%Y%m%d"))

print('Yesterday was', yesterday)
pageViews = PageviewsClient('shivansh')

#FOR Iphone
pv = pageViews.article_views('en.wikipedia',
                             'IPhone',
                             granularity='daily',
                             start='20150701',
                             end=yesterday)
print(pv)
print('Data points for IPhone: ', pv.__len__())

rawIphone = list(pv.items())

t = sorted(rawIphone)

out = open('Iphone.csv', 'w')
# add up the total views for a sub page
def addtotalviews(data, a):
	for key, value in data.iteritems():

		v = value.get(t + l)

		if isinstance(v, int) == True:
			a += v
		else:
			pass

	return a

today = datetime.datetime.today() 

p = PageviewsClient()

# the first blank spot is reserved for the english (unless modified for wikipedia language versions, then add 'en' first) 
#	version or 'original article page'. Add langauge codes after.
code = [ '' ]

# VVVVVVVVVVVVVVV add to code to vew all lanauge verisons VVVVVVVVVVVVVVVV
#  'es' , 'aa' , 'ab' , 'ae' , 'af' , 'ak' , 'am' , 'an' , 'ar' , 'as' , 
# 'av' , 'ay' , 'az' , 'ba' , 'be' , 'bg' , 'bh' , 'bi' , 'bm' , 'bn' , 'bo' , 
# 'br' , 'bs' , 'ca' , 'ce' , 'ch' , 'co' , 'cr' , 'cs' , 'cu' , 'cv' , 'cy' , 
# 'da' , 'de' , 'dv' , 'dz' , 'ee' , 'el' , 'eo' , 'es' , 'et' , 'eu' , 'fa' , 
# 'ff' , 'fi' , 'fj' , 'fo' , 'fr' , 'fy' , 'ga' , 'gd' , 'gl' , 'gn' , 'gu' , 
# 'gv' , 'ha' , 'he' , 'hi' , 'ho' , 'hr' , 'ht' , 'hu' , 'hy' , 'hz' , 'ia' , 
# 'id' , 'ie' , 'ig' , 'ii' , 'ik' , 'io' , 'is' , 'it' , 'iu' , 'ja' , 'jv' , 
# 'ka' , 'kg' , 'ki' , 'kj' , 'kk' , 'kl' , 'km' , 'kn' , 'ko' , 'kr' , 'ks' , 
# 'ku' , 'kv' , 'kw' , 'ky' , 'la' , 'lb' , 'lg' , 'li' , 'ln' , 'lo' , 'lt' , 
Exemple #17
0
import pageviewapi
import mwviews
import datetime
from mwviews.api import PageviewsClient
import ystockquote
from pprint import pprint

p = PageviewsClient('shivansh')

today = str(datetime.datetime.now().strftime("%Y%m%d"))
print(today)
#print(p.article_views('en.wikipedia', 'IPhone', granularity='daily', start='20160201', end=today))

pprint(ystockquote.get_historical_prices('AAPL', '2013-01-03', today))
#now let's explore the data again
df_2018[df_2018.Player=='Player']
#counts for each value of Rnd, PIck, Tm, and Position
for column in tabs:
    tab = pd.crosstab(index=df_2018[column],  # Make a crosstab
                              columns="count")      # Name the count column
    print(tab)
df_2018.describe()
#that looks better


####2 COLLECT WIKIPEDIA PAGE VIEWS FOR EACH PLAYER


# Sends a descriptive User-Agent header with every request
p = PageviewsClient(user_agent="<ene> NFL draft analysis")

#2a retrieve page views for each player


#Error occurs as ProFootballReference and Wikipedia handle some initials inconsistently
#Manually correcting this issue

name_correction = {'M.J. Stewart':'M. J. Stewart',
                   'P.J. Hall':'P. J. Hall',
                   'R.J. McIntosh':'R. J. McIntosh'
                  }
df_2018 = df_2018.replace(name_correction)

#2018 NFL draft took place from April 26 to April 28
#Collect more data than needed at beginning. Dates will be pared down after exploratory analysis
Exemple #19
0
import json
from datetime import datetime
from mwviews.api import PageviewsClient

dump_folder = './Dumps'

dataset_file = dump_folder + '/' + 'dataset.dmp'
stats_file = dump_folder + '/' + 'stats.txt'
semi_final_dataset_file = dump_folder + '/' + 'semi_final_dataset.dmp'

p = PageviewsClient()

i = 0

with open(semi_final_dataset_file, 'w') as semi_final_dataset:
    semi_final_dataset.write('')

with open(dataset_file, 'r') as dataset:
    with open(stats_file, 'r') as stats:
        with open(semi_final_dataset_file, 'a') as semi_final_dataset:
            statistics = stats.readlines()
            statistics = [s for s in statistics if 'ERRORFAIL' not in s]
            current_stat_read_line = 0
            previous_date = None
            data_out = []
            stat_out = []
            for data in dataset:
                data_dic = json.loads(data)
                stat_dic = json.loads(statistics[current_stat_read_line])
                current_stat_read_line += 1
                current_date = data_dic['webPublicationDate'].split(
Exemple #20
0
import six
import urllib
from mwviews.api import PageviewsClient

articles = [
    'cat',
    'dog',
    'New York',
]
articles = [
    urllib.parse.quote('Park Güell'.encode('utf-8', 'ignore'), safe='')
]

top_x = 2

p = PageviewsClient(10)

# create date string based on previous month
now = datetime.datetime.now()
previous_month = str(now.month - 1).zfill(2)
if previous_month == "00": previous_month = "12"
start_date = str(now.year) + previous_month + "0100"
end_date = str(now.year) + previous_month + "2800"

# encode in ascii for compatibility with page views api
articles = [article.encode("ascii", 'ignore') for article in articles]
# get views
result = p.article_views('en.wikipedia',
                         articles,
                         granularity='monthly',
                         start=start_date,
Exemple #21
0
from mwviews.api import PageviewsClient
from neo4j import GraphDatabase
import csv

p = PageviewsClient("mark-needham")
driver = GraphDatabase.driver("bolt://localhost", auth=("neo4j", "neo"))

# people = [
#     "Boris Johnson", "Theresa May", "Jacob Rees-Mogg"
# ]

with driver.session() as session:
    result = session.run("""
  MATCH (p:Person)
  RETURN p.name AS person
  """)
    people = [row["person"] for row in result]

# p.article_views("en.wikipedia", people,  start="20190325", end="20190330")
views = p.article_views("en.wikipedia",
                        people,
                        start="20160624",
                        end="20190330")
votes = {person: 0 for person in people}

for key in views.keys():
    for person_key in views[key].keys():
        person = person_key.replace("_", " ")
        if views[key][person_key]:
            votes[person] += views[key][person_key]
Exemple #22
0
import mwclient
import os
import glob
import pycountry
from mwviews.api import PageviewsClient
from calendar import monthrange
import logins

# TODO: Reschedule if something went wrong
# TODO: Investigate and fix SSLError when trying to do
#       gpsread stuff while pageviews is collecting

__dir__ = os.path.dirname(__file__)

ua = 'Page views collection for The Wikipedia Library. Run by User:Samwalton9'
p = PageviewsClient()

g_client = logins.gspread_login()
# Test sheet - 17Vr9o9ytiv-5l9g3TdUoheEJldWKFxZrUTiIJQI-Ucg
# Live sheet - 1hUbMHmjoewO36kkE_LlTsj2JQL9018vEHTeAP7sR5ik
# Pageviews sheet
g_sheet = g_client.open_by_key('1hUbMHmjoewO36kkE_LlTsj2JQL9018vEHTeAP7sR5ik')
global_sums = g_sheet.worksheet('Global Sums')


def mwclient_login(language, user_agent=ua):

    if language == 'meta':
        p_m = 'm'
    else:
        p_m = 'p'
class wiki_table:
    levenshtein = Levenshtein()
    pvc = PageviewsClient("Looking for songs")

    def __init__(self, decade):
        self.p = Path('C:/Users/tomha/PycharmProjects/GlglzPredictor/DFs')
        self.decade = decade
        self.genres_dict = {}
        self.df = self.create_table()

    def cut_year_from_cell(self, cell):
        try:
            return int(cell.contents[0])
        except:
            try:
                return int(cell.contents[0].split(" ")[1])
            except:
                try:
                    return int(cell.contents[0].split(" ")[2])
                except:
                    return 0

    def append_genre(self, genre):
        for genre_from_dict in self.genres_dict.keys():
            if genre_from_dict[1:len(genre_from_dict)] == genre["title"][
                    1:len(genre["title"])]:
                return genre_from_dict
            elif genre_from_dict[1:len(genre_from_dict)] == genre["title"][
                    1:len(genre["title"])] + " music":
                return genre_from_dict
            elif self.levenshtein.distance(genre_from_dict,
                                           genre["title"]) <= 2:
                return genre_from_dict
        return genre["title"]

    def get_year(self, row):
        year = 0
        found = 0
        year_cell = row.find("td", {"class": "plainlist"})
        if year_cell is not None:
            if year_cell.find("li") and found == 0:
                year = self.cut_year_from_cell(year_cell.find("li"))
                if year != 0:
                    print("Taken from List! " + str(year))
                    found = 1
                else:
                    print("year_li: " + str(year_cell.find("li")))
            elif year_cell.find("a") and year_cell.find("a").has_attr("title"):
                year = year_cell.find("a")["title"].split(" ")[0]
                print("Taken from Link! " + str(year))
                found = 1
            elif year_cell.find("span", {"class": "dtstart"}):
                try:
                    year = int(
                        year_cell.find("span", {
                            "class": "dtstart"
                        }).contents[0].split("-")[0])
                    print("Taken from span! " + str(year))
                    found = 1
                except:
                    print(year_cell)
            elif len(year_cell.contents) > 0:
                year = self.cut_year_from_cell(year_cell)
                if year != 0:
                    found = 1
            if found == 0:
                print("year cell: " + str(year_cell))
        return year

    def scrape_info_from_wiki(self, page):
        song = {}
        try:
            page_html = wikipedia.WikipediaPage(page).html()
            prettified = BeautifulSoup(page_html, 'html.parser')
            info_table = prettified.findAll("table", {"class": "infobox"})
            song["result"] = page
            song["year"] = 0
            song["genres"] = []
            song["views"] = 0
            for row in info_table[0].find_all("tr"):
                row_year = row.find(text='Released')
                if row_year:
                    song["year"] = self.get_year(row)
                row_genres = row.find("td", {"class": "category"})
                if row_genres:
                    for genre in row_genres.find_all("a"):
                        if genre.has_attr("title"):
                            song["genres"].append(self.append_genre(genre))
            try:
                pop_dict = self.pvc.article_views('en.wikipedia', [page],
                                                  granularity='monthly',
                                                  start='20190101',
                                                  end='20191001')
                for value in pop_dict.items():
                    for i in value[1]:
                        if value[1][i] != None:
                            song["views"] = song["views"] + value[1][i]
            except:
                print("Can't Sum Up Views!")
        except Exception as e:
            print(e)
            song = {'result': 'None', 'year': 0, 'genres': [], 'views': 0}
        return song

    def get_song_from_wikipedia(self, song_name):
        song = {}
        results = wikipedia.search(song_name)
        found = 0
        for result in results:
            if self.levenshtein.distance(
                    result.split("(")[0],
                    song_name.split("-")[0]) <= 5 and found == 0:
                song = self.scrape_info_from_wiki(result)
                found = 1
        if found == 0:
            print("Name: " + song_name)
            print("Available Results: " + str(results))
            selection = int(input("Select the right result"))
            if selection in range(0, len(results)):
                song = self.scrape_info_from_wiki(results[selection])
            else:
                song = {'result': 'None', 'year': 0, 'genres': [], 'views': 0}
        return song

    def create_table(self):
        spotify_table_name = "DFs/spotify_" + self.decade + ".csv"
        data_from_spotify = pd.read_csv(spotify_table_name)
        wiki_songs = []
        for row in data_from_spotify.iterrows():
            name = row[1]['name'].split("-")[0].replace(
                'remastered', '') + " - " + row[1]['artist']
            song = self.get_song_from_wikipedia(name)
            song["spotify_name"] = row[1]['name']
            song["spotify_artist"] = row[1]['artist']
            wiki_songs.append(song)
            wiki_df = pd.DataFrame(wiki_songs)
            table_name = "wiki_" + self.decade
            wiki_df.to_csv(Path(self.p, table_name + '.csv'),
                           index=None,
                           header=True)
            if len(wiki_songs) % 100 == 0:
                print("Fetched " + str(len(wiki_songs)) + " songs")
        wiki_df = pd.DataFrame(wiki_songs)
        table_name = "wiki_" + self.decade
        export_csv = wiki_df.to_csv(Path(self.p, table_name + '.csv'),
                                    index=None,
                                    header=True)
        print("Saved table successfully")
        return wiki_df
Exemple #24
0
from mwviews.api import PageviewsClient

###Provides the access time  for a wikipedia pages in between a specific time(with starting and endoing date specified), it gives a nomber of views###
p = PageviewsClient('Amit')
'''
def accessTime(page_name, s_time, e_time):
    test = False
    for daily in p.article_views('en.wikipedia', [page_name], granularity='daily', start=s_time, end=e_time).items():
        if daily[1].get(page_name) != 0:
            no_of_views = daily[1].get(page_name)
            return str(no_of_views)
    return "0"
'''


def accessTime(page_name, s_time, e_time):
    no_of_views = 0
    for daily in p.article_views('en.wikipedia', [page_name],
                                 granularity='daily',
                                 start=s_time,
                                 end=e_time).items():
        if daily[1].get(page_name) != None:
            no_of_views = no_of_views + daily[1].get(page_name)
        else:
            pass
        return no_of_views
class WikiIngest(object):
    def __init__(self):
        self.db_connection = DBConnection()
        self.logger = logging.getLogger(__name__)
        self.api = PageviewsClient(
            "Mozilla/5.0 (X11; Linux x86_64)"
            " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
        )

    def get_top_articles(self, time_collect=None, historic=False):
        if not historic:
            time_collect = datetime.now() - timedelta(days=1)

        results = self.api.top_articles(project=WIKI_SOURCES.ENGLISH_WIKIPEDIA,
                                        year=time_collect.year,
                                        month=time_collect.month,
                                        day=time_collect.day)

        timestamp = calendar.timegm(time_collect.timetuple())
        articles_to_insert = []
        bulk_op = None
        if historic:
            bulk_op = self.db_connection.start_bulk_upsert(
                collection=DB.WIKI_TRENDS)
        for result in results:
            name = result["article"]
            if "_" in name:
                name = name.replace("_", " ")

            doc = {
                WIKI_TREND.NAME: name,
                WIKI_TREND.RANK: int(result["rank"]),
                WIKI_TREND.VIEWS: int(result["views"]),
                WIKI_TREND.TIMESTAMP: timestamp,
                WIKI_TREND.DATE_OBJECT: time_collect,
                WIKI_TREND.DATE_STRING: time_collect.strftime("%A %B %d %Y"),
                WIKI_TREND.MONTH: time_collect.strftime("%B").lower(),
                WIKI_TREND.WEEKDAY: time_collect.strftime("%A").lower(),
                WIKI_TREND.MONTH_DAY: int(time_collect.strftime("%d")),
                WIKI_TREND.YEAR: time_collect.strftime("%Y")
            }

            if historic:
                self.db_connection.add_to_bulk_upsert(query={
                    "$and": [{
                        WIKI_TREND.NAME: name
                    }, {
                        WIKI_TREND.DATE_STRING:
                        time_collect.strftime("%A %B %d %Y")
                    }]
                },
                                                      data=doc,
                                                      bulk_op=bulk_op)

            else:
                articles_to_insert.append(doc)

        if historic:
            self.db_connection.end_bulk_upsert(bulk_op=bulk_op)

        else:
            self.db_connection.bulk_insert(data=articles_to_insert,
                                           collection=DB.WIKI_TRENDS)
 # -*- coding: utf-8 -*-
import datetime
import operator
import six
import urllib
from mwviews.api import PageviewsClient

articles = ['cat', 'dog', 'New York', ]
articles = [urllib.parse.quote('Park Güell'.encode('utf-8', 'ignore'), safe='')]

top_x = 2

p = PageviewsClient(10)

# create date string based on previous month
now = datetime.datetime.now()
previous_month = str(now.month - 1).zfill(2)
if previous_month == "00": previous_month = "12"
start_date = str(now.year) + previous_month + "0100"
end_date = str(now.year) + previous_month + "2800"

# encode in ascii for compatibility with page views api 
articles = [article.encode("ascii", 'ignore') for article in articles]
# get views
result = p.article_views('en.wikipedia', articles, 
        granularity='monthly', start=start_date, end=end_date)
# clean results (six is used for backwards compatibility with python 2
result = six.next(six.itervalues(result))
sorted_articles = sorted(result.items(), 
        key=operator.itemgetter(1), reverse=True)
# print sorted_articles[:top_x]
class original_table:
    levenshtein = Levenshtein()
    pvc = PageviewsClient("Looking for songs")
    client_credentials_manager = SpotifyClientCredentials(
        client_id='274d5abed01c455099ac8ad14c6a68e8',
        client_secret='7425a61db8ed45c48d1ccfaa39842e00')

    def __init__(self, decade):
        self.sp = spotipy.Spotify(
            client_credentials_manager=self.client_credentials_manager)
        self.decade = decade
        table_name = input("Please insert the original chart table name")
        self.original_decade_df = pd.read_csv("DFs/" + table_name + ".csv",
                                              encoding="utf-8")
        spoti = input("Add Spotify Features?")
        if spoti == 'Y' or spoti == 'y' or spoti == 'yes':
            self.add_spotify_features()
        wiki = input("Add Wikipedia Features?")
        if wiki == 'Y' or wiki == 'y' or wiki == 'yes':
            self.operate_wikipedia()
        #yearly = input("Find in yearly charts?")
        #if yearly == 'Y' or yearly == 'y' or yearly == 'yes':
        #    self.find_in_yearly_chart()
        p = Path('C:/Users/tomha/PycharmProjects/GlglzPredictor/DFs/')
        new_table_name = input(
            "Please insert the new original chart table name")
        export_csv = self.original_decade_df.to_csv(Path(
            p, new_table_name + '.csv'),
                                                    index=None,
                                                    header=True)
        print("Table saved successfully!")

    def add_spotify_features(self):
        spotify_popularity = []
        spotify_valence = []
        spotify_tempo = []
        spotify_instrumentalness = []
        spotify_year = []
        for row in self.original_decade_df.iterrows():
            try:
                result = self.sp.search(q=row[1]['name'],
                                        type='track')['tracks']['items'][0]
                spotify_valence.append(
                    self.sp.audio_features(result['id'])[0]['valence'])
                spotify_tempo.append(
                    self.sp.audio_features(result['id'])[0]['tempo'])
                spotify_instrumentalness.append(
                    self.sp.audio_features(
                        result['id'])[0]['instrumentalness'])
                spotify_popularity.append(result['popularity'])
                spotify_year.append(
                    result['album']['release_date'].split("-")[0])
            except:
                spotify_valence.append('None')
                spotify_tempo.append('None')
                spotify_instrumentalness.append('None')
                spotify_popularity.append('None')
                spotify_year.append('None')
        self.original_decade_df['spotify_popularity'] = spotify_popularity
        self.original_decade_df['spotify_valence'] = spotify_valence
        self.original_decade_df['spotify_tempo'] = spotify_tempo
        self.original_decade_df[
            'spotify_instrumentalness'] = spotify_instrumentalness
        self.original_decade_df['spotify_year'] = spotify_year

    def scrape_info_from_wiki(self, page):
        song = {}
        try:
            page_html = wikipedia.WikipediaPage(page).html()
            prettified = BeautifulSoup(page_html, 'html.parser')
            info_table = prettified.findAll("table", {"class": "infobox"})
            song["result"] = page
            song["year"] = 0
            song["genres"] = []
            song["views"] = 0
            for row in info_table[0].find_all("tr"):
                row_year = row.find(text='Released')
                if row_year:
                    song["year"] = get_year(row)
                row_genres = row.find("td", {"class": "category"})
                if row_genres:
                    for genre in row_genres.find_all("a"):
                        if genre.has_attr("title"):
                            song["genres"].append(genre["title"])
            try:
                pop_dict = self.pvc.article_views('en.wikipedia', [page],
                                                  granularity='monthly',
                                                  start='20190101',
                                                  end='20190731')
                for value in pop_dict.items():
                    for i in value[1]:
                        if value[1][i] != None:
                            song["views"] = song["views"] + value[1][i]
            except:
                print("Can't Sum Up Views!")
        except Exception as e:
            print(e)
            song = {'result': 'None', 'year': 0, 'genres': [], 'views': 0}
        return song

    def get_song_from_wikipedia(self, song_name):
        song = {}
        results = wikipedia.search(song_name)
        found = 0
        for result in results:
            if self.levenshtein.distance(
                    result.split("(")[0],
                    song_name.split("-")[0]) <= 5 and found == 0:
                song = self.scrape_info_from_wiki(result)
                found = 1
        if found == 0:
            print("Name: " + song_name)
            print("Available Results: " + str(results))
            selection = int(input("Select the right result"))
            if selection in range(0, len(results)):
                song = self.scrape_info_from_wiki(results[selection])
            else:
                song = {'result': 'None', 'year': 0, 'genres': [], 'views': 0}
        return song

    def operate_wikipedia(self):
        songs_from_wikipedia = []
        for row in self.original_decade_df.iterrows():
            songs_from_wikipedia.append(
                self.get_song_from_wikipedia(row[1]['name']))
        songs_from_wikipedia = pd.DataFrame(songs_from_wikipedia)
        self.original_decade_df['wikipedia_year'] = songs_from_wikipedia[
            'year']
        self.original_decade_df['genres'] = songs_from_wikipedia['genres']
        self.original_decade_df['views'] = songs_from_wikipedia['views']

    def read_chart_file(self):
        songs = []
        year = 0
        file_name = "DFs/" + self.decade + ".txt"
        file = open(file_name, "r", encoding="utf8")

        for line in file.readlines():
            song = {}
            try:
                year = int(line)
            except:
                try:
                    song["name"] = line[line.find('"') + 1:len(line) - 1]
                    song["location"] = int(line.split(".")[0][0:2])
                    song["year"] = year
                    songs.append(song)
                except:
                    print("Empty Line")
        return songs

    def find_in_yearly_chart(self):
        yearly_positions = []
        songs_from_charts = self.read_chart_file()
        for row in self.original_decade_df.iterrows():
            found_song = 0
            for song in songs_from_charts:
                if self.levenshtein.distance(song['name'],
                                             row[1]['name']) <= 1:
                    yearly_positions.append(song['location'])
                    found_song = 1
            if found_song == 0:
                yearly_positions.append(0)
        self.original_decade_df['yearly_position'] = yearly_positions

    def fix_year(self):
        year = []
        year_source = []
        for row in self.original_decade_df.iterrows():
            if int(row[1]['spotify_year']) > 1979 and int(
                    row[1]['spotify_year']) < 1990:
                year.append(int(row[1]['spotify_year']))
                year_source.append('spotify')
            elif int(row[1]['wikipedia_year']) > 1979 and int(
                    row[1]['wikipedia_year']) < 1990:
                year.append(int(row[1]['wikipedia_year']))
                year_source.append('wikipedia')
            else:
                year.append(int(input(row[1]['name'] + " " +
                                      row[1]['artist'])))
                year_source.append('manual')
        self.original_decade_df['year'] = year
        self.original_decade_df['year_source'] = year_source
Exemple #28
0
def download_pageviews(entities=None, start='20150701', end=None, access='desktop', agent='user', limit=10000):

    """
    Download pageviews from Wikipedia

    :param entities: A list of entities (Wikipedia pages) to get pageview data for
    :param start: The start date of the range over which to collect data;
        2015-07-01 is the earliest supported by the API
    :param end: The end date of the range, or None for today
    :param access: The method by which Wikipedia was accessed (default: desktop)
    :param agent: The user agent accessing Wikipedia (default: user)
    :param limit: The number of most-trafficked entities to return data for, if no entities are specified in the call
    :return: A DataFrame of entities x pageviews by day
    """
    
    if end is None:
        end = datetime.date.today().strftime('%Y%m%d')
    
    p = PageviewsClient()
    dates = pd.date_range(start=start, end=end)

    #str -> list
    if type(entities) is str:
        
        entities = [entities]
    
    # if entities aren't passed in, get the daily top entities for the period
    if entities is None:
        df_pvs = None
    
        for d in dates:
            try:
                df = pd.DataFrame(p.top_articles('en.wikipedia', year=d.year, month=d.month,\
                                                 day=d.day, limit=limit, access=access))
            except:
                continue

            df = df.set_index('article').rename(columns={'views': d})[[d]]

            if df_pvs is None:
                df_pvs = df
            else:
                df_pvs = df_pvs.join(df, how='outer')

        entities = df_pvs.index.values.tolist()
    
    for i in range(len(entities)):
        try:
            entities[i] = unidecode(wikipedia.page(entities[i]).title)
        except wikipedia.exceptions.DisambiguationError as e:
            print 'I could not understand that, please check your spelling or be more specific'
            print 'Error: {0}'.format(e)
            avere = pd.DataFrame(columns=['NONE'])
            return avere
        except wikipedia.exceptions.PageError as e:
            print 'I could not understand that, please check your spelling or be more specific'
            print 'Error: {0}'.format(e)
            avere = pd.DataFrame(columns=['NONE'])
            return avere
        
    search = p.article_views('en.wikipedia', entities, start=start, end=end, access=access, agent=agent)
    df = pd.DataFrame.from_dict(search, orient='index')
    
    return df
Exemple #29
0
conn = mdb.connect(host=db_host,
                   port=db_port,
                   user=db_user,
                   passwd=db_password,
                   db=db_db,
                   charset='utf8')

cache = {}

# Handle brand new pages added which haven't had anything downloaded yet.  Determine pageid

query = "select * from edits where pageid is null"
df = pd.read_sql(query, conn)

p = PageviewsClient(user_agent=wp_agent)

q = "UPDATE edits set pageid=%s WHERE edit_id=%s"
cur = conn.cursor()
for r in range(df.shape[0]):
    row = df.iloc[r]
    editid = row['edit_id']
    title = row['page']
    lang = row['lang']
    url = 'https://' + lang + '.wikipedia.org/w/api.php?action=query&format=json&titles=' + urllib.parse.quote(
        title.encode("utf8"))
    print('Getting', url)
    req = requests.get(url)
    j = json.loads(req.text)
    try:
        pages = j['query']['pages']
Exemple #30
0
from mwviews.api import PageviewsClient
import pandas as pd

p = PageviewsClient(user_agent='all-agents')

x = (p.article_views('en.wikipedia', ['Reliance Industries'],
                     granularity='daily',
                     start='20150701',
                     end='20180318'))

df = pd.DataFrame()
df1 = pd.DataFrame()
Y = pd.read_csv("FinalData.csv")

timeslot = []
for date in x:
    items = x[date].items()
    timeslot.append({date: value for (timeslot, value) in items})
Date = []
PageViews = []
for i in timeslot:
    for x, y in i.items():
        Date.append(x.date())
        PageViews.append(y)
#print(Date)
#print(PageViews)

df = pd.DataFrame(Date, columns=['Date'])
df1 = pd.DataFrame(PageViews, columns=['WikiPageViews'])

df = df.merge(df1, left_index=True, right_index=True)