def __init__(self, dir_path=None, develop=True): """ Initializes a GTITAN instance with the desired directory. :param dir_path: path where to create a directory. If left to None, uses default package directory; """ self.develop = develop if dir_path is None: #This queries 1 path up self.dir_path = os.path.dirname(os.path.abspath(__file__)) if not os.path.exists(os.path.join(self.dir_path, "checkpath")): os.makedirs(os.path.join(self.dir_path, "checkpath")) else: #TODO: make all folders if new folder indicated: copy default files there such as the config. self.dir_path = dir_path if not os.path.exists(dir_path): default_path = os.path.dirname(os.path.abspath(__file__)) # creating directory structure os.makedirs(os.path.join(self.dir_path, "config")) os.makedirs(os.path.join(self.dir_path, "data", "Translations")) os.makedirs( os.path.join(self.dir_path, "data", "Trend_indices")) # copying defaults shutil.copyfile( os.path.join(default_path, "config", "config_py.json"), os.path.join(self.dir_path, "config", "config_py.json")) else: print("Directory already exists, loading data from it.") print(f"Using directory2 '{self.dir_path}'", __file__) with open(os.path.join(self.dir_path, "config", "config_py.json"), 'r') as fp: self.CONFIG = json.load(fp) #set configuring settings: self.CONFIG['CONN']['timeout'] = tuple(self.CONFIG['CONN']['timeout']) self.t_sleep = 0.2 #also get it from self.google_timeout = 14 * 60 * 60 self.t_block = 0 #Call instances of used packages self.translator = google_translator() #self.pytrend = TrendReq(hl='en-US', **self.CONFIG['CONN']) self.pytrend = TrendReq() self.gtab = gtab.GTAB() # sets default anchorbank default_project = "frontex" self.set_active_project(default_project)
def __init__(self, timeframe, geo_code=''): self.timeframe = timeframe geo_code = geo_code.upper() self.anchorbank = 'google_anchorbank_geo=' + geo_code + '_timeframe=' + timeframe + '.tsv' self.t = gtab.GTAB(self.GTAB_DIR) self.t.set_options(pytrends_config={ 'geo': geo_code, 'timeframe': timeframe }) if not os.path.exists( os.path.join(self.GTAB_DIR, 'output/google_anchorbanks/', self.anchorbank)): print("Creating Anchorbank...") self.t.create_anchorbank() self.t.set_active_gtab(self.anchorbank)
def main(): t = gtab.GTAB() t.set_active_gtab( "google_anchorbank_geo=_timeframe=2019-01-01 2020-08-01.tsv") df = pd.read_csv('player.csv') inputs = range(df.shape[0]) # removing processes argument makes the code run on all available cores pool = mp.Pool(processes=2) func = partial(processInput, df, t) pops = pool.map(func, inputs) df['popularity_trends'] = pops df.to_csv('player_with_pop.csv', index=False)
def google_trends(df, d1, d2): """Pulls movie data from google trends and merges""" df.loc[:, 'google trends'] = 0 for movie in df['Release'].unique(): try: t = gtab.GTAB() t.set_options( pytrends_config={"timeframe": f"{str(d1)} {str(d2)}"}) query = t.new_query(movie) #print(query) for date in query.index: print(query.loc[date, 'max_ratio']) df.loc[df['Release'].eq(movie) & df['Date'].eq(date), 'google trends'] = query.loc[date, 'max_ratio'] except: continue return df
def create_and_set_gtab(start_timeframe, end_timeframe, geo, gtab_path="gtab_data"): """ Creates and sets a gtab to the required options. This functions takes a lot of time if the anchorbank was not yet created. It also creates a directory if needed to the gtab_path Args: ------- start_timeframe (str): start of the timeframe for the queries (included) end_timeframe (str): end of the timeframe for the queries (included) geo (str): geolocalisation of the search query (Two Uppercase letter (ex: US, CH...) or "" for worldwide) gtab_path (str): path to already existing data Returns: ------- t (GTAB): GoogleTrendsAnchorBank to use for the queries, consistent with the provided options """ t = gtab.GTAB(dir_path=gtab_path) # Create time frame timeframe = start_timeframe + " " + end_timeframe # Set required options t.set_options(pytrends_config={"geo": geo, "timeframe": timeframe}) # Create anchorbank if it doesn't already exist t.create_anchorbank( ) # takes a while to run since it queries Google Trends. # We apply the anchorbank t.set_active_gtab(f"google_anchorbank_geo={geo}_timeframe={timeframe}.tsv") return t
import pandas as pd import gtab df = pd.read_csv("./data/country_codes_filtered.csv") # this creates a new folder to the gtab! my_path = "./anchorbanks" t = gtab.GTAB(dir_path=my_path) for i in df.Code.values[::-1]: try: t.set_options(pytrends_config={"geo": i, "timeframe": "2019-01-01 2020-12-31"}) t.create_anchorbank() # takes a while to run since it queries Google Trends. except: print("error", i)
import matplotlib.pyplot as plt import pandas as pd import numpy as np import matplotlib import gtab t = gtab.GTAB() query_swaziland = t.new_query("swaziland") query_facebook = t.new_query("Facebook") query_google = t.new_query("Google") # result1.png plt.plot(query_swaziland.index, np.ceil(query_swaziland.max_ratio.values / 1.52) * 1, color="#F44336", label="Swaziland Before Calibration", ls=":") plt.plot(query_swaziland.max_ratio, color="#F44336", label="Swaziland") plt.plot(query_facebook.max_ratio, color="#2196F3", label="Facebook") plt.yscale("log") plt.xlabel("Date") plt.ylabel("Popularity") plt.legend() fig = matplotlib.pyplot.gcf() fig.set_size_inches(10, 3.5) plt.title("gtab-corrected trends") plt.xlim([pd.to_datetime("2019-09-01"), pd.to_datetime("2020-08-01")]) plt.show()